def __init__(self, We_initial, params): self.textfile = open(params.outfile, 'w') We = theano.shared(We_initial) embsize = We_initial.shape[1] hidden = params.hidden self.num_labels = params.num_labels self.de_hidden_size = params.de_hidden_size self.en_hidden_size = params.en_hidden_size print params.de_hidden_size, hidden, params.num_labels self.lstm_layers_num = 1 input_var = T.imatrix(name='inputs') target_var = T.imatrix(name='targets') target_var_in = T.imatrix(name='in_targets') mask_var = T.fmatrix(name='masks') mask_var1 = T.fmatrix(name='masks1') length = T.iscalar() length0 = T.iscalar() t_t = T.fscalar() t_t0 = T.fscalar() Wyy0 = np.random.uniform( -0.02, 0.02, (self.num_labels + 1, self.num_labels + 1)).astype('float32') Wyy = theano.shared(Wyy0) l_in_word = lasagne.layers.InputLayer((None, None)) l_mask_word = lasagne.layers.InputLayer(shape=(None, None)) if params.emb == 1: l_emb_word = lasagne.layers.EmbeddingLayer( l_in_word, input_size=We_initial.shape[0], output_size=embsize, W=We) else: l_emb_word = lasagne_embedding_layer_2(l_in_word, embsize, We) l_lstm_wordf = lasagne.layers.LSTMLayer(l_emb_word, 512, mask_input=l_mask_word) l_lstm_wordb = lasagne.layers.LSTMLayer(l_emb_word, 512, mask_input=l_mask_word, backwards=True) concat = lasagne.layers.concat([l_lstm_wordf, l_lstm_wordb], axis=2) l_reshape_concat = lasagne.layers.ReshapeLayer(concat, (-1, 2 * 512)) l_local = lasagne.layers.DenseLayer( l_reshape_concat, num_units=self.num_labels, nonlinearity=lasagne.nonlinearities.linear) network_params = lasagne.layers.get_all_params(l_local, trainable=True) network_params.append(Wyy) print len(network_params) f = open( 'ccctag_CRF_Bilstm_Viterbi_.Batchsize_10_dropout_0_LearningRate_0.01_0.0512_tagversoin_2.pickle', 'r') data = pickle.load(f) f.close() for idx, p in enumerate(network_params): #print data[idx].shape p.set_value(data[idx]) self.params = [] self.hos = [] self.Cos = [] self.encoder_lstm_layers = [] self.decoder_lstm_layers = [] ei, di, dt = T.imatrices(3) #place holders decoderInputs0, em, em1, dm, tf, di0 = T.fmatrices(6) #### the last one is for the stary symbole self.de_lookuptable = theano.shared(name="Decoder LookUpTable", value=init_xavier_uniform( self.num_labels + 1, self.de_hidden_size), borrow=True) self.linear = theano.shared( name="Linear", value=init_xavier_uniform( self.de_hidden_size + 2 * self.en_hidden_size, self.num_labels), borrow=True) self.linear_bias = theano.shared( name="Hidden to Bias", value=np.asarray(np.random.randn(self.num_labels, ) * 0., dtype=theano.config.floatX), borrow=True) #self.hidden_decode = theano.shared(name="Hidden to Decode", value= init_xavier_uniform(2*hidden, self.de_hidden_size), borrow = True) #self.hidden_bias = theano.shared( # name="Hidden to Bias", # value=np.asarray(np.random.randn(self.de_hidden_size, )*0., dtype=theano.config.floatX) , # borrow=True # ) input_var_shuffle = input_var.dimshuffle(1, 0) mask_var_shuffle = mask_var.dimshuffle(1, 0) target_var_in_shuffle = target_var_in.dimshuffle(1, 0) target_var_shuffle = target_var.dimshuffle(1, 0) self.params += [self.linear, self.linear_bias, self.de_lookuptable] #concatenate state_below = We[input_var_shuffle.flatten()].reshape( (input_var_shuffle.shape[0], input_var_shuffle.shape[1], embsize)) enclstm_f = LSTM(embsize, self.en_hidden_size) enclstm_b = LSTM(embsize, self.en_hidden_size, True) self.encoder_lstm_layers.append(enclstm_f) #append self.encoder_lstm_layers.append(enclstm_b) #append self.params += enclstm_f.params + enclstm_b.params #concatenate hs_f, Cs_f = enclstm_f.forward(state_below, mask_var_shuffle) hs_b, Cs_b = enclstm_b.forward(state_below, mask_var_shuffle) hs = T.concatenate([hs_f, hs_b], axis=2) Cs = T.concatenate([Cs_f, Cs_b], axis=2) hs0 = T.concatenate([hs_f[-1], hs_b[0]], axis=1) Cs0 = T.concatenate([Cs_f[-1], Cs_b[0]], axis=1) #self.hos += T.tanh(tensor.dot(hs0, self.hidden_decode) + self.hidden_bias), #self.Cos += T.tanh(tensor.dot(Cs0, self.hidden_decode) + self.hidden_bias), self.hos += T.alloc(np.asarray(0., dtype=theano.config.floatX), input_var_shuffle.shape[1], self.de_hidden_size), self.Cos += T.alloc(np.asarray(0., dtype=theano.config.floatX), input_var_shuffle.shape[1], self.de_hidden_size), Encoder = hs ei, di, dt = T.imatrices(3) #place holders em, dm, tf, di0 = T.fmatrices(4) self.encoder_function = theano.function(inputs=[ei, em], outputs=Encoder, givens={ input_var: ei, mask_var: em }) state_below = self.de_lookuptable[ target_var_in_shuffle.flatten()].reshape( (target_var_in_shuffle.shape[0], target_var_in_shuffle.shape[1], self.de_hidden_size)) for i in range(self.lstm_layers_num): declstm = LSTM(self.de_hidden_size, self.de_hidden_size) self.decoder_lstm_layers += declstm, #append self.params += declstm.params #concatenate ho, Co = self.hos[i], self.Cos[i] state_below, Cs = declstm.forward(state_below, mask_var_shuffle, ho, Co) decoder_lstm_outputs = T.concatenate([Encoder, state_below], axis=2) linear_outputs = T.dot(decoder_lstm_outputs, self.linear) + self.linear_bias[None, None, :] softmax_outputs, updates = theano.scan( fn=lambda x: T.nnet.softmax(x), sequences=[linear_outputs], ) def _NLL(pred, y, m): return -m * T.log(pred[T.arange(input_var.shape[0]), y]) def _step2(ctx_, state_, hs_, Cs_): #print ctx_.shape, state_.shape, hs_.shape, Cs_.shape hs, Cs = [], [] token_idxs = T.cast(state_.argmax(axis=-1), "int32") msk_ = T.fill((T.zeros_like(token_idxs, dtype="float32")), 1) msk_ = msk_.dimshuffle('x', 0) state_below0 = self.de_lookuptable[token_idxs].reshape( (1, ctx_.shape[0], self.de_hidden_size)) for i, lstm in enumerate(self.decoder_lstm_layers): h, C = lstm.forward(state_below0, msk_, hs_[i], Cs_[i]) #mind msk hs += h[-1], Cs += C[-1], state_below0 = h hs, Cs = T.as_tensor_variable(hs), T.as_tensor_variable(Cs) state_below0 = state_below0.reshape( (ctx_.shape[0], self.de_hidden_size)) state_below0 = T.concatenate([ctx_, state_below0], axis=1) newpred = T.dot(state_below0, self.linear) + self.linear_bias[None, :] state_below = T.nnet.softmax(newpred) extra_p = T.zeros_like(hs[:, :, 0]) state_below = T.concatenate([state_below, extra_p.T], axis=1) return state_below, hs, Cs ctx_0, state_0 = T.fmatrices(2) hs_0 = T.ftensor3() Cs_0 = T.ftensor3() state_below_tmp, hs_tmp, Cs_tmp = _step2(ctx_0, state_0, hs_0, Cs_0) self.f_next = theano.function([ctx_0, state_0, hs_0, Cs_0], [state_below_tmp, hs_tmp, Cs_tmp], name='f_next') hs0, Cs0 = T.as_tensor_variable( self.hos, name="hs0"), T.as_tensor_variable(self.Cos, name="Cs0") train_outputs, _ = theano.scan(fn=_step2, sequences=[Encoder], outputs_info=[decoderInputs0, hs0, Cs0], n_steps=input_var_shuffle.shape[0]) predy = train_outputs[0].dimshuffle(1, 0, 2) predy = predy[:, :, :-1] * mask_var[:, :, None] predy0 = predy.reshape((-1, self.num_labels)) def inner_function(targets_one_step, mask_one_step, prev_label, tg_energy): """ :param targets_one_step: [batch_size, t] :param prev_label: [batch_size, t] :param tg_energy: [batch_size] :return: """ new_ta_energy = T.dot(prev_label, Wyy[:-1, :-1]) new_ta_energy_t = tg_energy + T.sum( new_ta_energy * targets_one_step, axis=1) tg_energy_t = T.switch(mask_one_step, new_ta_energy_t, tg_energy) return [targets_one_step, tg_energy_t] local_energy = lasagne.layers.get_output(l_local, { l_in_word: input_var, l_mask_word: mask_var }) local_energy = local_energy.reshape((-1, length, self.num_labels)) local_energy = local_energy * mask_var[:, :, None] ##################### # for the end symbole of a sequence #################### end_term = Wyy[:-1, -1] local_energy = local_energy + end_term.dimshuffle( 'x', 'x', 0) * mask_var1[:, :, None] #predy0 = lasagne.layers.get_output(l_local_a, {l_in_word_a:input_var, l_mask_word_a:mask_var}) predy_in = T.argmax(predy0, axis=1) A = T.extra_ops.to_one_hot(predy_in, self.num_labels) A = A.reshape((-1, length, self.num_labels)) #predy = predy0.reshape((-1, length, 25)) #predy = predy*mask_var[:,:,None] targets_shuffled = predy.dimshuffle(1, 0, 2) target_time0 = targets_shuffled[0] masks_shuffled = mask_var.dimshuffle(1, 0) initial_energy0 = T.dot(target_time0, Wyy[-1, :-1]) initials = [target_time0, initial_energy0] [_, target_energies], _ = theano.scan( fn=inner_function, outputs_info=initials, sequences=[targets_shuffled[1:], masks_shuffled[1:]]) cost11 = target_energies[-1] + T.sum( T.sum(local_energy * predy, axis=2) * mask_var, axis=1) # compute the ground-truth energy targets_shuffled0 = A.dimshuffle(1, 0, 2) target_time00 = targets_shuffled0[0] initial_energy00 = T.dot(target_time00, Wyy[-1, :-1]) initials0 = [target_time00, initial_energy00] [_, target_energies0], _ = theano.scan( fn=inner_function, outputs_info=initials0, sequences=[targets_shuffled0[1:], masks_shuffled[1:]]) cost110 = target_energies0[-1] + T.sum( T.sum(local_energy * A, axis=2) * mask_var, axis=1) #predy_f = predy.reshape((-1, 25)) y_f = target_var.flatten() if (params.annealing == 0): lamb = params.L3 elif (params.annealing == 1): lamb = params.L3 * (1 - 0.01 * t_t) if (params.regutype == 0): ce_hinge = lasagne.objectives.categorical_crossentropy( predy0 + eps, y_f) ce_hinge = ce_hinge.reshape((-1, length)) ce_hinge = T.sum(ce_hinge * mask_var, axis=1) cost = T.mean(-cost11) + lamb * T.mean(ce_hinge) else: entropy_term = -T.sum(predy0 * T.log(predy0 + eps), axis=1) entropy_term = entropy_term.reshape((-1, length)) entropy_term = T.sum(entropy_term * mask_var, axis=1) cost = T.mean(-cost11) - lamb * T.mean(entropy_term) """ f = open('F0_simple.pickle') PARA = pickle.load(f) f.close() l2_term = sum(lasagne.regularization.l2(x-PARA[index]) for index, x in enumerate(a_params)) cost = T.mean(-cost11) + params.L2*l2_term """ ##from adam import adam ##updates_a = adam(cost, self.params, params.eta) #updates_a = lasagne.updates.sgd(cost, self.params, params.eta) #updates_a = lasagne.updates.apply_momentum(updates_a, self.params, momentum=0.9) from momentum import momentum updates_a = momentum(cost, self.params, params.eta, momentum=0.9) if (params.regutype == 0): self.train_fn = theano.function( inputs=[ei, dt, em, em1, length0, t_t0, di0], outputs=[cost, ce_hinge], updates=updates_a, on_unused_input='ignore', givens={ input_var: ei, target_var: dt, mask_var: em, mask_var1: em1, length: length0, t_t: t_t0, decoderInputs0: di0 }) #self.train_fn = theano.function([input_var, target_var, mask_var, mask_var1, length, t_t], [cost, ce_hinge], updates = updates_a, on_unused_input='ignore') else: self.train_fn = theano.function( inputs=[ei, dt, em, em1, length0, t_t0, di0], outputs=[cost, entropy_term], updates=updates_a, on_unused_input='ignore', givens={ input_var: ei, target_var: dt, mask_var: em, mask_var1: em1, length: length0, t_t: t_t0, decoderInputs0: di0 }) #self.train_fn = theano.function([input_var, target_var, mask_var, mask_var1, length, t_t], [cost, entropy_term], updates = updates_a, on_unused_input='ignore') prediction = T.argmax(predy, axis=2) corr = T.eq(prediction, target_var) corr_train = (corr * mask_var).sum(dtype=theano.config.floatX) num_tokens = mask_var.sum(dtype=theano.config.floatX) self.eval_fn = theano.function( inputs=[ei, dt, em, em1, length0, di0], outputs=[cost11, cost110, corr_train, num_tokens, prediction], on_unused_input='ignore', givens={ input_var: ei, target_var: dt, mask_var: em, mask_var1: em1, length: length0, decoderInputs0: di0 })
def __init__(self, We_initial, char_embedd_table_initial, params): We = theano.shared(We_initial) # initial embedding for the InfNet We_inf = theano.shared(We_initial) embsize = We_initial.shape[1] hidden = params.hidden self.en_hidden_size = params.hidden_inf self.num_labels = 17 self.de_hidden_size = params.de_hidden_size char_embedd_dim = params.char_embedd_dim char_dic_size = len(params.char_dic) char_embedd_table = theano.shared(char_embedd_table_initial) char_embedd_table_inf = theano.shared(char_embedd_table_initial) input_var = T.imatrix(name='inputs') target_var = T.imatrix(name='targets') target_var_in = T.imatrix(name='targets') mask_var = T.fmatrix(name='masks') mask_var1 = T.fmatrix(name='masks1') char_input_var = T.itensor3(name='char-inputs') length = T.iscalar() length0 = T.iscalar() t_t = T.fscalar() t_t0 = T.fscalar() use_dropout = T.fscalar() use_dropout0 = T.fscalar() Wyy0 = np.random.uniform(-0.02, 0.02, (self.num_labels +1 , self.num_labels + 1)).astype('float32') Wyy = theano.shared(Wyy0) l_in_word = lasagne.layers.InputLayer((None, None)) l_mask_word = lasagne.layers.InputLayer(shape=(None, None)) if params.emb ==1: l_emb_word = lasagne.layers.EmbeddingLayer(l_in_word, input_size= We_initial.shape[0] , output_size = embsize, W =We, name='word_embedding') else: l_emb_word = lasagne_embedding_layer_2(l_in_word, embsize, We) layer_char_input = lasagne.layers.InputLayer(shape=(None, None, Max_Char_Length ), input_var=char_input_var, name='char-input') layer_char = lasagne.layers.reshape(layer_char_input, (-1, [2])) layer_char_embedding = lasagne.layers.EmbeddingLayer(layer_char, input_size=char_dic_size, output_size=char_embedd_dim, W=char_embedd_table, name='char_embedding') layer_char = lasagne.layers.DimshuffleLayer(layer_char_embedding, pattern=(0, 2, 1)) # first get some necessary dimensions or parameters conv_window = 3 num_filters = params.num_filters # construct convolution layer cnn_layer = lasagne.layers.Conv1DLayer(layer_char, num_filters=num_filters, filter_size=conv_window, pad='full', nonlinearity=lasagne.nonlinearities.tanh, name='cnn') # infer the pool size for pooling (pool size should go through all time step of cnn) _, _, pool_size = cnn_layer.output_shape # construct max pool layer pool_layer = lasagne.layers.MaxPool1DLayer(cnn_layer, pool_size=pool_size) # reshape the layer to match lstm incoming layer [batch * sent_length, num_filters, 1] --> [batch, sent_length, num_filters] output_cnn_layer = lasagne.layers.reshape(pool_layer, (-1, length, [1])) # finally, concatenate the two incoming layers together. incoming = lasagne.layers.concat([output_cnn_layer, l_emb_word], axis=2) l_lstm_wordf = lasagne.layers.LSTMLayer(incoming, hidden, mask_input=l_mask_word) l_lstm_wordb = lasagne.layers.LSTMLayer(incoming, hidden, mask_input=l_mask_word, backwards = True) concat = lasagne.layers.concat([l_lstm_wordf, l_lstm_wordb], axis=2) l_reshape_concat = lasagne.layers.ReshapeLayer(concat,(-1,2*hidden)) l_local = lasagne.layers.DenseLayer(l_reshape_concat, num_units= self.num_labels + 1, nonlinearity=lasagne.nonlinearities.linear) network_params = lasagne.layers.get_all_params(l_local, trainable=True) network_params.append(Wyy) print len(network_params) f = open('NER_BiLSTM_CNN_CRF_.Batchsize_10_dropout_1_LearningRate_0.005_0.0_50_hidden_200.pickle','r') data = pickle.load(f) f.close() for idx, p in enumerate(network_params): p.set_value(data[idx]) self.params = [] self.hos = [] self.Cos = [] self.encoder_lstm_layers = [] self.decoder_lstm_layers = [] self.lstm_layers_num = 1 ei, di, dt = T.imatrices(3) #place holders decoderInputs0 ,em, em1, dm, tf, di0 =T.fmatrices(6) ci = T.itensor3() #### the last one is for the stary symbole self.de_lookuptable = theano.shared(name="Decoder LookUpTable", value=init_xavier_uniform(self.num_labels +1, self.de_hidden_size), borrow=True) self.linear = theano.shared(name="Linear", value = init_xavier_uniform(self.de_hidden_size + 2*self.en_hidden_size, self.num_labels), borrow= True) self.linear_bias = theano.shared(name="Hidden to Bias", value=np.asarray(np.random.randn(self.num_labels, )*0., dtype=theano.config.floatX), borrow=True) #self.hidden_decode = theano.shared(name="Hidden to Decode", value= init_xavier_uniform(2*hidden, self.de_hidden_size), borrow = True) #self.hidden_bias = theano.shared( # name="Hidden to Bias", # value=np.asarray(np.random.randn(self.de_hidden_size, )*0., dtype=theano.config.floatX) , # borrow=True # ) input_var_shuffle = input_var.dimshuffle(1, 0) mask_var_shuffle = mask_var.dimshuffle(1, 0) target_var_in_shuffle = target_var_in.dimshuffle(1,0) target_var_shuffle = target_var.dimshuffle(1,0) self.params += [We_inf, self.linear, self.de_lookuptable, self.linear_bias] ######[batch, sent_length, embsize] state_below = We_inf[input_var_shuffle.flatten()].reshape((input_var_shuffle.shape[0], input_var_shuffle.shape[1], embsize)) ###### character word embedding layer_char_input_inf = lasagne.layers.InputLayer(shape=(None, None, Max_Char_Length ), input_var=char_input_var, name='char-input') layer_char_inf = lasagne.layers.reshape(layer_char_input_inf, (-1, [2])) layer_char_embedding_inf = lasagne.layers.EmbeddingLayer(layer_char_inf, input_size=char_dic_size, output_size=char_embedd_dim, W=char_embedd_table_inf, name='char_embedding_inf') layer_char_inf = lasagne.layers.DimshuffleLayer(layer_char_embedding_inf, pattern=(0, 2, 1)) #layer_char_inf = lasagne.layers.DropoutLayer(layer_char_inf, p=0.5) cnn_layer_inf = lasagne.layers.Conv1DLayer(layer_char_inf, num_filters=num_filters, filter_size=conv_window, pad='full', nonlinearity=lasagne.nonlinearities.tanh, name='cnn_inf') pool_layer_inf = lasagne.layers.MaxPool1DLayer(cnn_layer_inf, pool_size=pool_size) output_cnn_layer_inf = lasagne.layers.reshape(pool_layer_inf, (-1, length, [1])) char_params = lasagne.layers.get_all_params(output_cnn_layer_inf, trainable=True) self.params += char_params ###### [batch, sent_length, num_filters] #char_state_below = lasagne.layers.get_output(output_cnn_layer_inf, {layer_char_input_inf:char_input_var}) char_state_below = lasagne.layers.get_output(output_cnn_layer_inf) char_state_below = dropout_layer(char_state_below, use_dropout, trng) char_state_shuff = char_state_below.dimshuffle(1,0, 2) state_below = T.concatenate([state_below, char_state_shuff], axis=2) state_below = dropout_layer(state_below, use_dropout, trng) enclstm_f = LSTM(embsize+num_filters, self.en_hidden_size) enclstm_b = LSTM(embsize+num_filters, self.en_hidden_size, True) self.encoder_lstm_layers.append(enclstm_f) #append self.encoder_lstm_layers.append(enclstm_b) #append self.params += enclstm_f.params + enclstm_b.params #concatenate hs_f, Cs_f = enclstm_f.forward(state_below, mask_var_shuffle) hs_b, Cs_b = enclstm_b.forward(state_below, mask_var_shuffle) hs = T.concatenate([hs_f, hs_b], axis=2) Cs = T.concatenate([Cs_f, Cs_b], axis=2) hs0 = T.concatenate([hs_f[-1], hs_b[0]], axis=1) Cs0 = T.concatenate([Cs_f[-1], Cs_b[0]], axis=1) #self.hos += T.tanh(tensor.dot(hs0, self.hidden_decode) + self.hidden_bias), #self.Cos += T.tanh(tensor.dot(Cs0, self.hidden_decode) + self.hidden_bias), self.hos += T.alloc(np.asarray(0., dtype=theano.config.floatX), input_var_shuffle.shape[1], self.de_hidden_size), self.Cos += T.alloc(np.asarray(0., dtype=theano.config.floatX), input_var_shuffle.shape[1], self.de_hidden_size), Encoder = hs state_below = self.de_lookuptable[target_var_in_shuffle.flatten()].reshape((target_var_in_shuffle.shape[0], target_var_in_shuffle.shape[1], self.de_hidden_size)) for i in range(self.lstm_layers_num): declstm = LSTM(self.de_hidden_size, self.de_hidden_size) self.decoder_lstm_layers += declstm, #append self.params += declstm.params #concatenate ho, Co = self.hos[i], self.Cos[i] state_below, Cs = declstm.forward(state_below, mask_var_shuffle, ho, Co) decoder_lstm_outputs = T.concatenate([state_below, Encoder], axis=2) linear_outputs = T.dot(decoder_lstm_outputs, self.linear) + self.linear_bias[None, None, :] softmax_outputs, updates = theano.scan( fn=lambda x: T.nnet.softmax(x), sequences=[linear_outputs], ) def _NLL(pred, y, m): return -m * T.log(pred[T.arange(input_var.shape[0]), y]) """ costs, _ = theano.scan(fn=_NLL, sequences=[softmax_outputs, target_var_shuffle, mask_var_shuffle]) #loss = costs.sum() / mask_var.sum() + params.L2*sum(lasagne.regularization.l2(x) for x in self.params) loss = costs.sum() / mask_var.sum() updates = lasagne.updates.sgd(loss, self.params, self.eta) updates = lasagne.updates.apply_momentum(updates, self.params, momentum=0.9) ################################################### #### using the ground truth when training ################################################## self._train = theano.function( inputs=[ei, em, di, dm, dt], outputs=[loss, softmax_outputs], updates=updates, givens={input_var:ei, mask_var:em, target_var_in:di, decoderMask:dm, target_var:dt} ) """ def _step2(ctx_, state_, hs_, Cs_): hs, Cs = [], [] token_idxs = T.cast(state_.argmax(axis=-1), "int32" ) msk_ = T.fill( (T.zeros_like(token_idxs, dtype="float32")), 1.) msk_ = msk_.dimshuffle('x', 0) state_below0 = self.de_lookuptable[token_idxs].reshape((1, ctx_.shape[0], self.de_hidden_size)) for i, lstm in enumerate(self.decoder_lstm_layers): h, C = lstm.forward(state_below0, msk_, hs_[i], Cs_[i]) #mind msk hs += h[-1], Cs += C[-1], state_below0 = h hs, Cs = T.as_tensor_variable(hs), T.as_tensor_variable(Cs) state_below0 = state_below0.reshape((ctx_.shape[0], self.de_hidden_size)) state_below0 = T.concatenate([ctx_, state_below0], axis =1) newpred = T.dot(state_below0, self.linear) + self.linear_bias[None, :] state_below = T.nnet.softmax(newpred) ##### the beging symbole probablity is 0 extra_p = T.zeros_like(hs[:,:,0]) state_below = T.concatenate([state_below, extra_p.T], axis=1) return state_below, hs, Cs hs0, Cs0 = T.as_tensor_variable(self.hos, name="hs0"), T.as_tensor_variable(self.Cos, name="Cs0") train_outputs, _ = theano.scan( fn=_step2, sequences = [Encoder], outputs_info=[decoderInputs0, hs0, Cs0], n_steps=input_var_shuffle.shape[0] ) predy = train_outputs[0].dimshuffle(1, 0 , 2) predy = predy[:,:,:-1]*mask_var[:,:,None] predy0 = predy.reshape((-1, 17)) def inner_function( targets_one_step, mask_one_step, prev_label, tg_energy): """ :param targets_one_step: [batch_size, t] :param prev_label: [batch_size, t] :param tg_energy: [batch_size] :return: """ new_ta_energy = T.dot(prev_label, Wyy[:-1,:-1]) new_ta_energy_t = tg_energy + T.sum(new_ta_energy*targets_one_step, axis =1) tg_energy_t = T.switch(mask_one_step, new_ta_energy_t, tg_energy) return [targets_one_step, tg_energy_t] local_energy = lasagne.layers.get_output(l_local, {l_in_word: input_var, l_mask_word: mask_var, layer_char_input:char_input_var}) local_energy = local_energy.reshape((-1, length, 17)) local_energy = local_energy*mask_var[:,:,None] ##################### # for the end symbole of a sequence #################### end_term = Wyy[:-1,-1] local_energy = local_energy + end_term.dimshuffle('x', 'x', 0)*mask_var1[:,:, None] #predy0 = lasagne.layers.get_output(l_local_a, {l_in_word_a:input_var, l_mask_word_a:mask_var}) predy_in = T.argmax(predy0, axis=1) A = T.extra_ops.to_one_hot(predy_in, 17) A = A.reshape((-1, length, 17)) #predy = predy0.reshape((-1, length, 25)) #predy = predy*mask_var[:,:,None] targets_shuffled = predy.dimshuffle(1, 0, 2) target_time0 = targets_shuffled[0] masks_shuffled = mask_var.dimshuffle(1, 0) initial_energy0 = T.dot(target_time0, Wyy[-1,:-1]) initials = [target_time0, initial_energy0] [ _, target_energies], _ = theano.scan(fn=inner_function, outputs_info=initials, sequences=[targets_shuffled[1:], masks_shuffled[1:]]) cost11 = target_energies[-1] + T.sum(T.sum(local_energy*predy, axis=2)*mask_var, axis=1) cost = T.mean(-cost11) from momentum import momentum updates_a = momentum(cost, self.params, params.eta, momentum=0.9) self.train_fn = theano.function( inputs=[ei, ci, em, em1, length0, di0, use_dropout0], outputs=[cost], updates=updates_a, on_unused_input='ignore', givens={input_var:ei, char_input_var:ci, mask_var:em, mask_var1:em1, length: length0, decoderInputs0:di0, use_dropout:use_dropout0} ) prediction = T.argmax(predy, axis=2) corr = T.eq(prediction, target_var) corr_train = (corr * mask_var).sum(dtype=theano.config.floatX) num_tokens = mask_var.sum(dtype=theano.config.floatX) self.eval_fn = theano.function( inputs=[ei, ci, em, em1, length0, di0, use_dropout0], outputs=[prediction, -cost11], on_unused_input='ignore', givens={input_var:ei, char_input_var:ci, mask_var:em, mask_var1:em1, length: length0, decoderInputs0:di0, use_dropout:use_dropout0} )
def __init__(self, We, params): lstm_layers_num = 1 emb_size = We.shape[1] self.eta = params.eta self.num_labels = params.num_labels self.en_hidden_size = params.en_hidden_size self.de_hidden_size = params.de_hidden_size self.lstm_layers_num = params.lstm_layers_num self._train = None self._utter = None self.params = [] self.encoder_lstm_layers = [] self.decoder_lstm_layers = [] self.hos = [] self.Cos = [] encoderInputs = tensor.imatrix() decoderInputs, decoderTarget = tensor.imatrices(2) encoderMask, TF, decoderMask, decoderInputs0 = tensor.fmatrices(4) self.lookuptable = theano.shared(We) #### the last one is for the stary symbole self.de_lookuptable = theano.shared(name="Decoder LookUpTable", value=init_xavier_uniform( self.num_labels + 1, self.de_hidden_size), borrow=True) self.linear = theano.shared( name="Linear", value=init_xavier_uniform( self.de_hidden_size + 2 * self.en_hidden_size, self.num_labels), borrow=True) self.linear_bias = theano.shared( name="Hidden to Bias", value=np.asarray(np.random.randn(self.num_labels, ) * 0., dtype=theano.config.floatX), borrow=True) #self.hidden_decode = theano.shared(name="Hidden to Decode", value= init_xavier_uniform(2*en_hidden_size, self.de_hidden_size), borrow = True) #self.hidden_bias = theano.shared( # name="Hidden to Bias", # value=np.asarray(np.random.randn(self.de_hidden_size, )*0., dtype=theano.config.floatX) , # borrow=True # ) #self.params += [self.linear, self.de_lookuptable, self.hidden_decode, self.hidden_bias] #concatenate self.params += [self.linear, self.linear_bias, self.de_lookuptable ] #the initial hidden state of decoder lstm is zeros #(max_sent_size, batch_size, hidden_size) state_below = self.lookuptable[encoderInputs.flatten()].reshape( (encoderInputs.shape[0], encoderInputs.shape[1], emb_size)) for _ in range(self.lstm_layers_num): enclstm_f = LSTM(emb_size, self.en_hidden_size) enclstm_b = LSTM(emb_size, self.en_hidden_size, True) self.encoder_lstm_layers.append(enclstm_f) #append self.encoder_lstm_layers.append(enclstm_b) #append self.params += enclstm_f.params + enclstm_b.params #concatenate hs_f, Cs_f = enclstm_f.forward(state_below, encoderMask) hs_b, Cs_b = enclstm_b.forward(state_below, encoderMask) hs = tensor.concatenate([hs_f, hs_b], axis=2) Cs = tensor.concatenate([Cs_f, Cs_b], axis=2) hs0 = tensor.concatenate([hs_f[-1], hs_b[0]], axis=1) Cs0 = tensor.concatenate([Cs_f[-1], Cs_b[0]], axis=1) #self.hos += tensor.tanh(tensor.dot(hs0, self.hidden_decode) + self.hidden_bias), #self.Cos += tensor.tanh(tensor.dot(Cs0, self.hidden_decode) + self.hidden_bias), self.hos += tensor.alloc( np.asarray(0., dtype=theano.config.floatX), encoderInputs.shape[1], self.de_hidden_size), self.Cos += tensor.alloc( np.asarray(0., dtype=theano.config.floatX), encoderInputs.shape[1], self.de_hidden_size), state_below = hs Encoder = state_below state_below = self.de_lookuptable[decoderInputs.flatten()].reshape( (decoderInputs.shape[0], decoderInputs.shape[1], self.de_hidden_size)) for i in range(self.lstm_layers_num): declstm = LSTM(self.de_hidden_size, self.de_hidden_size) self.decoder_lstm_layers += declstm, #append self.params += declstm.params #concatenate ho, Co = self.hos[i], self.Cos[i] state_below, Cs = declstm.forward(state_below, decoderMask, ho, Co) ##### Here we include the representation from the decoder decoder_lstm_outputs = tensor.concatenate([state_below, Encoder], axis=2) ei, di, dt = tensor.imatrices(3) #place holders em, dm, tf, di0 = tensor.fmatrices(4) ##################################################### ##################################################### linear_outputs = tensor.dot(decoder_lstm_outputs, self.linear) + self.linear_bias[None, None, :] softmax_outputs, _ = theano.scan( fn=lambda x: tensor.nnet.softmax(x), sequences=[linear_outputs], ) def _NLL(pred, y, m): return -m * tensor.log(pred[tensor.arange(encoderInputs.shape[1]), y]) costs, _ = theano.scan( fn=_NLL, sequences=[softmax_outputs, decoderTarget, decoderMask]) loss = costs.sum() / decoderMask.sum() + params.L2 * sum( lasagne.regularization.l2(x) for x in self.params) updates = lasagne.updates.adam(loss, self.params, self.eta) #updates = lasagne.updates.apply_momentum(updates, self.params, momentum=0.9) ################################################### #### using the ground truth when training ################################################## self._train = theano.function(inputs=[ei, em, di, dm, dt], outputs=[loss, softmax_outputs], updates=updates, givens={ encoderInputs: ei, encoderMask: em, decoderInputs: di, decoderMask: dm, decoderTarget: dt }) ######################################################################### ### For schedule sampling ######################################################################### ###### always use privous predict as next input def _step2(ctx_, state_, hs_, Cs_): hs, Cs = [], [] token_idxs = tensor.cast(state_.argmax(axis=-1), "int32") msk_ = tensor.fill( (tensor.zeros_like(token_idxs, dtype="float32")), 1) msk_ = msk_.dimshuffle('x', 0) state_below0 = self.de_lookuptable[token_idxs].reshape( (1, encoderInputs.shape[1], self.de_hidden_size)) for i, lstm in enumerate(self.decoder_lstm_layers): h, C = lstm.forward(state_below0, msk_, hs_[i], Cs_[i]) #mind msk hs += h[-1], Cs += C[-1], state_below0 = h hs, Cs = tensor.as_tensor_variable(hs), tensor.as_tensor_variable( Cs) state_below0 = state_below0.reshape( (encoderInputs.shape[1], self.de_hidden_size)) state_below0 = tensor.concatenate([ctx_, state_below0], axis=1) newpred = tensor.dot(state_below0, self.linear) + self.linear_bias[None, :] state_below = tensor.nnet.softmax(newpred) ##### the beging symbole probablity is 0 extra_p = tensor.zeros_like(hs[:, :, 0]) state_below = tensor.concatenate([state_below, extra_p.T], axis=1) return state_below, hs, Cs hs0, Cs0 = tensor.as_tensor_variable( self.hos, name="hs0"), tensor.as_tensor_variable(self.Cos, name="Cs0") train_outputs, _ = theano.scan(fn=_step2, sequences=[Encoder], outputs_info=[decoderInputs0, hs0, Cs0], n_steps=encoderInputs.shape[0]) train_predict = train_outputs[0] train_costs, _ = theano.scan( fn=_NLL, sequences=[train_predict, decoderTarget, decoderMask]) train_loss = train_costs.sum() / decoderMask.sum() + params.L2 * sum( lasagne.regularization.l2(x) for x in self.params) #from adam import adam #train_updates = adam(train_loss, self.params, self.eta) #train_updates = lasagne.updates.apply_momentum(train_updates, self.params, momentum=0.9) #train_updates = lasagne.updates.sgd(train_loss, self.params, self.eta) #train_updates = lasagne.updates.apply_momentum(train_updates, self.params, momentum=0.9) from momentum import momentum train_updates = momentum(train_loss, self.params, params.eta, momentum=0.9) self._train2 = theano.function( inputs=[ei, em, di0, dm, dt], outputs=[train_loss, train_predict], updates=train_updates, givens={ encoderInputs: ei, encoderMask: em, decoderInputs0: di0, decoderMask: dm, decoderTarget: dt } #givens={encoderInputs:ei, encoderMask:em, decoderInputs:di, decoderMask:dm, decoderTarget:dt, TF:tf} ) listof_token_idx = train_predict.argmax(axis=-1) self._utter = theano.function(inputs=[ei, em, di0], outputs=listof_token_idx, givens={ encoderInputs: ei, encoderMask: em, decoderInputs0: di0 })
def __init__(self, We, char_embedd_table_initial, params): lstm_layers_num = 1 emb_size = We.shape[1] self.eta = params.eta self.num_labels = params.num_labels self.en_hidden_size = params.en_hidden_size self.de_hidden_size = params.de_hidden_size self.lstm_layers_num = params.lstm_layers_num self._train = None self._utter = None self.params = [] self.encoder_lstm_layers = [] self.decoder_lstm_layers = [] self.hos = [] self.Cos = [] char_embedd_dim = params.char_embedd_dim char_dic_size = len(params.char_dic) char_embedd_table = theano.shared(char_embedd_table_initial) encoderInputs = tensor.imatrix() decoderInputs, decoderTarget = tensor.imatrices(2) encoderMask, TF, decoderMask, decoderInputs0 = tensor.fmatrices(4) char_input_var = tensor.itensor3(name='char-inputs') ci = tensor.itensor3() use_dropout = tensor.fscalar() use_dropout0 = tensor.fscalar() self.lookuptable = theano.shared(We) #### the last one is for the stary symbole self.de_lookuptable = theano.shared(name="Decoder LookUpTable", value=init_xavier_uniform( self.num_labels + 1, self.de_hidden_size), borrow=True) self.linear = theano.shared( name="Linear", value=init_xavier_uniform( self.de_hidden_size + 2 * self.en_hidden_size, self.num_labels), borrow=True) self.linear_bias = theano.shared( name="Hidden to Bias", value=np.asarray(np.random.randn(self.num_labels, ) * 0., dtype=theano.config.floatX), borrow=True) #self.hidden_decode = theano.shared(name="Hidden to Decode", value= init_xavier_uniform(2*en_hidden_size, self.de_hidden_size), borrow = True) #self.hidden_bias = theano.shared( # name="Hidden to Bias", # value=np.asarray(np.random.randn(self.de_hidden_size, )*0., dtype=theano.config.floatX) , # borrow=True # ) #self.params += [self.linear, self.de_lookuptable, self.hidden_decode, self.hidden_bias] #concatenate self.params += [ self.lookuptable, self.linear, self.linear_bias, self.de_lookuptable ] #the initial hidden state of decoder lstm is zeros #(max_sent_size, batch_size, hidden_size) state_below = self.lookuptable[encoderInputs.flatten()].reshape( (encoderInputs.shape[0], encoderInputs.shape[1], emb_size)) layer_char_input = lasagne.layers.InputLayer(shape=(None, None, Max_Char_Length), input_var=char_input_var, name='char-input') layer_char = lasagne.layers.reshape(layer_char_input, (-1, [2])) layer_char_embedding = lasagne.layers.EmbeddingLayer( layer_char, input_size=char_dic_size, output_size=char_embedd_dim, W=char_embedd_table, name='char_embedding') layer_char = lasagne.layers.DimshuffleLayer(layer_char_embedding, pattern=(0, 2, 1)) # first get some necessary dimensions or parameters conv_window = 3 num_filters = params.num_filters # construct convolution layer cnn_layer = lasagne.layers.Conv1DLayer( layer_char, num_filters=num_filters, filter_size=conv_window, pad='full', nonlinearity=lasagne.nonlinearities.tanh, name='cnn') # infer the pool size for pooling (pool size should go through all time step of cnn) _, _, pool_size = cnn_layer.output_shape # construct max pool layer pool_layer = lasagne.layers.MaxPool1DLayer(cnn_layer, pool_size=pool_size) # reshape the layer to match lstm incoming layer [batch * sent_length, num_filters, 1] --> [batch, sent_length, num_filters] output_cnn_layer = lasagne.layers.reshape( pool_layer, (-1, encoderInputs.shape[0], [1])) char_params = lasagne.layers.get_all_params(output_cnn_layer, trainable=True) self.params += char_params char_state_below = lasagne.layers.get_output(output_cnn_layer) char_state_below = dropout_layer(char_state_below, use_dropout, trng) char_state_shuff = char_state_below.dimshuffle(1, 0, 2) state_below = tensor.concatenate([state_below, char_state_shuff], axis=2) state_below = dropout_layer(state_below, use_dropout, trng) for _ in range(self.lstm_layers_num): enclstm_f = LSTM(emb_size + num_filters, self.en_hidden_size) enclstm_b = LSTM(emb_size + num_filters, self.en_hidden_size, True) self.encoder_lstm_layers.append(enclstm_f) #append self.encoder_lstm_layers.append(enclstm_b) #append self.params += enclstm_f.params + enclstm_b.params #concatenate hs_f, Cs_f = enclstm_f.forward(state_below, encoderMask) hs_b, Cs_b = enclstm_b.forward(state_below, encoderMask) hs = tensor.concatenate([hs_f, hs_b], axis=2) Cs = tensor.concatenate([Cs_f, Cs_b], axis=2) hs0 = tensor.concatenate([hs_f[-1], hs_b[0]], axis=1) Cs0 = tensor.concatenate([Cs_f[-1], Cs_b[0]], axis=1) #self.hos += tensor.tanh(tensor.dot(hs0, self.hidden_decode) + self.hidden_bias), #self.Cos += tensor.tanh(tensor.dot(Cs0, self.hidden_decode) + self.hidden_bias), self.hos += tensor.alloc( np.asarray(0., dtype=theano.config.floatX), encoderInputs.shape[1], self.de_hidden_size), self.Cos += tensor.alloc( np.asarray(0., dtype=theano.config.floatX), encoderInputs.shape[1], self.de_hidden_size), state_below = hs Encoder = state_below state_below = self.de_lookuptable[decoderInputs.flatten()].reshape( (decoderInputs.shape[0], decoderInputs.shape[1], self.de_hidden_size)) for i in range(self.lstm_layers_num): declstm = LSTM(self.de_hidden_size, self.de_hidden_size) self.decoder_lstm_layers += declstm, #append self.params += declstm.params #concatenate ho, Co = self.hos[i], self.Cos[i] state_below, Cs = declstm.forward(state_below, decoderMask, ho, Co) ##### Here we include the representation from the decoder decoder_lstm_outputs = tensor.concatenate([state_below, Encoder], axis=2) ei, di, dt = tensor.imatrices(3) #place holders em, dm, tf, di0 = tensor.fmatrices(4) ##################################################### ##################################################### linear_outputs = tensor.dot(decoder_lstm_outputs, self.linear) + self.linear_bias[None, None, :] softmax_outputs, _ = theano.scan( fn=lambda x: tensor.nnet.softmax(x), sequences=[linear_outputs], ) def _NLL(pred, y, m): return -m * tensor.log(pred[tensor.arange(encoderInputs.shape[1]), y]) costs, _ = theano.scan( fn=_NLL, sequences=[softmax_outputs, decoderTarget, decoderMask]) loss = costs.sum() / decoderMask.sum() + params.L2 * sum( lasagne.regularization.l2(x) for x in self.params) #updates = lasagne.updates.adam(loss, self.params, self.eta) #updates = lasagne.updates.apply_momentum(updates, self.params, momentum=0.9) ################################################### #### using the ground truth when training ################################################## #self._train = theano.function( # inputs=[ei, em, di, dm, dt], # outputs=[loss, softmax_outputs], # updates=updates, # givens={encoderInputs:ei, encoderMask:em, decoderInputs:di, decoderMask:dm, decoderTarget:dt} # ) ######################################################################### ### For schedule sampling ######################################################################### ###### always use privous predict as next input def _step2(ctx_, state_, hs_, Cs_): hs, Cs = [], [] token_idxs = tensor.cast(state_.argmax(axis=-1), "int32") msk_ = tensor.fill( (tensor.zeros_like(token_idxs, dtype="float32")), 1.) msk_ = msk_.dimshuffle('x', 0) state_below0 = self.de_lookuptable[token_idxs].reshape( (1, encoderInputs.shape[1], self.de_hidden_size)) for i, lstm in enumerate(self.decoder_lstm_layers): h, C = lstm.forward(state_below0, msk_, hs_[i], Cs_[i]) #mind msk hs += h[-1], Cs += C[-1], state_below0 = h hs, Cs = tensor.as_tensor_variable(hs), tensor.as_tensor_variable( Cs) state_below0 = state_below0.reshape( (encoderInputs.shape[1], self.de_hidden_size)) state_below0 = tensor.concatenate([ctx_, state_below0], axis=1) newpred = tensor.dot(state_below0, self.linear) + self.linear_bias[None, :] state_below = tensor.nnet.softmax(newpred) ##### the beging symbole probablity is 0 extra_p = tensor.zeros_like(hs[:, :, 0]) state_below = tensor.concatenate([state_below, extra_p.T], axis=1) return state_below, hs, Cs hs0, Cs0 = tensor.as_tensor_variable( self.hos, name="hs0"), tensor.as_tensor_variable(self.Cos, name="Cs0") train_outputs, _ = theano.scan(fn=_step2, sequences=[Encoder], outputs_info=[decoderInputs0, hs0, Cs0], n_steps=encoderInputs.shape[0]) train_predict = train_outputs[0] train_costs, _ = theano.scan( fn=_NLL, sequences=[train_predict, decoderTarget, decoderMask]) train_loss = train_costs.sum() / decoderMask.sum() + params.L2 * sum( lasagne.regularization.l2(x) for x in self.params) #from adam import adam #train_updates = adam(train_loss, self.params, self.eta) #train_updates = lasagne.updates.apply_momentum(train_updates, self.params, momentum=0.9) #train_updates = lasagne.updates.sgd(train_loss, self.params, self.eta) #train_updates = lasagne.updates.apply_momentum(train_updates, self.params, momentum=0.9) from momentum import momentum train_updates = momentum(train_loss, self.params, params.eta, momentum=0.9) self._train2 = theano.function( inputs=[ei, ci, em, di0, dm, dt, use_dropout0], outputs=[train_loss, train_predict], updates=train_updates, givens={ encoderInputs: ei, char_input_var: ci, encoderMask: em, decoderInputs0: di0, decoderMask: dm, decoderTarget: dt, use_dropout: use_dropout0 } #givens={encoderInputs:ei, encoderMask:em, decoderInputs:di, decoderMask:dm, decoderTarget:dt, TF:tf} ) listof_token_idx = train_predict.argmax(axis=-1) self._utter = theano.function(inputs=[ei, ci, em, di0, use_dropout0], outputs=listof_token_idx, givens={ encoderInputs: ei, char_input_var: ci, encoderMask: em, decoderInputs0: di0, use_dropout: use_dropout0 })