def test_accuracy_instance(): from .metrics import accuracy_instance predictions_var, targets_var = T.imatrices('predictions', 'targets') accuracy_var = accuracy_instance(predictions_var, targets_var, \ nb_classes=5, nb_samples_per_class=10, batch_size=16) accuracy_fn = theano.function([predictions_var, targets_var], accuracy_var) # Generate sample data targets = np.kron(np.arange(5), np.ones((16, 10))).astype('int32') predictions = np.zeros((16, 50)).astype('int32') indices = np.zeros((16, 5)).astype('int32') accuracy = np.zeros((16, 10)) for i in range(16): for j in range(50): correct = np.random.binomial(1, 0.5) predictions[i, j] = correct * targets[i, j] + \ (1 - correct) * ((targets[i, j] + 1) % 5) accuracy[i, indices[i, targets[i, j]]] += correct indices[i, targets[i, j]] += 1 numpy_accuracy = np.mean(accuracy, axis=0) / 5 theano_accuracy = accuracy_fn(predictions, targets) assert np.allclose(theano_accuracy, numpy_accuracy)
def test_array(): x = T.imatrices("x") y = T.log(x)[[0, 1], 0] f = theano.function(outputs=y, inputs=[x]) print f([[1, 2, 3, 4, 5], [5, 4, 3, 2, 1] ])
def test_batch_size(): input_var_1, input_var_2 = T.tensor3s('input1', 'input2') target_var_1, target_var_2 = T.imatrices('target1', 'target2') # First model with `batch_size=16` output_var_1, _, params1 = memory_augmented_neural_network( input_var_1, target_var_1, batch_size=16, nb_class=5, memory_shape=(128, 40), controller_size=200, input_size=20 * 20, nb_reads=4) # Second model with `batch_size=1` output_var_2, _, params2 = memory_augmented_neural_network( input_var_2, target_var_2, batch_size=1, nb_class=5, memory_shape=(128, 40), controller_size=200, input_size=20 * 20, nb_reads=4) for (param1, param2) in zip(params1, params2): param2.set_value(param1.get_value()) posterior_fn1 = theano.function([input_var_1, target_var_1], output_var_1) posterior_fn2 = theano.function([input_var_2, target_var_2], output_var_2) # Input has shape (batch_size, timesteps, vocabulary_size + actions_vocabulary_size + 3) test_input = np.random.rand(16, 50, 20 * 20) test_target = np.random.randint(5, size=(16, 50)).astype('int32') test_output1 = posterior_fn1(test_input, test_target) test_output2 = np.zeros_like(test_output1) for i in range(16): test_output2[i] = posterior_fn2(test_input[i][np.newaxis, :, :], test_target[i][np.newaxis, :]) assert np.allclose(test_output1, test_output2)
def test_clone(self): # Data for unit testing X_unit = ['abcdef', 'abcdef', 'qwerty'] X_unit = [[ord(c) for c in w] for w in X_unit] X_unit = np.array(X_unit, dtype='int8') n_alerts_unit, l_alerts_unit = X_unit.shape mask_unit = np.ones(X_unit.shape, dtype='int8') # Dimensions n_alerts = None l_alerts = None n_alphabet = 2**7 # All ASCII chars num_units = 10 # Symbolic variables input_var, input_var2 = T.imatrices('inputs', 'inputs2') mask_var, mask_var2 = T.matrices('masks', 'masks2') target_var = T.dvector('targets') # build net for testing l_in = InputLayer(shape=(n_alerts, l_alerts), input_var=input_var, name='INPUT-LAYER') l_emb = EmbeddingLayer(l_in, n_alphabet, n_alphabet, W=np.eye(n_alphabet), name='EMBEDDING-LAYER') l_emb.params[l_emb.W].remove('trainable') # Fix weight l_mask = InputLayer(shape=(n_alerts, l_alerts), input_var=mask_var, name='MASK-INPUT-LAYER') l_lstm = LSTMLayer(l_emb, num_units=num_units, name='LSTM-LAYER', mask_input=l_mask) l_slice = SliceLayer(l_lstm, indices=-1, axis=1, name="SLICE-LAYER") # Only last timestep net = l_slice # clone l_in2 = InputLayer(shape=(n_alerts, l_alerts), input_var=input_var2, name='INPUT-LAYER2') l_mask2 = InputLayer(shape=(n_alerts, l_alerts), input_var=mask_var2, name='MASK-INPUT-LAYER2') net2 = lstm_rnn_tied_weights.clone(net, l_in2, l_mask2) self.assertNotEqual(repr(net), repr(net2)) pred_unit = layers.get_output(net, inputs={ l_in: input_var, l_mask: mask_var }).eval({ input_var: X_unit, mask_var: mask_unit }) pred_unit2 = layers.get_output(net2, inputs={ l_in2: input_var2, l_mask2: mask_var2 }).eval({ input_var2: X_unit, mask_var2: mask_unit }) self.assert_array_equal(pred_unit, pred_unit2)
def __init__(self, We_initial, char_embedd_table_initial, params): We = theano.shared(We_initial) # initial embedding for the InfNet We_inf = theano.shared(We_initial) embsize = We_initial.shape[1] hidden = params.hidden self.en_hidden_size = params.hidden_inf self.num_labels = 17 self.de_hidden_size = params.de_hidden_size char_embedd_dim = params.char_embedd_dim char_dic_size = len(params.char_dic) char_embedd_table = theano.shared(char_embedd_table_initial) char_embedd_table_inf = theano.shared(char_embedd_table_initial) input_var = T.imatrix(name='inputs') target_var = T.imatrix(name='targets') target_var_in = T.imatrix(name='targets') mask_var = T.fmatrix(name='masks') mask_var1 = T.fmatrix(name='masks1') char_input_var = T.itensor3(name='char-inputs') length = T.iscalar() length0 = T.iscalar() t_t = T.fscalar() t_t0 = T.fscalar() use_dropout = T.fscalar() use_dropout0 = T.fscalar() Wyy0 = np.random.uniform(-0.02, 0.02, (self.num_labels +1 , self.num_labels + 1)).astype('float32') Wyy = theano.shared(Wyy0) l_in_word = lasagne.layers.InputLayer((None, None)) l_mask_word = lasagne.layers.InputLayer(shape=(None, None)) if params.emb ==1: l_emb_word = lasagne.layers.EmbeddingLayer(l_in_word, input_size= We_initial.shape[0] , output_size = embsize, W =We, name='word_embedding') else: l_emb_word = lasagne_embedding_layer_2(l_in_word, embsize, We) layer_char_input = lasagne.layers.InputLayer(shape=(None, None, Max_Char_Length ), input_var=char_input_var, name='char-input') layer_char = lasagne.layers.reshape(layer_char_input, (-1, [2])) layer_char_embedding = lasagne.layers.EmbeddingLayer(layer_char, input_size=char_dic_size, output_size=char_embedd_dim, W=char_embedd_table, name='char_embedding') layer_char = lasagne.layers.DimshuffleLayer(layer_char_embedding, pattern=(0, 2, 1)) # first get some necessary dimensions or parameters conv_window = 3 num_filters = params.num_filters # construct convolution layer cnn_layer = lasagne.layers.Conv1DLayer(layer_char, num_filters=num_filters, filter_size=conv_window, pad='full', nonlinearity=lasagne.nonlinearities.tanh, name='cnn') # infer the pool size for pooling (pool size should go through all time step of cnn) _, _, pool_size = cnn_layer.output_shape # construct max pool layer pool_layer = lasagne.layers.MaxPool1DLayer(cnn_layer, pool_size=pool_size) # reshape the layer to match lstm incoming layer [batch * sent_length, num_filters, 1] --> [batch, sent_length, num_filters] output_cnn_layer = lasagne.layers.reshape(pool_layer, (-1, length, [1])) # finally, concatenate the two incoming layers together. incoming = lasagne.layers.concat([output_cnn_layer, l_emb_word], axis=2) l_lstm_wordf = lasagne.layers.LSTMLayer(incoming, hidden, mask_input=l_mask_word) l_lstm_wordb = lasagne.layers.LSTMLayer(incoming, hidden, mask_input=l_mask_word, backwards = True) concat = lasagne.layers.concat([l_lstm_wordf, l_lstm_wordb], axis=2) l_reshape_concat = lasagne.layers.ReshapeLayer(concat,(-1,2*hidden)) l_local = lasagne.layers.DenseLayer(l_reshape_concat, num_units= self.num_labels + 1, nonlinearity=lasagne.nonlinearities.linear) network_params = lasagne.layers.get_all_params(l_local, trainable=True) network_params.append(Wyy) print len(network_params) f = open('NER_BiLSTM_CNN_CRF_.Batchsize_10_dropout_1_LearningRate_0.005_0.0_50_hidden_200.pickle','r') data = pickle.load(f) f.close() for idx, p in enumerate(network_params): p.set_value(data[idx]) self.params = [] self.hos = [] self.Cos = [] self.encoder_lstm_layers = [] self.decoder_lstm_layers = [] self.lstm_layers_num = 1 ei, di, dt = T.imatrices(3) #place holders decoderInputs0 ,em, em1, dm, tf, di0 =T.fmatrices(6) ci = T.itensor3() #### the last one is for the stary symbole self.de_lookuptable = theano.shared(name="Decoder LookUpTable", value=init_xavier_uniform(self.num_labels +1, self.de_hidden_size), borrow=True) self.linear = theano.shared(name="Linear", value = init_xavier_uniform(self.de_hidden_size + 2*self.en_hidden_size, self.num_labels), borrow= True) self.linear_bias = theano.shared(name="Hidden to Bias", value=np.asarray(np.random.randn(self.num_labels, )*0., dtype=theano.config.floatX), borrow=True) #self.hidden_decode = theano.shared(name="Hidden to Decode", value= init_xavier_uniform(2*hidden, self.de_hidden_size), borrow = True) #self.hidden_bias = theano.shared( # name="Hidden to Bias", # value=np.asarray(np.random.randn(self.de_hidden_size, )*0., dtype=theano.config.floatX) , # borrow=True # ) input_var_shuffle = input_var.dimshuffle(1, 0) mask_var_shuffle = mask_var.dimshuffle(1, 0) target_var_in_shuffle = target_var_in.dimshuffle(1,0) target_var_shuffle = target_var.dimshuffle(1,0) self.params += [We_inf, self.linear, self.de_lookuptable, self.linear_bias] ######[batch, sent_length, embsize] state_below = We_inf[input_var_shuffle.flatten()].reshape((input_var_shuffle.shape[0], input_var_shuffle.shape[1], embsize)) ###### character word embedding layer_char_input_inf = lasagne.layers.InputLayer(shape=(None, None, Max_Char_Length ), input_var=char_input_var, name='char-input') layer_char_inf = lasagne.layers.reshape(layer_char_input_inf, (-1, [2])) layer_char_embedding_inf = lasagne.layers.EmbeddingLayer(layer_char_inf, input_size=char_dic_size, output_size=char_embedd_dim, W=char_embedd_table_inf, name='char_embedding_inf') layer_char_inf = lasagne.layers.DimshuffleLayer(layer_char_embedding_inf, pattern=(0, 2, 1)) #layer_char_inf = lasagne.layers.DropoutLayer(layer_char_inf, p=0.5) cnn_layer_inf = lasagne.layers.Conv1DLayer(layer_char_inf, num_filters=num_filters, filter_size=conv_window, pad='full', nonlinearity=lasagne.nonlinearities.tanh, name='cnn_inf') pool_layer_inf = lasagne.layers.MaxPool1DLayer(cnn_layer_inf, pool_size=pool_size) output_cnn_layer_inf = lasagne.layers.reshape(pool_layer_inf, (-1, length, [1])) char_params = lasagne.layers.get_all_params(output_cnn_layer_inf, trainable=True) self.params += char_params ###### [batch, sent_length, num_filters] #char_state_below = lasagne.layers.get_output(output_cnn_layer_inf, {layer_char_input_inf:char_input_var}) char_state_below = lasagne.layers.get_output(output_cnn_layer_inf) char_state_below = dropout_layer(char_state_below, use_dropout, trng) char_state_shuff = char_state_below.dimshuffle(1,0, 2) state_below = T.concatenate([state_below, char_state_shuff], axis=2) state_below = dropout_layer(state_below, use_dropout, trng) enclstm_f = LSTM(embsize+num_filters, self.en_hidden_size) enclstm_b = LSTM(embsize+num_filters, self.en_hidden_size, True) self.encoder_lstm_layers.append(enclstm_f) #append self.encoder_lstm_layers.append(enclstm_b) #append self.params += enclstm_f.params + enclstm_b.params #concatenate hs_f, Cs_f = enclstm_f.forward(state_below, mask_var_shuffle) hs_b, Cs_b = enclstm_b.forward(state_below, mask_var_shuffle) hs = T.concatenate([hs_f, hs_b], axis=2) Cs = T.concatenate([Cs_f, Cs_b], axis=2) hs0 = T.concatenate([hs_f[-1], hs_b[0]], axis=1) Cs0 = T.concatenate([Cs_f[-1], Cs_b[0]], axis=1) #self.hos += T.tanh(tensor.dot(hs0, self.hidden_decode) + self.hidden_bias), #self.Cos += T.tanh(tensor.dot(Cs0, self.hidden_decode) + self.hidden_bias), self.hos += T.alloc(np.asarray(0., dtype=theano.config.floatX), input_var_shuffle.shape[1], self.de_hidden_size), self.Cos += T.alloc(np.asarray(0., dtype=theano.config.floatX), input_var_shuffle.shape[1], self.de_hidden_size), Encoder = hs state_below = self.de_lookuptable[target_var_in_shuffle.flatten()].reshape((target_var_in_shuffle.shape[0], target_var_in_shuffle.shape[1], self.de_hidden_size)) for i in range(self.lstm_layers_num): declstm = LSTM(self.de_hidden_size, self.de_hidden_size) self.decoder_lstm_layers += declstm, #append self.params += declstm.params #concatenate ho, Co = self.hos[i], self.Cos[i] state_below, Cs = declstm.forward(state_below, mask_var_shuffle, ho, Co) decoder_lstm_outputs = T.concatenate([state_below, Encoder], axis=2) linear_outputs = T.dot(decoder_lstm_outputs, self.linear) + self.linear_bias[None, None, :] softmax_outputs, updates = theano.scan( fn=lambda x: T.nnet.softmax(x), sequences=[linear_outputs], ) def _NLL(pred, y, m): return -m * T.log(pred[T.arange(input_var.shape[0]), y]) """ costs, _ = theano.scan(fn=_NLL, sequences=[softmax_outputs, target_var_shuffle, mask_var_shuffle]) #loss = costs.sum() / mask_var.sum() + params.L2*sum(lasagne.regularization.l2(x) for x in self.params) loss = costs.sum() / mask_var.sum() updates = lasagne.updates.sgd(loss, self.params, self.eta) updates = lasagne.updates.apply_momentum(updates, self.params, momentum=0.9) ################################################### #### using the ground truth when training ################################################## self._train = theano.function( inputs=[ei, em, di, dm, dt], outputs=[loss, softmax_outputs], updates=updates, givens={input_var:ei, mask_var:em, target_var_in:di, decoderMask:dm, target_var:dt} ) """ def _step2(ctx_, state_, hs_, Cs_): hs, Cs = [], [] token_idxs = T.cast(state_.argmax(axis=-1), "int32" ) msk_ = T.fill( (T.zeros_like(token_idxs, dtype="float32")), 1.) msk_ = msk_.dimshuffle('x', 0) state_below0 = self.de_lookuptable[token_idxs].reshape((1, ctx_.shape[0], self.de_hidden_size)) for i, lstm in enumerate(self.decoder_lstm_layers): h, C = lstm.forward(state_below0, msk_, hs_[i], Cs_[i]) #mind msk hs += h[-1], Cs += C[-1], state_below0 = h hs, Cs = T.as_tensor_variable(hs), T.as_tensor_variable(Cs) state_below0 = state_below0.reshape((ctx_.shape[0], self.de_hidden_size)) state_below0 = T.concatenate([ctx_, state_below0], axis =1) newpred = T.dot(state_below0, self.linear) + self.linear_bias[None, :] state_below = T.nnet.softmax(newpred) ##### the beging symbole probablity is 0 extra_p = T.zeros_like(hs[:,:,0]) state_below = T.concatenate([state_below, extra_p.T], axis=1) return state_below, hs, Cs hs0, Cs0 = T.as_tensor_variable(self.hos, name="hs0"), T.as_tensor_variable(self.Cos, name="Cs0") train_outputs, _ = theano.scan( fn=_step2, sequences = [Encoder], outputs_info=[decoderInputs0, hs0, Cs0], n_steps=input_var_shuffle.shape[0] ) predy = train_outputs[0].dimshuffle(1, 0 , 2) predy = predy[:,:,:-1]*mask_var[:,:,None] predy0 = predy.reshape((-1, 17)) def inner_function( targets_one_step, mask_one_step, prev_label, tg_energy): """ :param targets_one_step: [batch_size, t] :param prev_label: [batch_size, t] :param tg_energy: [batch_size] :return: """ new_ta_energy = T.dot(prev_label, Wyy[:-1,:-1]) new_ta_energy_t = tg_energy + T.sum(new_ta_energy*targets_one_step, axis =1) tg_energy_t = T.switch(mask_one_step, new_ta_energy_t, tg_energy) return [targets_one_step, tg_energy_t] local_energy = lasagne.layers.get_output(l_local, {l_in_word: input_var, l_mask_word: mask_var, layer_char_input:char_input_var}) local_energy = local_energy.reshape((-1, length, 17)) local_energy = local_energy*mask_var[:,:,None] ##################### # for the end symbole of a sequence #################### end_term = Wyy[:-1,-1] local_energy = local_energy + end_term.dimshuffle('x', 'x', 0)*mask_var1[:,:, None] #predy0 = lasagne.layers.get_output(l_local_a, {l_in_word_a:input_var, l_mask_word_a:mask_var}) predy_in = T.argmax(predy0, axis=1) A = T.extra_ops.to_one_hot(predy_in, 17) A = A.reshape((-1, length, 17)) #predy = predy0.reshape((-1, length, 25)) #predy = predy*mask_var[:,:,None] targets_shuffled = predy.dimshuffle(1, 0, 2) target_time0 = targets_shuffled[0] masks_shuffled = mask_var.dimshuffle(1, 0) initial_energy0 = T.dot(target_time0, Wyy[-1,:-1]) initials = [target_time0, initial_energy0] [ _, target_energies], _ = theano.scan(fn=inner_function, outputs_info=initials, sequences=[targets_shuffled[1:], masks_shuffled[1:]]) cost11 = target_energies[-1] + T.sum(T.sum(local_energy*predy, axis=2)*mask_var, axis=1) cost = T.mean(-cost11) from momentum import momentum updates_a = momentum(cost, self.params, params.eta, momentum=0.9) self.train_fn = theano.function( inputs=[ei, ci, em, em1, length0, di0, use_dropout0], outputs=[cost], updates=updates_a, on_unused_input='ignore', givens={input_var:ei, char_input_var:ci, mask_var:em, mask_var1:em1, length: length0, decoderInputs0:di0, use_dropout:use_dropout0} ) prediction = T.argmax(predy, axis=2) corr = T.eq(prediction, target_var) corr_train = (corr * mask_var).sum(dtype=theano.config.floatX) num_tokens = mask_var.sum(dtype=theano.config.floatX) self.eval_fn = theano.function( inputs=[ei, ci, em, em1, length0, di0, use_dropout0], outputs=[prediction, -cost11], on_unused_input='ignore', givens={input_var:ei, char_input_var:ci, mask_var:em, mask_var1:em1, length: length0, decoderInputs0:di0, use_dropout:use_dropout0} )
def __init__(self, hidden_size=4, nclasses=3, num_embeddings=1000, embedding_dim=2, window_size=7, memory_size=6, n_memory_slots=8): questions, docs = T.imatrices('questions', 'docs') y_true_matrix = T.imatrix('y_true') n_question_slots = int(n_memory_slots / 4) # TODO derive this from an arg n_doc_slots = n_memory_slots - n_question_slots n_instances = questions.shape[0] self.window_size = window_size randoms = { # attr: shape 'emb': (num_embeddings + 1, embedding_dim), 'Wg_q': (window_size * embedding_dim, n_question_slots), 'Wg_d': (window_size * embedding_dim, n_doc_slots), 'Wk': (hidden_size, memory_size), 'Wb': (hidden_size, 1), 'Wv': (hidden_size, memory_size), 'We_q': (hidden_size, n_question_slots), 'We_d': (hidden_size, n_doc_slots), 'Wx': (window_size * embedding_dim, hidden_size), 'Wh': (memory_size, hidden_size), 'W': (hidden_size, nclasses), 'h0': hidden_size, 'w_q': (n_question_slots,), 'w_d': (n_doc_slots,), 'M_q': (memory_size, n_question_slots), # TODO can we set M0 to zeros without having issues with cosine_dist? 'M_d': (memory_size, n_doc_slots) # TODO can we set M0 to zeros without having issues with cosine_dist? } zeros = { # attr: shape 'bh': hidden_size, 'bg_q': n_question_slots, 'bg_d': n_doc_slots, 'bk': memory_size, 'bb': 1, 'bv': memory_size, 'be_q': n_question_slots, 'be_d': n_doc_slots, 'b': nclasses } def random_shared(shape): return theano.shared( 0.2 * numpy.random.uniform(-1.0, 1.0, shape).astype(theano.config.floatX)) def zeros_shared(shape): return theano.shared(numpy.zeros(shape, dtype=theano.config.floatX)) for key in randoms: # create an attribute with associated shape and random values setattr(self, key, random_shared(randoms[key])) for key in zeros: # create an attribute with associated shape and values = 0 setattr(self, key, zeros_shared(zeros[key])) def repeat_for_each_instance(param): """ repeat param along new axis once for each instance """ return T.repeat(T.shape_padleft(param), repeats=n_instances, axis=0) for key in 'h0 w_q M_q w_d M_d'.split(): setattr(self, key, repeat_for_each_instance(self.__getattribute__(key))) self.names = zeros.keys() + randoms.keys() self.params = [eval('self.' + name) for name in 'bh'.split()] def recurrence(i, h_tm1, w_q, M_q, w_d=None, M_d=None, is_question=True): """ notes Headers from paper in all caps mem = n_question slots if is_question else n_doc_slots :param i: center index of sliding window :param h_tm1: h_{t-1} (hidden state) :param w_q: attention weights for question memory :param M_q: question memory :param w_d: attention weights for docs memory :param M_d: docs memory :param is_question: we use different parts of memory when working with a question :return: [y_t = model outputs, i + 1 = increment index, h_t w_t, M_t (see above)] """ if not is_question: assert w_d is not None and M_d is not None # get representation of word window idxs = questions if is_question else docs # [instances, bucket_width] pad = T.zeros((idxs.shape[0], self.window_size // 2), dtype='int32') padded = T.concatenate([pad, idxs, pad], axis=1) window = padded[:, i:i + window_size] # [instances, window_size] x_t = self.emb[window].flatten(ndim=2) # [instances, window_size * embedding_dim] # EXTERNAL MEMORY READ # eqn 15 if is_question: M_read = M_q # [instances, memory_size, n_question_slots] w_read = w_q # [instances, n_question_slots] else: M_read = T.concatenate([M_q, M_d], axis=2) # [instances, memory_size, n_doc_slots] w_read = T.concatenate([w_q, w_d], axis=1) # [instances, n_doc_slots] c = T.batched_dot(M_read, w_read) # [instances, memory_size] def get_attention(Wg, bg, M, w): g_t = T.nnet.sigmoid(T.dot(x_t, Wg) + bg) # [instances, mem] # eqn 11 k = T.dot(h_tm1, self.Wk) + self.bk # [instances, memory_size] # eqn 13 beta = T.dot(h_tm1, self.Wb) + self.bb beta = T.log(1 + T.exp(beta)) beta = T.addbroadcast(beta, 1) # [instances, 1] # eqn 12 w_hat = T.nnet.softmax(beta * cosine_dist(M, k)) # eqn 14 return (1 - g_t) * w + g_t * w_hat # [instances, mem] w_q = get_attention(self.Wg_q, self.bg_q, M_q, w_q) # [instances, n_question_slots] if not is_question: w_d = get_attention(self.Wg_d, self.bg_d, M_d, w_d) # [instances, n_doc_slots] # MODEL INPUT AND OUTPUT # eqn 9 h_t = T.dot(x_t, self.Wx) + T.dot(c, self.Wh) + self.bh # [instances, hidden_size] # eqn 10 y_t = T.nnet.softmax(T.dot(h_t, self.W) + self.b) # [instances, nclasses] # EXTERNAL MEMORY UPDATE def update_memory(We, be, w_update, M_update): # eqn 17 e = T.nnet.sigmoid(T.dot(h_tm1, We) + be) # [instances, mem] f = 1. - w_update * e # [instances, mem] # eqn 16 v_t = T.dot(h_t, self.Wv) + self.bv # [instances, memory_size] # need to add broadcast layers for memory update f_t = f.dimshuffle(0, 'x', 1) # [instances, 1, mem] u_t = w_update.dimshuffle(0, 'x', 1) # [instances, 1, mem] v_t = v_t.dimshuffle(0, 1, 'x') # [instances, memory_size, 1] # eqn 19 return M_update * f_t + T.batched_dot(v_t, u_t) # [instances, memory_size, mem] M_q = update_memory(self.We_q, self.be_q, w_q, M_q) attention_and_memory = [w_q, M_q] if not is_question: M_d = update_memory(self.We_d, self.be_d, w_d, M_d) attention_and_memory += [w_d, M_d] return [y_t, i + 1, h_t] + attention_and_memory outputs_info = [None, T.constant(0, dtype='int32'), self.h0, self.w_q, self.M_q] ask_question = partial(recurrence, is_question=True) answer_question = partial(recurrence, is_question=False) [_, _, h, w, M], _ = theano.scan(fn=ask_question, outputs_info=outputs_info, n_steps=questions.shape[1], name='ask_scan') outputs_info[2:] = [param[-1, :, :] for param in (h, w, M)] output, _ = theano.scan(fn=answer_question, outputs_info=outputs_info + [self.w_d, self.M_d], n_steps=docs.shape[1], name='train_scan') y_dist = output[0].dimshuffle(2, 1, 0).flatten(ndim=2).T y_pred = y_dist.argmax(axis=1) y_true = y_true_matrix.ravel() counts = T.extra_ops.bincount(y_true, assert_nonneg=True) weights = 1.0 / (counts[y_true] + 1) * T.neq(y_true, 0) losses = T.nnet.categorical_crossentropy(y_dist, y_true) loss = lasagne.objectives.aggregate(losses, weights) self.test = theano.function(inputs=[questions, docs, y_true_matrix], outputs=[T.grad(loss, self.bh)]) updates = lasagne.updates.adadelta(loss, self.params, learning_rate=.001) # theano functions self.predict = theano.function(inputs=[questions, docs], outputs=y_pred) self.train = theano.function(inputs=[questions, docs, y_true_matrix], outputs=[y_pred, loss], updates=updates, allow_input_downcast=True) normalized_embeddings = self.emb / T.sqrt((self.emb ** 2).sum(axis=1)).dimshuffle(0, 'x') self.normalize = theano.function(inputs=[], updates={self.emb: normalized_embeddings})
def __init__(self, hidden_size=4, nclasses=3, num_embeddings=1000, embedding_dim=2, window_size=7, memory_size=6, n_memory_slots=8): questions, docs = T.imatrices('questions', 'docs') y_true_matrix = T.imatrix('y_true') n_question_slots = int(n_memory_slots / 4) # TODO derive this from an arg n_doc_slots = n_memory_slots - n_question_slots n_instances = questions.shape[0] self.window_size = window_size randoms = { # attr: shape 'emb': (num_embeddings + 1, embedding_dim), 'Wg_q': (window_size * embedding_dim, n_question_slots), 'Wg_d': (window_size * embedding_dim, n_doc_slots), 'Wk': (hidden_size, memory_size), 'Wb': (hidden_size, 1), 'Wv': (hidden_size, memory_size), 'We_q': (hidden_size, n_question_slots), 'We_d': (hidden_size, n_doc_slots), 'Wx': (window_size * embedding_dim, hidden_size), 'Wh': (memory_size, hidden_size), 'W': (hidden_size, nclasses), 'h0': hidden_size, 'w_q': (n_question_slots, ), 'w_d': (n_doc_slots, ), 'M_q': (memory_size, n_question_slots), # TODO can we set M0 to zeros without having issues with cosine_dist? 'M_d': ( memory_size, n_doc_slots ) # TODO can we set M0 to zeros without having issues with cosine_dist? } zeros = { # attr: shape 'bh': hidden_size, 'bg_q': n_question_slots, 'bg_d': n_doc_slots, 'bk': memory_size, 'bb': 1, 'bv': memory_size, 'be_q': n_question_slots, 'be_d': n_doc_slots, 'b': nclasses } def random_shared(shape): return theano.shared(0.2 * numpy.random.uniform( -1.0, 1.0, shape).astype(theano.config.floatX)) def zeros_shared(shape): return theano.shared(numpy.zeros(shape, dtype=theano.config.floatX)) for key in randoms: # create an attribute with associated shape and random values setattr(self, key, random_shared(randoms[key])) for key in zeros: # create an attribute with associated shape and values = 0 setattr(self, key, zeros_shared(zeros[key])) def repeat_for_each_instance(param): """ repeat param along new axis once for each instance """ return T.repeat(T.shape_padleft(param), repeats=n_instances, axis=0) for key in 'h0 w_q M_q w_d M_d'.split(): setattr(self, key, repeat_for_each_instance(self.__getattribute__(key))) self.names = zeros.keys() + randoms.keys() self.params = [eval('self.' + name) for name in 'bh'.split()] def recurrence(i, h_tm1, w_q, M_q, w_d=None, M_d=None, is_question=True): """ notes Headers from paper in all caps mem = n_question slots if is_question else n_doc_slots :param i: center index of sliding window :param h_tm1: h_{t-1} (hidden state) :param w_q: attention weights for question memory :param M_q: question memory :param w_d: attention weights for docs memory :param M_d: docs memory :param is_question: we use different parts of memory when working with a question :return: [y_t = model outputs, i + 1 = increment index, h_t w_t, M_t (see above)] """ if not is_question: assert w_d is not None and M_d is not None # get representation of word window idxs = questions if is_question else docs # [instances, bucket_width] pad = T.zeros((idxs.shape[0], self.window_size // 2), dtype='int32') padded = T.concatenate([pad, idxs, pad], axis=1) window = padded[:, i:i + window_size] # [instances, window_size] x_t = self.emb[window].flatten( ndim=2) # [instances, window_size * embedding_dim] # EXTERNAL MEMORY READ # eqn 15 if is_question: M_read = M_q # [instances, memory_size, n_question_slots] w_read = w_q # [instances, n_question_slots] else: M_read = T.concatenate( [M_q, M_d], axis=2) # [instances, memory_size, n_doc_slots] w_read = T.concatenate([w_q, w_d], axis=1) # [instances, n_doc_slots] c = T.batched_dot(M_read, w_read) # [instances, memory_size] def get_attention(Wg, bg, M, w): g_t = T.nnet.sigmoid(T.dot(x_t, Wg) + bg) # [instances, mem] # eqn 11 k = T.dot(h_tm1, self.Wk) + self.bk # [instances, memory_size] # eqn 13 beta = T.dot(h_tm1, self.Wb) + self.bb beta = T.log(1 + T.exp(beta)) beta = T.addbroadcast(beta, 1) # [instances, 1] # eqn 12 w_hat = T.nnet.softmax(beta * cosine_dist(M, k)) # eqn 14 return (1 - g_t) * w + g_t * w_hat # [instances, mem] w_q = get_attention(self.Wg_q, self.bg_q, M_q, w_q) # [instances, n_question_slots] if not is_question: w_d = get_attention(self.Wg_d, self.bg_d, M_d, w_d) # [instances, n_doc_slots] # MODEL INPUT AND OUTPUT # eqn 9 h_t = T.dot(x_t, self.Wx) + T.dot( c, self.Wh) + self.bh # [instances, hidden_size] # eqn 10 y_t = T.nnet.softmax(T.dot(h_t, self.W) + self.b) # [instances, nclasses] # EXTERNAL MEMORY UPDATE def update_memory(We, be, w_update, M_update): # eqn 17 e = T.nnet.sigmoid(T.dot(h_tm1, We) + be) # [instances, mem] f = 1. - w_update * e # [instances, mem] # eqn 16 v_t = T.dot(h_t, self.Wv) + self.bv # [instances, memory_size] # need to add broadcast layers for memory update f_t = f.dimshuffle(0, 'x', 1) # [instances, 1, mem] u_t = w_update.dimshuffle(0, 'x', 1) # [instances, 1, mem] v_t = v_t.dimshuffle(0, 1, 'x') # [instances, memory_size, 1] # eqn 19 return M_update * f_t + T.batched_dot( v_t, u_t) # [instances, memory_size, mem] M_q = update_memory(self.We_q, self.be_q, w_q, M_q) attention_and_memory = [w_q, M_q] if not is_question: M_d = update_memory(self.We_d, self.be_d, w_d, M_d) attention_and_memory += [w_d, M_d] return [y_t, i + 1, h_t] + attention_and_memory outputs_info = [ None, T.constant(0, dtype='int32'), self.h0, self.w_q, self.M_q ] ask_question = partial(recurrence, is_question=True) answer_question = partial(recurrence, is_question=False) [_, _, h, w, M], _ = theano.scan(fn=ask_question, outputs_info=outputs_info, n_steps=questions.shape[1], name='ask_scan') outputs_info[2:] = [param[-1, :, :] for param in (h, w, M)] output, _ = theano.scan(fn=answer_question, outputs_info=outputs_info + [self.w_d, self.M_d], n_steps=docs.shape[1], name='train_scan') y_dist = output[0].dimshuffle(2, 1, 0).flatten(ndim=2).T y_pred = y_dist.argmax(axis=1) y_true = y_true_matrix.ravel() counts = T.extra_ops.bincount(y_true, assert_nonneg=True) weights = 1.0 / (counts[y_true] + 1) * T.neq(y_true, 0) losses = T.nnet.categorical_crossentropy(y_dist, y_true) loss = lasagne.objectives.aggregate(losses, weights) self.test = theano.function(inputs=[questions, docs, y_true_matrix], outputs=[T.grad(loss, self.bh)]) updates = lasagne.updates.adadelta(loss, self.params, learning_rate=.001) # theano functions self.predict = theano.function(inputs=[questions, docs], outputs=y_pred) self.train = theano.function(inputs=[questions, docs, y_true_matrix], outputs=[y_pred, loss], updates=updates, allow_input_downcast=True) normalized_embeddings = self.emb / T.sqrt( (self.emb**2).sum(axis=1)).dimshuffle(0, 'x') self.normalize = theano.function( inputs=[], updates={self.emb: normalized_embeddings})
def test_dot(): x = T.imatrices("x") y = T.dot(x, [[1,2], [1,2], [1,2]]) f = theano.function(inputs=[x], outputs=y) print f([[1, 2, 3], [2, 3, 4]])
def __init__(self, We_initial, params): self.textfile = open(params.outfile, 'w') We = theano.shared(We_initial) embsize = We_initial.shape[1] hidden = params.hidden self.num_labels = params.num_labels self.de_hidden_size = params.de_hidden_size self.en_hidden_size = params.en_hidden_size print params.de_hidden_size, hidden, params.num_labels self.lstm_layers_num = 1 input_var = T.imatrix(name='inputs') target_var = T.imatrix(name='targets') target_var_in = T.imatrix(name='in_targets') mask_var = T.fmatrix(name='masks') mask_var1 = T.fmatrix(name='masks1') length = T.iscalar() length0 = T.iscalar() t_t = T.fscalar() t_t0 = T.fscalar() Wyy0 = np.random.uniform( -0.02, 0.02, (self.num_labels + 1, self.num_labels + 1)).astype('float32') Wyy = theano.shared(Wyy0) l_in_word = lasagne.layers.InputLayer((None, None)) l_mask_word = lasagne.layers.InputLayer(shape=(None, None)) if params.emb == 1: l_emb_word = lasagne.layers.EmbeddingLayer( l_in_word, input_size=We_initial.shape[0], output_size=embsize, W=We) else: l_emb_word = lasagne_embedding_layer_2(l_in_word, embsize, We) l_lstm_wordf = lasagne.layers.LSTMLayer(l_emb_word, 512, mask_input=l_mask_word) l_lstm_wordb = lasagne.layers.LSTMLayer(l_emb_word, 512, mask_input=l_mask_word, backwards=True) concat = lasagne.layers.concat([l_lstm_wordf, l_lstm_wordb], axis=2) l_reshape_concat = lasagne.layers.ReshapeLayer(concat, (-1, 2 * 512)) l_local = lasagne.layers.DenseLayer( l_reshape_concat, num_units=self.num_labels, nonlinearity=lasagne.nonlinearities.linear) network_params = lasagne.layers.get_all_params(l_local, trainable=True) network_params.append(Wyy) print len(network_params) f = open( 'ccctag_CRF_Bilstm_Viterbi_.Batchsize_10_dropout_0_LearningRate_0.01_0.0512_tagversoin_2.pickle', 'r') data = pickle.load(f) f.close() for idx, p in enumerate(network_params): #print data[idx].shape p.set_value(data[idx]) self.params = [] self.hos = [] self.Cos = [] self.encoder_lstm_layers = [] self.decoder_lstm_layers = [] ei, di, dt = T.imatrices(3) #place holders decoderInputs0, em, em1, dm, tf, di0 = T.fmatrices(6) #### the last one is for the stary symbole self.de_lookuptable = theano.shared(name="Decoder LookUpTable", value=init_xavier_uniform( self.num_labels + 1, self.de_hidden_size), borrow=True) self.linear = theano.shared( name="Linear", value=init_xavier_uniform( self.de_hidden_size + 2 * self.en_hidden_size, self.num_labels), borrow=True) self.linear_bias = theano.shared( name="Hidden to Bias", value=np.asarray(np.random.randn(self.num_labels, ) * 0., dtype=theano.config.floatX), borrow=True) #self.hidden_decode = theano.shared(name="Hidden to Decode", value= init_xavier_uniform(2*hidden, self.de_hidden_size), borrow = True) #self.hidden_bias = theano.shared( # name="Hidden to Bias", # value=np.asarray(np.random.randn(self.de_hidden_size, )*0., dtype=theano.config.floatX) , # borrow=True # ) input_var_shuffle = input_var.dimshuffle(1, 0) mask_var_shuffle = mask_var.dimshuffle(1, 0) target_var_in_shuffle = target_var_in.dimshuffle(1, 0) target_var_shuffle = target_var.dimshuffle(1, 0) self.params += [self.linear, self.linear_bias, self.de_lookuptable] #concatenate state_below = We[input_var_shuffle.flatten()].reshape( (input_var_shuffle.shape[0], input_var_shuffle.shape[1], embsize)) enclstm_f = LSTM(embsize, self.en_hidden_size) enclstm_b = LSTM(embsize, self.en_hidden_size, True) self.encoder_lstm_layers.append(enclstm_f) #append self.encoder_lstm_layers.append(enclstm_b) #append self.params += enclstm_f.params + enclstm_b.params #concatenate hs_f, Cs_f = enclstm_f.forward(state_below, mask_var_shuffle) hs_b, Cs_b = enclstm_b.forward(state_below, mask_var_shuffle) hs = T.concatenate([hs_f, hs_b], axis=2) Cs = T.concatenate([Cs_f, Cs_b], axis=2) hs0 = T.concatenate([hs_f[-1], hs_b[0]], axis=1) Cs0 = T.concatenate([Cs_f[-1], Cs_b[0]], axis=1) #self.hos += T.tanh(tensor.dot(hs0, self.hidden_decode) + self.hidden_bias), #self.Cos += T.tanh(tensor.dot(Cs0, self.hidden_decode) + self.hidden_bias), self.hos += T.alloc(np.asarray(0., dtype=theano.config.floatX), input_var_shuffle.shape[1], self.de_hidden_size), self.Cos += T.alloc(np.asarray(0., dtype=theano.config.floatX), input_var_shuffle.shape[1], self.de_hidden_size), Encoder = hs ei, di, dt = T.imatrices(3) #place holders em, dm, tf, di0 = T.fmatrices(4) self.encoder_function = theano.function(inputs=[ei, em], outputs=Encoder, givens={ input_var: ei, mask_var: em }) state_below = self.de_lookuptable[ target_var_in_shuffle.flatten()].reshape( (target_var_in_shuffle.shape[0], target_var_in_shuffle.shape[1], self.de_hidden_size)) for i in range(self.lstm_layers_num): declstm = LSTM(self.de_hidden_size, self.de_hidden_size) self.decoder_lstm_layers += declstm, #append self.params += declstm.params #concatenate ho, Co = self.hos[i], self.Cos[i] state_below, Cs = declstm.forward(state_below, mask_var_shuffle, ho, Co) decoder_lstm_outputs = T.concatenate([Encoder, state_below], axis=2) linear_outputs = T.dot(decoder_lstm_outputs, self.linear) + self.linear_bias[None, None, :] softmax_outputs, updates = theano.scan( fn=lambda x: T.nnet.softmax(x), sequences=[linear_outputs], ) def _NLL(pred, y, m): return -m * T.log(pred[T.arange(input_var.shape[0]), y]) def _step2(ctx_, state_, hs_, Cs_): #print ctx_.shape, state_.shape, hs_.shape, Cs_.shape hs, Cs = [], [] token_idxs = T.cast(state_.argmax(axis=-1), "int32") msk_ = T.fill((T.zeros_like(token_idxs, dtype="float32")), 1) msk_ = msk_.dimshuffle('x', 0) state_below0 = self.de_lookuptable[token_idxs].reshape( (1, ctx_.shape[0], self.de_hidden_size)) for i, lstm in enumerate(self.decoder_lstm_layers): h, C = lstm.forward(state_below0, msk_, hs_[i], Cs_[i]) #mind msk hs += h[-1], Cs += C[-1], state_below0 = h hs, Cs = T.as_tensor_variable(hs), T.as_tensor_variable(Cs) state_below0 = state_below0.reshape( (ctx_.shape[0], self.de_hidden_size)) state_below0 = T.concatenate([ctx_, state_below0], axis=1) newpred = T.dot(state_below0, self.linear) + self.linear_bias[None, :] state_below = T.nnet.softmax(newpred) extra_p = T.zeros_like(hs[:, :, 0]) state_below = T.concatenate([state_below, extra_p.T], axis=1) return state_below, hs, Cs ctx_0, state_0 = T.fmatrices(2) hs_0 = T.ftensor3() Cs_0 = T.ftensor3() state_below_tmp, hs_tmp, Cs_tmp = _step2(ctx_0, state_0, hs_0, Cs_0) self.f_next = theano.function([ctx_0, state_0, hs_0, Cs_0], [state_below_tmp, hs_tmp, Cs_tmp], name='f_next') hs0, Cs0 = T.as_tensor_variable( self.hos, name="hs0"), T.as_tensor_variable(self.Cos, name="Cs0") train_outputs, _ = theano.scan(fn=_step2, sequences=[Encoder], outputs_info=[decoderInputs0, hs0, Cs0], n_steps=input_var_shuffle.shape[0]) predy = train_outputs[0].dimshuffle(1, 0, 2) predy = predy[:, :, :-1] * mask_var[:, :, None] predy0 = predy.reshape((-1, self.num_labels)) def inner_function(targets_one_step, mask_one_step, prev_label, tg_energy): """ :param targets_one_step: [batch_size, t] :param prev_label: [batch_size, t] :param tg_energy: [batch_size] :return: """ new_ta_energy = T.dot(prev_label, Wyy[:-1, :-1]) new_ta_energy_t = tg_energy + T.sum( new_ta_energy * targets_one_step, axis=1) tg_energy_t = T.switch(mask_one_step, new_ta_energy_t, tg_energy) return [targets_one_step, tg_energy_t] local_energy = lasagne.layers.get_output(l_local, { l_in_word: input_var, l_mask_word: mask_var }) local_energy = local_energy.reshape((-1, length, self.num_labels)) local_energy = local_energy * mask_var[:, :, None] ##################### # for the end symbole of a sequence #################### end_term = Wyy[:-1, -1] local_energy = local_energy + end_term.dimshuffle( 'x', 'x', 0) * mask_var1[:, :, None] #predy0 = lasagne.layers.get_output(l_local_a, {l_in_word_a:input_var, l_mask_word_a:mask_var}) predy_in = T.argmax(predy0, axis=1) A = T.extra_ops.to_one_hot(predy_in, self.num_labels) A = A.reshape((-1, length, self.num_labels)) #predy = predy0.reshape((-1, length, 25)) #predy = predy*mask_var[:,:,None] targets_shuffled = predy.dimshuffle(1, 0, 2) target_time0 = targets_shuffled[0] masks_shuffled = mask_var.dimshuffle(1, 0) initial_energy0 = T.dot(target_time0, Wyy[-1, :-1]) initials = [target_time0, initial_energy0] [_, target_energies], _ = theano.scan( fn=inner_function, outputs_info=initials, sequences=[targets_shuffled[1:], masks_shuffled[1:]]) cost11 = target_energies[-1] + T.sum( T.sum(local_energy * predy, axis=2) * mask_var, axis=1) # compute the ground-truth energy targets_shuffled0 = A.dimshuffle(1, 0, 2) target_time00 = targets_shuffled0[0] initial_energy00 = T.dot(target_time00, Wyy[-1, :-1]) initials0 = [target_time00, initial_energy00] [_, target_energies0], _ = theano.scan( fn=inner_function, outputs_info=initials0, sequences=[targets_shuffled0[1:], masks_shuffled[1:]]) cost110 = target_energies0[-1] + T.sum( T.sum(local_energy * A, axis=2) * mask_var, axis=1) #predy_f = predy.reshape((-1, 25)) y_f = target_var.flatten() if (params.annealing == 0): lamb = params.L3 elif (params.annealing == 1): lamb = params.L3 * (1 - 0.01 * t_t) if (params.regutype == 0): ce_hinge = lasagne.objectives.categorical_crossentropy( predy0 + eps, y_f) ce_hinge = ce_hinge.reshape((-1, length)) ce_hinge = T.sum(ce_hinge * mask_var, axis=1) cost = T.mean(-cost11) + lamb * T.mean(ce_hinge) else: entropy_term = -T.sum(predy0 * T.log(predy0 + eps), axis=1) entropy_term = entropy_term.reshape((-1, length)) entropy_term = T.sum(entropy_term * mask_var, axis=1) cost = T.mean(-cost11) - lamb * T.mean(entropy_term) """ f = open('F0_simple.pickle') PARA = pickle.load(f) f.close() l2_term = sum(lasagne.regularization.l2(x-PARA[index]) for index, x in enumerate(a_params)) cost = T.mean(-cost11) + params.L2*l2_term """ ##from adam import adam ##updates_a = adam(cost, self.params, params.eta) #updates_a = lasagne.updates.sgd(cost, self.params, params.eta) #updates_a = lasagne.updates.apply_momentum(updates_a, self.params, momentum=0.9) from momentum import momentum updates_a = momentum(cost, self.params, params.eta, momentum=0.9) if (params.regutype == 0): self.train_fn = theano.function( inputs=[ei, dt, em, em1, length0, t_t0, di0], outputs=[cost, ce_hinge], updates=updates_a, on_unused_input='ignore', givens={ input_var: ei, target_var: dt, mask_var: em, mask_var1: em1, length: length0, t_t: t_t0, decoderInputs0: di0 }) #self.train_fn = theano.function([input_var, target_var, mask_var, mask_var1, length, t_t], [cost, ce_hinge], updates = updates_a, on_unused_input='ignore') else: self.train_fn = theano.function( inputs=[ei, dt, em, em1, length0, t_t0, di0], outputs=[cost, entropy_term], updates=updates_a, on_unused_input='ignore', givens={ input_var: ei, target_var: dt, mask_var: em, mask_var1: em1, length: length0, t_t: t_t0, decoderInputs0: di0 }) #self.train_fn = theano.function([input_var, target_var, mask_var, mask_var1, length, t_t], [cost, entropy_term], updates = updates_a, on_unused_input='ignore') prediction = T.argmax(predy, axis=2) corr = T.eq(prediction, target_var) corr_train = (corr * mask_var).sum(dtype=theano.config.floatX) num_tokens = mask_var.sum(dtype=theano.config.floatX) self.eval_fn = theano.function( inputs=[ei, dt, em, em1, length0, di0], outputs=[cost11, cost110, corr_train, num_tokens, prediction], on_unused_input='ignore', givens={ input_var: ei, target_var: dt, mask_var: em, mask_var1: em1, length: length0, decoderInputs0: di0 })
def __init__(self, hidden_size=100, nclasses=73, num_embeddings=11359, embedding_dim=100, window_size=1, memory_size=40, n_memory_slots=8, go_code=1, depth=2, load_dir=None): articles, titles = T.imatrices('articles', 'titles') n_article_slots = int(n_memory_slots / 2) # TODO derive this from an arg n_title_slots = n_memory_slots - n_article_slots n_instances = articles.shape[0] self.window_size = window_size randoms = { # attr: shape # 'emb': (num_embeddings + 1, embedding_dim), 'M_a': (memory_size, n_article_slots), 'M_t': (memory_size, n_title_slots), 'w_a': (n_article_slots, ), 'w_t': (n_title_slots, ), 'Wg_a': (window_size * embedding_dim, n_article_slots), 'Wg_t': (window_size * embedding_dim, n_title_slots), 'Wk': (hidden_size, memory_size), 'Wb': (hidden_size, 1), 'Wv': (hidden_size, memory_size), 'We_a': (hidden_size, n_article_slots), 'We_t': (hidden_size, n_title_slots), 'Wx': (window_size * embedding_dim, hidden_size), 'Wh': (memory_size, hidden_size), 'W': (hidden_size, nclasses), 'h0': hidden_size } zeros = { # attr: shape 'bg_a': n_article_slots, 'bg_t': n_title_slots, 'bk': memory_size, 'bb': 1, 'bv': memory_size, 'be_a': n_article_slots, 'be_t': n_title_slots, 'bh': hidden_size, 'b': nclasses, } for l in range(depth): randoms['gru' + str(l)] = (1, embedding_dim) def random_shared(name): shape = randoms[name] return theano.shared( 0.2 * np.random.normal(size=shape).astype(theano.config.floatX), name=name) def zeros_shared(name): shape = zeros[name] return theano.shared(np.zeros(shape, dtype=theano.config.floatX), name=name) for key in randoms: # create an attribute with associated shape and random values setattr(self, key, random_shared(key)) for key in zeros: # create an attribute with associated shape and values equal to 0 setattr(self, key, zeros_shared(key)) self.names = randoms.keys() + zeros.keys() # self.names.remove('emb') # no need to save or update embeddings scan_vars = 'h0 w_a M_a w_t M_t'.split() def repeat_for_each_instance(param): """ repeat param along new axis once for each instance """ return T.repeat(T.shape_padleft(param), repeats=n_instances, axis=0) for key in scan_vars: setattr(self, key, repeat_for_each_instance(self.__getattribute__(key))) self.names.remove(key) if load_dir is not None: with open(os.path.join(load_dir, 'params.pkl')) as handle: params = pickle.load(handle) self.__dict__.update(params) def recurrence(i, h_tm1, w_a, M_a, *args, **kwargs): """ notes Headers from paper in all caps mem = n_article slots if is_article else n_title_slots :param i: center index of sliding window :param h_tm1: h_{t-1} (hidden state) :param w_a: attention weights for article memory :param M_a: article memory :param args: gru_weights, maybe w_t, maybe M_t gru_weights: weights with which to initialize GRULayer on each time step w_t: attention weights for titles memory M_t: titles memory :param kwargs: is_training, is_article is_training: is_article: we use different parts of memory when working with a article :return: [y = model outputs, i + 1 = increment index, h w, M (see above)] """ is_training = kwargs['is_training'] is_article = kwargs['is_article'] gru_weights = args[:depth] if len(args) > depth: w_t = args[depth] M_t = args[depth + 1] i_type = T.iscalar if is_article or is_training else T.ivector assert i.type == i_type if not is_article: assert w_t is not None and M_t is not None word_idxs = i if is_article or is_training: # get representation of word window document = articles if is_article else titles # [instances, bucket_width] word_idxs = document[:, i:i + 1] # [instances, 1] # x_i = self.emb[word_idxs].flatten(ndim=2) # [instances, embedding_dim] input = InputLayer(shape=(None, 1), input_var=word_idxs) embed = EmbeddingLayer(input, num_embeddings, embedding_dim) gru = GRULayer(incoming=embed, num_units=embedding_dim, hid_init=self.gru0) for weight in gru_weights: gru = GRULayer(incoming=gru, num_units=embedding_dim, hid_init=weight) x_i = get_output(gru).flatten(ndim=2) x_i = Print('x_i')(x_i) # [instances, embedding_dim] gru_weights = [] if is_article: M_read = M_a # [instances, memory_size, n_article_slots] w_read = w_a # [instances, n_article_slots] else: M_read = T.concatenate( [M_a, M_t], axis=2) # [instances, memory_size, n_title_slots] w_read = T.concatenate([w_a, w_t], axis=1) # [instances, n_title_slots] # eqn 15 c = T.batched_dot(M_read, w_read) # [instances, memory_size] # EXTERNAL MEMORY READ def get_attention(Wg, bg, M, w): g = T.nnet.sigmoid(T.dot(x_i, Wg) + bg) # [instances, mem] # eqn 11 k = T.dot(h_tm1, self.Wk) + self.bk # [instances, memory_size] # eqn 13 beta = T.dot(h_tm1, self.Wb) + self.bb beta = T.nnet.softplus(beta) beta = T.addbroadcast(beta, 1) # [instances, 1] # eqn 12 w_hat = T.nnet.softmax(beta * cosine_dist(M, k)) # eqn 14 return (1 - g) * w + g * w_hat # [instances, mem] w_a = get_attention(self.Wg_a, self.bg_a, M_a, w_a) # [instances, n_article_slots] if not is_article: w_t = get_attention(self.Wg_t, self.bg_t, M_t, w_t) # [instances, n_title_slots] # MODEL INPUT AND OUTPUT # eqn 9 h = T.dot(c, self.Wh) + T.dot( x_i, self.Wx) + self.bh # [instances, hidden_size] # eqn 10 y = T.nnet.softmax(T.dot(h, self.W) + self.b) # [instances, nclasses] # EXTERNAL MEMORY UPDATE def update_memory(We, be, w_update, M_update): # eqn 17 e = T.nnet.sigmoid(T.dot(h_tm1, We) + be) # [instances, mem] f = 1. - w_update * e # [instances, mem] # eqn 16 v = T.tanh(T.dot(h, self.Wv) + self.bv) # [instances, memory_size] # need to add broadcast layers for memory update f = f.dimshuffle(0, 'x', 1) # [instances, 1, mem] u = w_update.dimshuffle(0, 'x', 1) # [instances, 1, mem] v = v.dimshuffle(0, 1, 'x') # [instances, memory_size, 1] # eqn 19 return M_update * f + T.batched_dot(v, u) * ( 1 - f) # [instances, memory_size, mem] M_a = update_memory(self.We_a, self.be_a, w_a, M_a) attention_and_memory = [w_a, M_a] if not is_article: M_t = update_memory(self.We_t, self.be_t, w_t, M_t) attention_and_memory += [w_t, M_t] y_max = y.argmax(axis=1).astype(int32) next_idxs = i + 1 if is_training or is_article else y_max return [y, y_max, next_idxs, h] + attention_and_memory read_article = partial(recurrence, is_training=True, is_article=True) # for read_article, it actually doesn't matter whether is_training is true i0 = T.constant(0, dtype=int32, name='first_value_of_i') gru_weights = [eval('self.gru' + str(l)) for l in range(depth)] outputs_info = [None, None, i0, self.h0, self.w_a, self.M_a ] + gru_weights [_, _, _, h, w, M], _ = theano.scan(fn=read_article, outputs_info=outputs_info, n_steps=articles.shape[1], name='read_scan') produce_title = partial(recurrence, is_training=True, is_article=False) outputs_info[3:6] = [param[-1, :, :] for param in (h, w, M)] outputs_info.extend([self.w_t, self.M_t]) bucket_width = titles.shape[ 1] - 1 # subtract 1 because <go> is omitted in y_true [y, y_max, _, _, _, _, _, _], _ = theano.scan(fn=produce_title, outputs_info=outputs_info, n_steps=bucket_width, name='train_scan') # loss and updates y_clip = T.clip(y, .01, .99) y_flatten = y_clip.dimshuffle(2, 1, 0).flatten(ndim=2).T y_true = titles[:, 1:].ravel() # [:, 1:] in order to omit <go> counts = T.extra_ops.bincount(y_true, assert_nonneg=True) weights = 1.0 / (counts[y_true] + 1) * T.neq(y_true, 0) losses = T.nnet.categorical_crossentropy(y_flatten, y_true) loss = objectives.aggregate(losses, weights, mode='sum') updates = adadelta(loss, self.params()) self.learn = theano.function(inputs=[articles, titles], outputs=[y_max.T, loss], updates=updates, allow_input_downcast=True, name='learn') produce_title_test = partial(recurrence, is_training=False, is_article=False) self.test = theano.function(inputs=[articles, titles], outputs=[y_max.T], on_unused_input='ignore') outputs_info[2] = T.zeros([n_instances], dtype=int32) + go_code [_, y_max, _, _, _, _, _, _], _ = theano.scan(fn=produce_title_test, outputs_info=outputs_info, n_steps=bucket_width, name='test_scan') self.predict = theano.function(inputs=[articles, titles], outputs=y_max.T, name='infer')
def __init__(self, We, char_embedd_table_initial, params): lstm_layers_num = 1 emb_size = We.shape[1] self.eta = params.eta self.num_labels = params.num_labels self.en_hidden_size = params.en_hidden_size self.de_hidden_size = params.de_hidden_size self.lstm_layers_num = params.lstm_layers_num self._train = None self._utter = None self.params = [] self.encoder_lstm_layers = [] self.decoder_lstm_layers = [] self.hos = [] self.Cos = [] char_embedd_dim = params.char_embedd_dim char_dic_size = len(params.char_dic) char_embedd_table = theano.shared(char_embedd_table_initial) encoderInputs = tensor.imatrix() decoderInputs, decoderTarget = tensor.imatrices(2) encoderMask, TF, decoderMask, decoderInputs0 = tensor.fmatrices(4) char_input_var = tensor.itensor3(name='char-inputs') ci = tensor.itensor3() use_dropout = tensor.fscalar() use_dropout0 = tensor.fscalar() self.lookuptable = theano.shared(We) #### the last one is for the stary symbole self.de_lookuptable = theano.shared(name="Decoder LookUpTable", value=init_xavier_uniform( self.num_labels + 1, self.de_hidden_size), borrow=True) self.linear = theano.shared( name="Linear", value=init_xavier_uniform( self.de_hidden_size + 2 * self.en_hidden_size, self.num_labels), borrow=True) self.linear_bias = theano.shared( name="Hidden to Bias", value=np.asarray(np.random.randn(self.num_labels, ) * 0., dtype=theano.config.floatX), borrow=True) #self.hidden_decode = theano.shared(name="Hidden to Decode", value= init_xavier_uniform(2*en_hidden_size, self.de_hidden_size), borrow = True) #self.hidden_bias = theano.shared( # name="Hidden to Bias", # value=np.asarray(np.random.randn(self.de_hidden_size, )*0., dtype=theano.config.floatX) , # borrow=True # ) #self.params += [self.linear, self.de_lookuptable, self.hidden_decode, self.hidden_bias] #concatenate self.params += [ self.lookuptable, self.linear, self.linear_bias, self.de_lookuptable ] #the initial hidden state of decoder lstm is zeros #(max_sent_size, batch_size, hidden_size) state_below = self.lookuptable[encoderInputs.flatten()].reshape( (encoderInputs.shape[0], encoderInputs.shape[1], emb_size)) layer_char_input = lasagne.layers.InputLayer(shape=(None, None, Max_Char_Length), input_var=char_input_var, name='char-input') layer_char = lasagne.layers.reshape(layer_char_input, (-1, [2])) layer_char_embedding = lasagne.layers.EmbeddingLayer( layer_char, input_size=char_dic_size, output_size=char_embedd_dim, W=char_embedd_table, name='char_embedding') layer_char = lasagne.layers.DimshuffleLayer(layer_char_embedding, pattern=(0, 2, 1)) # first get some necessary dimensions or parameters conv_window = 3 num_filters = params.num_filters # construct convolution layer cnn_layer = lasagne.layers.Conv1DLayer( layer_char, num_filters=num_filters, filter_size=conv_window, pad='full', nonlinearity=lasagne.nonlinearities.tanh, name='cnn') # infer the pool size for pooling (pool size should go through all time step of cnn) _, _, pool_size = cnn_layer.output_shape # construct max pool layer pool_layer = lasagne.layers.MaxPool1DLayer(cnn_layer, pool_size=pool_size) # reshape the layer to match lstm incoming layer [batch * sent_length, num_filters, 1] --> [batch, sent_length, num_filters] output_cnn_layer = lasagne.layers.reshape( pool_layer, (-1, encoderInputs.shape[0], [1])) char_params = lasagne.layers.get_all_params(output_cnn_layer, trainable=True) self.params += char_params char_state_below = lasagne.layers.get_output(output_cnn_layer) char_state_below = dropout_layer(char_state_below, use_dropout, trng) char_state_shuff = char_state_below.dimshuffle(1, 0, 2) state_below = tensor.concatenate([state_below, char_state_shuff], axis=2) state_below = dropout_layer(state_below, use_dropout, trng) for _ in range(self.lstm_layers_num): enclstm_f = LSTM(emb_size + num_filters, self.en_hidden_size) enclstm_b = LSTM(emb_size + num_filters, self.en_hidden_size, True) self.encoder_lstm_layers.append(enclstm_f) #append self.encoder_lstm_layers.append(enclstm_b) #append self.params += enclstm_f.params + enclstm_b.params #concatenate hs_f, Cs_f = enclstm_f.forward(state_below, encoderMask) hs_b, Cs_b = enclstm_b.forward(state_below, encoderMask) hs = tensor.concatenate([hs_f, hs_b], axis=2) Cs = tensor.concatenate([Cs_f, Cs_b], axis=2) hs0 = tensor.concatenate([hs_f[-1], hs_b[0]], axis=1) Cs0 = tensor.concatenate([Cs_f[-1], Cs_b[0]], axis=1) #self.hos += tensor.tanh(tensor.dot(hs0, self.hidden_decode) + self.hidden_bias), #self.Cos += tensor.tanh(tensor.dot(Cs0, self.hidden_decode) + self.hidden_bias), self.hos += tensor.alloc( np.asarray(0., dtype=theano.config.floatX), encoderInputs.shape[1], self.de_hidden_size), self.Cos += tensor.alloc( np.asarray(0., dtype=theano.config.floatX), encoderInputs.shape[1], self.de_hidden_size), state_below = hs Encoder = state_below state_below = self.de_lookuptable[decoderInputs.flatten()].reshape( (decoderInputs.shape[0], decoderInputs.shape[1], self.de_hidden_size)) for i in range(self.lstm_layers_num): declstm = LSTM(self.de_hidden_size, self.de_hidden_size) self.decoder_lstm_layers += declstm, #append self.params += declstm.params #concatenate ho, Co = self.hos[i], self.Cos[i] state_below, Cs = declstm.forward(state_below, decoderMask, ho, Co) ##### Here we include the representation from the decoder decoder_lstm_outputs = tensor.concatenate([state_below, Encoder], axis=2) ei, di, dt = tensor.imatrices(3) #place holders em, dm, tf, di0 = tensor.fmatrices(4) ##################################################### ##################################################### linear_outputs = tensor.dot(decoder_lstm_outputs, self.linear) + self.linear_bias[None, None, :] softmax_outputs, _ = theano.scan( fn=lambda x: tensor.nnet.softmax(x), sequences=[linear_outputs], ) def _NLL(pred, y, m): return -m * tensor.log(pred[tensor.arange(encoderInputs.shape[1]), y]) costs, _ = theano.scan( fn=_NLL, sequences=[softmax_outputs, decoderTarget, decoderMask]) loss = costs.sum() / decoderMask.sum() + params.L2 * sum( lasagne.regularization.l2(x) for x in self.params) #updates = lasagne.updates.adam(loss, self.params, self.eta) #updates = lasagne.updates.apply_momentum(updates, self.params, momentum=0.9) ################################################### #### using the ground truth when training ################################################## #self._train = theano.function( # inputs=[ei, em, di, dm, dt], # outputs=[loss, softmax_outputs], # updates=updates, # givens={encoderInputs:ei, encoderMask:em, decoderInputs:di, decoderMask:dm, decoderTarget:dt} # ) ######################################################################### ### For schedule sampling ######################################################################### ###### always use privous predict as next input def _step2(ctx_, state_, hs_, Cs_): hs, Cs = [], [] token_idxs = tensor.cast(state_.argmax(axis=-1), "int32") msk_ = tensor.fill( (tensor.zeros_like(token_idxs, dtype="float32")), 1.) msk_ = msk_.dimshuffle('x', 0) state_below0 = self.de_lookuptable[token_idxs].reshape( (1, encoderInputs.shape[1], self.de_hidden_size)) for i, lstm in enumerate(self.decoder_lstm_layers): h, C = lstm.forward(state_below0, msk_, hs_[i], Cs_[i]) #mind msk hs += h[-1], Cs += C[-1], state_below0 = h hs, Cs = tensor.as_tensor_variable(hs), tensor.as_tensor_variable( Cs) state_below0 = state_below0.reshape( (encoderInputs.shape[1], self.de_hidden_size)) state_below0 = tensor.concatenate([ctx_, state_below0], axis=1) newpred = tensor.dot(state_below0, self.linear) + self.linear_bias[None, :] state_below = tensor.nnet.softmax(newpred) ##### the beging symbole probablity is 0 extra_p = tensor.zeros_like(hs[:, :, 0]) state_below = tensor.concatenate([state_below, extra_p.T], axis=1) return state_below, hs, Cs hs0, Cs0 = tensor.as_tensor_variable( self.hos, name="hs0"), tensor.as_tensor_variable(self.Cos, name="Cs0") train_outputs, _ = theano.scan(fn=_step2, sequences=[Encoder], outputs_info=[decoderInputs0, hs0, Cs0], n_steps=encoderInputs.shape[0]) train_predict = train_outputs[0] train_costs, _ = theano.scan( fn=_NLL, sequences=[train_predict, decoderTarget, decoderMask]) train_loss = train_costs.sum() / decoderMask.sum() + params.L2 * sum( lasagne.regularization.l2(x) for x in self.params) #from adam import adam #train_updates = adam(train_loss, self.params, self.eta) #train_updates = lasagne.updates.apply_momentum(train_updates, self.params, momentum=0.9) #train_updates = lasagne.updates.sgd(train_loss, self.params, self.eta) #train_updates = lasagne.updates.apply_momentum(train_updates, self.params, momentum=0.9) from momentum import momentum train_updates = momentum(train_loss, self.params, params.eta, momentum=0.9) self._train2 = theano.function( inputs=[ei, ci, em, di0, dm, dt, use_dropout0], outputs=[train_loss, train_predict], updates=train_updates, givens={ encoderInputs: ei, char_input_var: ci, encoderMask: em, decoderInputs0: di0, decoderMask: dm, decoderTarget: dt, use_dropout: use_dropout0 } #givens={encoderInputs:ei, encoderMask:em, decoderInputs:di, decoderMask:dm, decoderTarget:dt, TF:tf} ) listof_token_idx = train_predict.argmax(axis=-1) self._utter = theano.function(inputs=[ei, ci, em, di0, use_dropout0], outputs=listof_token_idx, givens={ encoderInputs: ei, char_input_var: ci, encoderMask: em, decoderInputs0: di0, use_dropout: use_dropout0 })
def __init__(self, hidden_size=100, nclasses=73, num_embeddings=11359, embedding_dim=100, window_size=1, # TODO: do we want some kind of window? memory_size=40, n_memory_slots=8, go_code=1, load_dir=None): articles, titles = T.imatrices('articles', 'titles') n_article_slots = int(n_memory_slots / 2) # TODO derive this from an arg n_title_slots = n_memory_slots - n_article_slots n_instances = articles.shape[0] self.window_size = window_size randoms = { # attr: shape 'emb': (num_embeddings + 1, embedding_dim), 'M_a': (memory_size, n_article_slots), 'M_t': (memory_size, n_title_slots), 'w_a': (n_article_slots,), 'w_t': (n_title_slots,), 'Wg_a': (window_size * embedding_dim, n_article_slots), 'Wg_t': (window_size * embedding_dim, n_title_slots), 'Wk': (hidden_size, memory_size), 'Wb': (hidden_size, 1), 'Wv': (hidden_size, memory_size), 'We_a': (hidden_size, n_article_slots), 'We_t': (hidden_size, n_title_slots), 'Wx': (window_size * embedding_dim, hidden_size), 'Wh': (memory_size, hidden_size), 'W': (hidden_size, nclasses), 'h0': hidden_size } zeros = { # attr: shape 'bg_a': n_article_slots, 'bg_t': n_title_slots, 'bk': memory_size, 'bb': 1, 'bv': memory_size, 'be_a': n_article_slots, 'be_t': n_title_slots, 'bh': hidden_size, 'b': nclasses, } def random_shared(name): shape = randoms[name] return theano.shared( 0.2 * numpy.random.normal(size=shape).astype(theano.config.floatX), name=name) def zeros_shared(name): shape = zeros[name] return theano.shared(numpy.zeros(shape, dtype=theano.config.floatX), name=name) for key in randoms: # create an attribute with associated shape and random values setattr(self, key, random_shared(key)) for key in zeros: # create an attribute with associated shape and values equal to 0 setattr(self, key, zeros_shared(key)) if load_dir is not None: print('!!!!!!!!!!!!!!') self.load(load_dir) self.names = randoms.keys() + zeros.keys() self.names.remove('emb') # no need to save or update embeddings scan_vars = 'h0 w_a M_a w_t M_t'.split() def repeat_for_each_instance(param): """ repeat param along new axis once for each instance """ return T.repeat(T.shape_padleft(param), repeats=n_instances, axis=0) for key in scan_vars: setattr(self, key, repeat_for_each_instance(self.__getattribute__(key))) self.names.remove(key) def recurrence(i, h_tm1, w_a, M_a, w_t=None, M_t=None, is_training=True, is_article=True): """ notes Headers from paper in all caps mem = n_article slots if is_article else n_title_slots :param i: center index of sliding window :param h_tm1: h_{t-1} (hidden state) :param w_a: attention weights for article memory :param M_a: article memory :param w_t: attention weights for titles memory :param M_t: titles memory :param is_training: :param is_article: we use different parts of memory when working with a article :return: [y = model outputs, i + 1 = increment index, h w, M (see above)] """ # i_type = T.iscalar if is_article or is_training else T.ivector # assert i.type == i_type # # if not is_article: # assert w_t is not None and M_t is not None # # word_idxs = i # if is_article or is_training: # # get representation of word window # document = articles if is_article else titles # [instances, bucket_width] # word_idxs = document[:, i] # [instances, 1] # x_i = self.emb[word_idxs].flatten(ndim=2) # [instances, embedding_dim] # # if is_article: # M_read = M_a # [instances, memory_size, n_article_slots] # w_read = w_a # [instances, n_article_slots] # else: # M_read = T.concatenate([M_a, M_t], axis=2) # [instances, memory_size, n_title_slots] # w_read = T.concatenate([w_a, w_t], axis=1) # [instances, n_title_slots] # # # eqn 15 # c = T.batched_dot(M_read, w_read) # [instances, memory_size] # # # EXTERNAL MEMORY READ # def get_attention(Wg, bg, M, w): # g = T.nnet.sigmoid(T.dot(x_i, Wg) + bg) # [instances, mem] # # # eqn 11 # k = T.dot(h_tm1, self.Wk) + self.bk # [instances, memory_size] # # # eqn 13 # beta = T.dot(h_tm1, self.Wb) + self.bb # beta = T.nnet.softplus(beta) # beta = T.addbroadcast(beta, 1) # [instances, 1] # # # eqn 12 # w_hat = T.nnet.softmax(beta * cosine_dist(M, k)) # # # eqn 14 # return (1 - g) * w + g * w_hat # [instances, mem] # # w_a = get_attention(self.Wg_a, self.bg_a, M_a, w_a) # [instances, n_article_slots] # if not is_article: # w_t = get_attention(self.Wg_t, self.bg_t, M_t, w_t) # [instances, n_title_slots] # # # MODEL INPUT AND OUTPUT # # eqn 9 # h = T.dot(c, self.Wh) + T.dot(x_i, self.Wx) + self.bh # [instances, hidden_size] # # # eqn 10 # y = T.nnet.softmax(T.dot(h, self.W) + self.b) # [instances, nclasses] # # # EXTERNAL MEMORY UPDATE # def update_memory(We, be, w_update, M_update): # # eqn 17 # e = T.nnet.sigmoid(T.dot(h_tm1, We) + be) # [instances, mem] # f = 1. - w_update * e # [instances, mem] # # # eqn 16 # v = T.tanh(T.dot(h, self.Wv) + self.bv) # [instances, memory_size] # # # need to add broadcast layers for memory update # f = f.dimshuffle(0, 'x', 1) # [instances, 1, mem] # u = w_update.dimshuffle(0, 'x', 1) # [instances, 1, mem] # v = v.dimshuffle(0, 1, 'x') # [instances, memory_size, 1] # # # eqn 19 # return M_update * f + T.batched_dot(v, u) * (1 - f) # [instances, memory_size, mem] # # M_a = update_memory(self.We_a, self.be_a, w_a, M_a) # attention_and_memory = [w_a, M_a] # if not is_article: # M_t = update_memory(self.We_t, self.be_t, w_t, M_t) # attention_and_memory += [w_t, M_t] # # y_max = y.argmax(axis=1).astype(int32) # next_idxs = i + 1 if is_training or is_article else y_max # return [y, y_max, next_idxs, h] + attention_and_memory + self.params() return self.params() read_article = partial(recurrence, is_article=True) i0 = T.constant(0, dtype=int32, name='first_value_of_i') outputs_info = [None, None, i0, self.h0, self.w_a, self.M_a] self.test = theano.function([articles, titles], recurrence(*outputs_info[2:]), on_unused_input='ignore')
def __init__(self, hidden_size=100, nclasses=73, num_embeddings=11359, embedding_dim=100, window_size=1, memory_size=40, n_memory_slots=8, go_code=1, depth=2, load_dir=None): articles, titles = T.imatrices('articles', 'titles') n_article_slots = int(n_memory_slots / 2) # TODO derive this from an arg n_title_slots = n_memory_slots - n_article_slots n_instances = articles.shape[0] self.window_size = window_size randoms = { # attr: shape # 'emb': (num_embeddings + 1, embedding_dim), 'M_a': (memory_size, n_article_slots), 'M_t': (memory_size, n_title_slots), 'w_a': (n_article_slots,), 'w_t': (n_title_slots,), 'Wg_a': (window_size * embedding_dim, n_article_slots), 'Wg_t': (window_size * embedding_dim, n_title_slots), 'Wk': (hidden_size, memory_size), 'Wb': (hidden_size, 1), 'Wv': (hidden_size, memory_size), 'We_a': (hidden_size, n_article_slots), 'We_t': (hidden_size, n_title_slots), 'Wx': (window_size * embedding_dim, hidden_size), 'Wh': (memory_size, hidden_size), 'W': (hidden_size, nclasses), 'h0': hidden_size } zeros = { # attr: shape 'bg_a': n_article_slots, 'bg_t': n_title_slots, 'bk': memory_size, 'bb': 1, 'bv': memory_size, 'be_a': n_article_slots, 'be_t': n_title_slots, 'bh': hidden_size, 'b': nclasses, } for l in range(depth): randoms['gru' + str(l)] = (1, embedding_dim) def random_shared(name): shape = randoms[name] return theano.shared( 0.2 * np.random.normal(size=shape).astype(theano.config.floatX), name=name) def zeros_shared(name): shape = zeros[name] return theano.shared(np.zeros(shape, dtype=theano.config.floatX), name=name) for key in randoms: # create an attribute with associated shape and random values setattr(self, key, random_shared(key)) for key in zeros: # create an attribute with associated shape and values equal to 0 setattr(self, key, zeros_shared(key)) self.names = randoms.keys() + zeros.keys() # self.names.remove('emb') # no need to save or update embeddings scan_vars = 'h0 w_a M_a w_t M_t'.split() def repeat_for_each_instance(param): """ repeat param along new axis once for each instance """ return T.repeat(T.shape_padleft(param), repeats=n_instances, axis=0) for key in scan_vars: setattr(self, key, repeat_for_each_instance(self.__getattribute__(key))) self.names.remove(key) if load_dir is not None: with open(os.path.join(load_dir, 'params.pkl')) as handle: params = pickle.load(handle) self.__dict__.update(params) def recurrence(i, h_tm1, w_a, M_a, *args, **kwargs): """ notes Headers from paper in all caps mem = n_article slots if is_article else n_title_slots :param i: center index of sliding window :param h_tm1: h_{t-1} (hidden state) :param w_a: attention weights for article memory :param M_a: article memory :param args: gru_weights, maybe w_t, maybe M_t gru_weights: weights with which to initialize GRULayer on each time step w_t: attention weights for titles memory M_t: titles memory :param kwargs: is_training, is_article is_training: is_article: we use different parts of memory when working with a article :return: [y = model outputs, i + 1 = increment index, h w, M (see above)] """ is_training = kwargs['is_training'] is_article = kwargs['is_article'] gru_weights = args[:depth] if len(args) > depth: w_t = args[depth] M_t = args[depth + 1] i_type = T.iscalar if is_article or is_training else T.ivector assert i.type == i_type if not is_article: assert w_t is not None and M_t is not None word_idxs = i if is_article or is_training: # get representation of word window document = articles if is_article else titles # [instances, bucket_width] word_idxs = document[:, i:i+1] # [instances, 1] # x_i = self.emb[word_idxs].flatten(ndim=2) # [instances, embedding_dim] input = InputLayer(shape=(None, 1), input_var=word_idxs) embed = EmbeddingLayer(input, num_embeddings, embedding_dim) gru = GRULayer(incoming=embed, num_units=embedding_dim, hid_init=self.gru0) for weight in gru_weights: gru = GRULayer(incoming=gru, num_units=embedding_dim, hid_init=weight) x_i = get_output(gru).flatten(ndim=2) x_i = Print('x_i')(x_i) # [instances, embedding_dim] gru_weights = [] if is_article: M_read = M_a # [instances, memory_size, n_article_slots] w_read = w_a # [instances, n_article_slots] else: M_read = T.concatenate([M_a, M_t], axis=2) # [instances, memory_size, n_title_slots] w_read = T.concatenate([w_a, w_t], axis=1) # [instances, n_title_slots] # eqn 15 c = T.batched_dot(M_read, w_read) # [instances, memory_size] # EXTERNAL MEMORY READ def get_attention(Wg, bg, M, w): g = T.nnet.sigmoid(T.dot(x_i, Wg) + bg) # [instances, mem] # eqn 11 k = T.dot(h_tm1, self.Wk) + self.bk # [instances, memory_size] # eqn 13 beta = T.dot(h_tm1, self.Wb) + self.bb beta = T.nnet.softplus(beta) beta = T.addbroadcast(beta, 1) # [instances, 1] # eqn 12 w_hat = T.nnet.softmax(beta * cosine_dist(M, k)) # eqn 14 return (1 - g) * w + g * w_hat # [instances, mem] w_a = get_attention(self.Wg_a, self.bg_a, M_a, w_a) # [instances, n_article_slots] if not is_article: w_t = get_attention(self.Wg_t, self.bg_t, M_t, w_t) # [instances, n_title_slots] # MODEL INPUT AND OUTPUT # eqn 9 h = T.dot(c, self.Wh) + T.dot(x_i, self.Wx) + self.bh # [instances, hidden_size] # eqn 10 y = T.nnet.softmax(T.dot(h, self.W) + self.b) # [instances, nclasses] # EXTERNAL MEMORY UPDATE def update_memory(We, be, w_update, M_update): # eqn 17 e = T.nnet.sigmoid(T.dot(h_tm1, We) + be) # [instances, mem] f = 1. - w_update * e # [instances, mem] # eqn 16 v = T.tanh(T.dot(h, self.Wv) + self.bv) # [instances, memory_size] # need to add broadcast layers for memory update f = f.dimshuffle(0, 'x', 1) # [instances, 1, mem] u = w_update.dimshuffle(0, 'x', 1) # [instances, 1, mem] v = v.dimshuffle(0, 1, 'x') # [instances, memory_size, 1] # eqn 19 return M_update * f + T.batched_dot(v, u) * (1 - f) # [instances, memory_size, mem] M_a = update_memory(self.We_a, self.be_a, w_a, M_a) attention_and_memory = [w_a, M_a] if not is_article: M_t = update_memory(self.We_t, self.be_t, w_t, M_t) attention_and_memory += [w_t, M_t] y_max = y.argmax(axis=1).astype(int32) next_idxs = i + 1 if is_training or is_article else y_max return [y, y_max, next_idxs, h] + attention_and_memory read_article = partial(recurrence, is_training=True, is_article=True) # for read_article, it actually doesn't matter whether is_training is true i0 = T.constant(0, dtype=int32, name='first_value_of_i') gru_weights = [eval('self.gru' + str(l)) for l in range(depth)] outputs_info = [None, None, i0, self.h0, self.w_a, self.M_a] + gru_weights [_, _, _, h, w, M], _ = theano.scan(fn=read_article, outputs_info=outputs_info, n_steps=articles.shape[1], name='read_scan') produce_title = partial(recurrence, is_training=True, is_article=False) outputs_info[3:6] = [param[-1, :, :] for param in (h, w, M)] outputs_info.extend([self.w_t, self.M_t]) bucket_width = titles.shape[1] - 1 # subtract 1 because <go> is omitted in y_true [y, y_max, _, _, _, _, _, _], _ = theano.scan(fn=produce_title, outputs_info=outputs_info, n_steps=bucket_width, name='train_scan') # loss and updates y_clip = T.clip(y, .01, .99) y_flatten = y_clip.dimshuffle(2, 1, 0).flatten(ndim=2).T y_true = titles[:, 1:].ravel() # [:, 1:] in order to omit <go> counts = T.extra_ops.bincount(y_true, assert_nonneg=True) weights = 1.0 / (counts[y_true] + 1) * T.neq(y_true, 0) losses = T.nnet.categorical_crossentropy(y_flatten, y_true) loss = objectives.aggregate(losses, weights, mode='sum') updates = adadelta(loss, self.params()) self.learn = theano.function(inputs=[articles, titles], outputs=[y_max.T, loss], updates=updates, allow_input_downcast=True, name='learn') produce_title_test = partial(recurrence, is_training=False, is_article=False) self.test = theano.function(inputs=[articles, titles], outputs=[y_max.T], on_unused_input='ignore') outputs_info[2] = T.zeros([n_instances], dtype=int32) + go_code [_, y_max, _, _, _, _, _, _], _ = theano.scan(fn=produce_title_test, outputs_info=outputs_info, n_steps=bucket_width, name='test_scan') self.predict = theano.function(inputs=[articles, titles], outputs=y_max.T, name='infer')
def zeros_shared(shape): return theano.shared(numpy.zeros(shape, dtype=theano.config.floatX)) emb = T.constant(np.tile(np.arange(20).reshape(-1, 1), embedding_dim)) for (attributes, initializer) in ((weights, random_shared), (biases, zeros_shared)): for key in attributes: exec key + '= initializer(attributes[key])' names = weights.keys() + biases.keys() params = map(eval, names) questions, docs = T.imatrices(2) # as many columns as context window size/lines as words in the sentence is_question = True inputs = questions if is_question else docs def repeat_for_each_instance(param): """ repeat param along new axis once for each instance """ return T.repeat(T.shape_padleft(param), repeats=inputs.shape[0], axis=0) h0, w0, M0 = map(repeat_for_each_instance, [h0, w0, M0]) def recurrence(i, h_tm1, w_previous, M_previous, is_question): # get representation of word window idxs = questions if is_question else docs # [instances, bucket_width]
def __init__(self, We, params): lstm_layers_num = 1 en_hidden_size = We.shape[1] self.eta = params.eta self.num_labels = params.num_labels self.en_hidden_size = en_hidden_size self.de_hidden_size = params.de_hidden_size self.lstm_layers_num = params.lstm_layers_num self._train = None self._utter = None self.params = [] self.encoder_lstm_layers = [] self.decoder_lstm_layers = [] self.hos = [] self.Cos = [] encoderInputs = tensor.imatrix() decoderInputs, decoderTarget = tensor.imatrices(2) encoderMask, TF, decoderMask, decoderInputs0 = tensor.fmatrices(4) self.lookuptable = theano.shared(We) #### the last one is for the stary symbole self.de_lookuptable = theano.shared(name="Decoder LookUpTable", value=init_xavier_uniform( self.num_labels + 1, self.de_hidden_size), borrow=True) self.linear = theano.shared(name="Linear", value=init_xavier_uniform( self.de_hidden_size, self.num_labels), borrow=True) self.hidden_decode = theano.shared(name="Hidden to Decode", value=init_xavier_uniform( 2 * en_hidden_size, self.de_hidden_size), borrow=True) self.hidden_bias = theano.shared( name="Hidden to Bias", value=np.asarray(np.random.randn(self.de_hidden_size, ) * 0., dtype=theano.config.floatX), borrow=True) self.params += [ self.linear, self.de_lookuptable, self.hidden_decode, self.hidden_bias ] #concatenate #(max_sent_size, batch_size, hidden_size) state_below = self.lookuptable[encoderInputs.flatten()].reshape( (encoderInputs.shape[0], encoderInputs.shape[1], self.en_hidden_size)) for _ in range(self.lstm_layers_num): enclstm_f = LSTM(self.en_hidden_size) enclstm_b = LSTM(self.en_hidden_size, True) self.encoder_lstm_layers.append(enclstm_f) #append self.encoder_lstm_layers.append(enclstm_b) #append self.params += enclstm_f.params + enclstm_b.params #concatenate hs_f, Cs_f = enclstm_f.forward(state_below, encoderMask) hs_b, Cs_b = enclstm_b.forward(state_below, encoderMask) hs = tensor.concatenate([hs_f, hs_b], axis=2) Cs = tensor.concatenate([Cs_f, Cs_b], axis=2) self.hos += tensor.tanh( tensor.dot(hs[-1], self.hidden_decode) + self.hidden_bias), self.Cos += tensor.tanh( tensor.dot(Cs[-1], self.hidden_decode) + self.hidden_bias), state_below = hs state_below = self.de_lookuptable[decoderInputs.flatten()].reshape( (decoderInputs.shape[0], decoderInputs.shape[1], self.de_hidden_size)) for i in range(self.lstm_layers_num): declstm = LSTM(self.de_hidden_size) self.decoder_lstm_layers += declstm, #append self.params += declstm.params #concatenate ho, Co = self.hos[i], self.Cos[i] state_below, Cs = declstm.forward(state_below, decoderMask, ho, Co) decoder_lstm_outputs = state_below ei, di, dt = tensor.imatrices(3) #place holders em, dm, tf, di0 = tensor.fmatrices(4) ##################################################### ##################################################### linear_outputs = tensor.dot(decoder_lstm_outputs, self.linear) softmax_outputs, updates = theano.scan( fn=lambda x: tensor.nnet.softmax(x), sequences=[linear_outputs], ) def _NLL(pred, y, m): return -m * tensor.log(pred[tensor.arange(encoderInputs.shape[1]), y]) costs, _ = theano.scan( fn=_NLL, sequences=[softmax_outputs, decoderTarget, decoderMask]) loss = costs.sum() / decoderMask.sum() updates = lasagne.updates.adam(loss, self.params, self.eta) #updates = lasagne.updates.apply_momentum(updates, self.params, momentum=0.9) ################################################### #### using the ground truth when training ################################################## self._train = theano.function(inputs=[ei, em, di, dm, dt], outputs=[loss, softmax_outputs], updates=updates, givens={ encoderInputs: ei, encoderMask: em, decoderInputs: di, decoderMask: dm, decoderTarget: dt }) ######################################################################### ### For schedule sampling ######################################################################### ###### always use privous predict as next input def _step2(state_, hs_, Cs_): hs, Cs = [], [] token_idxs = tensor.cast(state_.argmax(axis=-1), "int32") msk_ = tensor.fill( (tensor.zeros_like(token_idxs, dtype="float32")), 1) msk_ = msk_.dimshuffle('x', 0) state_below0 = self.de_lookuptable[token_idxs].reshape( (1, encoderInputs.shape[1], self.de_hidden_size)) for i, lstm in enumerate(self.decoder_lstm_layers): h, C = lstm.forward(state_below0, msk_, hs_[i], Cs_[i]) #mind msk hs += h[-1], Cs += C[-1], state_below0 = h hs, Cs = tensor.as_tensor_variable(hs), tensor.as_tensor_variable( Cs) newpred = tensor.dot(state_below0, self.linear).reshape( (encoderInputs.shape[1], self.num_labels)) state_below = tensor.nnet.softmax(newpred) return state_below, hs, Cs hs0, Cs0 = tensor.as_tensor_variable( self.hos, name="hs0"), tensor.as_tensor_variable(self.Cos, name="Cs0") train_outputs, _ = theano.scan(fn=_step2, outputs_info=[decoderInputs0, hs0, Cs0], n_steps=encoderInputs.shape[0]) train_predict = train_outputs[0] train_costs, _ = theano.scan( fn=_NLL, sequences=[train_predict, decoderTarget, decoderMask]) train_loss = train_costs.sum() / decoderMask.sum() train_updates = lasagne.updates.adam(train_loss, self.params, self.eta) #train_updates = lasagne.updates.apply_momentum(train_updates, self.params, momentum=0.9) self._train2 = theano.function( inputs=[ei, em, di0, dm, dt], outputs=[train_loss, train_predict], updates=train_updates, givens={ encoderInputs: ei, encoderMask: em, decoderInputs0: di0, decoderMask: dm, decoderTarget: dt } #givens={encoderInputs:ei, encoderMask:em, decoderInputs:di, decoderMask:dm, decoderTarget:dt, TF:tf} ) listof_token_idx = train_predict.argmax(axis=-1) self._utter = theano.function(inputs=[ei, em, di0], outputs=listof_token_idx, givens={ encoderInputs: ei, encoderMask: em, decoderInputs0: di0 })
def __init__(self, voca_size, hidden_size, lstm_layers_num, learning_rate=0.2): self.voca_size = voca_size self.hidden_size = hidden_size self.lstm_layers_num = lstm_layers_num self.learning_rate = learning_rate self._train = None self._utter = None self.params = [] self.encoder_lstm_layers = [] self.decoder_lstm_layers = [] self.hos = [] self.Cos = [] encoderInputs, encoderMask = tensor.imatrices(2) decoderInputs, decoderMask, decoderTarget = tensor.imatrices(3) self.lookuptable = theano.shared( name="Encoder LookUpTable", value=utils.init_norm(self.voca_size, self.hidden_size), borrow=True ) self.linear = theano.shared( name="Linear", value=utils.init_norm(self.hidden_size, self.voca_size), borrow=True ) self.params += [self.lookuptable, self.linear] #concatenate #(max_sent_size, batch_size, hidden_size) state_below = self.lookuptable[encoderInputs.flatten()].reshape((encoderInputs.shape[0], encoderInputs.shape[1], self.hidden_size)) for _ in range(self.lstm_layers_num): enclstm = LSTM(self.hidden_size) self.encoder_lstm_layers += enclstm, #append self.params += enclstm.params #concatenate hs, Cs = enclstm.forward(state_below, encoderMask) self.hos += hs[-1], self.Cos += Cs[-1], state_below = hs state_below = self.lookuptable[decoderInputs.flatten()].reshape((decoderInputs.shape[0], decoderInputs.shape[1], self.hidden_size)) for i in range(self.lstm_layers_num): declstm = LSTM(self.hidden_size) self.decoder_lstm_layers += declstm, #append self.params += declstm.params #concatenate ho, Co = self.hos[i], self.Cos[i] state_below, Cs = declstm.forward(state_below, decoderMask, ho, Co) decoder_lstm_outputs = state_below ei, em, di, dm, dt = tensor.imatrices(5) #place holders ##################################################### ##################################################### linear_outputs = tensor.dot(decoder_lstm_outputs, self.linear) softmax_outputs, updates = theano.scan( fn=lambda x: tensor.nnet.softmax(x), sequences=[linear_outputs], ) def _NLL(pred, y, m): return -m * tensor.log(pred[tensor.arange(decoderInputs.shape[1]), y]) costs, updates = theano.scan(fn=_NLL, sequences=[softmax_outputs, decoderTarget, decoderMask]) loss = costs.sum() / decoderMask.sum() gparams = [tensor.grad(loss, param) for param in self.params] updates = [(param, param - self.learning_rate*gparam) for param, gparam in zip(self.params, gparams)] self._train = theano.function( inputs=[ei, em, di, dm, dt], outputs=[loss, costs], updates=updates, givens={encoderInputs:ei, encoderMask:em, decoderInputs:di, decoderMask:dm, decoderTarget:dt} ) ##################################################### ##################################################### hs0, Cs0 = tensor.as_tensor_variable(self.hos, name="hs0"), tensor.as_tensor_variable(self.Cos, name="Cs0") token_idxs = tensor.fill( tensor.zeros_like(decoderInputs, dtype="int32"), utils.idx_start) msk = tensor.fill( (tensor.zeros_like(decoderInputs, dtype="int32")), 1) def _step(token_idxs, hs_, Cs_): hs, Cs = [], [] state_below = self.lookuptable[token_idxs].reshape((decoderInputs.shape[0], decoderInputs.shape[1], self.hidden_size)) for i, lstm in enumerate(self.decoder_lstm_layers): h, C = lstm.forward(state_below, msk, hs_[i], Cs_[i]) #mind msk hs += h[-1], Cs += C[-1], state_below = h hs, Cs = tensor.as_tensor_variable(hs), tensor.as_tensor_variable(Cs) next_token_idx = tensor.cast( tensor.dot(state_below, self.linear).argmax(axis=-1), "int32" ) return next_token_idx, hs, Cs outputs, updates = theano.scan( fn=_step, outputs_info=[token_idxs, hs0, Cs0], n_steps=utils.max_sent_size ) listof_token_idx = outputs[0] self._utter = theano.function( inputs=[ei, em, di], outputs=listof_token_idx, givens={encoderInputs:ei, encoderMask:em, decoderInputs:di} #givens={encoderInputs:ei, encoderMask:em} )
def load_model(model_selected): input_var = T.tensor3('input') target_var, mislabel = T.imatrices('target', 'mis_label') generator = modified_Generator( data_folder_train=train_data_dir, data_folder_valid=valid_data_dir, batch_size=1, num_feed_train= 1, # number of training samples from each class as baseline nb_classes=nb_classes, nb_samples_per_class=nb_samples_per_class, img_size=img_size, max_rotation=0, max_shift=0, var=0, amount=0, max_iter=None) # Model output_var, output_var_flatten, params1 = memory_augmented_neural_network( input_var, target_var, batch_size=generator.batch_size, nb_class=generator.nb_classes, memory_shape=memory_shape, controller_size=controller_size, input_size=img_size[0] * img_size[1], nb_reads=nb_reads) accuracies = accuracy_instance( T.argmax(output_var, axis=2), target_var, mislabel, nb_classes=generator.nb_classes, nb_samples_per_class=generator.nb_samples_per_class, batch_size=generator.batch_size) cost = T.mean( T.nnet.categorical_crossentropy(output_var_flatten, target_var.flatten())) posterior_fn = theano.function([input_var, target_var], output_var) accuracy_fn = theano.function([input_var, target_var, mislabel], accuracies) cost_fn = theano.function([input_var, target_var], cost) # load the best trained model f = open(model_selected, 'rb') loaded_params = cPickle.load(f) f.close() # set the parameters for i in range(len(loaded_params)): params1[i].set_value(loaded_params[i]) d = dict([]) all_acc = all_mis = np.zeros((0, generator.nb_samples_per_class)) all_loss, accs = [], np.zeros(generator.nb_samples_per_class) all_names = all_classes = np.zeros( (0, generator.nb_samples_per_class * generator.nb_classes), dtype=np.int32) for i, (test_input, test_target, image_names) in generator: test_output = np.argmax(posterior_fn(test_input, test_target), axis=2) test_mislabel = count_mislabel(generator, image_names) acc1, _, mis = accuracy_fn(test_input, test_target, test_mislabel) d = update_dict(d, acc1, image_names, generator.num_feed_train) all_acc = np.concatenate( (all_acc, acc1.reshape([-1, generator.nb_samples_per_class]))) all_mis = np.concatenate( (all_mis, mis.reshape([-1, generator.nb_samples_per_class]))) cls_label, image_name = prepare_img_names_to_save(image_names) all_names = np.concatenate( (all_names, image_name.reshape( -1, generator.nb_samples_per_class * generator.nb_classes)), axis=0) all_classes = np.concatenate( (all_classes, cls_label.reshape( -1, generator.nb_samples_per_class * generator.nb_classes)), axis=0) loss = cost_fn(test_input, test_target) all_loss.append(loss) accs += acc1 if i > 0 and not (i % DISPLAY_FREQ): print('Episode %05d: %.6f' % (i, loss)) print(accs / 100.) accs = np.zeros(generator.nb_samples_per_class) # save the model parameters, loss and accuracy values every 500 episodes if i > 0 and not (i % MODEL_FREQ): mislabel_count = prepare_dict_to_save(d) h5f = h5py.File(test_path + '/test_Results.h5', 'w') h5f.create_dataset('all_acc_episode', data=all_acc) h5f.create_dataset('loss_episode', data=all_loss) h5f.create_dataset('names_episode', data=all_names) h5f.create_dataset('classes_episode', data=all_classes) h5f.create_dataset('mis_episode', data=all_mis) h5f.create_dataset('mislabel_count', data=mislabel_count) h5f.close() print( '****************************************************************************************' )
def __init__(self, We, params): lstm_layers_num = 1 en_hidden_size = We.shape[1] self.eta = params.eta self.num_labels = params.num_labels self.en_hidden_size = en_hidden_size self.de_hidden_size = params.de_hidden_size self.lstm_layers_num = params.lstm_layers_num self._train = None self._utter = None self.params = [] self.encoder_lstm_layers = [] self.decoder_lstm_layers = [] self.hos = [] self.Cos = [] encoderInputs = tensor.imatrix() decoderInputs, decoderTarget = tensor.imatrices(2) encoderMask, TF, decoderMask, decoderInputs0 = tensor.fmatrices(4) self.lookuptable = theano.shared(We) #### the last one is for the stary symbole self.de_lookuptable = theano.shared(name="Decoder LookUpTable", value=init_xavier_uniform( self.num_labels + 1, self.de_hidden_size), borrow=True) self.linear = theano.shared( name="Linear", value=init_xavier_uniform(self.de_hidden_size + 2 * en_hidden_size, self.num_labels), borrow=True) self.linear_bias = theano.shared( name="Hidden to Bias", value=np.asarray(np.random.randn(self.num_labels, ) * 0., dtype=theano.config.floatX), borrow=True) #self.hidden_decode = theano.shared(name="Hidden to Decode", value= init_xavier_uniform(2*en_hidden_size, self.de_hidden_size), borrow = True) #self.hidden_bias = theano.shared( # name="Hidden to Bias", # value=np.asarray(np.random.randn(self.de_hidden_size, )*0., dtype=theano.config.floatX) , # borrow=True # ) #self.params += [self.linear, self.de_lookuptable, self.hidden_decode, self.hidden_bias] #concatenate self.params += [self.linear, self.linear_bias, self.de_lookuptable ] #the initial hidden state of decoder lstm is zeros #(max_sent_size, batch_size, hidden_size) state_below = self.lookuptable[encoderInputs.flatten()].reshape( (encoderInputs.shape[0], encoderInputs.shape[1], self.en_hidden_size)) for _ in range(self.lstm_layers_num): enclstm_f = LSTM(self.en_hidden_size) enclstm_b = LSTM(self.en_hidden_size, True) self.encoder_lstm_layers.append(enclstm_f) #append self.encoder_lstm_layers.append(enclstm_b) #append self.params += enclstm_f.params + enclstm_b.params #concatenate hs_f, Cs_f = enclstm_f.forward(state_below, encoderMask) hs_b, Cs_b = enclstm_b.forward(state_below, encoderMask) hs = tensor.concatenate([hs_f, hs_b], axis=2) Cs = tensor.concatenate([Cs_f, Cs_b], axis=2) hs0 = tensor.concatenate([hs_f[-1], hs_b[0]], axis=1) Cs0 = tensor.concatenate([Cs_f[-1], Cs_b[0]], axis=1) #self.hos += tensor.tanh(tensor.dot(hs0, self.hidden_decode) + self.hidden_bias), #self.Cos += tensor.tanh(tensor.dot(Cs0, self.hidden_decode) + self.hidden_bias), self.hos += tensor.alloc( np.asarray(0., dtype=theano.config.floatX), encoderInputs.shape[1], self.de_hidden_size), self.Cos += tensor.alloc( np.asarray(0., dtype=theano.config.floatX), encoderInputs.shape[1], self.de_hidden_size), state_below = hs Encoder = state_below ei, di, dt = tensor.imatrices(3) #place holders em, dm, tf, di0 = tensor.fmatrices(4) self.encoder_function = theano.function(inputs=[ei, em], outputs=Encoder, givens={ encoderInputs: ei, encoderMask: em }) ##################################################### ##################################################### state_below = self.de_lookuptable[decoderInputs.flatten()].reshape( (decoderInputs.shape[0], decoderInputs.shape[1], self.de_hidden_size)) for i in range(self.lstm_layers_num): declstm = LSTM(self.de_hidden_size) self.decoder_lstm_layers += declstm, #append self.params += declstm.params #concatenate ho, Co = self.hos[i], self.Cos[i] state_below, Cs = declstm.forward(state_below, decoderMask, ho, Co) ##### Here we include the representation from the decoder decoder_lstm_outputs = tensor.concatenate([state_below, Encoder], axis=2) linear_outputs = tensor.dot(decoder_lstm_outputs, self.linear) + self.linear_bias[None, None, :] softmax_outputs, _ = theano.scan( fn=lambda x: tensor.nnet.softmax(x), sequences=[linear_outputs], ) def _NLL(pred, y, m): return -m * tensor.log(pred[tensor.arange(encoderInputs.shape[1]), y]) costs, _ = theano.scan( fn=_NLL, sequences=[softmax_outputs, decoderTarget, decoderMask]) loss = costs.sum() / decoderMask.sum() + params.L2 * sum( lasagne.regularization.l2(x) for x in self.params) updates = lasagne.updates.adam(loss, self.params, self.eta) #updates = lasagne.updates.apply_momentum(updates, self.params, momentum=0.9) ################################################### #### using the ground truth when training ################################################## self._train = theano.function(inputs=[ei, em, di, dm, dt], outputs=[loss, softmax_outputs], updates=updates, givens={ encoderInputs: ei, encoderMask: em, decoderInputs: di, decoderMask: dm, decoderTarget: dt }) ######################################################################### ### For schedule sampling ######################################################################### ###### always use privous predict as next input def _step2(ctx_, state_, hs_, Cs_): ### ctx_: b x h ### state_ : b x h ### hs_ : 1 x b x h the first dimension is the number of the decoder layers ### Cs_ : 1 x b x h the first dimension is the number of the decoder layers hs, Cs = [], [] token_idxs = tensor.cast(state_.argmax(axis=-1), "int32") msk_ = tensor.fill( (tensor.zeros_like(token_idxs, dtype="float32")), 1) msk_ = msk_.dimshuffle('x', 0) state_below0 = self.de_lookuptable[token_idxs].reshape( (1, ctx_.shape[0], self.de_hidden_size)) for i, lstm in enumerate(self.decoder_lstm_layers): h, C = lstm.forward(state_below0, msk_, hs_[i], Cs_[i]) #mind msk hs += h[-1], Cs += C[-1], state_below0 = h hs, Cs = tensor.as_tensor_variable(hs), tensor.as_tensor_variable( Cs) state_below0 = state_below0.reshape( (ctx_.shape[0], self.de_hidden_size)) state_below0 = tensor.concatenate([ctx_, state_below0], axis=1) newpred = tensor.dot(state_below0, self.linear) + self.linear_bias[None, :] state_below = tensor.nnet.softmax(newpred) ##### the beging symbole probablity is 0 extra_p = tensor.zeros_like(hs[:, :, 0]) state_below = tensor.concatenate([state_below, extra_p.T], axis=1) return state_below, hs, Cs ctx_0, state_0 = tensor.fmatrices(2) hs_0 = tensor.ftensor3() Cs_0 = tensor.ftensor3() state_below_tmp, hs_tmp, Cs_tmp = _step2(ctx_0, state_0, hs_0, Cs_0) self.f_next = theano.function([ctx_0, state_0, hs_0, Cs_0], [state_below_tmp, hs_tmp, Cs_tmp], name='f_next') hs0, Cs0 = tensor.as_tensor_variable( self.hos, name="hs0"), tensor.as_tensor_variable(self.Cos, name="Cs0") train_outputs, _ = theano.scan(fn=_step2, sequences=[Encoder], outputs_info=[decoderInputs0, hs0, Cs0], n_steps=encoderInputs.shape[0]) train_predict = train_outputs[0] train_costs, _ = theano.scan( fn=_NLL, sequences=[train_predict, decoderTarget, decoderMask]) train_loss = train_costs.sum() / decoderMask.sum() + params.L2 * sum( lasagne.regularization.l2(x) for x in self.params) ##from adam import adam ##train_updates = adam(train_loss, self.params, self.eta) #train_updates = lasagne.updates.apply_momentum(train_updates, self.params, momentum=0.9) #train_updates = lasagne.updates.sgd(train_loss, self.params, self.eta) #train_updates = lasagne.updates.apply_momentum(train_updates, self.params, momentum=0.9) from momentum import momentum train_updates = momentum(train_loss, self.params, params.eta, momentum=0.9) self._train2 = theano.function( inputs=[ei, em, di0, dm, dt], outputs=[train_loss, train_predict], updates=train_updates, givens={ encoderInputs: ei, encoderMask: em, decoderInputs0: di0, decoderMask: dm, decoderTarget: dt } #givens={encoderInputs:ei, encoderMask:em, decoderInputs:di, decoderMask:dm, decoderTarget:dt, TF:tf} ) listof_token_idx = train_predict.argmax(axis=-1) self._utter = theano.function(inputs=[ei, em, di0], outputs=listof_token_idx, givens={ encoderInputs: ei, encoderMask: em, decoderInputs0: di0 })
def __init__(self, voca_size, hidden_size, lstm_layers_num, learning_rate=0.2): self.voca_size = voca_size self.hidden_size = hidden_size self.lstm_layers_num = lstm_layers_num self.learning_rate = learning_rate self._train = None self._utter = None self.params = [] self.encoder_lstm_layers = [] self.decoder_lstm_layers = [] self.hos = [] self.Cos = [] encoderInputs, encoderMask = tensor.imatrices(2) decoderInputs, decoderMask, decoderTarget = tensor.imatrices(3) self.lookuptable = theano.shared(name="Encoder LookUpTable", value=utils.init_norm( self.voca_size, self.hidden_size), borrow=True) self.linear = theano.shared(name="Linear", value=utils.init_norm( self.hidden_size, self.voca_size), borrow=True) self.params += [self.lookuptable, self.linear] #concatenate #(max_sent_size, batch_size, hidden_size) state_below = self.lookuptable[encoderInputs.flatten()].reshape( (encoderInputs.shape[0], encoderInputs.shape[1], self.hidden_size)) for _ in range(self.lstm_layers_num): enclstm = LSTM(self.hidden_size) self.encoder_lstm_layers += enclstm, #append self.params += enclstm.params #concatenate hs, Cs = enclstm.forward(state_below, encoderMask) self.hos += hs[-1], self.Cos += Cs[-1], state_below = hs state_below = self.lookuptable[decoderInputs.flatten()].reshape( (decoderInputs.shape[0], decoderInputs.shape[1], self.hidden_size)) for i in range(self.lstm_layers_num): declstm = LSTM(self.hidden_size) self.decoder_lstm_layers += declstm, #append self.params += declstm.params #concatenate ho, Co = self.hos[i], self.Cos[i] state_below, Cs = declstm.forward(state_below, decoderMask, ho, Co) decoder_lstm_outputs = state_below ei, em, di, dm, dt = tensor.imatrices(5) #place holders ##################################################### ##################################################### linear_outputs = tensor.dot(decoder_lstm_outputs, self.linear) softmax_outputs, updates = theano.scan( fn=lambda x: tensor.nnet.softmax(x), sequences=[linear_outputs], ) def _NLL(pred, y, m): return -m * tensor.log(pred[tensor.arange(decoderInputs.shape[1]), y]) costs, updates = theano.scan( fn=_NLL, sequences=[softmax_outputs, decoderTarget, decoderMask]) loss = costs.sum() / decoderMask.sum() gparams = [tensor.grad(loss, param) for param in self.params] updates = [(param, param - self.learning_rate * gparam) for param, gparam in zip(self.params, gparams)] self._train = theano.function(inputs=[ei, em, di, dm, dt], outputs=[loss, costs], updates=updates, givens={ encoderInputs: ei, encoderMask: em, decoderInputs: di, decoderMask: dm, decoderTarget: dt }) ##################################################### ##################################################### hs0, Cs0 = tensor.as_tensor_variable( self.hos, name="hs0"), tensor.as_tensor_variable(self.Cos, name="Cs0") token_idxs = tensor.fill( tensor.zeros_like(decoderInputs, dtype="int32"), utils.idx_start) msk = tensor.fill((tensor.zeros_like(decoderInputs, dtype="int32")), 1) def _step(token_idxs, hs_, Cs_): hs, Cs = [], [] state_below = self.lookuptable[token_idxs].reshape( (decoderInputs.shape[0], decoderInputs.shape[1], self.hidden_size)) for i, lstm in enumerate(self.decoder_lstm_layers): h, C = lstm.forward(state_below, msk, hs_[i], Cs_[i]) #mind msk hs += h[-1], Cs += C[-1], state_below = h hs, Cs = tensor.as_tensor_variable(hs), tensor.as_tensor_variable( Cs) next_token_idx = tensor.cast( tensor.dot(state_below, self.linear).argmax(axis=-1), "int32") return next_token_idx, hs, Cs outputs, updates = theano.scan(fn=_step, outputs_info=[token_idxs, hs0, Cs0], n_steps=utils.max_sent_size) listof_token_idx = outputs[0] self._utter = theano.function( inputs=[ei, em, di], outputs=listof_token_idx, givens={ encoderInputs: ei, encoderMask: em, decoderInputs: di } #givens={encoderInputs:ei, encoderMask:em} )