def ff(self, x, phase): return ele.tanh(x)
def LSTM_train(model, sents, words, learning_rate, EPOCH, tanhC_version=1): # Constants N = model.Layers[1] # Number of units K = model.Layers[2] # Vocabulary size last_time = time.time() # For each epoch for epoch_id in range(1, EPOCH + 1): epoch_ll = 0 # For each sentence for sent_id, sent in enumerate(sents): #print sent_id #print "sent", sent #print "sents", sents ##### Initialize activations ##### Tau = len(sent) sent_ll = 0 # Sentence log likelihood data = [None] * Tau Hout = [None] * Tau Hout[0] = owl.zeros([N, 1]) act_ig = [None] * Tau act_fg = [None] * Tau act_og = [None] * Tau act_ff = [None] * Tau C = [None] * Tau C[0] = owl.zeros([N, 1]) dY = [None] * Tau dBd = owl.zeros([model.Layers[2], 1]) #dY.sum(0) dWd = owl.zeros([model.Layers[2], model.Layers[1]]) dHout = [None] * Tau #dY.dot(model.decoder_weights.transpose()) dEmb = [None] * Tau ##### Forward pass ##### # For each time step for t in range(1, Tau): # predict the (t+1)'th word from the t'th word data[t] = model.emb_weight[sent[t - 1]] NVector = np.zeros((K, 1)) NVector[sent[t]] = 1 target = owl.from_numpy(NVector).trans() act_ig[t] = model.ig_weight_data * data[ t] + model.ig_weight_prev * Hout[ t - 1] + model.ig_weight_cell * C[ t - 1] + model.ig_weight_bias act_ig[t] = ele.sigm(act_ig[t]) act_fg[t] = model.fg_weight_data * data[ t] + model.fg_weight_prev * Hout[ t - 1] + model.fg_weight_cell * C[ t - 1] + model.fg_weight_bias act_fg[t] = ele.sigm(act_fg[t]) act_ff[t] = model.ff_weight_data * data[ t] + model.ff_weight_prev * Hout[t - 1] + model.ff_weight_bias act_ff[t] = ele.tanh(act_ff[t]) C[t] = ele.mult(act_ig[t], act_ff[t]) + ele.mult( act_fg[t], C[t - 1]) act_og[t] = model.og_weight_data * data[ t] + model.og_weight_prev * Hout[ t - 1] + model.og_weight_cell * C[t] + model.og_weight_bias act_og[t] = ele.sigm(act_og[t]) if tanhC_version: Hout[t] = ele.mult(act_og[t], ele.tanh(C[t])) else: Hout[t] = ele.mult(act_og[t], C[t]) Y = softmax(model.decoder_weights * Hout[t] + model.decoder_bias) # BP to Hout dY[t] = Y - target dBd += dY[t] dWd += dY[t] * Hout[t].trans() dHout[t] = model.decoder_weights.trans() * dY[t] # evaluation output = Y.to_numpy( ) # Can directly get a single element from Y # print output[0, sent[t]] sent_ll += math.log(max(output[0, sent[t]], 1e-20), 2) #print "Y_0[t]",Y_o[t] #print "Y_o[t][sent[t]]",Y_o[t][sent[t]] #print np.sum(output.to_numpy()) # output = Ym[t].trans() * data[t] # sent_ll += math.log10( max(np.sum(output.to_numpy()),1e-20) ) ##### Initialize gradient vectors ##### weight_update_ig_data = owl.zeros( [model.Layers[1], model.Layers[0]]) weight_update_ig_prev = owl.zeros( [model.Layers[1], model.Layers[1]]) weight_update_ig_cell = owl.zeros( [model.Layers[1], model.Layers[1]]) weight_update_ig_bias = owl.zeros([model.Layers[1], 1]) weight_update_fg_data = owl.zeros( [model.Layers[1], model.Layers[0]]) weight_update_fg_prev = owl.zeros( [model.Layers[1], model.Layers[1]]) weight_update_fg_cell = owl.zeros( [model.Layers[1], model.Layers[1]]) weight_update_fg_bias = owl.zeros([model.Layers[1], 1]) weight_update_og_data = owl.zeros( [model.Layers[1], model.Layers[0]]) weight_update_og_prev = owl.zeros( [model.Layers[1], model.Layers[1]]) weight_update_og_cell = owl.zeros( [model.Layers[1], model.Layers[1]]) weight_update_og_bias = owl.zeros([model.Layers[1], 1]) weight_update_ff_data = owl.zeros( [model.Layers[1], model.Layers[0]]) weight_update_ff_prev = owl.zeros( [model.Layers[1], model.Layers[1]]) weight_update_ff_bias = owl.zeros([model.Layers[1], 1]) dC = [None] * Tau for t in xrange(Tau): dC[t] = owl.zeros(C[t].shape) # Calculate the error and add it for t in reversed(range(1, Tau)): #print "sent",sent #print "t",t # BP from og controled gate and og if tanhC_version: tanhC = ele.tanh(C[t]) dTanhC = ele.mult(dHout[t], act_og[t]) sen_og = ele.mult(dHout[t], tanhC) dC[t] += ele.mult((1 - ele.mult(tanhC, tanhC)), dTanhC) else: sen_og = ele.mult(C[t], dHout[t]) dC[t] += ele.mult(act_og[t], dHout[t]) # BP from og sen_og = ele.mult(ele.mult(act_og[t], (1.0 - act_og[t])), sen_og) dHout[t - 1] = model.og_weight_prev.trans() * sen_og dC[t] += model.og_weight_cell.trans() * sen_og dEmb[t] = model.og_weight_data.trans() * sen_og # BP from fg controled gate sen_fg = ele.mult(C[t - 1], dC[t]) dC[t - 1] += ele.mult(act_fg[t], dC[t]) # BP from ig controled gate sen_ig = ele.mult(act_ff[t], dC[t]) sen_ff = ele.mult(act_ig[t], dC[t]) sen_ff = ele.mult((1 - ele.mult(act_ff[t], act_ff[t])), sen_ff) dEmb[t] += model.ff_weight_data.trans() * sen_ff # BP from fg sen_fg = ele.mult(ele.mult(act_fg[t], (1.0 - act_fg[t])), sen_fg) dHout[t - 1] += model.fg_weight_prev.trans() * sen_fg dC[t - 1] += model.fg_weight_cell.trans() * sen_fg dEmb[t] += model.fg_weight_data.trans() * sen_fg # BP from ig sen_ig = ele.mult(ele.mult(act_ig[t], (1.0 - act_ig[t])), sen_ig) dHout[t - 1] += model.ig_weight_prev.trans() * sen_ig dC[t - 1] += model.ig_weight_cell.trans() * sen_ig dEmb[t] += model.ig_weight_data.trans() * sen_ig # derivatives on weight matrix and bias weight_update_ig_data += sen_ig * data[t].trans() weight_update_ig_prev += sen_ig * Hout[t - 1].trans() weight_update_ig_cell += sen_ig * C[t - 1].trans() weight_update_ig_bias += sen_ig weight_update_fg_data += sen_fg * data[t].trans() weight_update_fg_prev += sen_fg * Hout[t - 1].trans() weight_update_fg_cell += sen_fg * C[t - 1].trans() weight_update_fg_bias += sen_fg weight_update_og_data += sen_og * data[t].trans() weight_update_og_prev += sen_og * Hout[t - 1].trans() weight_update_og_cell += sen_og * C[t].trans() weight_update_og_bias += sen_og weight_update_ff_data += sen_ff * data[t].trans() weight_update_ff_prev += sen_ff * Hout[t - 1].trans() weight_update_ff_bias += sen_ff # normalize the gradients rate = learning_rate / Tau # weight update model.ig_weight_prev -= rate * weight_update_ig_prev model.ig_weight_data -= rate * weight_update_ig_data model.ig_weight_cell -= rate * weight_update_ig_cell model.ig_weight_bias -= rate * weight_update_ig_bias model.fg_weight_prev -= rate * weight_update_fg_prev model.fg_weight_data -= rate * weight_update_fg_data model.fg_weight_cell -= rate * weight_update_fg_cell model.fg_weight_bias -= rate * weight_update_fg_bias model.og_weight_prev -= rate * weight_update_og_prev model.og_weight_data -= rate * weight_update_og_data model.og_weight_cell -= rate * weight_update_og_cell model.og_weight_bias -= rate * weight_update_og_bias model.ff_weight_prev -= rate * weight_update_ff_prev model.ff_weight_data -= rate * weight_update_ff_data model.ff_weight_bias -= rate * weight_update_ff_bias model.decoder_weights -= rate * dWd model.decoder_bias -= rate * dBd for t in range(1, Tau): model.emb_weight[sent[t - 1]] -= rate * dEmb[t] # Print results epoch_ll += sent_ll # print(" Sentence %d LL: %f" % (sent_id, sent_ll)) epoch_ent = epoch_ll * (-1) / words epoch_ppl = 2**epoch_ent cur_time = time.time() print("Epoch %d (alpha=%f) PPL=%f" % (epoch_id, learning_rate, epoch_ppl)) print " time consumed:", cur_time - last_time last_time = cur_time return model, learning_rate
def LSTM_test(model, sents, words, tanhC_version=1): N = model.Layers[1] K = model.Layers[2] test_ll = 0 # For each sentence for sent_id, sent in enumerate(sents): #print sent_id #print "sent", sent #print "sents", sents ##### Initialize activations ##### Tau = len(sent) sent_ll = 0 # Sentence log likelihood data = [None] * Tau Hout = [None] * Tau Hout[0] = owl.zeros([N, 1]) act_ig = [None] * Tau act_fg = [None] * Tau act_og = [None] * Tau act_ff = [None] * Tau C = [None] * Tau C[0] = owl.zeros([N, 1]) ##### Forward pass ##### # For each time step for t in range(1, Tau): # predict the (t+1)'th word from the t'th word data[t] = model.emb_weight[sent[t - 1]] act_ig[t] = model.ig_weight_data * data[ t] + model.ig_weight_prev * Hout[ t - 1] + model.ig_weight_cell * C[t - 1] + model.ig_weight_bias act_ig[t] = ele.sigm(act_ig[t]) act_fg[t] = model.fg_weight_data * data[ t] + model.fg_weight_prev * Hout[ t - 1] + model.fg_weight_cell * C[t - 1] + model.fg_weight_bias act_fg[t] = ele.sigm(act_fg[t]) act_ff[t] = model.ff_weight_data * data[ t] + model.ff_weight_prev * Hout[t - 1] + model.ff_weight_bias act_ff[t] = ele.tanh(act_ff[t]) C[t] = ele.mult(act_ig[t], act_ff[t]) + ele.mult( act_fg[t], C[t - 1]) act_og[t] = model.og_weight_data * data[ t] + model.og_weight_prev * Hout[ t - 1] + model.og_weight_cell * C[t] + model.og_weight_bias act_og[t] = ele.sigm(act_og[t]) if tanhC_version: Hout[t] = ele.mult(act_og[t], ele.tanh(C[t])) else: Hout[t] = ele.mult(act_og[t], C[t]) Y = softmax(model.decoder_weights * Hout[t] + model.decoder_bias) # evaluation output = Y.to_numpy() # Can directly get a single element from Y # print output[0, sent[t]] sent_ll += math.log(max(output[0, sent[t]], 1e-20), 2) test_ll += sent_ll test_ent = test_ll * (-1) / words test_ppl = 2**test_ent print "Test PPL =", test_ppl
def LSTM_train(model, sents, vocab_size, words, NUM_EPOCHS=100, tanhC_version=1): # Constants ALPHA = 1 # Learning rate N = 10 # Number of units learning_rate = 1 K = vocab_size # Vocabulary size # For each epoch last_ll = 1e99 last_time = time.time() for epoch_id in range(1, NUM_EPOCHS + 1): epoch_ll = 0 # For each sentence for sent_id, sent in enumerate(sents): #print "sent_id",sent_id #print "sent", sent #print "sents", sents ##### Initialize activations ##### Tau = len(sent) sent_ll = 0 # Sentence log likelihood batch_size = Tau data = [None] * Tau prev = [None] * Tau embed = np.zeros((K, 1)) embed[sent[0]] = 1 data[0] = owl.from_numpy(embed).trans() Hout = [None] * Tau Hout[0] = owl.zeros([N, 1]) act_ig = [None] * Tau act_fg = [None] * Tau act_og = [None] * Tau act_ff = [None] * Tau C = [None] * Tau C[0] = owl.zeros([N, 1]) Ym = [None] * Tau dY = [None] * Tau dBd = owl.zeros([model.Layers[2], 1]) #dY.sum(0) dWd = owl.zeros([model.Layers[1], model.Layers[2]]) #Hout.transpose().dot(dY) dHout = [None] * Tau #dY.dot(model.decoder_weights.transpose()) ##### Forward pass ##### # For each time step for t in range(1, Tau): prev[t] = Hout[t - 1] embed = np.zeros((K, 1)) embed[sent[t]] = 1 data[t] = owl.from_numpy(embed).trans() act_ig[t] = model.ig_weight_data.trans() * data[ t - 1] + model.ig_weight_prev.trans( ) * prev[t] + model.ig_weight_bias act_fg[t] = model.fg_weight_data.trans() * data[ t - 1] + model.fg_weight_prev.trans( ) * prev[t] + model.fg_weight_bias act_og[t] = model.og_weight_data.trans() * data[ t - 1] + model.og_weight_prev.trans( ) * prev[t] + model.og_weight_bias act_ff[t] = model.ff_weight_data.trans() * data[ t - 1] + model.ff_weight_prev.trans( ) * prev[t] + model.ff_weight_bias act_ig[t] = ele.sigm(act_ig[t]) act_fg[t] = ele.sigm(act_fg[t]) act_og[t] = ele.sigm(act_og[t]) act_ff[t] = ele.tanh(act_ff[t]) C[t] = ele.mult(act_ig[t], act_ff[t]) + ele.mult( act_fg[t], C[t - 1]) if tanhC_version: Hout[t] = ele.mult(act_og[t], ele.tanh(C[t])) else: Hout[t] = ele.mult(act_og[t], C[t]) Ym[t] = softmax(model.decoder_weights.trans() * Hout[t] + model.decoder_bias) dY[t] = data[t] - Ym[t] dBd += dY[t] / batch_size dWd += Hout[t] * dY[t].trans() / batch_size dHout[t] = model.decoder_weights * dY[t] #print "Y_0[t]",Y_o[t] #print "Y_o[t][sent[t]]",Y_o[t][sent[t]] #print np.sum(output.to_numpy()) # output = Ym[t].trans() * data[t] # sent_ll += math.log10( max(np.sum(output.to_numpy()),1e-20) ) ##### Initialize gradient vectors ##### for t in range(1, Tau): output = Ym[t].trans() * data[t] sent_ll += math.log10(max(np.sum(output.to_numpy()), 1e-20)) sen_ig = [None] * Tau sen_fg = [None] * Tau sen_og = [None] * Tau sen_ff = [None] * Tau weight_update_ig_data = owl.zeros( [model.Layers[0], model.Layers[1]]) weight_update_ig_prev = owl.zeros( [model.Layers[1], model.Layers[1]]) weight_update_ig_bias = owl.zeros([model.Layers[1], 1]) weight_update_fg_data = owl.zeros( [model.Layers[0], model.Layers[1]]) weight_update_fg_prev = owl.zeros( [model.Layers[1], model.Layers[1]]) weight_update_fg_bias = owl.zeros([model.Layers[1], 1]) weight_update_og_data = owl.zeros( [model.Layers[0], model.Layers[1]]) weight_update_og_prev = owl.zeros( [model.Layers[1], model.Layers[1]]) weight_update_og_bias = owl.zeros([model.Layers[1], 1]) weight_update_ff_data = owl.zeros( [model.Layers[0], model.Layers[1]]) weight_update_ff_prev = owl.zeros( [model.Layers[1], model.Layers[1]]) weight_update_ff_bias = owl.zeros([model.Layers[1], 1]) dHin = owl.zeros([model.Layers[1], model.Layers[1]]) dC = [None] * Tau for t in xrange(Tau): dC[t] = owl.zeros(C[t].shape) # Calculate the error and add it for t in reversed(range(1, len(sent))): #print "sent",sent #print "t",t if tanhC_version: tanhCt = ele.tanh(C[t]) sen_og[t] = ele.mult(tanhCt, dHout[t]) dC[t] += ele.mult((1 - ele.mult(tanhCt, tanhCt)), ele.mult(act_og[t], dHout[t])) else: sen_og[t] = ele.mult(C[t], dHout[t]) dC[t] += ele.mult(act_og[t], dHout[t]) sen_fg[t] = owl.zeros([model.Layers[1], 1]) if t > 0: sen_fg[t] = ele.mult(C[t - 1], dC[t]) dC[t - 1] += ele.mult(act_og[t], dC[t]) sen_ig[t] = ele.mult(act_ff[t], dC[t]) sen_ff[t] = ele.mult(act_ig[t], dC[t]) # backprop activation functions sen_ff[t] = ele.mult((1 - ele.mult(act_ff[t], act_ff[t])), sen_ff[t]) sen_ig[t] = ele.mult(ele.mult(act_ig[t], (1.0 - act_ig[t])), sen_ig[t]) sen_fg[t] = ele.mult(ele.mult(act_fg[t], (1.0 - act_fg[t])), sen_fg[t]) sen_og[t] = ele.mult(ele.mult(act_og[t], (1.0 - act_og[t])), sen_og[t]) # backprop matrix multiply weight_update_ig_data += data[t] * sen_ig[t].trans() weight_update_ig_prev += prev[t] * sen_ig[t].trans() weight_update_fg_bias += sen_ig[t] # sen_ig[t].sum(0 or 1) weight_update_fg_data += data[t] * sen_fg[t].trans() weight_update_fg_prev += prev[t] * sen_fg[t].trans() weight_update_fg_bias += sen_fg[t] weight_update_og_data += data[t] * sen_og[t].trans() weight_update_og_prev += prev[t] * sen_og[t].trans() weight_update_og_bias += sen_og[t] weight_update_ff_data += data[t] * sen_ff[t].trans() weight_update_ff_prev += prev[t] * sen_ff[t].trans() weight_update_ff_bias += sen_ff[t] if t > 1: dHout[t - 1] += model.ig_weight_prev.trans() * sen_ig[t] dHout[t - 1] += model.fg_weight_prev.trans() * sen_fg[t] dHout[t - 1] += model.og_weight_prev.trans() * sen_og[t] dHout[t - 1] += model.ff_weight_prev.trans() * sen_ff[t] # normalize the gradients # dWLSTM /= batch_size weight_update_ig_prev /= batch_size weight_update_ig_data /= batch_size weight_update_ig_bias /= batch_size weight_update_fg_prev /= batch_size weight_update_fg_data /= batch_size weight_update_fg_bias /= batch_size weight_update_og_prev /= batch_size weight_update_og_data /= batch_size weight_update_og_bias /= batch_size weight_update_ff_prev /= batch_size weight_update_ff_data /= batch_size weight_update_ff_bias /= batch_size # weight update model.ig_weight_prev += learning_rate * weight_update_ig_prev model.ig_weight_data += learning_rate * weight_update_ig_data model.ig_weight_bias += learning_rate * weight_update_ig_bias model.fg_weight_prev += learning_rate * weight_update_fg_prev model.fg_weight_data += learning_rate * weight_update_fg_data model.fg_weight_bias += learning_rate * weight_update_fg_bias model.og_weight_prev += learning_rate * weight_update_og_prev model.og_weight_data += learning_rate * weight_update_og_data model.og_weight_bias += learning_rate * weight_update_og_bias model.ff_weight_prev += learning_rate * weight_update_ff_prev model.ff_weight_data += learning_rate * weight_update_ff_data model.ff_weight_bias += learning_rate * weight_update_ff_bias model.decoder_weights += learning_rate * dWd model.decoder_bias += learning_rate * dBd # Print results epoch_ll += sent_ll # print(" Sentence %d LL: %f" % (sent_id, sent_ll)) epoch_ent = epoch_ll * (-1) / words epoch_ppl = 10**epoch_ent cur_time = time.time() print("Epoch %d (alpha=%f) PPL=%f" % (epoch_id, learning_rate, epoch_ppl)) print " time consumed:", cur_time - last_time if last_ll > epoch_ll: learning_rate /= 2.0 last_ll = epoch_ll last_time = cur_time
def LSTM_test(model, sents, vocab_size, words, tanhC_version=1): N = 10 K = vocab_size test_ll = 0 # For each sentence for sent_id, sent in enumerate(sents): #print "sent_id",sent_id #print "sent", sent #print "sents", sents ##### Initialize activations ##### Tau = len(sent) sent_ll = 0 # Sentence log likelihood batch_size = Tau data = [None] * Tau prev = [None] * Tau embed = np.zeros((K, 1)) embed[sent[0]] = 1 data[0] = owl.from_numpy(embed).trans() Hout = [None] * Tau Hout[0] = owl.zeros([N, 1]) act_ig = [None] * Tau act_fg = [None] * Tau act_og = [None] * Tau act_ff = [None] * Tau C = [None] * Tau C[0] = owl.zeros([N, 1]) Ym = [None] * Tau ##### Forward pass ##### # For each time step for t in range(1, Tau): prev[t] = Hout[t - 1] embed = np.zeros((K, 1)) embed[sent[t]] = 1 data[t] = owl.from_numpy(embed).trans() act_ig[t] = model.ig_weight_data.trans() * data[ t - 1] + model.ig_weight_prev.trans( ) * prev[t] + model.ig_weight_bias act_fg[t] = model.fg_weight_data.trans() * data[ t - 1] + model.fg_weight_prev.trans( ) * prev[t] + model.fg_weight_bias act_og[t] = model.og_weight_data.trans() * data[ t - 1] + model.og_weight_prev.trans( ) * prev[t] + model.og_weight_bias act_ff[t] = model.ff_weight_data.trans() * data[ t - 1] + model.ff_weight_prev.trans( ) * prev[t] + model.ff_weight_bias act_ig[t] = ele.sigm(act_ig[t]) act_fg[t] = ele.sigm(act_fg[t]) act_og[t] = ele.sigm(act_og[t]) act_ff[t] = ele.tanh(act_ff[t]) C[t] = ele.mult(act_ig[t], act_ff[t]) + ele.mult( act_fg[t], C[t - 1]) if tanhC_version: Hout[t] = ele.mult(act_og[t], ele.tanh(C[t])) else: Hout[t] = ele.mult(act_og[t], C[t]) Ym[t] = softmax(model.decoder_weights.trans() * Hout[t] + model.decoder_bias) #print "Y_0[t]",Y_o[t] #print "Y_o[t][sent[t]]",Y_o[t][sent[t]] output = Ym[t].trans() * data[t] test_ll += math.log10(max(np.sum(output.to_numpy()), 1e-20)) print test_ll test_ent = test_ll * (-1) / words test_ppl = 10**test_ent print("Test PPL = %f" % (test_ppl))
def LSTM_train(model, sents, words, learning_rate, EPOCH, tanhC_version = 1): # Constants N = model.Layers[1] # Number of units K = model.Layers[2] # Vocabulary size last_time = time.time() # For each epoch for epoch_id in range(1, EPOCH + 1): epoch_ll = 0 # For each sentence for sent_id, sent in enumerate(sents): #print sent_id #print "sent", sent #print "sents", sents ##### Initialize activations ##### Tau = len(sent) sent_ll = 0 # Sentence log likelihood data = [None] * Tau Hout = [None] * Tau Hout[0] = owl.zeros([N, 1]) act_ig = [None] * Tau act_fg = [None] * Tau act_og = [None] * Tau act_ff = [None] * Tau C = [None] * Tau C[0] = owl.zeros([N, 1]) dY = [None] * Tau dBd = owl.zeros([model.Layers[2], 1]) #dY.sum(0) dWd = owl.zeros([model.Layers[2], model.Layers[1]]) dHout = [None] * Tau #dY.dot(model.decoder_weights.transpose()) dEmb = [None] * Tau ##### Forward pass ##### # For each time step for t in range(1, Tau): # predict the (t+1)'th word from the t'th word data[t] = model.emb_weight[sent[t - 1]] NVector = np.zeros((K, 1)) NVector[sent[t]] = 1 target = owl.from_numpy(NVector).trans() act_ig[t] = model.ig_weight_data * data[t] + model.ig_weight_prev * Hout[t - 1] + model.ig_weight_cell * C[t - 1] + model.ig_weight_bias act_ig[t] = ele.sigm(act_ig[t]) act_fg[t] = model.fg_weight_data * data[t] + model.fg_weight_prev * Hout[t - 1] + model.fg_weight_cell * C[t - 1] + model.fg_weight_bias act_fg[t] = ele.sigm(act_fg[t]) act_ff[t] = model.ff_weight_data * data[t] + model.ff_weight_prev * Hout[t - 1] + model.ff_weight_bias act_ff[t] = ele.tanh(act_ff[t]) C[t] = ele.mult(act_ig[t], act_ff[t]) + ele.mult(act_fg[t], C[t - 1]) act_og[t] = model.og_weight_data * data[t] + model.og_weight_prev * Hout[t - 1] + model.og_weight_cell * C[t] + model.og_weight_bias act_og[t] = ele.sigm(act_og[t]) if tanhC_version: Hout[t] = ele.mult(act_og[t], ele.tanh(C[t])) else: Hout[t] = ele.mult(act_og[t], C[t]) Y = softmax(model.decoder_weights * Hout[t] + model.decoder_bias) # BP to Hout dY[t] = Y - target dBd += dY[t] dWd += dY[t] * Hout[t].trans() dHout[t] = model.decoder_weights.trans() * dY[t] # evaluation output = Y.to_numpy() # Can directly get a single element from Y # print output[0, sent[t]] sent_ll += math.log(max(output[0, sent[t]],1e-20), 2) #print "Y_0[t]",Y_o[t] #print "Y_o[t][sent[t]]",Y_o[t][sent[t]] #print np.sum(output.to_numpy()) # output = Ym[t].trans() * data[t] # sent_ll += math.log10( max(np.sum(output.to_numpy()),1e-20) ) ##### Initialize gradient vectors ##### weight_update_ig_data = owl.zeros([model.Layers[1], model.Layers[0]]) weight_update_ig_prev = owl.zeros([model.Layers[1], model.Layers[1]]) weight_update_ig_cell = owl.zeros([model.Layers[1], model.Layers[1]]) weight_update_ig_bias = owl.zeros([model.Layers[1], 1]) weight_update_fg_data = owl.zeros([model.Layers[1], model.Layers[0]]) weight_update_fg_prev = owl.zeros([model.Layers[1], model.Layers[1]]) weight_update_fg_cell = owl.zeros([model.Layers[1], model.Layers[1]]) weight_update_fg_bias = owl.zeros([model.Layers[1], 1]) weight_update_og_data = owl.zeros([model.Layers[1], model.Layers[0]]) weight_update_og_prev = owl.zeros([model.Layers[1], model.Layers[1]]) weight_update_og_cell = owl.zeros([model.Layers[1], model.Layers[1]]) weight_update_og_bias = owl.zeros([model.Layers[1], 1]) weight_update_ff_data = owl.zeros([model.Layers[1], model.Layers[0]]) weight_update_ff_prev = owl.zeros([model.Layers[1], model.Layers[1]]) weight_update_ff_bias = owl.zeros([model.Layers[1], 1]) dC = [None] * Tau for t in xrange(Tau): dC[t] = owl.zeros(C[t].shape) # Calculate the error and add it for t in reversed(range(1, Tau)): #print "sent",sent #print "t",t # BP from og controled gate and og if tanhC_version: tanhC = ele.tanh(C[t]) dTanhC = ele.mult(dHout[t], act_og[t]) sen_og = ele.mult(dHout[t], tanhC) dC[t] += ele.mult((1 - ele.mult(tanhC, tanhC)), dTanhC) else: sen_og = ele.mult(C[t], dHout[t]) dC[t] += ele.mult(act_og[t], dHout[t]) # BP from og sen_og = ele.mult(ele.mult(act_og[t], (1.0 - act_og[t])), sen_og) dHout[t - 1] = model.og_weight_prev.trans() * sen_og dC[t] += model.og_weight_cell.trans() * sen_og dEmb[t] = model.og_weight_data.trans() * sen_og # BP from fg controled gate sen_fg = ele.mult(C[t - 1], dC[t]) dC[t - 1] += ele.mult(act_fg[t], dC[t]) # BP from ig controled gate sen_ig = ele.mult(act_ff[t], dC[t]) sen_ff = ele.mult(act_ig[t], dC[t]) sen_ff = ele.mult((1 - ele.mult(act_ff[t], act_ff[t])), sen_ff) dEmb[t] += model.ff_weight_data.trans() * sen_ff # BP from fg sen_fg = ele.mult(ele.mult(act_fg[t], (1.0 - act_fg[t])), sen_fg) dHout[t - 1] += model.fg_weight_prev.trans() * sen_fg dC[t - 1] += model.fg_weight_cell.trans() * sen_fg dEmb[t] += model.fg_weight_data.trans() * sen_fg # BP from ig sen_ig = ele.mult(ele.mult(act_ig[t], (1.0 - act_ig[t])), sen_ig) dHout[t - 1] += model.ig_weight_prev.trans() * sen_ig dC[t - 1] += model.ig_weight_cell.trans() * sen_ig dEmb[t] += model.ig_weight_data.trans() * sen_ig # derivatives on weight matrix and bias weight_update_ig_data += sen_ig * data[t].trans() weight_update_ig_prev += sen_ig * Hout[t - 1].trans() weight_update_ig_cell += sen_ig * C[t - 1].trans() weight_update_ig_bias += sen_ig weight_update_fg_data += sen_fg * data[t].trans() weight_update_fg_prev += sen_fg * Hout[t - 1].trans() weight_update_fg_cell += sen_fg * C[t - 1].trans() weight_update_fg_bias += sen_fg weight_update_og_data += sen_og * data[t].trans() weight_update_og_prev += sen_og * Hout[t - 1].trans() weight_update_og_cell += sen_og * C[t].trans() weight_update_og_bias += sen_og weight_update_ff_data += sen_ff * data[t].trans() weight_update_ff_prev += sen_ff * Hout[t - 1].trans() weight_update_ff_bias += sen_ff # normalize the gradients rate = learning_rate / Tau # weight update model.ig_weight_prev -= rate * weight_update_ig_prev model.ig_weight_data -= rate * weight_update_ig_data model.ig_weight_cell -= rate * weight_update_ig_cell model.ig_weight_bias -= rate * weight_update_ig_bias model.fg_weight_prev -= rate * weight_update_fg_prev model.fg_weight_data -= rate * weight_update_fg_data model.fg_weight_cell -= rate * weight_update_fg_cell model.fg_weight_bias -= rate * weight_update_fg_bias model.og_weight_prev -= rate * weight_update_og_prev model.og_weight_data -= rate * weight_update_og_data model.og_weight_cell -= rate * weight_update_og_cell model.og_weight_bias -= rate * weight_update_og_bias model.ff_weight_prev -= rate * weight_update_ff_prev model.ff_weight_data -= rate * weight_update_ff_data model.ff_weight_bias -= rate * weight_update_ff_bias model.decoder_weights -= rate * dWd model.decoder_bias -= rate * dBd for t in range(1, Tau): model.emb_weight[sent[t - 1]] -= rate * dEmb[t] # Print results epoch_ll += sent_ll # print(" Sentence %d LL: %f" % (sent_id, sent_ll)) epoch_ent = epoch_ll * (-1) / words epoch_ppl = 2 ** epoch_ent cur_time = time.time() print("Epoch %d (alpha=%f) PPL=%f" % (epoch_id, learning_rate, epoch_ppl)) print " time consumed:", cur_time - last_time last_time = cur_time return model, learning_rate
def LSTM_test(model, sents, words, tanhC_version = 1): N = model.Layers[1] K = model.Layers[2] test_ll = 0 # For each sentence for sent_id, sent in enumerate(sents): #print sent_id #print "sent", sent #print "sents", sents ##### Initialize activations ##### Tau = len(sent) sent_ll = 0 # Sentence log likelihood data = [None] * Tau Hout = [None] * Tau Hout[0] = owl.zeros([N, 1]) act_ig = [None] * Tau act_fg = [None] * Tau act_og = [None] * Tau act_ff = [None] * Tau C = [None] * Tau C[0] = owl.zeros([N, 1]) ##### Forward pass ##### # For each time step for t in range(1, Tau): # predict the (t+1)'th word from the t'th word data[t] = model.emb_weight[sent[t - 1]] act_ig[t] = model.ig_weight_data * data[t] + model.ig_weight_prev * Hout[t - 1] + model.ig_weight_cell * C[t - 1] + model.ig_weight_bias act_ig[t] = ele.sigm(act_ig[t]) act_fg[t] = model.fg_weight_data * data[t] + model.fg_weight_prev * Hout[t - 1] + model.fg_weight_cell * C[t - 1] + model.fg_weight_bias act_fg[t] = ele.sigm(act_fg[t]) act_ff[t] = model.ff_weight_data * data[t] + model.ff_weight_prev * Hout[t - 1] + model.ff_weight_bias act_ff[t] = ele.tanh(act_ff[t]) C[t] = ele.mult(act_ig[t], act_ff[t]) + ele.mult(act_fg[t], C[t - 1]) act_og[t] = model.og_weight_data * data[t] + model.og_weight_prev * Hout[t - 1] + model.og_weight_cell * C[t] + model.og_weight_bias act_og[t] = ele.sigm(act_og[t]) if tanhC_version: Hout[t] = ele.mult(act_og[t], ele.tanh(C[t])) else: Hout[t] = ele.mult(act_og[t], C[t]) Y = softmax(model.decoder_weights * Hout[t] + model.decoder_bias) # evaluation output = Y.to_numpy() # Can directly get a single element from Y # print output[0, sent[t]] sent_ll += math.log(max(output[0, sent[t]],1e-20), 2) test_ll += sent_ll test_ent = test_ll * (-1) / words test_ppl = 2 ** test_ent print "Test PPL =", test_ppl
def LSTM_train(model, sents, words, learning_rate, EPOCH, tanhC_version=1): # Constants N = model.Layers[1] # Number of units K = model.Layers[0] # Vocabulary size # For each epoch last_ll = 1e99 for epoch_id in range(EPOCH, EPOCH + 10): print 'Start epoch #', epoch_id last_time = time.time() epoch_ll = 0 tau_sum = 0 # For each sentence for sent_id, sent in enumerate(sents): #print "sent_id",sent_id #print "sent", sent #print "sents", sents ##### Initialize activations ##### Tau = len(sent) tau_sum += Tau sent_ll = 0 # Sentence log likelihood batch_size = Tau data = [None] * Tau prev = [None] * Tau data[0] = owl.zeros([K, 1]) # embed = np.zeros((K, 1)) # embed[sent[0]] = 1 # data[0] = owl.from_numpy(embed).trans() Hout = [None] * Tau Hout[0] = owl.zeros([N, 1]) act_ig = [None] * Tau act_fg = [None] * Tau act_og = [None] * Tau act_ff = [None] * Tau C = [None] * Tau C[0] = owl.zeros([N, 1]) Ym = [None] * Tau dY = [None] * Tau dBd = owl.zeros([model.Layers[2], 1]) #dY.sum(0) dWd = owl.zeros([model.Layers[1], model.Layers[2]]) #Hout.transpose().dot(dY) dHout = [None] * Tau #dY.dot(model.decoder_weights.transpose()) ##### Forward pass ##### # For each time step for t in range(1, Tau): #prev[t] = Hout[t - 1] prev[t] = owl.zeros([N, 1]) data[t] = owl.zeros([K, 1]) #embed = np.zeros((K, 1)) #embed[sent[t]] = 1 #data[t] = owl.from_numpy(embed).trans() act_ig[t] = model.ig_weight_data.trans() * data[ t - 1] + model.ig_weight_prev.trans( ) * prev[t] + model.ig_weight_bias act_fg[t] = model.fg_weight_data.trans() * data[ t - 1] + model.fg_weight_prev.trans( ) * prev[t] + model.fg_weight_bias act_og[t] = model.og_weight_data.trans() * data[ t - 1] + model.og_weight_prev.trans( ) * prev[t] + model.og_weight_bias act_ff[t] = model.ff_weight_data.trans() * data[ t - 1] + model.ff_weight_prev.trans( ) * prev[t] + model.ff_weight_bias act_ig[t] = ele.sigm(act_ig[t]) act_fg[t] = ele.sigm(act_fg[t]) act_og[t] = ele.sigm(act_og[t]) act_ff[t] = ele.tanh(act_ff[t]) C[t] = ele.mult(act_ig[t], act_ff[t]) + ele.mult( act_fg[t], C[t - 1]) if tanhC_version: Hout[t] = ele.mult(act_og[t], ele.tanh(C[t])) else: Hout[t] = ele.mult(act_og[t], C[t]) Ym[t] = softmax(model.decoder_weights.trans() * Hout[t] + model.decoder_bias) dY[t] = data[t] - Ym[t] dBd += dY[t] / batch_size dWd += Hout[t] * dY[t].trans() / batch_size dHout[t] = model.decoder_weights * dY[t] #print "Y_0[t]",Y_o[t] #print "Y_o[t][sent[t]]",Y_o[t][sent[t]] #print np.sum(output.to_numpy()) # output = Ym[t].trans() * data[t] # sent_ll += math.log10( max(np.sum(output.to_numpy()),1e-20) ) ##### Initialize gradient vectors ##### #Ym[-1].wait_for_eval() for t in range(1, Tau): Ym[t].wait_for_eval() #output = Ym[t].trans() * data[t] #sent_ll += math.log10( max(np.sum(output.to_numpy()),1e-20) ) if sent_id % 100 == 0: cur_time = time.time() print 'Finished', sent_id, 'sentences. Time used:', cur_time - last_time, 's. sent/s:', float( sent_id) / (cur_time - last_time), 'tau_sum=', tau_sum #print owl.print_profiler_result() tau_sum = 0 continue sen_ig = [None] * Tau sen_fg = [None] * Tau sen_og = [None] * Tau sen_ff = [None] * Tau weight_update_ig_data = owl.zeros( [model.Layers[0], model.Layers[1]]) weight_update_ig_prev = owl.zeros( [model.Layers[1], model.Layers[1]]) weight_update_ig_bias = owl.zeros([model.Layers[1], 1]) weight_update_fg_data = owl.zeros( [model.Layers[0], model.Layers[1]]) weight_update_fg_prev = owl.zeros( [model.Layers[1], model.Layers[1]]) weight_update_fg_bias = owl.zeros([model.Layers[1], 1]) weight_update_og_data = owl.zeros( [model.Layers[0], model.Layers[1]]) weight_update_og_prev = owl.zeros( [model.Layers[1], model.Layers[1]]) weight_update_og_bias = owl.zeros([model.Layers[1], 1]) weight_update_ff_data = owl.zeros( [model.Layers[0], model.Layers[1]]) weight_update_ff_prev = owl.zeros( [model.Layers[1], model.Layers[1]]) weight_update_ff_bias = owl.zeros([model.Layers[1], 1]) dHin = owl.zeros([model.Layers[1], model.Layers[1]]) dC = [None] * Tau for t in xrange(Tau): dC[t] = owl.zeros(C[t].shape) # Calculate the error and add it for t in reversed(range(1, len(sent))): #print "sent",sent #print "t",t if tanhC_version: tanhCt = ele.tanh(C[t]) sen_og[t] = ele.mult(tanhCt, dHout[t]) dC[t] += ele.mult((1 - ele.mult(tanhCt, tanhCt)), ele.mult(act_og[t], dHout[t])) else: sen_og[t] = ele.mult(C[t], dHout[t]) dC[t] += ele.mult(act_og[t], dHout[t]) sen_fg[t] = owl.zeros([model.Layers[1], 1]) if t > 0: sen_fg[t] = ele.mult(C[t - 1], dC[t]) dC[t - 1] += ele.mult(act_fg[t], dC[t]) sen_ig[t] = ele.mult(act_ff[t], dC[t]) sen_ff[t] = ele.mult(act_ig[t], dC[t]) # backprop activation functions sen_ff[t] = ele.mult((1 - ele.mult(act_ff[t], act_ff[t])), sen_ff[t]) sen_ig[t] = ele.mult(ele.mult(act_ig[t], (1.0 - act_ig[t])), sen_ig[t]) sen_fg[t] = ele.mult(ele.mult(act_fg[t], (1.0 - act_fg[t])), sen_fg[t]) sen_og[t] = ele.mult(ele.mult(act_og[t], (1.0 - act_og[t])), sen_og[t]) # backprop matrix multiply weight_update_ig_data += data[t] * sen_ig[t].trans() weight_update_ig_prev += prev[t] * sen_ig[t].trans() weight_update_fg_bias += sen_ig[t] # sen_ig[t].sum(0 or 1) weight_update_fg_data += data[t] * sen_fg[t].trans() weight_update_fg_prev += prev[t] * sen_fg[t].trans() weight_update_fg_bias += sen_fg[t] weight_update_og_data += data[t] * sen_og[t].trans() weight_update_og_prev += prev[t] * sen_og[t].trans() weight_update_og_bias += sen_og[t] weight_update_ff_data += data[t] * sen_ff[t].trans() weight_update_ff_prev += prev[t] * sen_ff[t].trans() weight_update_ff_bias += sen_ff[t] if t > 1: dHout[t - 1] += model.ig_weight_prev.trans() * sen_ig[t] dHout[t - 1] += model.fg_weight_prev.trans() * sen_fg[t] dHout[t - 1] += model.og_weight_prev.trans() * sen_og[t] dHout[t - 1] += model.ff_weight_prev.trans() * sen_ff[t] # normalize the gradients # weight update model.ig_weight_prev += learning_rate / batch_size * weight_update_ig_prev model.ig_weight_data += learning_rate / batch_size * weight_update_ig_data model.ig_weight_bias += learning_rate / batch_size * weight_update_ig_bias model.fg_weight_prev += learning_rate / batch_size * weight_update_fg_prev model.fg_weight_data += learning_rate / batch_size * weight_update_fg_data model.fg_weight_bias += learning_rate / batch_size * weight_update_fg_bias model.og_weight_prev += learning_rate / batch_size * weight_update_og_prev model.og_weight_data += learning_rate / batch_size * weight_update_og_data model.og_weight_bias += learning_rate / batch_size * weight_update_og_bias model.ff_weight_prev += learning_rate / batch_size * weight_update_ff_prev model.ff_weight_data += learning_rate / batch_size * weight_update_ff_data model.ff_weight_bias += learning_rate / batch_size * weight_update_ff_bias model.decoder_weights += learning_rate * dWd model.decoder_bias += learning_rate * dBd # Print results epoch_ll += sent_ll # print(" Sentence %d LL: %f" % (sent_id, sent_ll)) epoch_ent = epoch_ll * (-1) / words epoch_ppl = 10**epoch_ent cur_time = time.time() print("Epoch %d (alpha=%f) PPL=%f" % (epoch_id, learning_rate, epoch_ppl)) print " time consumed:", cur_time - last_time last_time = cur_time if last_ll > epoch_ll: learning_rate /= 2.0 last_ll = epoch_ll return model, learning_rate
def ff(self, x): return ele.tanh(x)