def test_rmsprop_cuda(self): lr = 0.1 n, m = 2, 2 p1 = np.random.rand(n, m) p2 = np.random.rand(n, m) g1 = np.random.rand(n, m) * 0.01 g2 = np.random.rand(n, m) * 0.01 v1 = np.zeros((n, m)) v2 = np.zeros((n, m)) t1 = tensor.from_numpy(p1) t2 = tensor.from_numpy(p2) tg1 = tensor.from_numpy(g1) tg2 = tensor.from_numpy(g2) for t in range(1, 4): np_rmsprop([p1, p2], [g1, g2], [v1, v2], lr, t) rsmprop = opt.RMSProp(lr=lr) self.to_cuda() for t in range(1, 4): rsmprop.apply(0, tg1, t1, 'p1', t) rsmprop.apply(0, tg2, t2, 'p2', t) t1 = tensor.to_numpy(t1) t2 = tensor.to_numpy(t2) for t, p in zip([t1, t2], [p1, p2]): for i in range(n): for j in range(m): self.assertAlmostEqual(t[i, j], p[i, j], 2)
def train(self, data_path, max_epoch, model_path='model'): # SGD with L2 gradient normalization opt = optimizer.RMSProp(constraint=optimizer.L2Constraint(5)) #opt = optimizer.SGD(momentum=0.9, weight_decay=5e-4) # initialize embedding layer embed_w = self.embed.param_values()[0] embed_b = self.embed.param_values()[1] #initializer.uniform(embed_w, 0, embed_w.shape[1]) embed_w.uniform(-0.08, 0.08) embed_b.set_value(0) print 'embed weight l1 = %f' % (embed_w.l1()) print 'embed b l1 = %f' % (embed_b.l1()) # initialize lstm layer lstm_w = self.lstm.param_values()[0] lstm_w.uniform(-0.08, 0.08) # init all lstm parameters print 'lstm weight l1 = %f' % (lstm_w.l1()) # initialize dense layer dense_w = self.dense.param_values()[0] dense_b = self.dense.param_values()[1] dense_w.uniform(-0.1, 0.1) dense_b.set_value(0) print 'dense w ', dense_w.shape print 'dense b ', dense_b.shape print 'dense weight l1 = %f' % (dense_w.l1()) print 'dense b l1 = %f' % (dense_b.l1()) start = time() train_dat, train_label, val_dat, val_label = load_sample() #train_dat, train_label, val_dat, val_label = load_corpus(data_path) train_label = word2onehot(train_label, 2) val_label = word2onehot(val_label, 2) print 'loading time:', time() - start print "train data shape:", train_dat.shape, "train label shape:", train_label.shape print "val data shape:", val_dat.shape, "val label shape:", val_label.shape for epoch in range(max_epoch): train_loss = 0 num_train_batch = train_dat.shape[0] / self.batchsize glb_acc = 0 for b in range(num_train_batch): start = time() # load training data inputs_arr = train_dat[b * self.batchsize: (b + 1) * self.batchsize] labels = train_label[b * self.batchsize: (b + 1) * self.batchsize] lens = rm_padding(inputs_arr) acc = 0 batch_loss = 0.0 g_dense_w = tensor.Tensor(dense_w.shape, self.dev) g_dense_w.set_value(0) g_dense_b = tensor.Tensor(dense_b.shape, self.dev) g_dense_b.set_value(0) g_lstm_w = tensor.Tensor(lstm_w.shape, self.dev) g_lstm_w.set_value(0) g_embed_w = tensor.Tensor(embed_w.shape, self.dev) g_embed_w.set_value(0) for idx_sam in range(len(inputs_arr)): sam_arr = inputs_arr[idx_sam] sam_arr = convert_sample(sam_arr, sam_arr.shape[0], self.vocab_size, self.dev) sample = tensor.from_numpy(sam_arr) sample.to_device(self.dev) #print sample.shape embed = self.embed.forward(model_pb2.kTrain, sample) #print embed.shape is (53, 128) # embed.shape[0] means the sequence length of the sample embeded = [] for idx_seq in range(self.seq_length): if idx_seq >= embed.shape[0]: embeded.append(tensor.Tensor()) else: seq = tensor.Tensor((1,embed.shape[1]), self.dev) tensor.copy_data_to_from(seq, embed, embed.shape[1], 0, idx_seq * embed.shape[1]) embeded.append(seq) embeded.append(tensor.Tensor()) # hx embeded.append(tensor.Tensor()) # cx #print 'forward embedding time:', time() -start #print tensor.to_numpy(embeded[self.seq_length-1]) # forward lstm layer hidden = self.lstm.forward(model_pb2.kTrain, embeded) # outputs are [y1, ..., yn, hx, cx], only need the last output as the predicted latent vector #print len(hidden), hidden[embed.shape[0]-1] #print [hidden[i].l1() for i in range(len(hidden))] # forward dense and loss layer act = self.dense.forward(model_pb2.kTrain, hidden[lens[idx_sam]-1]) label = tensor.from_numpy(labels[idx_sam]) label.to_device(self.dev) lvalue = self.loss.forward(model_pb2.kTrain, act, label) #print 'forward dense time:', time() - start regularized_act = self.sft.forward(model_pb2.kEval, act) pred = tensor.to_numpy(regularized_act) gt = labels[idx_sam][1] if (gt and pred[0,1] > pred[0,0]) or (gt == 0 and pred[0,1] <= pred[0,0]): acc += 1 grads = [] batch_loss += lvalue.l1() / self.batchsize #print batch_loss start = time() # backward loss and dense layer grad = self.loss.backward() / self.batchsize grad, gwb = self.dense.backward(model_pb2.kTrain, grad) g_dense_w += gwb[0] g_dense_b += gwb[1] #print 'dense_w l1 = %f' % (gwb[0].l1()) for i in range(self.seq_length): if i == lens[idx_sam] - 1: grads.append(grad) else: emp = tensor.Tensor(grad.shape, self.dev) emp.set_value(0) grads.append(emp) grads.append(tensor.Tensor()) grads.append(tensor.Tensor()) # backward lstm layer lstm_input_grad, lstm_param_grad = self.lstm.backward(model_pb2.kTrain, grads) g_lstm_w += lstm_param_grad[0] #print 'lstm_input l1 = %f' % (lstm_input_grad[0].l1()) #print 'backward lstm' embed_grad = tensor.Tensor(embed.shape, self.dev) for idx in range(len(lstm_input_grad)-2): tensor.copy_data_to_from(embed_grad, lstm_input_grad[idx], embed.shape[1], idx * embed.shape[1], 0) _, grad_w = self.embed.backward(model_pb2.kTrain, embed_grad) #print 'backward embedding time:', time() - start #print 'embed weight l1 = %f' % (grad_w[0].l1()) g_embed_w += grad_w[0] train_loss += batch_loss glb_acc += acc utils.update_progress( b * 1.0 / num_train_batch, 'training loss = %f, acc = %f' % (batch_loss, acc * 1.0 / self.batchsize)) opt.apply_with_lr(epoch, get_lr(epoch), g_lstm_w, lstm_w, 'lstm_w') opt.apply_with_lr(epoch, get_lr(epoch), g_dense_w, dense_w, 'dense_w') opt.apply_with_lr(epoch, get_lr(epoch), g_dense_b, dense_b, 'dense_b') opt.apply_with_lr(epoch, get_lr(epoch), g_embed_w, embed_w, 'embed_w') #opt.apply_with_lr(epoch, get_lr(epoch), grad_w[1], embed_b, 'embed_b') print '\nEpoch %d, train loss is %f, acc = %f' % \ (epoch, train_loss / num_train_batch, glb_acc * 1. / (self.batchsize * num_train_batch)) # evaluation eval_loss = 0 val_acc = 0 num_test_batch = min(5000, val_dat.shape[0] / self.batchsize) for b in range(num_test_batch): acc = 0 val_arr = val_dat[b * self.batchsize: (b + 1) * self.batchsize] labels = val_label[b * self.batchsize: (b + 1) * self.batchsize] lens = rm_padding(val_arr) val_arr = convert(val_arr, self.batchsize, self.seq_length, self.vocab_size, self.dev) val_arr = np.swapaxes(val_arr, 0, 1).reshape(( self.batchsize * self.seq_length, self.vocab_size)) inputs = tensor.from_numpy(val_arr) inputs.to_device(self.dev) # shape (128*53, 33366) embed = self.embed.forward(model_pb2.kEval, inputs) embed.reshape((self.seq_length, self.batchsize, self.embed_size)) embeded = [] for idx in range(self.seq_length): embed_seq = tensor.Tensor((self.batchsize, self.embed_size), self.dev) tensor.copy_data_to_from(embed_seq, embed, self.batchsize * self.embed_size, 0, idx * self.batchsize * self.embed_size) embeded.append(embed_seq) embeded.append(tensor.Tensor()) # hx embeded.append(tensor.Tensor()) # cx hidden = self.lstm.forward(model_pb2.kEval, embeded) hidden_batch = tensor.Tensor((self.batchsize, self.hidden_size), self.dev) for idx in range(self.batchsize): tensor.copy_data_to_from(hidden_batch, hidden[lens[idx]-1], self.hidden_size, idx * self.hidden_size, idx* self.hidden_size) act = self.dense.forward(model_pb2.kEval, hidden_batch) labels = tensor.from_numpy(labels) labels.to_device(self.dev) eval_loss += self.loss.forward(model_pb2.kEval, act, labels).l1() regularized_act = self.sft.forward(model_pb2.kEval, act) pred = tensor.to_numpy(regularized_act) gt = tensor.to_numpy(labels)[:,1] for i in range(self.batchsize): if (gt[i] and pred[i,1] > pred[i,0]) or (gt[i] == 0 and pred[i,1] <= pred[i,0]): acc += 1 #print 'acc = %f' % (acc * 1. / self.batchsize) val_acc += acc print 'Epoch %d, evaluation loss is %f, acc = %f' % \ (epoch, eval_loss / num_test_batch, val_acc * 1. / (num_test_batch * self.batchsize)) # model saving if (epoch + 1) % 2 == 0 or epoch + 1 == max_epoch: print 'dense weight l1 = %f' % (dense_w.l1()) print 'dense bias l1 = %f' % (dense_b.l1()) print 'lstm weight l1 = %f' % (lstm_w.l1()) print 'embed weight l1 = %f' % (embed_w.l1()) # checkpoint the file model with open('%s_%d.bin' % (model_path, epoch), 'wb') as fd: print 'saving model to %s' % model_path d = {} for name, w in zip( ['embed_w','embed_b', 'lstm_w', 'dense_w', 'dense_b'], [embed_w, embed_b, lstm_w, dense_w, dense_b]): w.to_host() d[name] = tensor.to_numpy(w) w.to_device(self.dev) '''d['idx_to_char'] = data.idx_to_char d['char_to_idx'] = data.char_to_idx d['hidden_size'] = hidden_size d['num_stacks'] = num_stacks d['dropout'] = dropout''' pickle.dump(d, fd)
def train(data, max_epoch, hidden_size=100, seq_length=100, batch_size=16, num_stacks=1, dropout=0.5, model_path='model'): # SGD with L2 gradient normalization opt = optimizer.RMSProp(constraint=optimizer.L2Constraint(5)) cuda = device.create_cuda_gpu() rnn = layer.LSTM(name='lstm', hidden_size=hidden_size, num_stacks=num_stacks, dropout=dropout, input_sample_shape=(data.vocab_size, )) rnn.to_device(cuda) print 'created rnn' rnn_w = rnn.param_values()[0] rnn_w.uniform(-0.08, 0.08) # init all rnn parameters print 'rnn weight l1 = %f' % (rnn_w.l1()) dense = layer.Dense('dense', data.vocab_size, input_sample_shape=(hidden_size, )) dense.to_device(cuda) dense_w = dense.param_values()[0] dense_b = dense.param_values()[1] print 'dense w ', dense_w.shape print 'dense b ', dense_b.shape initializer.uniform(dense_w, dense_w.shape[0], 0) print 'dense weight l1 = %f' % (dense_w.l1()) dense_b.set_value(0) print 'dense b l1 = %f' % (dense_b.l1()) g_dense_w = tensor.Tensor(dense_w.shape, cuda) g_dense_b = tensor.Tensor(dense_b.shape, cuda) lossfun = loss.SoftmaxCrossEntropy() for epoch in range(max_epoch): train_loss = 0 for b in range(data.num_train_batch): batch = data.train_dat[b * batch_size:(b + 1) * batch_size] inputs, labels = convert(batch, batch_size, seq_length, data.vocab_size, cuda) inputs.append(tensor.Tensor()) inputs.append(tensor.Tensor()) outputs = rnn.forward(model_pb2.kTrain, inputs)[0:-2] grads = [] batch_loss = 0 g_dense_w.set_value(0.0) g_dense_b.set_value(0.0) for output, label in zip(outputs, labels): act = dense.forward(model_pb2.kTrain, output) lvalue = lossfun.forward(model_pb2.kTrain, act, label) batch_loss += lvalue.l1() grad = lossfun.backward() grad /= batch_size grad, gwb = dense.backward(model_pb2.kTrain, grad) grads.append(grad) g_dense_w += gwb[0] g_dense_b += gwb[1] # print output.l1(), act.l1() utils.update_progress( b * 1.0 / data.num_train_batch, 'training loss = %f' % (batch_loss / seq_length)) train_loss += batch_loss grads.append(tensor.Tensor()) grads.append(tensor.Tensor()) g_rnn_w = rnn.backward(model_pb2.kTrain, grads)[1][0] dense_w, dense_b = dense.param_values() opt.apply_with_lr(epoch, get_lr(epoch), g_rnn_w, rnn_w, 'rnnw') opt.apply_with_lr(epoch, get_lr(epoch), g_dense_w, dense_w, 'dense_w') opt.apply_with_lr(epoch, get_lr(epoch), g_dense_b, dense_b, 'dense_b') print '\nEpoch %d, train loss is %f' % \ (epoch, train_loss / data.num_train_batch / seq_length) eval_loss = 0 for b in range(data.num_test_batch): batch = data.val_dat[b * batch_size:(b + 1) * batch_size] inputs, labels = convert(batch, batch_size, seq_length, data.vocab_size, cuda) inputs.append(tensor.Tensor()) inputs.append(tensor.Tensor()) outputs = rnn.forward(model_pb2.kEval, inputs)[0:-2] for output, label in zip(outputs, labels): output = dense.forward(model_pb2.kEval, output) eval_loss += lossfun.forward(model_pb2.kEval, output, label).l1() print 'Epoch %d, evaluation loss is %f' % \ (epoch, eval_loss / data.num_test_batch / seq_length) if (epoch + 1) % 30 == 0: # checkpoint the file model with open('%s_%d.bin' % (model_path, epoch), 'wb') as fd: print 'saving model to %s' % model_path d = {} for name, w in zip(['rnn_w', 'dense_w', 'dense_b'], [rnn_w, dense_w, dense_b]): w.to_host() d[name] = tensor.to_numpy(w) w.to_device(cuda) d['idx_to_char'] = data.idx_to_char d['char_to_idx'] = data.char_to_idx d['hidden_size'] = hidden_size d['num_stacks'] = num_stacks d['dropout'] = dropout pickle.dump(d, fd)
from singa import initializer from singa.proto import model_pb2 from tqdm import tnrange from word2tensor import load_data, numpy2tensors, convert, labelconvert import time def get_lr(epoch): return 0.001 / float(1 << (epoch / 50)) if __name__ == "__main__": # SGD with L2 gradient normalization vocab_size = 7000 opt = optimizer.RMSProp(constraint=optimizer.L2Constraint(5)) cuda = device.create_cuda_gpu_on(1) encoder = layer.LSTM(name='lstm1', hidden_size=64, num_stacks=5, dropout=0.5, input_sample_shape=(vocab_size, )) decoder = layer.LSTM(name='lstm2', hidden_size=64, num_stacks=5, dropout=0.5, input_sample_shape=(vocab_size, )) encoder.to_device(cuda) decoder.to_device(cuda) encoder_w = encoder.param_values()[0] encoder_w.uniform(-0.08, 0.08)