def train(data, net, max_epoch, get_lr, weight_decay, batch_size=100, use_cpu=False): print('Start intialization............') if use_cpu: print('Using CPU') dev = device.get_default_device() else: print('Using GPU') dev = device.create_cuda_gpu() net.to_device(dev) opt = optimizer.SGD(momentum=0.9, weight_decay=weight_decay) for (p, specs) in zip(net.param_names(), net.param_specs()): opt.register(p, specs) tx = tensor.Tensor((batch_size, 3, 32, 32), dev) ty = tensor.Tensor((batch_size, ), dev, core_pb2.kInt) train_x, train_y, test_x, test_y = data num_train_batch = train_x.shape[0] // batch_size num_test_batch = test_x.shape[0] // batch_size idx = np.arange(train_x.shape[0], dtype=np.int32) for epoch in range(max_epoch): np.random.shuffle(idx) loss, acc = 0.0, 0.0 print('Epoch %d' % epoch) for b in range(num_train_batch): x = train_x[idx[b * batch_size:(b + 1) * batch_size]] y = train_y[idx[b * batch_size:(b + 1) * batch_size]] tx.copy_from_numpy(x) ty.copy_from_numpy(y) grads, (l, a) = net.train(tx, ty) loss += l acc += a for (s, p, g) in zip(net.param_names(), net.param_values(), grads): opt.apply_with_lr(epoch, get_lr(epoch), g, p, str(s), b) # update progress bar utils.update_progress(b * 1.0 / num_train_batch, 'training loss = %f, accuracy = %f' % (l, a)) info = '\ntraining loss = %f, training accuracy = %f, lr = %f' \ % ((loss / num_train_batch), (acc / num_train_batch), get_lr(epoch)) print(info) loss, acc = 0.0, 0.0 for b in range(num_test_batch): x = test_x[b * batch_size:(b + 1) * batch_size] y = test_y[b * batch_size:(b + 1) * batch_size] tx.copy_from_numpy(x) ty.copy_from_numpy(y) l, a = net.evaluate(tx, ty) loss += l acc += a print('test loss = %f, test accuracy = %f' % ((loss / num_test_batch), (acc / num_test_batch))) net.save('model', 20) # save model params into checkpoint file
def train(data_dir, net, num_epoch=20, batch_size=250): print 'Start intialization............' cuda = device.create_cuda_gpu() net.to_device(cuda) opt = optimizer.SGD(momentum=0.9,weight_decay=0.04) for (p, specs) in zip(net.param_values(), net.param_specs()): filler = specs.filler if filler.type == 'gaussian': initializer.gaussian(p, filler.mean, filler.std) else: p.set_value(0) opt.register(p, specs) print specs.name, filler.type, p.l1() print 'Loading data ..................' train_x, train_y = load_dataset(data_dir,1) test_x, test_y = load_dataset(data_dir,2) tx = tensor.Tensor((batch_size,3), cuda) ty = tensor.Tensor((batch_size,),cuda, core_pb2.kInt) #ta = tensor.Tensor((batch_size,3), cuda) #tb = tensor.Tensor((batch_size,),cuda, core_pb2.kInt) num_train_batch = train_x.shape[0]/batch_size num_test_batch = test_x.shape[0]/batch_size idx = np.arange(train_x.shape[0], dtype=np.int32) id = np.arange(test_x.shape[0],dtype=np.int32) for epoch in range(num_epoch): np.random.shuffle(idx) loss, acc = 0.000,0.000 print 'Epoch %d' % epoch for b in range(num_train_batch): x = train_x[idx[b * batch_size:(b+1)* batch_size]] y = train_y[idx[b * batch_size:(b+1)* batch_size]] tx.copy_from_numpy(x) ty.copy_from_numpy(y) grads, (l, a) = net.train(tx, ty) loss += l acc += a for (s, p, g) in zip(net.param_specs(), net.param_values(), grads): opt.apply_with_lr(epoch, get_lr(epoch), g, p, str(s.name)) # update progress bar utils.update_progress(b * 1.0 / num_train_batch, 'training loss = %f, accuracy = %f' % (l, a)) info = '\ntraining loss = %f, training accuracy = %f' \ % (loss/num_train_batch, acc/num_train_batch) print info loss,acc=0.000,0.000 np.random.shuffle(id) for b in range(num_test_batch): x = test_x[b * batch_size:(b+1) * batch_size] y = test_y[b * batch_size:(b+1) * batch_size] tx.copy_from_numpy(x) ty.copy_from_numpy(y) l, a = net.evaluate(tx, ty) loss += l acc += a print 'test loss = %f, test accuracy = %f' \ % (loss / num_test_batch, acc / num_test_batch) net.save('model.bin') # save model params into checkpoint file
def train(data, net, max_epoch, get_lr, weight_decay, batch_size=100, use_cpu=False): print('Start intialization............') if use_cpu: print('Using CPU') dev = device.get_default_device() else: print('Using GPU') dev = device.create_cuda_gpu() net.to_device(dev) opt = optimizer.SGD(momentum=0.9, weight_decay=weight_decay) for (p, specs) in zip(net.param_names(), net.param_specs()): opt.register(p, specs) tx = tensor.Tensor((batch_size, 3, 32, 32), dev) ty = tensor.Tensor((batch_size,), dev, tensor.int32) train_x, train_y, test_x, test_y = data num_train_batch = train_x.shape[0] // batch_size num_test_batch = test_x.shape[0] // batch_size idx = np.arange(train_x.shape[0], dtype=np.int32) for epoch in range(max_epoch): np.random.shuffle(idx) loss, acc = 0.0, 0.0 print('Epoch %d' % epoch) for b in range(num_train_batch): x = train_x[idx[b * batch_size: (b + 1) * batch_size]] y = train_y[idx[b * batch_size: (b + 1) * batch_size]] tx.copy_from_numpy(x) ty.copy_from_numpy(y) grads, (l, a) = net.train(tx, ty) loss += l acc += a for (s, p, g) in zip(net.param_names(), net.param_values(), grads): opt.apply_with_lr(epoch, get_lr(epoch), g, p, str(s), b) # update progress bar utils.update_progress(b * 1.0 / num_train_batch, 'training loss = %f, accuracy = %f' % (l, a)) info = '\ntraining loss = %f, training accuracy = %f, lr = %f' \ % ((loss / num_train_batch), (acc / num_train_batch), get_lr(epoch)) print(info) loss, acc = 0.0, 0.0 for b in range(num_test_batch): x = test_x[b * batch_size: (b + 1) * batch_size] y = test_y[b * batch_size: (b + 1) * batch_size] tx.copy_from_numpy(x) ty.copy_from_numpy(y) l, a = net.evaluate(tx, ty) loss += l acc += a print('test loss = %f, test accuracy = %f' % ((loss / num_test_batch), (acc / num_test_batch))) net.save('model', 20) # save model params into checkpoint file
def train(inputfolder, outputfolder, visfolder, trainratio, validationratio, testratio, dev, agent, max_epoch, use_cpu, batch_size=100): opt = optimizer.SGD(momentum=0.9, weight_decay=0.01) agent.push(MsgType.kStatus, 'Downlaoding data...') # all_feature, all_label = get_data(os.path.join(inputfolder, 'features.txt'), os.path.join(inputfolder, 'label.txt')) # PUT THE DATA on/to dbsystem all_feature, all_label = get_data( os.path.join(inputfolder, 'features.txt'), os.path.join(inputfolder, 'label.txt')) # PUT THE DATA on/to dbsystem agent.push(MsgType.kStatus, 'Finish downloading data') n_folds = 5 print("all_label shape: ", all_label.shape) all_label = all_label[:, 1] # for i, (train_index, test_index) in enumerate(StratifiedKFold(all_label.reshape(all_label.shape[0]), n_folds=n_folds)): for i in range(3): train_index = np.arange(0, 1404) train_feature, train_label = all_feature[train_index], all_label[ train_index] if i == 0: print("fold: ", i) break print("train label sum: ", train_label.sum()) in_shape = np.array([1, 12, 375]) trainx = tensor.Tensor( (batch_size, int(in_shape[0]), int(in_shape[1]), int(in_shape[2])), dev) trainy = tensor.Tensor((batch_size, ), dev, tensor.int32) num_train_batch = train_feature.shape[0] / batch_size idx = np.arange(train_feature.shape[0], dtype=np.int32) # height = 12 # width = 375 # kernel_y = 3 # kernel_x = 80 # stride_y = 1 # stride_x = 20 hyperpara = np.array([12, 375, 3, 10, 1, 3]) height, width, kernel_y, kernel_x, stride_y, stride_x = hyperpara[ 0], hyperpara[1], hyperpara[2], hyperpara[3], hyperpara[4], hyperpara[ 5] print('kernel_y: ', kernel_y) print('kernel_x: ', kernel_x) print('stride_y: ', stride_y) print('stride_x: ', stride_x) net = model.create_net(in_shape, hyperpara, use_cpu) net.to_device(dev) test_epoch = 10 occlude_test_epoch = 100 for epoch in range(max_epoch): if handle_cmd(agent): break np.random.seed(10) np.random.shuffle(idx) train_feature, train_label = train_feature[idx], train_label[idx] print('Epoch %d' % epoch) loss, acc = 0.0, 0.0 val_loss, val_acc = 0.0, 0.0 # using the first half as validation for b in range(int(num_train_batch)): x, y = train_feature[b * batch_size:(b + 1) * batch_size], train_label[b * batch_size:(b + 1) * batch_size] x = x.reshape((batch_size, in_shape[0], in_shape[1], in_shape[2])) trainx.copy_from_numpy(x) trainy.copy_from_numpy(y) grads, (l, a), probs = net.train(trainx, trainy) loss += l acc += a if b < (int(num_train_batch / 2)): val_loss += l val_acc += a for (s, p, g) in zip(net.param_specs(), net.param_values(), grads): opt.apply_with_lr(epoch, 0.005, g, p, str(s.name)) info = 'training loss = %f, training accuracy = %f' % (l, a) utils.update_progress(b * 1.0 / num_train_batch, info) # put training status info into a shared queue info = dict(phase='train', step=epoch, accuracy=acc / num_train_batch, loss=loss / num_train_batch, timestamp=time.time()) agent.push(MsgType.kInfoMetric, info) info = 'training loss = %f, training accuracy = %f' \ % (loss / num_train_batch, acc / num_train_batch) print(info) val_info = 'validation loss = %f, validation accuracy = %f' \ % (val_loss / (int(num_train_batch / 2)), val_acc / (int(num_train_batch / 2))) print(val_info) if epoch == (max_epoch - 1): print('final val_loss: ', val_loss / (int(num_train_batch / 2))) np.savetxt(outputfolder + '/final_results.txt', np.full((1), val_loss / (int(num_train_batch / 2))), delimiter=",")
def train(data, max_epoch, hidden_size=100, seq_length=100, batch_size=16, num_stacks=1, dropout=0.5, model_path='model'): # SGD with L2 gradient normalization opt = optimizer.RMSProp(constraint=optimizer.L2Constraint(5)) cuda = device.create_cuda_gpu() rnn = layer.LSTM(name='lstm', hidden_size=hidden_size, num_stacks=num_stacks, dropout=dropout, input_sample_shape=(data.vocab_size, )) rnn.to_device(cuda) print 'created rnn' rnn_w = rnn.param_values()[0] rnn_w.uniform(-0.08, 0.08) # init all rnn parameters print 'rnn weight l1 = %f' % (rnn_w.l1()) dense = layer.Dense('dense', data.vocab_size, input_sample_shape=(hidden_size, )) dense.to_device(cuda) dense_w = dense.param_values()[0] dense_b = dense.param_values()[1] print 'dense w ', dense_w.shape print 'dense b ', dense_b.shape initializer.uniform(dense_w, dense_w.shape[0], 0) print 'dense weight l1 = %f' % (dense_w.l1()) dense_b.set_value(0) print 'dense b l1 = %f' % (dense_b.l1()) g_dense_w = tensor.Tensor(dense_w.shape, cuda) g_dense_b = tensor.Tensor(dense_b.shape, cuda) lossfun = loss.SoftmaxCrossEntropy() for epoch in range(max_epoch): train_loss = 0 for b in range(data.num_train_batch): batch = data.train_dat[b * batch_size:(b + 1) * batch_size] inputs, labels = convert(batch, batch_size, seq_length, data.vocab_size, cuda) inputs.append(tensor.Tensor()) inputs.append(tensor.Tensor()) outputs = rnn.forward(model_pb2.kTrain, inputs)[0:-2] grads = [] batch_loss = 0 g_dense_w.set_value(0.0) g_dense_b.set_value(0.0) for output, label in zip(outputs, labels): act = dense.forward(model_pb2.kTrain, output) lvalue = lossfun.forward(model_pb2.kTrain, act, label) batch_loss += lvalue.l1() grad = lossfun.backward() grad /= batch_size grad, gwb = dense.backward(model_pb2.kTrain, grad) grads.append(grad) g_dense_w += gwb[0] g_dense_b += gwb[1] # print output.l1(), act.l1() utils.update_progress( b * 1.0 / data.num_train_batch, 'training loss = %f' % (batch_loss / seq_length)) train_loss += batch_loss grads.append(tensor.Tensor()) grads.append(tensor.Tensor()) g_rnn_w = rnn.backward(model_pb2.kTrain, grads)[1][0] dense_w, dense_b = dense.param_values() opt.apply_with_lr(epoch, get_lr(epoch), g_rnn_w, rnn_w, 'rnnw') opt.apply_with_lr(epoch, get_lr(epoch), g_dense_w, dense_w, 'dense_w') opt.apply_with_lr(epoch, get_lr(epoch), g_dense_b, dense_b, 'dense_b') print '\nEpoch %d, train loss is %f' % \ (epoch, train_loss / data.num_train_batch / seq_length) eval_loss = 0 for b in range(data.num_test_batch): batch = data.val_dat[b * batch_size:(b + 1) * batch_size] inputs, labels = convert(batch, batch_size, seq_length, data.vocab_size, cuda) inputs.append(tensor.Tensor()) inputs.append(tensor.Tensor()) outputs = rnn.forward(model_pb2.kEval, inputs)[0:-2] for output, label in zip(outputs, labels): output = dense.forward(model_pb2.kEval, output) eval_loss += lossfun.forward(model_pb2.kEval, output, label).l1() print 'Epoch %d, evaluation loss is %f' % \ (epoch, eval_loss / data.num_test_batch / seq_length) if (epoch + 1) % 30 == 0: # checkpoint the file model with open('%s_%d.bin' % (model_path, epoch), 'wb') as fd: print 'saving model to %s' % model_path d = {} for name, w in zip(['rnn_w', 'dense_w', 'dense_b'], [rnn_w, dense_w, dense_b]): w.to_host() d[name] = tensor.to_numpy(w) w.to_device(cuda) d['idx_to_char'] = data.idx_to_char d['char_to_idx'] = data.char_to_idx d['hidden_size'] = hidden_size d['num_stacks'] = num_stacks d['dropout'] = dropout pickle.dump(d, fd)
def train(data, max_epoch, hidden_size=100, seq_length=100, batch_size=16, num_stacks=1, dropout=0.5, model_path='model'): # SGD with L2 gradient normalization opt = optimizer.RMSProp(constraint=optimizer.L2Constraint(5)) cuda = device.create_cuda_gpu() rnn = layer.LSTM( name='lstm', hidden_size=hidden_size, num_stacks=num_stacks, dropout=dropout, input_sample_shape=( data.vocab_size, )) rnn.to_device(cuda) print 'created rnn' rnn_w = rnn.param_values()[0] rnn_w.uniform(-0.08, 0.08) # init all rnn parameters print 'rnn weight l1 = %f' % (rnn_w.l1()) dense = layer.Dense( 'dense', data.vocab_size, input_sample_shape=( hidden_size, )) dense.to_device(cuda) dense_w = dense.param_values()[0] dense_b = dense.param_values()[1] print 'dense w ', dense_w.shape print 'dense b ', dense_b.shape initializer.uniform(dense_w, dense_w.shape[0], 0) print 'dense weight l1 = %f' % (dense_w.l1()) dense_b.set_value(0) print 'dense b l1 = %f' % (dense_b.l1()) g_dense_w = tensor.Tensor(dense_w.shape, cuda) g_dense_b = tensor.Tensor(dense_b.shape, cuda) lossfun = loss.SoftmaxCrossEntropy() for epoch in range(max_epoch): train_loss = 0 for b in range(data.num_train_batch): batch = data.train_dat[b * batch_size: (b + 1) * batch_size] inputs, labels = convert(batch, batch_size, seq_length, data.vocab_size, cuda) inputs.append(tensor.Tensor()) inputs.append(tensor.Tensor()) outputs = rnn.forward(model_pb2.kTrain, inputs)[0:-2] grads = [] batch_loss = 0 g_dense_w.set_value(0.0) g_dense_b.set_value(0.0) for output, label in zip(outputs, labels): act = dense.forward(model_pb2.kTrain, output) lvalue = lossfun.forward(model_pb2.kTrain, act, label) batch_loss += lvalue.l1() grad = lossfun.backward() grad /= batch_size grad, gwb = dense.backward(model_pb2.kTrain, grad) grads.append(grad) g_dense_w += gwb[0] g_dense_b += gwb[1] # print output.l1(), act.l1() utils.update_progress( b * 1.0 / data.num_train_batch, 'training loss = %f' % (batch_loss / seq_length)) train_loss += batch_loss grads.append(tensor.Tensor()) grads.append(tensor.Tensor()) g_rnn_w = rnn.backward(model_pb2.kTrain, grads)[1][0] dense_w, dense_b = dense.param_values() opt.apply_with_lr(epoch, get_lr(epoch), g_rnn_w, rnn_w, 'rnnw') opt.apply_with_lr( epoch, get_lr(epoch), g_dense_w, dense_w, 'dense_w') opt.apply_with_lr( epoch, get_lr(epoch), g_dense_b, dense_b, 'dense_b') print '\nEpoch %d, train loss is %f' % \ (epoch, train_loss / data.num_train_batch / seq_length) eval_loss = 0 for b in range(data.num_test_batch): batch = data.val_dat[b * batch_size: (b + 1) * batch_size] inputs, labels = convert(batch, batch_size, seq_length, data.vocab_size, cuda) inputs.append(tensor.Tensor()) inputs.append(tensor.Tensor()) outputs = rnn.forward(model_pb2.kEval, inputs)[0:-2] for output, label in zip(outputs, labels): output = dense.forward(model_pb2.kEval, output) eval_loss += lossfun.forward(model_pb2.kEval, output, label).l1() print 'Epoch %d, evaluation loss is %f' % \ (epoch, eval_loss / data.num_test_batch / seq_length) if (epoch + 1) % 30 == 0: # checkpoint the file model with open('%s_%d.bin' % (model_path, epoch), 'wb') as fd: print 'saving model to %s' % model_path d = {} for name, w in zip( ['rnn_w', 'dense_w', 'dense_b'], [rnn_w, dense_w, dense_b]): w.to_host() d[name] = tensor.to_numpy(w) w.to_device(cuda) d['idx_to_char'] = data.idx_to_char d['char_to_idx'] = data.char_to_idx d['hidden_size'] = hidden_size d['num_stacks'] = num_stacks d['dropout'] = dropout pickle.dump(d, fd)
def train(self, data, max_epoch, model_path='model'): if self.use_cpu: print 'Using CPU' self.dev = device.get_default_device() else: print 'Using GPU' self.dev = device.create_cuda_gpu() self.net.to_device(self.dev) opt = optimizer.SGD(momentum=0.9, weight_decay=1e-4) # opt = optimizer.RMSProp(constraint=optimizer.L2Constraint(5)) for (p, n) in zip(self.net.param_values(), self.net.param_names()): if 'var' in n: p.set_value(1.0) elif 'gamma' in n: p.uniform(0, 1) elif 'weight' in n: p.gaussian(0, 0.01) else: p.set_value(0.0) print n, p.shape, p.l1() tx = tensor.Tensor((self.batch_size, self.maxlen, self.vocab_size), self.dev) ty = tensor.Tensor((self.batch_size, ), self.dev, core_pb2.kInt) train_x, train_y, test_x, test_y = data num_train_batch = train_x.shape[0] / self.batch_size num_test_batch = test_x.shape[0] / self.batch_size idx = np.arange(train_x.shape[0], dtype=np.int32) for epoch in range(max_epoch): np.random.shuffle(idx) loss, acc = 0.0, 0.0 print '\nEpoch %d' % epoch start = time() for b in range(num_train_batch): batch_loss, batch_acc = 0.0, 0.0 grads = [] x = train_x[ idx[b * self.batch_size:(b + 1) * self.batch_size]] # x.shape = (batch_size, maxlen) y = train_y[idx[b * self.batch_size:(b + 1) * self.batch_size]] # y.shape = (batch_size,) # for input as (batch_size, max_len, vocab_size) sam_arrs = convert_samples(x, x.shape[1], self.vocab_size, self.dev) tx.copy_from_numpy(sam_arrs) ty.copy_from_numpy(np.array(y, dtype='int32')) grads, (batch_loss, batch_acc) = self.net.train(tx, ty) for (s, p, g) in zip(self.net.param_names(), self.net.param_values(), grads): opt.apply_with_lr(epoch, get_lr(epoch), g, p, str(s), b) # update progress bar utils.update_progress( b * 1.0 / num_train_batch, 'training loss = %f, accuracy = %f' % (batch_loss, batch_acc)) loss += batch_loss acc += batch_acc print "\ntraining time = ", time() - start info = 'training loss = %f, training accuracy = %f, lr = %f' \ % (loss / num_train_batch, acc / num_train_batch, get_lr(epoch)) print info loss, acc = 0.0, 0.0 start = time() for b in range(num_test_batch): batch_loss, batch_acc = 0.0, 0.0 x = test_x[b * self.batch_size:(b + 1) * self.batch_size] # x.shape = (batch_size, maxlen) y = test_y[b * self.batch_size:(b + 1) * self.batch_size] sam_arrs = convert_samples(x, x.shape[1], self.vocab_size, self.dev) tx.copy_from_numpy(sam_arrs) ty.copy_from_numpy(np.array(y, dtype='int32')) grads, (batch_loss, batch_acc) = self.net.train(tx, ty) loss += batch_loss acc += batch_acc print "evaluation time = ", time() - start print 'test loss = %f, test accuracy = %f \n' \ % (loss / num_test_batch, acc / num_test_batch) if (epoch % 2) == 1 or epoch + 1 == max_epoch: # checkpoint the file model with open('%s_%d.bin' % (model_path, epoch), 'wb') as fd: print 'saving model to %s_%d.bin' % (model_path, epoch) d = {} for name, w in zip(self.net.param_names(), self.net.param_values()): w.to_host() d[name] = tensor.to_numpy(w) w.to_device(self.dev) pickle.dump(d, fd)
def train(self, data_path, max_epoch, model_path='model'): # SGD with L2 gradient normalization opt = optimizer.RMSProp(constraint=optimizer.L2Constraint(5)) #opt = optimizer.SGD(momentum=0.9, weight_decay=5e-4) # initialize embedding layer embed_w = self.embed.param_values()[0] embed_b = self.embed.param_values()[1] #initializer.uniform(embed_w, 0, embed_w.shape[1]) embed_w.uniform(-0.08, 0.08) embed_b.set_value(0) print 'embed weight l1 = %f' % (embed_w.l1()) print 'embed b l1 = %f' % (embed_b.l1()) # initialize lstm layer lstm_w = self.lstm.param_values()[0] lstm_w.uniform(-0.08, 0.08) # init all lstm parameters print 'lstm weight l1 = %f' % (lstm_w.l1()) # initialize dense layer dense_w = self.dense.param_values()[0] dense_b = self.dense.param_values()[1] dense_w.uniform(-0.1, 0.1) dense_b.set_value(0) print 'dense w ', dense_w.shape print 'dense b ', dense_b.shape print 'dense weight l1 = %f' % (dense_w.l1()) print 'dense b l1 = %f' % (dense_b.l1()) start = time() train_dat, train_label, val_dat, val_label = load_sample() #train_dat, train_label, val_dat, val_label = load_corpus(data_path) train_label = word2onehot(train_label, 2) val_label = word2onehot(val_label, 2) print 'loading time:', time() - start print "train data shape:", train_dat.shape, "train label shape:", train_label.shape print "val data shape:", val_dat.shape, "val label shape:", val_label.shape for epoch in range(max_epoch): train_loss = 0 num_train_batch = train_dat.shape[0] / self.batchsize glb_acc = 0 for b in range(num_train_batch): start = time() # load training data inputs_arr = train_dat[b * self.batchsize: (b + 1) * self.batchsize] labels = train_label[b * self.batchsize: (b + 1) * self.batchsize] lens = rm_padding(inputs_arr) acc = 0 batch_loss = 0.0 g_dense_w = tensor.Tensor(dense_w.shape, self.dev) g_dense_w.set_value(0) g_dense_b = tensor.Tensor(dense_b.shape, self.dev) g_dense_b.set_value(0) g_lstm_w = tensor.Tensor(lstm_w.shape, self.dev) g_lstm_w.set_value(0) g_embed_w = tensor.Tensor(embed_w.shape, self.dev) g_embed_w.set_value(0) for idx_sam in range(len(inputs_arr)): sam_arr = inputs_arr[idx_sam] sam_arr = convert_sample(sam_arr, sam_arr.shape[0], self.vocab_size, self.dev) sample = tensor.from_numpy(sam_arr) sample.to_device(self.dev) #print sample.shape embed = self.embed.forward(model_pb2.kTrain, sample) #print embed.shape is (53, 128) # embed.shape[0] means the sequence length of the sample embeded = [] for idx_seq in range(self.seq_length): if idx_seq >= embed.shape[0]: embeded.append(tensor.Tensor()) else: seq = tensor.Tensor((1,embed.shape[1]), self.dev) tensor.copy_data_to_from(seq, embed, embed.shape[1], 0, idx_seq * embed.shape[1]) embeded.append(seq) embeded.append(tensor.Tensor()) # hx embeded.append(tensor.Tensor()) # cx #print 'forward embedding time:', time() -start #print tensor.to_numpy(embeded[self.seq_length-1]) # forward lstm layer hidden = self.lstm.forward(model_pb2.kTrain, embeded) # outputs are [y1, ..., yn, hx, cx], only need the last output as the predicted latent vector #print len(hidden), hidden[embed.shape[0]-1] #print [hidden[i].l1() for i in range(len(hidden))] # forward dense and loss layer act = self.dense.forward(model_pb2.kTrain, hidden[lens[idx_sam]-1]) label = tensor.from_numpy(labels[idx_sam]) label.to_device(self.dev) lvalue = self.loss.forward(model_pb2.kTrain, act, label) #print 'forward dense time:', time() - start regularized_act = self.sft.forward(model_pb2.kEval, act) pred = tensor.to_numpy(regularized_act) gt = labels[idx_sam][1] if (gt and pred[0,1] > pred[0,0]) or (gt == 0 and pred[0,1] <= pred[0,0]): acc += 1 grads = [] batch_loss += lvalue.l1() / self.batchsize #print batch_loss start = time() # backward loss and dense layer grad = self.loss.backward() / self.batchsize grad, gwb = self.dense.backward(model_pb2.kTrain, grad) g_dense_w += gwb[0] g_dense_b += gwb[1] #print 'dense_w l1 = %f' % (gwb[0].l1()) for i in range(self.seq_length): if i == lens[idx_sam] - 1: grads.append(grad) else: emp = tensor.Tensor(grad.shape, self.dev) emp.set_value(0) grads.append(emp) grads.append(tensor.Tensor()) grads.append(tensor.Tensor()) # backward lstm layer lstm_input_grad, lstm_param_grad = self.lstm.backward(model_pb2.kTrain, grads) g_lstm_w += lstm_param_grad[0] #print 'lstm_input l1 = %f' % (lstm_input_grad[0].l1()) #print 'backward lstm' embed_grad = tensor.Tensor(embed.shape, self.dev) for idx in range(len(lstm_input_grad)-2): tensor.copy_data_to_from(embed_grad, lstm_input_grad[idx], embed.shape[1], idx * embed.shape[1], 0) _, grad_w = self.embed.backward(model_pb2.kTrain, embed_grad) #print 'backward embedding time:', time() - start #print 'embed weight l1 = %f' % (grad_w[0].l1()) g_embed_w += grad_w[0] train_loss += batch_loss glb_acc += acc utils.update_progress( b * 1.0 / num_train_batch, 'training loss = %f, acc = %f' % (batch_loss, acc * 1.0 / self.batchsize)) opt.apply_with_lr(epoch, get_lr(epoch), g_lstm_w, lstm_w, 'lstm_w') opt.apply_with_lr(epoch, get_lr(epoch), g_dense_w, dense_w, 'dense_w') opt.apply_with_lr(epoch, get_lr(epoch), g_dense_b, dense_b, 'dense_b') opt.apply_with_lr(epoch, get_lr(epoch), g_embed_w, embed_w, 'embed_w') #opt.apply_with_lr(epoch, get_lr(epoch), grad_w[1], embed_b, 'embed_b') print '\nEpoch %d, train loss is %f, acc = %f' % \ (epoch, train_loss / num_train_batch, glb_acc * 1. / (self.batchsize * num_train_batch)) # evaluation eval_loss = 0 val_acc = 0 num_test_batch = min(5000, val_dat.shape[0] / self.batchsize) for b in range(num_test_batch): acc = 0 val_arr = val_dat[b * self.batchsize: (b + 1) * self.batchsize] labels = val_label[b * self.batchsize: (b + 1) * self.batchsize] lens = rm_padding(val_arr) val_arr = convert(val_arr, self.batchsize, self.seq_length, self.vocab_size, self.dev) val_arr = np.swapaxes(val_arr, 0, 1).reshape(( self.batchsize * self.seq_length, self.vocab_size)) inputs = tensor.from_numpy(val_arr) inputs.to_device(self.dev) # shape (128*53, 33366) embed = self.embed.forward(model_pb2.kEval, inputs) embed.reshape((self.seq_length, self.batchsize, self.embed_size)) embeded = [] for idx in range(self.seq_length): embed_seq = tensor.Tensor((self.batchsize, self.embed_size), self.dev) tensor.copy_data_to_from(embed_seq, embed, self.batchsize * self.embed_size, 0, idx * self.batchsize * self.embed_size) embeded.append(embed_seq) embeded.append(tensor.Tensor()) # hx embeded.append(tensor.Tensor()) # cx hidden = self.lstm.forward(model_pb2.kEval, embeded) hidden_batch = tensor.Tensor((self.batchsize, self.hidden_size), self.dev) for idx in range(self.batchsize): tensor.copy_data_to_from(hidden_batch, hidden[lens[idx]-1], self.hidden_size, idx * self.hidden_size, idx* self.hidden_size) act = self.dense.forward(model_pb2.kEval, hidden_batch) labels = tensor.from_numpy(labels) labels.to_device(self.dev) eval_loss += self.loss.forward(model_pb2.kEval, act, labels).l1() regularized_act = self.sft.forward(model_pb2.kEval, act) pred = tensor.to_numpy(regularized_act) gt = tensor.to_numpy(labels)[:,1] for i in range(self.batchsize): if (gt[i] and pred[i,1] > pred[i,0]) or (gt[i] == 0 and pred[i,1] <= pred[i,0]): acc += 1 #print 'acc = %f' % (acc * 1. / self.batchsize) val_acc += acc print 'Epoch %d, evaluation loss is %f, acc = %f' % \ (epoch, eval_loss / num_test_batch, val_acc * 1. / (num_test_batch * self.batchsize)) # model saving if (epoch + 1) % 2 == 0 or epoch + 1 == max_epoch: print 'dense weight l1 = %f' % (dense_w.l1()) print 'dense bias l1 = %f' % (dense_b.l1()) print 'lstm weight l1 = %f' % (lstm_w.l1()) print 'embed weight l1 = %f' % (embed_w.l1()) # checkpoint the file model with open('%s_%d.bin' % (model_path, epoch), 'wb') as fd: print 'saving model to %s' % model_path d = {} for name, w in zip( ['embed_w','embed_b', 'lstm_w', 'dense_w', 'dense_b'], [embed_w, embed_b, lstm_w, dense_w, dense_b]): w.to_host() d[name] = tensor.to_numpy(w) w.to_device(self.dev) '''d['idx_to_char'] = data.idx_to_char d['char_to_idx'] = data.char_to_idx d['hidden_size'] = hidden_size d['num_stacks'] = num_stacks d['dropout'] = dropout''' pickle.dump(d, fd)