def train(lmWithRNN, gateModel, args, trainData, validData): if args.gpu >= 0: lmWithRNN.lmNet.to_gpu() gateModel.gateModel.to_gpu() opt = O.Adam(alpha=0.001) opt.setup(gateModel.gateModel) opt.add_hook(chainer.optimizer.GradientClipping(args.maxGrad)) if args.weightDecay > 0: opt.add_hook(chainer.optimizer.WeightDecay(args.weightDecay)) bestperp = np.inf for epoch in range(args.epoch): epochStart = time.time() totalloss = 0 finishnum = 0 lr_decay = np.sqrt(epoch + 1) opt.alpha = 0.001 / lr_decay print 'Learning rate: %.6f' % (opt.alpha) if lmWithRNN.modelType == 'RHN': prevHidden = [ chainer.Variable( xp.zeros((args.batch, lmWithRNN.dim)).astype(np.float32)) for _ in range(lmWithRNN.layerNum) ] else: prevHidden = None for current_words, next_words in make_batch(trainData, args.batch, args.step): lmWithRNN.lmNet.cleargrads() gateModel.gateModel.cleargrads() loss, prevHidden = train_with_batch(current_words, next_words, lmWithRNN, gateModel, args, prevHidden) loss.backward() loss.unchain_backward() opt.update() totalloss += float(F.sum(loss).data) * current_words.shape[0] finishnum += current_words.shape[0] * current_words.shape[1] sys.stderr.write('\r Finished %s' % finishnum) sys.stderr.write('\n') epochEnd = time.time() with chainer.no_backprop_mode(), chainer.using_config('train', False): validloss, validperp = valid_with_batch(validData, lmWithRNN, gateModel) sys.stderr.write('Train time is %s\tValid time is %s\n' % (epochEnd - epochStart, time.time() - epochEnd)) sys.stdout.write( 'Epoch: %s\tTrain loss: %.6f\tValid loss: %.6f\tValid perplexity: %.6f\n' % (epoch, totalloss / finishnum, validloss, validperp)) sys.stdout.flush() if validperp < bestperp: gateOutputFile = args.output + '.bin' S.save_npz(gateOutputFile, copy.deepcopy(gateModel.gateModel).to_cpu()) bestperp = validperp
def train(lmWithRNN, args, trainData, validData): if args.gpu >= 0: lmWithRNN.lmNet.to_gpu() if args.WT: lmWithRNN.lmNet.Output.W.data = lmWithRNN.lmNet.Embed.W.data #assign the same id to output and embedding opt = O.SGD(args.lr) opt.setup(lmWithRNN.lmNet) opt.add_hook(chainer.optimizer.GradientClipping(args.maxGrad)) opt.add_hook(chainer.optimizer.WeightDecay(args.weight_decay)) prevvalidperp = np.inf prevModel = None for epoch in range(args.epoch): epochStart = time.time() lr_decay = args.decay**max(epoch + 1 - args.decayEpoch, 0.0) opt.lr = args.lr * lr_decay sys.stdout.write('Learning rate: %.6f\n' % (opt.lr)) totalloss = 0 finishnum = 0 prevHiddenList = [ chainer.Variable( xp.zeros((args.batch, args.dim)).astype(np.float32)) for _ in range(lmWithRNN.layerNum) ] for current_words, next_words in make_batch(trainData, args.batch, args.step): lmWithRNN.lmNet.cleargrads() loss, prevHiddenList = train_with_batch(current_words, next_words, lmWithRNN, args, prevHiddenList) loss.backward() loss.unchain_backward() opt.update() totalloss += float(F.sum(loss).data) * current_words.shape[0] finishnum += current_words.shape[0] * current_words.shape[1] sys.stderr.write('\r Finished %s' % finishnum) sys.stderr.write('\n') epochEnd = time.time() validloss, validperp = valid_with_batch(validData, lmWithRNN) sys.stdout.write('Train time is %s\tValid time is %s\n' % (epochEnd - epochStart, time.time() - epochEnd)) sys.stdout.write( 'Epoch: %s\tTrain loss: %.6f\tValid loss: %.6f\tValid perplexity: %.6f\n' % (epoch, totalloss / finishnum, validloss, validperp)) sys.stdout.flush() if prevvalidperp < validperp: lmOutputFile = args.output + '.epoch%s' % (epoch) + '.bin' S.save_npz(lmOutputFile, prevModel) prevModel = copy.deepcopy(lmWithRNN.lmNet).to_cpu() prevvalidperp = validperp lmOutputFile = args.output + '.epoch%s_fin' % (epoch + 1) + '.bin' S.save_npz(lmOutputFile, prevModel)
def valid_with_batch(validData, lmWithRNN): batchsize = 50 prevHiddenList = [chainer.Variable(xp.zeros((batchsize, lmWithRNN.dim)).astype(np.float32)) for _ in range(lmWithRNN.layerNum)] loss = 0 totalins = len(validData) - 1 for current_words, next_words in make_batch(validData, batchsize, 1000000): for index in range(current_words.shape[1]): wordIndex = current_words[:, index] y, prevHiddenList = lmWithRNN.compute_forward(wordIndex, prevHiddenList) loss += F.softmax_cross_entropy(y, next_words[:, index]) * batchsize loss = float(F.sum(loss).data) / totalins perp = np.exp(loss) return loss, perp
def valid_with_batch(validData, lmWithRNN, gateModel): batchsize = 64 totalins = len(validData) - 1 loss = 0 if lmWithRNN.modelType == 'RHN': prevHidden = [ chainer.Variable( xp.zeros((batchsize, lmWithRNN.dim)).astype(np.float32)) for _ in range(lmWithRNN.layerNum) ] else: prevHidden = None for current_words, next_words in make_batch(validData, batchsize, 100000): for index in range(current_words.shape[1]): wordIndex = current_words[:, index] rnn_out, prevHidden = lmWithRNN.compute_forward( wordIndex, prevHidden) gate = gateModel.compute_gate(wordIndex) y = gate * rnn_out loss += F.softmax_cross_entropy(y, next_words[:, index]) * batchsize loss = float(F.sum(loss).data) / totalins perp = np.exp(loss) return loss, perp