def main(): parser = argparse.ArgumentParser() parser.add_argument('--batchsize', '-b', type=int, default=100, help='Number of examples in each mini-batch') parser.add_argument('--bproplen', '-l', type=int, default=200, help='Number of words in each mini-batch ' '(= length of truncated BPTT)') parser.add_argument('--epoch', '-e', type=int, default=40, help='Number of sweeps over the dataset to train') parser.add_argument('--gpu', '-g', type=int, default=0, help='GPU ID (negative value indicates CPU)') parser.add_argument('--out', '-o', default='result', help='Directory to output the result') parser.add_argument('--file', default="enwik8", help='path to text file for training') parser.add_argument('--unit', '-u', type=int, default=2800, help='Number of LSTM units') parser.add_argument('--embd', type=int, default=400, help='Number of embedding units') parser.add_argument('--hdrop', type=float, default=0.2, help='hidden state dropout (variational)') parser.add_argument('--edrop', type=float, default=0.5, help='embedding dropout') args = parser.parse_args() nembd = args.embd #number of training iterations per model save, log write, and validation set evaluation interval =100 pdrop = args.hdrop pdrope = args.edrop #initial learning rate alpha0 = .001 #inverse of linear decay rate towards 0 dec_it = 12*9000 #minimum learning rate alpha_min = .00007 #first ntrain words of dataset will be used for training ntrain = 90000000 seqlen = args.bproplen nbatch = args.batchsize filename= args.file text,mapping = get_char(filename) sequence = np.array(text).astype(np.int32) itrain =sequence[0:ntrain] ttrain = sequence[1:ntrain+1] fullseql=int(ntrain/nbatch) itrain = itrain.reshape(nbatch,fullseql) ttrain = ttrain.reshape(nbatch,fullseql) #doesn't use full validations set nval = 500000 ival = sequence[ntrain:ntrain+nval] tval = sequence[ntrain+1:ntrain+nval+1] ival = ival.reshape(ival.shape[0]//1000,1000) tval = tval.reshape(tval.shape[0]//1000,1000) #test = sequence[ntrain+nval:ntrain+nval+ntest] nvocab = max(sequence) + 1 # train is just an array of integers print('#vocab =', nvocab) # Prepare an RNNLM model rnn = RNNForLM(nvocab, args.unit,args.embd) model = L.Classifier(rnn) model.compute_accuracy = False # we only want the perplexity if args.gpu >= 0: chainer.cuda.get_device(args.gpu).use() # make the GPU current model.to_gpu() # Set up an optimizer optimizer = Adam(alpha=alpha0) optimizer.setup(model) resultdir = args.out print('starting') nepoch = args.epoch start = 0 loss_sum = 0; if not os.path.isdir(resultdir): os.mkdir(resultdir) vloss = test(rnn,ival,tval) vloss= (1.4427*vloss) f = open(os.path.join(resultdir,'log'), 'w') outstring = "Initial Validation loss (bits/word): " + str(vloss) + '\n' f.write(outstring) f.close() i=0 epoch_num = 0 it_num = 0 while True: # Get the result of the forward pass. fin = start+seqlen if fin>(itrain.shape[1]): start = 0 fin = start+seqlen epoch_num = epoch_num+1 if epoch_num== nepoch: break inputs = itrain[:,start:fin] targets = ttrain[:,start:fin] start = fin inputs = Variable(inputs) targets = Variable(targets) targets.to_gpu() inputs.to_gpu() it_num+=1 loss = 0 rnn.applyWN() #make hidden dropout mask mask = cp.zeros((inputs.shape[0],args.unit),dtype = cp.float32) ind = cp.nonzero(cp.random.rand(inputs.shape[0],args.unit)>pdrop) mask[ind] = 1/(1-pdrop) #make embedding dropout mask mask2 = cp.zeros((inputs.shape[0],nembd),dtype = cp.float32) ind = cp.nonzero(cp.random.rand(inputs.shape[0],nembd)>pdrope) mask2[ind] = 1/(1-pdrope) for j in range(seqlen): output = rnn(inputs[:,j],mask,mask2) loss = loss+ F.softmax_cross_entropy(output,targets[:,j]) loss = loss/(seqlen) # Zero all gradients before updating them. rnn.zerograds() loss_sum += loss.data # Calculate and update all gradients. loss.backward() s = 0; # Use the optmizer to move all parameters of the network # to values which will reduce the loss. optimizer.update() #decays learning rate linearly optimizer.alpha = alpha0*(dec_it-it_num)/float(dec_it) #prevents learning rate from going below minumum if optimizer.alpha<alpha_min: optimizer.alpha = alpha_min loss.unchain_backward() if ((i+1)%interval) ==0: rnn.reset_state() vloss = test(rnn,ival,tval) #converts to binary entropy vloss= (1.4427*vloss) loss_sum = (1.4427*loss_sum/interval) serializers.save_npz(os.path.join(resultdir,'model'),rnn) outstring = "Training iteration: " + str(i+1) + " Training loss (bits/char): " + str(loss_sum) + " Validation loss (bits/word): " + str(vloss) + '\n' f = open(os.path.join(resultdir,'log'), 'a') f.write(outstring) f.close() print("Training iteration: " + str(i+1)) print('training loss: ' + str(loss_sum)) print('validation loss: ' + str(vloss)) loss_sum=0 i+=1
def train(source_bpe, target_bpe, source_glove, target_glove, chunk_length, batch_size, warmup_steps, save_decimation, num_steps, gpu_id, out, log_level): if not os.path.exists(out): os.makedirs(out) ll = getattr(logging, log_level) stream_handler = logging.StreamHandler(sys.stdout) stream_handler.setLevel(ll) stream_handler.setFormatter(logging.Formatter('%(message)s')) file_handler = logging.FileHandler(filename=os.path.join( out, 'training.log'), mode='a') file_handler.setLevel(ll) file_handler.setFormatter(logging.Formatter('%(message)s')) logger.addHandler(stream_handler) logger.addHandler(file_handler) logger.setLevel(ll) gpu_id = gpu_id if gpu_id is not None else -1 device_name = '@intel64' if gpu_id >= 0: device_name = f'@cupy:{gpu_id}' with chainer.using_device(device_name): source_vocab = make_vocab(source_glove) target_vocab = make_vocab(target_glove) output_model_dim = target_vocab.embedding_size dataset = make_dataset(source_bpe, target_bpe, source_vocab, target_vocab, chunk_length) iterator = MultithreadIterator(dataset, batch_size) state = TrainingState() model = Transformer(source_vocab, target_vocab) model.to_gpu(gpu_id) optimizer = Adam(beta1=0.99, beta2=0.98, eps=1e-9).setup(model) load_training(out, model, optimizer, state) try: for n, batch in enumerate(iterator): if n >= num_steps: break if (n + 1) % save_decimation == 0: save_training(out, model, optimizer, state) model.cleargrads() gc.collect() source, target = stack_nested(batch) source.token_ids.to_gpu(gpu_id) source.masks.to_gpu(gpu_id) target.token_ids.to_gpu(gpu_id) target.masks.to_gpu(gpu_id) output_probs = model.train_forward(source.token_ids, target.token_ids, input_masks=source.masks, output_masks=target.masks) unnormalized_loss = F.softmax_cross_entropy( F.reshape(output_probs, (output_probs.shape[0] * output_probs.shape[1], output_probs.shape[2])), F.reshape(target.token_ids, (target.token_ids.shape[0] * target.token_ids.shape[1], )), reduce='no') loss_mask = xp.reshape( xp.logical_not(target.masks.array).astype(xp.float32), (target.masks.shape[0] * target.masks.shape[1], )) loss = F.sum(unnormalized_loss * loss_mask) / F.sum(loss_mask) loss.backward() learning_rate = (output_model_dim**-0.5) * min( (state.step**-0.5), state.step * (warmup_steps**-1.5)) optimizer.alpha = learning_rate optimizer.update() logger.info( f'time = {int(time.time())} | step = {state.step} | loss = {float(loss.array)} | lr = {learning_rate}' ) state.step += 1 finally: save_training(out, model, optimizer, state)