def train(): best_val = float("Inf") for epoch in range(args.epochs): total_L = 0.0 start_time = time.time() hidden = model.begin_state(func=mx.nd.zeros, batch_size=args.batch_size, ctx=context) for ibatch, i in enumerate(range(0, train_data.shape[0] - 1, args.bptt)): data, target = get_batch(train_data, i) hidden = detach(hidden) with autograd.record(): output, hidden = model(data, hidden) L = loss(output, target) L.backward() grads = [i.grad(context) for i in model.collect_params().values()] # Here gradient is for the whole batch. # So we multiply max_norm by batch_size and bptt size to balance it. gluon.utils.clip_global_norm( grads, args.clip * args.bptt * args.batch_size) trainer.step(args.batch_size) total_L += mx.nd.sum(L).asscalar() if ibatch % args.log_interval == 0 and ibatch > 0: cur_L = total_L / args.bptt / args.batch_size / args.log_interval print('[Epoch %d Batch %d] loss %.2f, ppl %.2f' % (epoch, ibatch, cur_L, math.exp(cur_L))) total_L = 0.0 val_L = eval(val_data) print('[Epoch %d] time cost %.2fs, valid loss %.2f, valid ppl %.2f' % (epoch, time.time() - start_time, val_L, math.exp(val_L))) if val_L < best_val: best_val = val_L test_L = eval(test_data) model.collect_params().save(args.save) print('test loss %.2f, test ppl %.2f' % (test_L, math.exp(test_L))) else: args.lr = args.lr * 0.25 trainer._init_optimizer('sgd', { 'learning_rate': args.lr, 'momentum': 0, 'wd': 0 }) model.collect_params().load(args.save, context)
def train(): best_val = None for epoch in range(args.epochs): total_L = 0.0 start_time = time.time() hidden = model.begin_state(func=mx.nd.zeros, batch_size=args.batch_size, ctx=context) for ibatch, i in enumerate(range(0, train_data.shape[0] - 1, args.bptt)): data, target = get_batch(train_data, i) hidden = detach(hidden) with autograd.record(): output, hidden = model(data, hidden) L = loss(output, target) L.backward() grads = [i.grad(context) for i in model.collect_params().values()] # Here gradient is not divided by batch_size yet. # So we multiply max_norm by batch_size to balance it. gluon.utils.clip_global_norm(grads, args.clip * args.batch_size) trainer.step(args.batch_size) total_L += mx.nd.sum(L).asscalar() if ibatch % args.log_interval == 0 and ibatch > 0: cur_L = total_L / args.batch_size / args.bptt / args.log_interval print('[Epoch %d Batch %d] loss %.2f, ppl %.2f'%( epoch, ibatch, cur_L, math.exp(cur_L))) total_L = 0.0 val_L = eval(val_data) print('[Epoch %d] time cost %.2fs, valid loss %.2f, valid ppl %.2f'%( epoch, time.time()-start_time, val_L, math.exp(val_L)))
def train(): total_loss = 0.0 start_time = time.time() hidden = model.begin_state(func=mx.nd.zeros, batch_size=args.batch_size, ctx=context) for batch, i in enumerate(range(0, train_data.shape[0] - 1, args.bptt)): data, target = get_batch(train_data, i) # Starting each batch, we detach the hidden state from how it was previously produced. # If we didn't, the model would try backpropagating all the way to start of the dataset. hidden = detach(hidden) with autograd.record(): output, hidden = model(data, hidden) loss = criterion(output, target) loss.backward() grads = [p.grad(context) for p in model.collect_params().values()] # grad clipping helps prevent the exploding gradient problem in RNNs / LSTMs. gluon.utils.clip_global_norm(grads, args.clip * args.bptt * args.batch_size) trainer.step(args.bptt * args.batch_size) total_loss += mx.nd.sum(loss).asscalar() / loss.shape[0] if batch % args.log_interval == 0 and batch > 0: cur_loss = total_loss / args.log_interval elapsed = time.time() - start_time print('| epoch {:3d} | {:5d}/{:5d} batches | lr {:02.2f} ' '| ms/batch {:5.2f} | loss {:5.2f} | ppl {:8.2f}'.format( epoch + 1, batch, len(train_data) // args.bptt, lr, elapsed * 1000 / args.log_interval, cur_loss, math.exp(cur_loss))) total_loss = 0.0 start_time = time.time()
def train(): best_val = float("Inf") for epoch in range(args.epochs): total_L = 0.0 start_time = time.time() hidden = model.begin_state(func=mx.nd.zeros, batch_size=args.batch_size, ctx=context) for i, (data, target) in enumerate(train_data): data = data.as_in_context(context).T target = target.as_in_context(context).T.reshape((-1, 1)) hidden = detach(hidden) with autograd.record(): output, hidden = model(data, hidden) L = loss(output, target) L.backward() grads = [p.grad(context) for p in model.collect_params().values()] # Here gradient is for the whole batch. # So we multiply max_norm by batch_size and bptt size to balance it. gluon.utils.clip_global_norm(grads, args.clip * args.bptt * args.batch_size) trainer.step(args.batch_size) total_L += mx.nd.sum(L).asscalar() if i % args.log_interval == 0 and i > 0: cur_L = total_L / args.bptt / args.batch_size / args.log_interval print('[Epoch %d Batch %d] loss %.2f, ppl %.2f'%( epoch, i, cur_L, math.exp(cur_L))) total_L = 0.0 val_L = eval(val_data) print('[Epoch %d] time cost %.2fs, valid loss %.2f, valid ppl %.2f'%( epoch, time.time()-start_time, val_L, math.exp(val_L))) if val_L < best_val: best_val = val_L test_L = eval(test_data) model.collect_params().save(args.save) print('test loss %.2f, test ppl %.2f'%(test_L, math.exp(test_L))) else: args.lr = args.lr*0.25 trainer._init_optimizer('sgd', {'learning_rate': args.lr, 'momentum': 0, 'wd': 0}) model.collect_params().load(args.save, context)
def train(): best_val = float("Inf") for epoch in range(args.epochs): total_L = 0.0 start_time = time.time() hidden = model.begin_state(func=mx.nd.zeros, batch_size=args.batch_size, ctx=context) # add time checkpoint for logging before_time = start_time for i, (data, target) in enumerate(train_data): data = data.as_in_context(context).T target = target.as_in_context(context).T.reshape((-1, 1)) hidden = detach(hidden) with autograd.record(): output, hidden = model(data, hidden) # Here L is a vector of size batch_size * bptt size L = loss(output, target) L = L / (args.bptt * args.batch_size) L.backward() grads = [p.grad(context) for p in model.collect_params().values()] gluon.utils.clip_global_norm(grads, args.clip) trainer.step(1) total_L += mx.nd.sum(L).asscalar() if i % args.log_interval == 0 and i > 0: # log interval latency print batch_latency = time.time() - before_time before_time = time.time() cur_L = total_L / args.log_interval print( '[Epoch %d Batch %d] loss %.2f, ppl %.2f, batch_latency %.4f' % (epoch, i, cur_L, math.exp(cur_L), batch_latency)) total_L = 0.0 if args.export_model: model.export('model') return val_L = eval(val_data) print('[Epoch %d] time cost %.2fs, valid loss %.2f, valid ppl %.2f' % (epoch, time.time() - start_time, val_L, math.exp(val_L))) if val_L < best_val: best_val = val_L test_L = eval(test_data) model.save_parameters(args.save) print('test loss %.2f, test ppl %.2f' % (test_L, math.exp(test_L))) else: args.lr = args.lr * 0.25 trainer.set_learning_rate(args.lr)
def train(): best_val = float("Inf") for epoch in range(args.epochs): total_L = 0.0 start_time = time.time() hidden = model.begin_state(func=mx.nd.zeros, batch_size=args.batch_size, ctx=context) for i, (data, target) in enumerate(train_data): data = data.as_in_context(context).T target = target.as_in_context(context).T.reshape((-1, 1)) hidden = detach(hidden) with autograd.record(): output, hidden = model(data, hidden) # Here L is a vector of size batch_size * bptt size L = loss(output, target) L = L / (args.bptt * args.batch_size) L.backward() grads = [p.grad(context) for p in model.collect_params().values()] gluon.utils.clip_global_norm(grads, args.clip) trainer.step(1) total_L += mx.nd.sum(L).asscalar() if i % args.log_interval == 0 and i > 0: cur_L = total_L / args.log_interval print('[Epoch %d Batch %d] loss %.2f, ppl %.2f'%( epoch, i, cur_L, math.exp(cur_L))) total_L = 0.0 if args.export_model: model.export('model') return val_L = eval(val_data) print('[Epoch %d] time cost %.2fs, valid loss %.2f, valid ppl %.2f'%( epoch, time.time()-start_time, val_L, math.exp(val_L))) if val_L < best_val: best_val = val_L test_L = eval(test_data) model.save_parameters(args.save) print('test loss %.2f, test ppl %.2f'%(test_L, math.exp(test_L))) else: args.lr = args.lr*0.25 trainer.set_learning_rate(args.lr)
############################################################################### # Build the model ############################################################################### ntokens = len(vocab) model = model.RNNModel(args.model, ntokens, args.emsize, args.nhid, args.nlayers, args.dropout, args.tied) if args.hybridize: model.hybridize(**hybridize_optional) model.initialize(mx.init.Xavier(), ctx=context) compression_params = None if args.gctype == 'none' else { 'type': args.gctype, 'threshold': args.gcthreshold } trainer = gluon.Trainer(model.collect_params(), 'sgd', { 'learning_rate': args.lr, 'momentum': 0, 'wd': 0 }, compression_params=compression_params) loss = gluon.loss.SoftmaxCrossEntropyLoss() if args.hybridize: loss.hybridize(**hybridize_optional) ############################################################################### # Training code ###############################################################################
def train(epochs, ctx): best_val = float("Inf") for epoch in range(epochs): total_L = 0.0 cur_L = 0.0 tic = time.time() hidden_states = [ model.begin_state(func=mx.nd.zeros, batch_size=args.batch_size // len(ctx), ctx=ctx[i]) for i in range(len(ctx)) ] btic = time.time() for ibatch, i in enumerate(range(0, train_data.shape[0] - 1, args.bptt)): # get data batch from the training data data_batch, target_batch = get_batch(train_data, i) # For RNN we can do within batch multi-device parallelization data = gluon.utils.split_and_load(data_batch, ctx_list=ctx, batch_axis=1) target = gluon.utils.split_and_load(target_batch, ctx_list=ctx, batch_axis=1) Ls = [] for (d, t) in zip(data, target): # get corresponding hidden state then update hidden hidden = detach(hidden_states[d.context.device_id]) with autograd.record(): output, hidden = model(d, hidden) L = loss(output, t.reshape((-1, ))) L.backward() Ls.append(L) # write back to the record hidden_states[d.context.device_id] = hidden for c in ctx: grads = [i.grad(c) for i in model.collect_params().values()] # Here gradient is for the whole batch. # So we multiply max_norm by batch_size and bptt size to balance it. # Also this utility function needs to be applied within the same context gluon.utils.clip_global_norm( grads, args.clip * args.bptt * args.batch_size / len(ctx)) trainer.step(args.batch_size) for L in Ls: total_L += mx.nd.sum(L).asscalar() if ibatch % args.log_interval == 0 and ibatch > 0: cur_L = total_L / args.bptt / args.batch_size / args.log_interval logging.info( '[Epoch %d Batch %d] Speed: %f samples/sec loss %.2f, ppl %.2f' % (epoch, ibatch, args.batch_size / (time.time() - btic), cur_L, math.exp(cur_L))) total_L = 0.0 btic = time.time() logging.info('[Epoch %d] train loss %.2f, train ppl %.2f' % (epoch, cur_L, math.exp(cur_L))) logging.info('[Epoch %d] time cost %.2f' % (epoch, time.time() - tic)) val_L = eval(val_data, ctx) logging.info('[Epoch %d] valid loss %.2f, valid ppl %.2f' % (epoch, val_L, math.exp(val_L))) if val_L < best_val: best_val = val_L # test_L = eval(test_data, ctx) model.collect_params().save('model.params') # logging.info('test loss %.2f, test ppl %.2f' % (test_L, math.exp(test_L))) else: args.lr = args.lr * 0.25 trainer._init_optimizer('sgd', { 'learning_rate': args.lr, 'momentum': 0, 'wd': 0 }) model.collect_params().load('model.params', ctx)
context = [mx.gpu(i) for i in range(args.gpus)] else: context = [mx.cpu(0)] corpus = data.Corpus(args.data) args.batch_size *= max(1, args.gpus) train_data = batchify(corpus.train, args.batch_size) val_data = batchify(corpus.valid, args.batch_size) test_data = batchify(corpus.test, args.batch_size) n_tokens = len(corpus.dictionary) model = model.RNNModel(args.model, n_tokens, args.emsize, args.nhid, args.nlayers, args.dropout, args.tied) model.collect_params().initialize(mx.init.Xavier(), ctx=context) trainer = gluon.Trainer(model.collect_params(), 'sgd', { 'learning_rate': args.lr, 'momentum': 0, 'wd': 0 }) loss = gluon.loss.SoftmaxCrossEntropyLoss() ############################################################################### # Train the model ############################################################################### def train(epochs, ctx): best_val = float("Inf")
return data train_data = batchify(corpus.train, args.batch_size).as_in_context(context) val_data = batchify(corpus.valid, args.batch_size).as_in_context(context) test_data = batchify(corpus.test, args.batch_size).as_in_context(context) ############################################################################### # Build the model ############################################################################### ntokens = len(corpus.dictionary) model = model.RNNModel(args.model, ntokens, args.emsize, args.nhid, args.nlayers, args.dropout, args.tied) model.collect_params().initialize(mx.init.Xavier(), ctx=context) compression_params = None if args.gctype == 'none' else {'type': args.gctype, 'threshold': args.gcthreshold} trainer = gluon.Trainer(model.collect_params(), 'sgd', {'learning_rate': args.lr, 'momentum': 0, 'wd': 0}, compression_params=compression_params) loss = gluon.loss.SoftmaxCrossEntropyLoss() ############################################################################### # Training code ############################################################################### def get_batch(source, i): seq_len = min(args.bptt, source.shape[0] - 1 - i)
def train(epochs, ctx): best_val = float("Inf") for epoch in range(epochs): total_L = 0.0 start_time = time.time() hidden_states = [ model.begin_state(func=mx.nd.zeros, batch_size=args.batch_size // len(ctx), ctx=ctx[i]) for i in range(len(ctx)) ] for ibatch, i in enumerate(range(0, train_data.shape[0] - 1, args.bptt)): # get data batch from the training data data_batch, target_batch = get_batch(train_data, i) # For RNN we can do within batch multi-device parallelization data = gluon.utils.split_and_load(data_batch, ctx_list=ctx, batch_axis=1) target = gluon.utils.split_and_load(target_batch, ctx_list=ctx, batch_axis=1) Ls = [] for (d, t) in zip(data, target): # get corresponding hidden state then update hidden hidden = detach(hidden_states[d.context.device_id]) with autograd.record(): output, hidden = model(d, hidden) L = loss(output, t.reshape((-1,))) L.backward() Ls.append(L) # write back to the record hidden_states[d.context.device_id] = hidden for c in ctx: grads = [i.grad(c) for i in model.collect_params().values()] # Here gradient is for the whole batch. # So we multiply max_norm by batch_size and bptt size to balance it. # Also this utility function needs to be applied within the same context gluon.utils.clip_global_norm(grads, args.clip * args.bptt * args.batch_size / len(ctx)) trainer.step(args.batch_size) for L in Ls: total_L += mx.nd.sum(L).asscalar() if ibatch % args.log_interval == 0 and ibatch > 0: cur_L = total_L / args.bptt / args.batch_size / args.log_interval logging.info('[Epoch %d Batch %d] loss %.2f, ppl %.2f' % ( epoch, ibatch, cur_L, math.exp(cur_L))) total_L = 0.0 val_L = eval(val_data, ctx) logging.info('[Epoch %d] time cost %.2fs, valid loss %.2f, valid ppl %.2f' % ( epoch, time.time() - start_time, val_L, math.exp(val_L))) if val_L < best_val: best_val = val_L test_L = eval(test_data, ctx) model.collect_params().save('model.params') logging.info('test loss %.2f, test ppl %.2f' % (test_L, math.exp(test_L))) else: args.lr = args.lr * 0.25 trainer._init_optimizer('sgd', { 'learning_rate': args.lr, 'momentum': 0, 'wd': 0 } ) model.collect_params().load('model.params', ctx)
############################################################################### # Build the model ############################################################################### ntokens = len(vocab) model = model.RNNModel(args.model, ntokens, args.emsize, args.nhid, args.nlayers, args.dropout, args.tied) if args.hybridize: model.hybridize(**hybridize_optional) model.initialize(mx.init.Xavier(), ctx=context) compression_params = None if args.gctype == 'none' else {'type': args.gctype, 'threshold': args.gcthreshold} trainer = gluon.Trainer(model.collect_params(), 'sgd', {'learning_rate': args.lr, 'momentum': 0, 'wd': 0}, compression_params=compression_params) loss = gluon.loss.SoftmaxCrossEntropyLoss() if args.hybridize: loss.hybridize(**hybridize_optional) ############################################################################### # Training code ############################################################################### def detach(hidden): if isinstance(hidden, (tuple, list)): hidden = [i.detach() for i in hidden]
model.hybridize() if args.optimizer == 'SGD': trainer_params = { 'learning_rate': args.lr, 'momentum': 0, 'wd': args.wdecay } elif args.optimizer == 'Adam': trainer_params = { 'learning_rate': args.lr, 'wd': args.wdecay, 'beta1': 0, 'beta2': 0.999, 'epsilon': 1e-9 } trainer = gluon.Trainer(model.collect_params(), args.optimizer, trainer_params) load_best_loss = float("Inf") if args.continue_exprm: load_model() load_best_loss, val_time = evaluate(val_data, eval_batch_size) load_best_ppl = math.exp(load_best_loss) logging.info("Loaded model: val_time {:5.2f}, valid loss {}, ppl {}\ ".format(val_time, load_best_loss, load_best_ppl)) # At any point you can hit Ctrl + C to break out of training early. # logging.info(model.summary(nd.zeros((args.bptt, m)))) try: if not args.predict_only:
data = data.reshape((batch_size, nbatch)).T return data train_data = batchify(corpus.train, args.batch_size).as_in_context(context) val_data = batchify(corpus.valid, args.batch_size).as_in_context(context) test_data = batchify(corpus.test, args.batch_size).as_in_context(context) ############################################################################### # Build the model ############################################################################### ntokens = len(corpus.dictionary) model = model.RNNModel(args.model, ntokens, args.emsize, args.nhid, args.nlayers, args.dropout, args.tied) model.collect_params().initialize(mx.init.Xavier(), ctx=context) compression_params = None if args.gctype == 'none' else { 'type': args.gctype, 'threshold': args.gcthreshold } trainer = gluon.Trainer(model.collect_params(), 'sgd', { 'learning_rate': args.lr, 'momentum': 0, 'wd': 0 }, compression_params=compression_params) loss = gluon.loss.SoftmaxCrossEntropyLoss() ###############################################################################
else: context = [mx.cpu(0)] corpus = data.Corpus(args.data) args.batch_size *= max(1, args.gpus) train_data = batchify(corpus.train, args.batch_size) val_data = batchify(corpus.valid, args.batch_size) test_data = batchify(corpus.test, args.batch_size) n_tokens = len(corpus.dictionary) model = model.RNNModel(args.model, n_tokens, args.emsize, args.nhid, args.nlayers, args.dropout, args.tied) model.collect_params().initialize(mx.init.Xavier(), ctx=context) trainer = gluon.Trainer( model.collect_params(), 'sgd', { 'learning_rate': args.lr, 'momentum': 0, 'wd': 0 } ) loss = gluon.loss.SoftmaxCrossEntropyLoss() ############################################################################### # Train the model ###############################################################################