def train(epochs, ctx): if isinstance(ctx, mx.Context): ctx = [ctx] net.initialize(mx.init.Xavier(), ctx=ctx) train_data = gluon.data.DataLoader( gluon.data.vision.CIFAR10(train=True).transform_first(transform_train), batch_size=batch_size, shuffle=True, last_batch='discard', num_workers=num_workers) val_data = gluon.data.DataLoader( gluon.data.vision.CIFAR10(train=False).transform_first(transform_test), batch_size=batch_size, shuffle=False, num_workers=num_workers) trainer = gluon.Trainer(net.collect_params(), optimizer, {'learning_rate': opt.lr, 'wd': opt.wd, 'momentum': opt.momentum}) metric = mx.metric.Accuracy() train_metric = mx.metric.RMSE() loss_fn = gluon.loss.SoftmaxCrossEntropyLoss(sparse_label=False) train_history = TrainingHistory(['training-error', 'validation-error']) iteration = 0 lr_decay_count = 0 best_val_score = 0 for epoch in range(epochs): tic = time.time() train_metric.reset() metric.reset() train_loss = 0 num_batch = len(train_data) alpha = 1 if epoch == lr_decay_epoch[lr_decay_count]: trainer.set_learning_rate(trainer.learning_rate*lr_decay) lr_decay_count += 1 for i, batch in enumerate(train_data): lam = np.random.beta(alpha, alpha) if epoch >= epochs - 20: lam = 1 data_1 = gluon.utils.split_and_load(batch[0], ctx_list=ctx, batch_axis=0) label_1 = gluon.utils.split_and_load(batch[1], ctx_list=ctx, batch_axis=0) data = [lam*X + (1-lam)*X[::-1] for X in data_1] label = [] for Y in label_1: y1 = label_transform(Y, classes) y2 = label_transform(Y[::-1], classes) label.append(lam*y1 + (1-lam)*y2) with ag.record(): output = [net(X) for X in data] loss = [loss_fn(yhat, y) for yhat, y in zip(output, label)] for l in loss: l.backward() trainer.step(batch_size) train_loss += sum([l.sum().asscalar() for l in loss]) output_softmax = [nd.SoftmaxActivation(out) for out in output] train_metric.update(label, output_softmax) name, acc = train_metric.get() iteration += 1 train_loss /= batch_size * num_batch name, acc = train_metric.get() name, val_acc = test(ctx, val_data) train_history.update([acc, 1-val_acc]) train_history.plot(save_path='%s/%s_history.png'%(plot_name, model_name)) if val_acc > best_val_score: best_val_score = val_acc net.save_parameters('%s/%.4f-cifar-%s-%d-best.params'%(save_dir, best_val_score, model_name, epoch)) name, val_acc = test(ctx, val_data) logging.info('[Epoch %d] train=%f val=%f loss=%f time: %f' % (epoch, acc, val_acc, train_loss, time.time()-tic)) if save_period and save_dir and (epoch + 1) % save_period == 0: net.save_parameters('%s/cifar10-%s-%d.params'%(save_dir, model_name, epoch)) if save_period and save_dir: net.save_parameters('%s/cifar10-%s-%d.params'%(save_dir, model_name, epochs-1))
for l in loss: l.backward() # Optimize trainer.step(batch_size) # Update metrics train_loss += sum([l.sum().asscalar() for l in loss]) train_metric.update(label, output) name, acc = train_metric.get() # Evaluate on Validation data name, val_acc = test(ctx, val_data) # Update history and print metrics train_history.update([1-acc, 1-val_acc]) print('[Epoch %d] train=%f val=%f loss=%f time: %f' % (epoch, acc, val_acc, train_loss, time.time()-tic)) # We can plot the metric scores with: train_history.plot() ################################################################ # If you trained the model for 240 epochs, the plot may look like: # # |image-aug| # # We can better observe the process of model training with plots. # For example, one may ask what will happen if there's no data augmentation: #
with ag.record(): outputs = [tuned_net(X) for X in data] loss = [L(yhat, y) for yhat, y in zip(outputs, label)] # Backpropagation for l in loss: l.backward() # Optimize trainer.step(batch_size) train_loss += sum([l.sum().asscalar() for l in loss]) train_metric.update(label, outputs) name, train_acc = train_metric.get() name, val_acc = test(tuned_net, val_data, ctx) train_history.update([1 - train_acc, 1 - val_acc]) print( '[Epoch %d] Train-acc: %.3f, loss: %.3f | Val-acc: %.3f | time: %.1f' % (epoch, train_acc, train_loss, val_acc, time.time() - tic)) if (epoch + 1) % 10 == 0: print('Params saved on epoch #', epoch) tuned_net.save_parameters( 'rejector2_{:03d}__resnet50_v.params'.format(epoch)) train_history.plot() name, test_acc = test(tuned_net, test_data, ctx) print('[Finished] Test-acc: %.3f' % (test_acc))
output = [] for _, X in enumerate(data): X = X.reshape((-1, ) + X.shape[2:]) pred = net(X) output.append(pred) loss = [loss_fn(yhat, y) for yhat, y in zip(output, label)] # Backpropagation for l in loss: l.backward() # Optimize trainer.step(batch_size) # Update metrics train_loss += sum([l.mean().asscalar() for l in loss]) train_metric.update(label, output) if i == 100: break name, acc = train_metric.get() # Update history and print metrics train_history.update([acc]) print('[Epoch %d] train=%f loss=%f time: %f' % (epoch, acc, train_loss / (i + 1), time.time() - tic)) # We can plot the metric scores with: train_history.plot()
train_loss += sum([l.mean().asscalar() for l in loss]) train_metric.update(label, output) if i % opt.log_interval == 0: name, acc = train_metric.get() logger.info('[Epoch %d] [%d | %d] train=%f loss=%f time: %f' % (epoch, i, len(train_data), acc, train_loss / (i + 1), time.time() - btic)) btic = time.time() name, acc = train_metric.get() # test acc_top1_val, acc_top5_val, loss_val = test(ctx, val_data) # Update history and print metrics train_history.update([acc, acc_top1_val, acc_top5_val]) logger.info('[Epoch %d] train=%f loss=%f time: %f' % (epoch, acc, train_loss / (i + 1), time.time() - tic)) logger.info( '[Epoch %d] val top1 =%f top5=%f val loss=%f,lr=%f' % (epoch, acc_top1_val, acc_top5_val, loss_val, trainer.learning_rate)) if acc_top1_val > best_val_score and epoch > 5: best_val_score = acc_top1_val net.save_parameters( '%s/%.4f-%s-%s-%03d-best.params' % (opt.save_dir, best_val_score, opt.dataset, opt.model, epoch)) trainer.save_states( '%s/%.4f-%s-%s-%03d-best.states' % (opt.save_dir, best_val_score, opt.dataset, opt.model, epoch)) # We can plot the metric scores with:
def train(epochs, ctx): if isinstance(ctx, mx.Context): ctx = [ctx] net.initialize(mx.init.Xavier(), ctx=ctx) train_data = gluon.data.DataLoader(gluon.data.vision.CIFAR10( train=True).transform_first(transform_train), batch_size=batch_size, shuffle=True, last_batch='discard', num_workers=num_workers) val_data = gluon.data.DataLoader(gluon.data.vision.CIFAR10( train=False).transform_first(transform_test), batch_size=batch_size, shuffle=False, num_workers=num_workers) trainer = gluon.Trainer(net.collect_params(), optimizer) metric = mx.metric.Accuracy() train_metric = mx.metric.Accuracy() loss_fn = gluon.loss.SoftmaxCrossEntropyLoss() train_history = TrainingHistory(['training-error', 'validation-error']) iteration = 0 lr_decay_count = 0 best_val_score = 0 for epoch in range(epochs): tic = time.time() train_metric.reset() metric.reset() train_loss = 0 num_batch = len(train_data) alpha = 1 if epoch == lr_decay_epoch[lr_decay_count]: trainer.set_learning_rate(trainer.learning_rate * lr_decay) lr_decay_count += 1 for i, batch in enumerate(train_data): data = gluon.utils.split_and_load(batch[0], ctx_list=ctx, batch_axis=0) label = gluon.utils.split_and_load(batch[1], ctx_list=ctx, batch_axis=0) with ag.record(): output = [net(X) for X in data] loss = [loss_fn(yhat, y) for yhat, y in zip(output, label)] for l in loss: l.backward() trainer.step(batch_size) train_loss += sum([l.sum().asscalar() for l in loss]) train_metric.update(label, output) name, acc = train_metric.get() iteration += 1 train_loss /= batch_size * num_batch name, acc = train_metric.get() name, val_acc = test(ctx, val_data) train_history.update([1 - acc, 1 - val_acc]) train_history.plot(save_path='%s/%s_history.png' % (plot_path, model_name)) if val_acc > best_val_score: best_val_score = val_acc net.save_parameters( '%s/%.4f-cifar-%s-%d-best.params' % (save_dir, best_val_score, model_name, epoch)) logging.info('[Epoch %d] train=%f val=%f loss=%f time: %f' % (epoch, acc, val_acc, train_loss, time.time() - tic)) if save_period and save_dir and (epoch + 1) % save_period == 0: net.save_parameters('%s/cifar10-%s-%d.params' % (save_dir, model_name, epoch)) if save_period and save_dir: net.save_parameters('%s/cifar10-%s-%d.params' % (save_dir, model_name, epochs - 1))