def test_learning_rate(): o1 = mx.optimizer.Optimizer(learning_rate=0.01) o1.set_learning_rate(0.2) assert o1.learning_rate == 0.2 lr_s = lr_scheduler.FactorScheduler(step=1) o2 = mx.optimizer.Optimizer(lr_scheduler=lr_s, learning_rate=0.3) assert o2.learning_rate == 0.3 o2.lr_scheduler.base_lr = 0.4 assert o2.learning_rate == 0.4 lr_s = lr_scheduler.FactorScheduler(step=1, base_lr=1024) o3 = mx.optimizer.Optimizer(lr_scheduler=lr_s) assert o3.learning_rate == 1024
def model_compile(self): # 编译模型 self.loss = gloss.SoftmaxCrossEntropyLoss(axis=1) lr_sch = lr_scheduler.FactorScheduler(step=100, factor=0.9) optimizer_params = { 'learning_rate': self.learning_rate, 'lr_scheduler': lr_sch } self.trainer = Trainer(self.model.collect_params(), optimizer='adam', optimizer_params=optimizer_params)
def generate_lr_scheduler(ls_dict): scheduler_type = ls_dict['type'] scheduler_param = ls_dict['lr_scheduler_config'] factor = float(scheduler_param['factor']) if scheduler_type == 'Factor': step = int(scheduler_param['step']) stop_factor_lr = float(scheduler_param['stop_factor_lr']) return ls.FactorScheduler(step, factor, stop_factor_lr) elif scheduler_type == 'MultiFactor': steps = scheduler_param['steps'] step_list = [int(step) for step in steps] return ls.MultiFactorScheduler(step=step_list, factor=factor)
def evaluate_mxnet_LeNet2(train_data, valid_data, test_data, rect): import mxnet from mxnet import gluon from mxnet.gluon import nn from mxnet import lr_scheduler import mxnet_utils as utils odim = train_data[0][1].shape[0] ctx = mxnet.gpu() batch_size = 1024 train_loader = utils.as_dataloader(train_data, batch_size, rect) valid_loader = utils.as_dataloader(valid_data, batch_size, rect) test_loader = utils.as_dataloader(test_data, batch_size, rect) net = nn.Sequential() with net.name_scope(): net.add(nn.Conv2D(channels=48, kernel_size=5, activation='relu'), nn.MaxPool2D(pool_size=2, strides=2), nn.Conv2D(channels=128, kernel_size=3, activation='relu'), nn.MaxPool2D(pool_size=2, strides=2), nn.Conv2D(channels=512, kernel_size=1, activation='relu'), nn.Flatten(), nn.Dense(1000, activation="relu"), nn.Dropout(0.5), nn.Dense(1000, activation="relu"), nn.Dense(odim)) net.initialize(init='Xavier', ctx=ctx) floss = gluon.loss.SoftmaxCrossEntropyLoss() lr_sch = lr_scheduler.FactorScheduler(1024, 0.95) trainer = gluon.Trainer( net.collect_params(), 'sgd', { 'learning_rate': 0.05, 'momentum': 0.9, 'lr_scheduler': lr_sch, 'wd': 0.0030 }) utils.train(ctx, net, floss, trainer, train_loader, valid_loader, epochs=50, valid_interval=5) print('mxnet_cnn: %.2f%%' % utils.evaluate(ctx, net, test_loader))
def evaluate_mxnet_nn(train_data, valid_data, test_data): import mxnet from mxnet import gluon from mxnet import lr_scheduler from mxnet.gluon import nn import mxnet_utils as utils odim = train_data[0][1].shape[0] ctx = mxnet.gpu() batch_size = 1024 train_loader = utils.as_dataloader(train_data, batch_size) valid_loader = utils.as_dataloader(valid_data, batch_size) test_loader = utils.as_dataloader(test_data, batch_size) net = nn.Sequential() with net.name_scope(): net.add(nn.Dense(30, activation='relu')) net.add(nn.Dense(odim)) net.initialize(init='Xavier', ctx=ctx) floss = gluon.loss.SoftmaxCrossEntropyLoss() lr_sch = lr_scheduler.FactorScheduler(1024, 0.99) trainer = gluon.Trainer( net.collect_params(), 'sgd', { 'learning_rate': 0.5, 'momentum': 0.9, 'lr_scheduler': lr_sch, 'wd': 0.0003 }) utils.train(ctx, net, floss, trainer, train_loader, valid_loader, epochs=50, valid_interval=5) print('mxnet_nn: %.2f%%' % utils.evaluate(ctx, net, test_loader))
def define_model(): model = mix_net(vocab=4000, embed_size=300, num_hiddens=200, num_layers=2, dense_layers=10) # 初始化参数 lr = 0.0003 lr_sch = lr_scheduler.FactorScheduler(step=50, factor=0.9) optimizer_params = {'learning_rate': lr, 'lr_scheduler': lr_sch} model.initialize(init.Xavier()) ## trainer = gluon.Trainer(params = model.collect_params(), optimizer = 'sgd', optimizer_params = {'learning_rate':lr}) trainer = gluon.Trainer(params=model.collect_params(), optimizer='adam', optimizer_params=optimizer_params) loss = gloss.SoftmaxCrossEntropyLoss(sparse_label=True) ## loss = gloss.L2Loss() ## loss = gloss.SigmoidBinaryCrossEntropyLoss() accuracy = mx.metric.Accuracy() return model, trainer, loss
def train_fine_tuning(net, folder, learning_rate, freeze=True, batch_size=64, num_epochs=5, scheduler=False, wd=None): training_dataset = mx.gluon.data.vision.ImageRecordDataset( os.path.join(folder, 'train_bi.rec'), transform=train_aug_transform) validation_dataset = mx.gluon.data.vision.ImageRecordDataset( os.path.join(folder, 'valid_bi.rec'), transform=valid_aug_transform) train_iter = mx.gluon.data.DataLoader(training_dataset, batch_size=batch_size, shuffle=True) test_iter = mx.gluon.data.DataLoader(validation_dataset, batch_size=batch_size) ctx = mx.gpu() net.collect_params().reset_ctx(ctx) net.hybridize() loss = gloss.SoftmaxCrossEntropyLoss() if freeze: params = net.output.collect_params() else: net.output.collect_params().setattr('lr_mult', 100) params = net.collect_params() learning_rate /= 100 hyperparams = {'learning_rate': learning_rate} if scheduler: schedule = lr_scheduler.FactorScheduler(step=7, factor=0.7) hyperparams['lr_scheduler'] = schedule if wd is not None: hyperparams['wd'] = wd trainer = gluon.Trainer(params, 'adam', hyperparams) return train(train_iter, test_iter, net, loss, trainer, ctx, num_epochs)
def test_learning_rate_expect_user_warning(): lr_s = lr_scheduler.FactorScheduler(step=1) o = mx.optimizer.Optimizer(lr_scheduler=lr_s, learning_rate=0.3) o.set_learning_rate(0.5)
def test_learning_rate_expect_user_warning(): lr_s = lr_scheduler.FactorScheduler(step=1) o = mx.optimizer.Optimizer(lr_scheduler=lr_s, learning_rate=0.3) with pytest.raises(UserWarning): o.set_learning_rate(0.5)
def train_net(net, config, check_flag, logger, sig_state, sig_pgbar, sig_table): print(config) # config = Configs() # matplotlib.use('Agg') # import matplotlib.pyplot as plt sig_pgbar.emit(-1) mx.random.seed(1) matplotlib.use('Agg') import matplotlib.pyplot as plt classes = 10 num_epochs = config.train_cfg.epoch batch_size = config.train_cfg.batchsize optimizer = config.lr_cfg.optimizer lr = config.lr_cfg.lr num_gpus = config.train_cfg.gpu batch_size *= max(1, num_gpus) context = [mx.gpu(i) for i in range(num_gpus)] if num_gpus > 0 else [mx.cpu()] num_workers = config.data_cfg.worker warmup = config.lr_cfg.warmup if config.lr_cfg.decay == 'cosine': lr_sch = lr_scheduler.CosineScheduler((50000//batch_size)*num_epochs, base_lr=lr, warmup_steps=warmup * (50000//batch_size), final_lr=1e-5) else: lr_sch = lr_scheduler.FactorScheduler((50000//batch_size)*config.lr_cfg.factor_epoch, factor=config.lr_cfg.factor, base_lr=lr, warmup_steps=warmup*(50000//batch_size)) model_name = config.net_cfg.name if config.data_cfg.mixup: model_name += '_mixup' if config.train_cfg.amp: model_name += '_amp' base_dir = './'+model_name if os.path.exists(base_dir): base_dir = base_dir + '-' + \ time.strftime("%m-%d-%H.%M.%S", time.localtime()) makedirs(base_dir) if config.save_cfg.tensorboard: logdir = base_dir+'/tb/'+model_name if os.path.exists(logdir): logdir = logdir + '-' + \ time.strftime("%m-%d-%H.%M.%S", time.localtime()) sw = SummaryWriter(logdir=logdir, flush_secs=5, verbose=False) cmd_file = open(base_dir+'/tb.bat', mode='w') cmd_file.write('tensorboard --logdir=./') cmd_file.close() save_period = 10 save_dir = base_dir+'/'+'params' makedirs(save_dir) plot_name = base_dir+'/'+'plot' makedirs(plot_name) stat_name = base_dir+'/'+'stat.txt' csv_name = base_dir+'/'+'data.csv' if os.path.exists(csv_name): csv_name = base_dir+'/'+'data-' + \ time.strftime("%m-%d-%H.%M.%S", time.localtime())+'.csv' csv_file = open(csv_name, mode='w', newline='') csv_writer = csv.writer(csv_file) csv_writer.writerow(['Epoch', 'train_loss', 'train_acc', 'valid_loss', 'valid_acc', 'lr', 'time']) logging_handlers = [logging.StreamHandler(), logger] logging_handlers.append(logging.FileHandler( '%s/train_cifar10_%s.log' % (model_name, model_name))) logging.basicConfig(level=logging.INFO, handlers=logging_handlers) logging.info(config) if config.train_cfg.amp: amp.init() if config.save_cfg.profiler: profiler.set_config(profile_all=True, aggregate_stats=True, continuous_dump=True, filename=base_dir+'/%s_profile.json' % model_name) is_profiler_run = False trans_list = [] imgsize = config.data_cfg.size if config.data_cfg.crop: trans_list.append(gcv_transforms.RandomCrop( 32, pad=config.data_cfg.crop_pad)) if config.data_cfg.cutout: trans_list.append(CutOut(config.data_cfg.cutout_size)) if config.data_cfg.flip: trans_list.append(transforms.RandomFlipLeftRight()) if config.data_cfg.erase: trans_list.append(gcv_transforms.block.RandomErasing(s_max=0.25)) trans_list.append(transforms.Resize(imgsize)) trans_list.append(transforms.ToTensor()) trans_list.append(transforms.Normalize([0.4914, 0.4822, 0.4465], [0.2023, 0.1994, 0.2010])) transform_train = transforms.Compose(trans_list) transform_test = transforms.Compose([ transforms.Resize(imgsize), transforms.ToTensor(), transforms.Normalize([0.4914, 0.4822, 0.4465], [0.2023, 0.1994, 0.2010]) ]) def label_transform(label, classes): ind = label.astype('int') res = nd.zeros((ind.shape[0], classes), ctx=label.context) res[nd.arange(ind.shape[0], ctx=label.context), ind] = 1 return res def test(ctx, val_data): metric = mx.metric.Accuracy() loss_fn = gluon.loss.SoftmaxCrossEntropyLoss() num_batch = len(val_data) test_loss = 0 for i, batch in enumerate(val_data): data = gluon.utils.split_and_load( batch[0], ctx_list=ctx, batch_axis=0) label = gluon.utils.split_and_load( batch[1], ctx_list=ctx, batch_axis=0) outputs = [net(X) for X in data] loss = [loss_fn(yhat, y) for yhat, y in zip(outputs, label)] metric.update(label, outputs) test_loss += sum([l.sum().asscalar() for l in loss]) test_loss /= batch_size * num_batch name, val_acc = metric.get() return name, val_acc, test_loss def train(epochs, ctx): if isinstance(ctx, mx.Context): ctx = [ctx] if config.train_cfg.param_init: init_func = getattr(mx.init, config.train_cfg.init) net.initialize(init_func(), ctx=ctx, force_reinit=True) else: net.load_parameters(config.train_cfg.param_file, ctx=ctx) summary(net, stat_name, nd.uniform( shape=(1, 3, imgsize, imgsize), ctx=ctx[0])) # net = nn.HybridBlock() net.hybridize() root = config.dir_cfg.dataset train_data = gluon.data.DataLoader( gluon.data.vision.CIFAR10( root=root, train=True).transform_first(transform_train), batch_size=batch_size, shuffle=True, last_batch='discard', num_workers=num_workers) val_data = gluon.data.DataLoader( gluon.data.vision.CIFAR10( root=root, train=False).transform_first(transform_test), batch_size=batch_size, shuffle=False, num_workers=num_workers) trainer_arg = {'learning_rate': config.lr_cfg.lr, 'wd': config.lr_cfg.wd, 'lr_scheduler': lr_sch} extra_arg = eval(config.lr_cfg.extra_arg) trainer_arg.update(extra_arg) trainer = gluon.Trainer(net.collect_params(), optimizer, trainer_arg) if config.train_cfg.amp: amp.init_trainer(trainer) metric = mx.metric.Accuracy() train_metric = mx.metric.RMSE() loss_fn = gluon.loss.SoftmaxCrossEntropyLoss( sparse_label=False if config.data_cfg.mixup else True) train_history = TrainingHistory(['training-error', 'validation-error']) # acc_history = TrainingHistory(['training-acc', 'validation-acc']) loss_history = TrainingHistory(['training-loss', 'validation-loss']) iteration = 0 best_val_score = 0 # print('start training') sig_state.emit(1) sig_pgbar.emit(0) # signal.emit('Training') for epoch in range(epochs): tic = time.time() train_metric.reset() metric.reset() train_loss = 0 num_batch = len(train_data) alpha = 1 for i, batch in enumerate(train_data): if epoch == 0 and iteration == 1 and config.save_cfg.profiler: profiler.set_state('run') is_profiler_run = True if epoch == 0 and iteration == 1 and config.save_cfg.tensorboard: sw.add_graph(net) lam = np.random.beta(alpha, alpha) if epoch >= epochs - 20 or not config.data_cfg.mixup: lam = 1 data_1 = gluon.utils.split_and_load( batch[0], ctx_list=ctx, batch_axis=0) label_1 = gluon.utils.split_and_load( batch[1], ctx_list=ctx, batch_axis=0) if not config.data_cfg.mixup: data = data_1 label = label_1 else: data = [lam*X + (1-lam)*X[::-1] for X in data_1] label = [] for Y in label_1: y1 = label_transform(Y, classes) y2 = label_transform(Y[::-1], classes) label.append(lam*y1 + (1-lam)*y2) with ag.record(): output = [net(X) for X in data] loss = [loss_fn(yhat, y) for yhat, y in zip(output, label)] if config.train_cfg.amp: with ag.record(): with amp.scale_loss(loss, trainer) as scaled_loss: ag.backward(scaled_loss) # scaled_loss.backward() else: for l in loss: l.backward() trainer.step(batch_size) train_loss += sum([l.sum().asscalar() for l in loss]) output_softmax = [nd.SoftmaxActivation(out) for out in output] train_metric.update(label, output_softmax) metric.update(label_1, output_softmax) name, acc = train_metric.get() if config.save_cfg.tensorboard: sw.add_scalar(tag='lr', value=trainer.learning_rate, global_step=iteration) if epoch == 0 and iteration == 1 and config.save_cfg.profiler: nd.waitall() profiler.set_state('stop') profiler.dump() iteration += 1 sig_pgbar.emit(iteration) if check_flag()[0]: sig_state.emit(2) while(check_flag()[0] or check_flag()[1]): if check_flag()[1]: print('stop') return else: time.sleep(5) print('pausing') epoch_time = time.time() - tic train_loss /= batch_size * num_batch name, acc = train_metric.get() _, train_acc = metric.get() name, val_acc, _ = test(ctx, val_data) # if config.data_cfg.mixup: # train_history.update([acc, 1-val_acc]) # plt.cla() # train_history.plot(save_path='%s/%s_history.png' % # (plot_name, model_name)) # else: train_history.update([1-train_acc, 1-val_acc]) plt.cla() train_history.plot(save_path='%s/%s_history.png' % (plot_name, model_name)) if val_acc > best_val_score: best_val_score = val_acc net.save_parameters('%s/%.4f-cifar-%s-%d-best.params' % (save_dir, best_val_score, model_name, epoch)) current_lr = trainer.learning_rate name, val_acc, val_loss = test(ctx, val_data) logging.info('[Epoch %d] loss=%f train_acc=%f train_RMSE=%f\n val_acc=%f val_loss=%f lr=%f time: %f' % (epoch, train_loss, train_acc, acc, val_acc, val_loss, current_lr, epoch_time)) loss_history.update([train_loss, val_loss]) plt.cla() loss_history.plot(save_path='%s/%s_loss.png' % (plot_name, model_name), y_lim=(0, 2), legend_loc='best') if config.save_cfg.tensorboard: sw._add_scalars(tag='Acc', scalar_dict={'train_acc': train_acc, 'test_acc': val_acc}, global_step=epoch) sw._add_scalars(tag='Loss', scalar_dict={'train_loss': train_loss, 'test_loss': val_loss}, global_step=epoch) sig_table.emit([epoch, train_loss, train_acc, val_loss, val_acc, current_lr, epoch_time]) csv_writer.writerow([epoch, train_loss, train_acc, val_loss, val_acc, current_lr, epoch_time]) csv_file.flush() if save_period and save_dir and (epoch + 1) % save_period == 0: net.save_parameters('%s/cifar10-%s-%d.params' % (save_dir, model_name, epoch)) if save_period and save_dir: net.save_parameters('%s/cifar10-%s-%d.params' % (save_dir, model_name, epochs-1)) train(num_epochs, context) if config.save_cfg.tensorboard: sw.close() for ctx in context: ctx.empty_cache() csv_file.close() logging.shutdown() reload(logging) sig_state.emit(0)
def test_lr_scheduler(): from mxnet import lr_scheduler, optimizer scheduler = lr_scheduler.FactorScheduler(base_lr=1, step=250, factor=0.5) optim = optimizer.SGD(learning_rate=0.1, lr_scheduler=scheduler)