Ejemplo n.º 1
0
def test_learning_rate():
    o1 = mx.optimizer.Optimizer(learning_rate=0.01)
    o1.set_learning_rate(0.2)
    assert o1.learning_rate == 0.2

    lr_s = lr_scheduler.FactorScheduler(step=1)
    o2 = mx.optimizer.Optimizer(lr_scheduler=lr_s, learning_rate=0.3)
    assert o2.learning_rate == 0.3
    o2.lr_scheduler.base_lr = 0.4
    assert o2.learning_rate == 0.4

    lr_s = lr_scheduler.FactorScheduler(step=1, base_lr=1024)
    o3 = mx.optimizer.Optimizer(lr_scheduler=lr_s)
    assert o3.learning_rate == 1024
Ejemplo n.º 2
0
 def model_compile(self):
     # 编译模型
     self.loss = gloss.SoftmaxCrossEntropyLoss(axis=1)
     lr_sch = lr_scheduler.FactorScheduler(step=100, factor=0.9)
     optimizer_params = {
         'learning_rate': self.learning_rate,
         'lr_scheduler': lr_sch
     }
     self.trainer = Trainer(self.model.collect_params(),
                            optimizer='adam',
                            optimizer_params=optimizer_params)
Ejemplo n.º 3
0
def generate_lr_scheduler(ls_dict):
    scheduler_type = ls_dict['type']
    scheduler_param = ls_dict['lr_scheduler_config']
    factor = float(scheduler_param['factor'])
    if scheduler_type == 'Factor':
        step = int(scheduler_param['step'])
        stop_factor_lr = float(scheduler_param['stop_factor_lr'])
        return ls.FactorScheduler(step, factor, stop_factor_lr)
    elif scheduler_type == 'MultiFactor':
        steps = scheduler_param['steps']
        step_list = [int(step) for step in steps]
        return ls.MultiFactorScheduler(step=step_list, factor=factor)
Ejemplo n.º 4
0
def evaluate_mxnet_LeNet2(train_data, valid_data, test_data, rect):
    import mxnet
    from mxnet import gluon
    from mxnet.gluon import nn
    from mxnet import lr_scheduler
    import mxnet_utils as utils

    odim = train_data[0][1].shape[0]

    ctx = mxnet.gpu()
    batch_size = 1024

    train_loader = utils.as_dataloader(train_data, batch_size, rect)
    valid_loader = utils.as_dataloader(valid_data, batch_size, rect)
    test_loader = utils.as_dataloader(test_data, batch_size, rect)

    net = nn.Sequential()
    with net.name_scope():
        net.add(nn.Conv2D(channels=48, kernel_size=5, activation='relu'),
                nn.MaxPool2D(pool_size=2, strides=2),
                nn.Conv2D(channels=128, kernel_size=3, activation='relu'),
                nn.MaxPool2D(pool_size=2, strides=2),
                nn.Conv2D(channels=512, kernel_size=1, activation='relu'),
                nn.Flatten(), nn.Dense(1000, activation="relu"),
                nn.Dropout(0.5), nn.Dense(1000, activation="relu"),
                nn.Dense(odim))
    net.initialize(init='Xavier', ctx=ctx)

    floss = gluon.loss.SoftmaxCrossEntropyLoss()

    lr_sch = lr_scheduler.FactorScheduler(1024, 0.95)

    trainer = gluon.Trainer(
        net.collect_params(), 'sgd', {
            'learning_rate': 0.05,
            'momentum': 0.9,
            'lr_scheduler': lr_sch,
            'wd': 0.0030
        })

    utils.train(ctx,
                net,
                floss,
                trainer,
                train_loader,
                valid_loader,
                epochs=50,
                valid_interval=5)

    print('mxnet_cnn: %.2f%%' % utils.evaluate(ctx, net, test_loader))
Ejemplo n.º 5
0
def evaluate_mxnet_nn(train_data, valid_data, test_data):
    import mxnet
    from mxnet import gluon
    from mxnet import lr_scheduler
    from mxnet.gluon import nn
    import mxnet_utils as utils

    odim = train_data[0][1].shape[0]

    ctx = mxnet.gpu()
    batch_size = 1024

    train_loader = utils.as_dataloader(train_data, batch_size)
    valid_loader = utils.as_dataloader(valid_data, batch_size)
    test_loader = utils.as_dataloader(test_data, batch_size)

    net = nn.Sequential()
    with net.name_scope():
        net.add(nn.Dense(30, activation='relu'))
        net.add(nn.Dense(odim))
    net.initialize(init='Xavier', ctx=ctx)

    floss = gluon.loss.SoftmaxCrossEntropyLoss()

    lr_sch = lr_scheduler.FactorScheduler(1024, 0.99)

    trainer = gluon.Trainer(
        net.collect_params(), 'sgd', {
            'learning_rate': 0.5,
            'momentum': 0.9,
            'lr_scheduler': lr_sch,
            'wd': 0.0003
        })

    utils.train(ctx,
                net,
                floss,
                trainer,
                train_loader,
                valid_loader,
                epochs=50,
                valid_interval=5)

    print('mxnet_nn: %.2f%%' % utils.evaluate(ctx, net, test_loader))
Ejemplo n.º 6
0
def define_model():
    model = mix_net(vocab=4000,
                    embed_size=300,
                    num_hiddens=200,
                    num_layers=2,
                    dense_layers=10)
    # 初始化参数
    lr = 0.0003
    lr_sch = lr_scheduler.FactorScheduler(step=50, factor=0.9)
    optimizer_params = {'learning_rate': lr, 'lr_scheduler': lr_sch}
    model.initialize(init.Xavier())
    ##    trainer = gluon.Trainer(params = model.collect_params(), optimizer = 'sgd', optimizer_params = {'learning_rate':lr})
    trainer = gluon.Trainer(params=model.collect_params(),
                            optimizer='adam',
                            optimizer_params=optimizer_params)
    loss = gloss.SoftmaxCrossEntropyLoss(sparse_label=True)
    ##    loss = gloss.L2Loss()
    ##    loss = gloss.SigmoidBinaryCrossEntropyLoss()
    accuracy = mx.metric.Accuracy()
    return model, trainer, loss
Ejemplo n.º 7
0
def train_fine_tuning(net,
                      folder,
                      learning_rate,
                      freeze=True,
                      batch_size=64,
                      num_epochs=5,
                      scheduler=False,
                      wd=None):

    training_dataset = mx.gluon.data.vision.ImageRecordDataset(
        os.path.join(folder, 'train_bi.rec'), transform=train_aug_transform)
    validation_dataset = mx.gluon.data.vision.ImageRecordDataset(
        os.path.join(folder, 'valid_bi.rec'), transform=valid_aug_transform)

    train_iter = mx.gluon.data.DataLoader(training_dataset,
                                          batch_size=batch_size,
                                          shuffle=True)
    test_iter = mx.gluon.data.DataLoader(validation_dataset,
                                         batch_size=batch_size)

    ctx = mx.gpu()
    net.collect_params().reset_ctx(ctx)
    net.hybridize()
    loss = gloss.SoftmaxCrossEntropyLoss()

    if freeze:
        params = net.output.collect_params()
    else:
        net.output.collect_params().setattr('lr_mult', 100)
        params = net.collect_params()
        learning_rate /= 100

    hyperparams = {'learning_rate': learning_rate}
    if scheduler:
        schedule = lr_scheduler.FactorScheduler(step=7, factor=0.7)
        hyperparams['lr_scheduler'] = schedule
    if wd is not None:
        hyperparams['wd'] = wd

    trainer = gluon.Trainer(params, 'adam', hyperparams)
    return train(train_iter, test_iter, net, loss, trainer, ctx, num_epochs)
Ejemplo n.º 8
0
def test_learning_rate_expect_user_warning():
    lr_s = lr_scheduler.FactorScheduler(step=1)
    o = mx.optimizer.Optimizer(lr_scheduler=lr_s, learning_rate=0.3)
    o.set_learning_rate(0.5)
Ejemplo n.º 9
0
def test_learning_rate_expect_user_warning():
    lr_s = lr_scheduler.FactorScheduler(step=1)
    o = mx.optimizer.Optimizer(lr_scheduler=lr_s, learning_rate=0.3)

    with pytest.raises(UserWarning):
        o.set_learning_rate(0.5)
Ejemplo n.º 10
0
def train_net(net, config, check_flag, logger, sig_state, sig_pgbar, sig_table):
    print(config)
    # config = Configs()
    # matplotlib.use('Agg')
    # import matplotlib.pyplot as plt
    sig_pgbar.emit(-1)
    mx.random.seed(1)
    matplotlib.use('Agg')
    import matplotlib.pyplot as plt

    classes = 10
    num_epochs = config.train_cfg.epoch
    batch_size = config.train_cfg.batchsize
    optimizer = config.lr_cfg.optimizer
    lr = config.lr_cfg.lr
    num_gpus = config.train_cfg.gpu
    batch_size *= max(1, num_gpus)
    context = [mx.gpu(i)
               for i in range(num_gpus)] if num_gpus > 0 else [mx.cpu()]
    num_workers = config.data_cfg.worker

    warmup = config.lr_cfg.warmup
    if config.lr_cfg.decay == 'cosine':
        lr_sch = lr_scheduler.CosineScheduler((50000//batch_size)*num_epochs,
                                              base_lr=lr,
                                              warmup_steps=warmup *
                                              (50000//batch_size),
                                              final_lr=1e-5)
    else:
        lr_sch = lr_scheduler.FactorScheduler((50000//batch_size)*config.lr_cfg.factor_epoch,
                                              factor=config.lr_cfg.factor,
                                              base_lr=lr,
                                              warmup_steps=warmup*(50000//batch_size))

    model_name = config.net_cfg.name

    if config.data_cfg.mixup:
        model_name += '_mixup'
    if config.train_cfg.amp:
        model_name += '_amp'

    base_dir = './'+model_name
    if os.path.exists(base_dir):
        base_dir = base_dir + '-' + \
            time.strftime("%m-%d-%H.%M.%S", time.localtime())
    makedirs(base_dir)

    if config.save_cfg.tensorboard:
        logdir = base_dir+'/tb/'+model_name
        if os.path.exists(logdir):
            logdir = logdir + '-' + \
                time.strftime("%m-%d-%H.%M.%S", time.localtime())
        sw = SummaryWriter(logdir=logdir, flush_secs=5, verbose=False)
        cmd_file = open(base_dir+'/tb.bat', mode='w')
        cmd_file.write('tensorboard --logdir=./')
        cmd_file.close()

    save_period = 10
    save_dir = base_dir+'/'+'params'
    makedirs(save_dir)

    plot_name = base_dir+'/'+'plot'
    makedirs(plot_name)

    stat_name = base_dir+'/'+'stat.txt'

    csv_name = base_dir+'/'+'data.csv'
    if os.path.exists(csv_name):
        csv_name = base_dir+'/'+'data-' + \
            time.strftime("%m-%d-%H.%M.%S", time.localtime())+'.csv'
    csv_file = open(csv_name, mode='w', newline='')
    csv_writer = csv.writer(csv_file)
    csv_writer.writerow(['Epoch', 'train_loss', 'train_acc',
                         'valid_loss', 'valid_acc', 'lr', 'time'])

    logging_handlers = [logging.StreamHandler(), logger]
    logging_handlers.append(logging.FileHandler(
        '%s/train_cifar10_%s.log' % (model_name, model_name)))

    logging.basicConfig(level=logging.INFO, handlers=logging_handlers)
    logging.info(config)

    if config.train_cfg.amp:
        amp.init()

    if config.save_cfg.profiler:
        profiler.set_config(profile_all=True,
                            aggregate_stats=True,
                            continuous_dump=True,
                            filename=base_dir+'/%s_profile.json' % model_name)
        is_profiler_run = False

    trans_list = []
    imgsize = config.data_cfg.size
    if config.data_cfg.crop:
        trans_list.append(gcv_transforms.RandomCrop(
            32, pad=config.data_cfg.crop_pad))
    if config.data_cfg.cutout:
        trans_list.append(CutOut(config.data_cfg.cutout_size))
    if config.data_cfg.flip:
        trans_list.append(transforms.RandomFlipLeftRight())
    if config.data_cfg.erase:
        trans_list.append(gcv_transforms.block.RandomErasing(s_max=0.25))
    trans_list.append(transforms.Resize(imgsize))
    trans_list.append(transforms.ToTensor())
    trans_list.append(transforms.Normalize([0.4914, 0.4822, 0.4465],
                                           [0.2023, 0.1994, 0.2010]))

    transform_train = transforms.Compose(trans_list)

    transform_test = transforms.Compose([
        transforms.Resize(imgsize),
        transforms.ToTensor(),
        transforms.Normalize([0.4914, 0.4822, 0.4465],
                             [0.2023, 0.1994, 0.2010])
    ])

    def label_transform(label, classes):
        ind = label.astype('int')
        res = nd.zeros((ind.shape[0], classes), ctx=label.context)
        res[nd.arange(ind.shape[0], ctx=label.context), ind] = 1
        return res

    def test(ctx, val_data):
        metric = mx.metric.Accuracy()
        loss_fn = gluon.loss.SoftmaxCrossEntropyLoss()
        num_batch = len(val_data)
        test_loss = 0
        for i, batch in enumerate(val_data):
            data = gluon.utils.split_and_load(
                batch[0], ctx_list=ctx, batch_axis=0)
            label = gluon.utils.split_and_load(
                batch[1], ctx_list=ctx, batch_axis=0)
            outputs = [net(X) for X in data]
            loss = [loss_fn(yhat, y) for yhat, y in zip(outputs, label)]
            metric.update(label, outputs)
            test_loss += sum([l.sum().asscalar() for l in loss])
        test_loss /= batch_size * num_batch
        name, val_acc = metric.get()
        return name, val_acc, test_loss

    def train(epochs, ctx):
        if isinstance(ctx, mx.Context):
            ctx = [ctx]

        if config.train_cfg.param_init:
            init_func = getattr(mx.init, config.train_cfg.init)
            net.initialize(init_func(), ctx=ctx, force_reinit=True)
        else:
            net.load_parameters(config.train_cfg.param_file, ctx=ctx)

        summary(net, stat_name, nd.uniform(
            shape=(1, 3, imgsize, imgsize), ctx=ctx[0]))
        # net = nn.HybridBlock()
        net.hybridize()

        root = config.dir_cfg.dataset
        train_data = gluon.data.DataLoader(
            gluon.data.vision.CIFAR10(
                root=root, train=True).transform_first(transform_train),
            batch_size=batch_size, shuffle=True, last_batch='discard', num_workers=num_workers)

        val_data = gluon.data.DataLoader(
            gluon.data.vision.CIFAR10(
                root=root, train=False).transform_first(transform_test),
            batch_size=batch_size, shuffle=False, num_workers=num_workers)

        trainer_arg = {'learning_rate': config.lr_cfg.lr,
                       'wd': config.lr_cfg.wd, 'lr_scheduler': lr_sch}
        extra_arg = eval(config.lr_cfg.extra_arg)
        trainer_arg.update(extra_arg)
        trainer = gluon.Trainer(net.collect_params(), optimizer, trainer_arg)
        if config.train_cfg.amp:
            amp.init_trainer(trainer)
        metric = mx.metric.Accuracy()
        train_metric = mx.metric.RMSE()
        loss_fn = gluon.loss.SoftmaxCrossEntropyLoss(
            sparse_label=False if config.data_cfg.mixup else True)
        train_history = TrainingHistory(['training-error', 'validation-error'])
        # acc_history = TrainingHistory(['training-acc', 'validation-acc'])
        loss_history = TrainingHistory(['training-loss', 'validation-loss'])

        iteration = 0

        best_val_score = 0

        # print('start training')
        sig_state.emit(1)
        sig_pgbar.emit(0)
        # signal.emit('Training')
        for epoch in range(epochs):
            tic = time.time()
            train_metric.reset()
            metric.reset()
            train_loss = 0
            num_batch = len(train_data)
            alpha = 1
            for i, batch in enumerate(train_data):
                if epoch == 0 and iteration == 1 and config.save_cfg.profiler:
                    profiler.set_state('run')
                    is_profiler_run = True
                if epoch == 0 and iteration == 1 and config.save_cfg.tensorboard:
                    sw.add_graph(net)
                lam = np.random.beta(alpha, alpha)
                if epoch >= epochs - 20 or not config.data_cfg.mixup:
                    lam = 1

                data_1 = gluon.utils.split_and_load(
                    batch[0], ctx_list=ctx, batch_axis=0)
                label_1 = gluon.utils.split_and_load(
                    batch[1], ctx_list=ctx, batch_axis=0)

                if not config.data_cfg.mixup:
                    data = data_1
                    label = label_1
                else:
                    data = [lam*X + (1-lam)*X[::-1] for X in data_1]
                    label = []
                    for Y in label_1:
                        y1 = label_transform(Y, classes)
                        y2 = label_transform(Y[::-1], classes)
                        label.append(lam*y1 + (1-lam)*y2)

                with ag.record():
                    output = [net(X) for X in data]
                    loss = [loss_fn(yhat, y) for yhat, y in zip(output, label)]
                if config.train_cfg.amp:
                    with ag.record():
                        with amp.scale_loss(loss, trainer) as scaled_loss:
                            ag.backward(scaled_loss)
                            # scaled_loss.backward()
                else:
                    for l in loss:
                        l.backward()
                trainer.step(batch_size)
                train_loss += sum([l.sum().asscalar() for l in loss])

                output_softmax = [nd.SoftmaxActivation(out) for out in output]
                train_metric.update(label, output_softmax)
                metric.update(label_1, output_softmax)
                name, acc = train_metric.get()
                if config.save_cfg.tensorboard:
                    sw.add_scalar(tag='lr', value=trainer.learning_rate,
                                  global_step=iteration)
                if epoch == 0 and iteration == 1 and config.save_cfg.profiler:
                    nd.waitall()
                    profiler.set_state('stop')
                    profiler.dump()
                iteration += 1
                sig_pgbar.emit(iteration)
                if check_flag()[0]:
                    sig_state.emit(2)
                while(check_flag()[0] or check_flag()[1]):
                    if check_flag()[1]:
                        print('stop')
                        return
                    else:
                        time.sleep(5)
                        print('pausing')

            epoch_time = time.time() - tic
            train_loss /= batch_size * num_batch
            name, acc = train_metric.get()
            _, train_acc = metric.get()
            name, val_acc, _ = test(ctx, val_data)
            # if config.data_cfg.mixup:
            #     train_history.update([acc, 1-val_acc])
            #     plt.cla()
            #     train_history.plot(save_path='%s/%s_history.png' %
            #                        (plot_name, model_name))
            # else:
            train_history.update([1-train_acc, 1-val_acc])
            plt.cla()
            train_history.plot(save_path='%s/%s_history.png' %
                               (plot_name, model_name))

            if val_acc > best_val_score:
                best_val_score = val_acc
                net.save_parameters('%s/%.4f-cifar-%s-%d-best.params' %
                                    (save_dir, best_val_score, model_name, epoch))

            current_lr = trainer.learning_rate
            name, val_acc, val_loss = test(ctx, val_data)

            logging.info('[Epoch %d] loss=%f train_acc=%f train_RMSE=%f\n     val_acc=%f val_loss=%f lr=%f time: %f' %
                         (epoch, train_loss, train_acc, acc, val_acc, val_loss, current_lr, epoch_time))
            loss_history.update([train_loss, val_loss])
            plt.cla()
            loss_history.plot(save_path='%s/%s_loss.png' %
                              (plot_name, model_name), y_lim=(0, 2), legend_loc='best')
            if config.save_cfg.tensorboard:
                sw._add_scalars(tag='Acc',
                                scalar_dict={'train_acc': train_acc, 'test_acc': val_acc}, global_step=epoch)
                sw._add_scalars(tag='Loss',
                                scalar_dict={'train_loss': train_loss, 'test_loss': val_loss}, global_step=epoch)

            sig_table.emit([epoch, train_loss, train_acc,
                            val_loss, val_acc, current_lr, epoch_time])
            csv_writer.writerow([epoch, train_loss, train_acc,
                                 val_loss, val_acc, current_lr, epoch_time])
            csv_file.flush()

            if save_period and save_dir and (epoch + 1) % save_period == 0:
                net.save_parameters('%s/cifar10-%s-%d.params' %
                                    (save_dir, model_name, epoch))
        if save_period and save_dir:
            net.save_parameters('%s/cifar10-%s-%d.params' %
                                (save_dir, model_name, epochs-1))

    train(num_epochs, context)
    if config.save_cfg.tensorboard:
        sw.close()

    for ctx in context:
        ctx.empty_cache()

    csv_file.close()
    logging.shutdown()
    reload(logging)
    sig_state.emit(0)
Ejemplo n.º 11
0
def test_lr_scheduler():
    from mxnet import lr_scheduler, optimizer
    scheduler = lr_scheduler.FactorScheduler(base_lr=1, step=250, factor=0.5)
    optim = optimizer.SGD(learning_rate=0.1, lr_scheduler=scheduler)