def train(ctx,
          loss,
          trainer,
          datasetName,
          modelName,
          net,
          train_iter,
          valid_iter,
          num_epochs,
          n_retrain_epoch=0):
    '''
    n_retrain_epoch 是从第 n_retrain_epoch 次开始训练模型
    '''
    train_metric = metric.Accuracy()
    train_history = TrainingHistory(['training-error', 'validation-error'])
    best_val_score = 0
    modelDir, resultDir = get_result_dirs(datasetName)
    for epoch in range(num_epochs):
        train_l_batch, start = 0.0, time.time()  # 计时开始
        train_metric.reset()
        for X, y in train_iter:
            X = X.as_in_context(ctx)
            y = y.as_in_context(ctx).astype('float32')  # 模型的输出是 float32 类型数据
            with autograd.record():  # 记录梯度信息
                outputs = net(X)  # 模型输出
                l = loss(outputs, y).mean()  # 计算平均损失
            l.backward()  # 反向传播
            trainer.step(1)
            train_l_batch += l.asscalar()  # 计算该批量的总损失
            train_metric.update(y, outputs)  # 计算训练精度
        _, train_acc = train_metric.get()
        time_s = "time {:.2f} sec".format(time.time() - start)  # 计时结束
        valid_loss = evaluate_loss(valid_iter, net, ctx, loss)  # 计算验证集的平均损失
        _, val_acc = test(valid_iter, net, ctx)  # 计算验证集的精度
        epoch_s = (
            "epoch {:d}, train loss {:.5f}, valid loss {:.5f}, train acc {:.5f}, valid acc {:.5f}, "
            .format(n_retrain_epoch + epoch, train_l_batch, valid_loss,
                    train_acc, val_acc))
        print(epoch_s + time_s)
        train_history.update([1 - train_acc, 1 - val_acc])  # 更新图像的纵轴
        train_history.plot(
            save_path=f'{resultDir}/{modelName}_history.png')  # 实时更新图像
        if val_acc > best_val_score:  # 保存比较好的模型
            best_val_score = val_acc
            net.save_parameters('{}/{:.4f}-{}-{:d}-best.params'.format(
                modelDir, best_val_score, modelName, n_retrain_epoch + epoch))
    return train_history
Example #2
0
# Epochs where learning rate decays
lr_decay_epoch = [30, 60, np.inf]

# Stochastic gradient descent
optimizer = 'sgd'
# Set parameters
optimizer_params = {'learning_rate': 0.001, 'wd': 0.0001, 'momentum': 0.9}

# Define our trainer for net
trainer = gluon.Trainer(net.collect_params(), optimizer, optimizer_params)

loss_fn = gluon.loss.SoftmaxCrossEntropyLoss()

train_metric = mx.metric.Accuracy()

train_history = TrainingHistory(
    ['training-acc', 'val-top1-acc', 'val-top5-acc'])

epochs = 80
lr_decay_count = 0

acc_top1 = mx.metric.Accuracy()
acc_top5 = mx.metric.TopKAccuracy(5)


def test(ctx, val_data):
    acc_top1.reset()
    acc_top5.reset()
    L = gluon.loss.SoftmaxCrossEntropyLoss()
    num_test_iter = len(val_data)
    val_loss_epoch = 0
    for i, batch in enumerate(val_data):
Example #3
0
def train(args):
    # SageMaker passes num_cpus, num_gpus and other args we can use to tailor training to
    # the current container environment
    ctx = [mx.gpu() if mx.context.num_gpus() > 0 else mx.cpu()]
    # retrieve the hyperparameters we set in notebook (with some defaults)

    #number of training examples utilized in one iteration.
    batch_size = args.batch_size
    #number of times an entire dataset is passed forward and backward through the neural network
    epochs = args.epochs
    #tuning parameter in an optimization algorithm that determines the step size at each iteration while moving toward a   minimum of a loss function.
    learning_rate = args.learning_rate
    #Momentum remembers the update Δ w at each iteration, and determines the next update as a linear combination of the gradient and the previous update
    momentum = args.momentum
    #Optimizers are algorithms or methods used to change the attributes of your neural network such as weights and learning rate in order to reduce the losses.
    optimizer = args.optimizer
    #after each update, the weights are multiplied by a factor slightly less than 1.
    wd = args.wd
    optimizer_params = {
        'learning_rate': learning_rate,
        'wd': wd,
        'momentum': momentum
    }
    log_interval = args.log_interval

    #In this example, we use Inflated 3D model (I3D) with ResNet50 backbone trained on Kinetics400 dataset. We want to replace the last classification (dense) layer to the number of classes in the dataset.
    model_name = args.network
    #number of classes in the dataset
    nclass = 2
    #number of workers for the data loader
    num_workers = 1

    current_host = args.current_host
    hosts = args.hosts
    model_dir = args.model_dir
    CHECKPOINTS_DIR = '/opt/ml/checkpoints'
    checkpoints_enabled = os.path.exists(CHECKPOINTS_DIR)

    data_dir = args.train
    segments = 'rawframes'
    train = 'annotations/{}_train_list_rawframes.txt'.format(args.task)

    #load the data with data loader
    train_data = load_data(data_dir, batch_size, num_workers, segments, train)
    # define the network
    net = define_network(ctx, model_name, nclass)
    #define the gluon trainer
    trainer = gluon.Trainer(net.collect_params(), optimizer, optimizer_params)
    #define loss function
    loss_fn = gluon.loss.SoftmaxCrossEntropyLoss()
    #define training metric
    train_metric = mx.metric.Accuracy()
    train_history = TrainingHistory(['training-acc'])
    net.hybridize()
    #learning rate decay hyperparameters
    lr_decay_count = 0
    lr_decay = 0.1
    lr_decay_epoch = [40, 80, 100]
    for epoch in range(epochs):
        tic = time.time()
        train_metric.reset()
        train_loss = 0

        # Learning rate decay
        if epoch == lr_decay_epoch[lr_decay_count]:
            trainer.set_learning_rate(trainer.learning_rate * lr_decay)
            lr_decay_count += 1

        # Loop through each batch of training data
        for i, batch in enumerate(train_data):
            # Extract data and label
            data = split_and_load(batch[0],
                                  ctx_list=ctx,
                                  batch_axis=0,
                                  even_split=False)
            label = split_and_load(batch[1],
                                   ctx_list=ctx,
                                   batch_axis=0,
                                   even_split=False)

            # AutoGrad
            with ag.record():
                output = []
                for _, X in enumerate(data):
                    X = X.reshape((-1, ) + X.shape[2:])
                    pred = net(X)
                    output.append(pred)
                loss = [loss_fn(yhat, y) for yhat, y in zip(output, label)]

            # Backpropagation
            for l in loss:
                l.backward()

            # Optimize
            trainer.step(batch_size)

            # Update metrics
            train_loss += sum([l.mean().asscalar() for l in loss])
            train_metric.update(label, output)

            if i == 100:
                break

        name, acc = train_metric.get()

        # Update history and print metrics
        train_history.update([acc])
        print('[Epoch %d] train=%f loss=%f time: %f' %
              (epoch, acc, train_loss / (i + 1), time.time() - tic))

    print('saving the model')
    save(net, model_dir)
Example #4
0
    return data, label


# Learning rate decay factor
lr_decay = opt.lr_decay
# Epochs where learning rate decays
lr_decay_epoch = opt.lr_decay_epoch

# Define our trainer for net
#trainer = gluon.Trainer(net.collect_params(), optimizer, optimizer_params)

loss_fn = gluon.loss.SoftmaxCrossEntropyLoss()

train_metric = mx.metric.Accuracy()

train_history = TrainingHistory(
    ['training-acc', 'val-top1-acc', 'val-top5-acc'])

lr_decay_count = 0
best_val_score = 0

acc_top1 = mx.metric.Accuracy()
acc_top5 = mx.metric.TopKAccuracy(5)

#perclip_time = []


def test(ctx, val_data):
    acc_top1.reset()
    acc_top5.reset()
    L = gluon.loss.SoftmaxCrossEntropyLoss()
    num_test_iter = len(val_data)
Example #5
0
trainer = gluon.Trainer(net.collect_params(), optimizer, optimizer_params)

################################################################
# In order to optimize our model, we need a loss function.
# For classification tasks, we usually use softmax cross entropy as the
# loss function.

loss_fn = gluon.loss.SoftmaxCrossEntropyLoss()

################################################################
# For simplicity, we use accuracy as the metric to monitor our training
# process. Besides, we record metric values, and will print them at the
# end of training.

train_metric = mx.metric.Accuracy()
train_history = TrainingHistory(['training-acc'])

################################################################
# Training
# --------
#
# After all the preparations, we can finally start training!
# Following is the script.
#
# .. note::
#   In order to finish the tutorial quickly, we only fine tune for 3 epochs, and 100 iterations per epoch for UCF101.
#   In your experiments, you can set the hyper-parameters depending on your dataset.

epochs = 3
lr_decay_count = 0
Example #6
0
def takeT(X, T=0):
    #idx = nd.array(nd.arange(T,opt.new_length,2),ctx=ctx[0])
    idx = nd.array([2 * n + T for n in range(16)], ctx=ctx[0])
    return nd.take(X.astype(opt.dtype, copy=False), idx, axis=3)


if opt.use_take:
    print('==============================================', opt.new_length)
    print([2 * n + 0 for n in range(16)])  #nd.arange(0,opt.new_length,2))
    print([2 * n + opt.predict_T
           for n in range(16)])  #nd.arange(opt.predict_T,opt.new_length,2))

train_metric = mx.metric.Accuracy()

train_history = TrainingHistory([
    'training-acc', 'val-top1-acc', 'val-top5-acc', 'training-loss',
    'cross-loss', 'mse-loss', 'pre-loss'
])

lr_decay_count = 0
best_val_score = 0

acc_top1 = mx.metric.Accuracy()
acc_top5 = mx.metric.TopKAccuracy(5)


def test(ctx, val_data):
    acc_top1.reset()
    acc_top5.reset()
    #L = gluon.loss.SoftmaxCrossEntropyLoss()
    #L2 = gluon.loss.L2Loss(weight=1.0)
    #L2.initialize()
    def train(epochs, ctx):
        if isinstance(ctx, mx.Context):
            ctx = [ctx]
        net.initialize(mx.init.Xavier(), ctx=ctx)

        # if opt.print_tensor_shape and rank == 0:
        #     print(net)

        train_dataset = gluon.data.vision.CIFAR100(train=True).transform_first(transform_train)

        train_data = gluon.data.DataLoader(
            train_dataset,
            sampler=SplitSampler(len(train_dataset), num_parts=num_workers, part_index=rank),
            batch_size=batch_size, last_batch='discard', num_workers=opt.num_workers)

        # val_dataset = gluon.data.vision.CIFAR100(train=False).transform_first(transform_test)
        # val_data = gluon.data.DataLoader(
        #     val_dataset,
        #     sampler=SplitSampler(len(val_dataset), num_parts=num_workers, part_index=rank),
        #     batch_size=batch_size, num_workers=opt.num_workers)

        val_data = gluon.data.DataLoader(
            gluon.data.vision.CIFAR100(train=False).transform_first(transform_test),
            batch_size=batch_size, shuffle=False, num_workers=opt.num_workers)

        hvd.broadcast_parameters(net.collect_params(), root_rank=0)

        trainer = QSparseLocalSGDTrainerV1(
            net.collect_params(),  
            'nag', optimizer_params, 
            input_sparse_ratio=1./opt.input_sparse, 
            output_sparse_ratio=1./opt.output_sparse, 
            layer_sparse_ratio=1./opt.layer_sparse,
            local_sgd_interval=opt.local_sgd_interval)

        # trainer = gluon.Trainer(net.collect_params(), optimizer,
                                # {'learning_rate': opt.lr, 'wd': opt.wd, 'momentum': opt.momentum})
        
        metric = mx.metric.Accuracy()
        train_metric = mx.metric.Accuracy()
        loss_fn = gluon.loss.SoftmaxCrossEntropyLoss()
        train_history = TrainingHistory(['training-error', 'validation-error'])

        iteration = 0
        lr_decay_count = 0

        best_val_score = 0

        lr = opt.lr

        for epoch in range(epochs):
            tic = time.time()
            train_metric.reset()
            metric.reset()
            train_loss = 0
            num_batch = len(train_data)
            alpha = 1

            if epoch == lr_decay_epoch[lr_decay_count]:
                lr *= lr_decay
                trainer.set_learning_rate(lr)
                lr_decay_count += 1

            for i, batch in enumerate(train_data):
                data = gluon.utils.split_and_load(batch[0], ctx_list=ctx, batch_axis=0)
                label = gluon.utils.split_and_load(batch[1], ctx_list=ctx, batch_axis=0)

                with ag.record():
                    output = [net(X) for X in data]
                    loss = [loss_fn(yhat, y) for yhat, y in zip(output, label)]

                for l in loss:
                    l.backward()
                trainer.step(batch_size)
                train_loss += sum([l.sum().asscalar() for l in loss])

                train_metric.update(label, output)
                name, acc = train_metric.get()
                iteration += 1

            mx.nd.waitall()
            toc = time.time()
            
            train_loss /= batch_size * num_batch
            name, acc = train_metric.get()
            # name, val_acc = test(ctx, val_data)

            trainer.pre_test()
            name, val_acc = test(ctx, val_data)
            trainer.post_test()
            
            train_history.update([1-acc, 1-val_acc])
            # train_history.plot(save_path='%s/%s_history.png'%(plot_path, model_name))

            # allreduce the results
            allreduce_array_nd = mx.nd.array([train_loss, acc, val_acc])
            hvd.allreduce_(allreduce_array_nd, name='allreduce_array', average=True)
            allreduce_array_np = allreduce_array_nd.asnumpy()
            train_loss = np.asscalar(allreduce_array_np[0])
            acc = np.asscalar(allreduce_array_np[1])
            val_acc = np.asscalar(allreduce_array_np[2])

            if val_acc > best_val_score:
                best_val_score = val_acc
                # net.save_parameters('%s/%.4f-cifar-%s-%d-best.params'%(save_dir, best_val_score, model_name, epoch))

            if rank == 0:
                logging.info('[Epoch %d] train=%f val=%f loss=%f comm=%.2f time: %f' %
                    (epoch, acc, val_acc, train_loss, trainer._comm_counter/1e6, toc-tic))

                if save_period and save_dir and (epoch + 1) % save_period == 0:
                    net.save_parameters('%s/cifar10-%s-%d.params'%(save_dir, model_name, epoch))

            trainer._comm_counter = 0.

        if rank == 0:
            if save_period and save_dir:
                net.save_parameters('%s/cifar10-%s-%d.params'%(save_dir, model_name, epochs-1))
Example #8
0
    def train(epochs, ctx):
        if isinstance(ctx, mx.Context):
            ctx = [ctx]

        if config.train_cfg.param_init:
            init_func = getattr(mx.init, config.train_cfg.init)
            net.initialize(init_func(), ctx=ctx, force_reinit=True)
        else:
            net.load_parameters(config.train_cfg.param_file, ctx=ctx)

        summary(net, stat_name, nd.uniform(
            shape=(1, 3, imgsize, imgsize), ctx=ctx[0]))
        # net = nn.HybridBlock()
        net.hybridize()

        root = config.dir_cfg.dataset
        train_data = gluon.data.DataLoader(
            gluon.data.vision.CIFAR10(
                root=root, train=True).transform_first(transform_train),
            batch_size=batch_size, shuffle=True, last_batch='discard', num_workers=num_workers)

        val_data = gluon.data.DataLoader(
            gluon.data.vision.CIFAR10(
                root=root, train=False).transform_first(transform_test),
            batch_size=batch_size, shuffle=False, num_workers=num_workers)

        trainer_arg = {'learning_rate': config.lr_cfg.lr,
                       'wd': config.lr_cfg.wd, 'lr_scheduler': lr_sch}
        extra_arg = eval(config.lr_cfg.extra_arg)
        trainer_arg.update(extra_arg)
        trainer = gluon.Trainer(net.collect_params(), optimizer, trainer_arg)
        if config.train_cfg.amp:
            amp.init_trainer(trainer)
        metric = mx.metric.Accuracy()
        train_metric = mx.metric.RMSE()
        loss_fn = gluon.loss.SoftmaxCrossEntropyLoss(
            sparse_label=False if config.data_cfg.mixup else True)
        train_history = TrainingHistory(['training-error', 'validation-error'])
        # acc_history = TrainingHistory(['training-acc', 'validation-acc'])
        loss_history = TrainingHistory(['training-loss', 'validation-loss'])

        iteration = 0

        best_val_score = 0

        # print('start training')
        sig_state.emit(1)
        sig_pgbar.emit(0)
        # signal.emit('Training')
        for epoch in range(epochs):
            tic = time.time()
            train_metric.reset()
            metric.reset()
            train_loss = 0
            num_batch = len(train_data)
            alpha = 1
            for i, batch in enumerate(train_data):
                if epoch == 0 and iteration == 1 and config.save_cfg.profiler:
                    profiler.set_state('run')
                    is_profiler_run = True
                if epoch == 0 and iteration == 1 and config.save_cfg.tensorboard:
                    sw.add_graph(net)
                lam = np.random.beta(alpha, alpha)
                if epoch >= epochs - 20 or not config.data_cfg.mixup:
                    lam = 1

                data_1 = gluon.utils.split_and_load(
                    batch[0], ctx_list=ctx, batch_axis=0)
                label_1 = gluon.utils.split_and_load(
                    batch[1], ctx_list=ctx, batch_axis=0)

                if not config.data_cfg.mixup:
                    data = data_1
                    label = label_1
                else:
                    data = [lam*X + (1-lam)*X[::-1] for X in data_1]
                    label = []
                    for Y in label_1:
                        y1 = label_transform(Y, classes)
                        y2 = label_transform(Y[::-1], classes)
                        label.append(lam*y1 + (1-lam)*y2)

                with ag.record():
                    output = [net(X) for X in data]
                    loss = [loss_fn(yhat, y) for yhat, y in zip(output, label)]
                if config.train_cfg.amp:
                    with ag.record():
                        with amp.scale_loss(loss, trainer) as scaled_loss:
                            ag.backward(scaled_loss)
                            # scaled_loss.backward()
                else:
                    for l in loss:
                        l.backward()
                trainer.step(batch_size)
                train_loss += sum([l.sum().asscalar() for l in loss])

                output_softmax = [nd.SoftmaxActivation(out) for out in output]
                train_metric.update(label, output_softmax)
                metric.update(label_1, output_softmax)
                name, acc = train_metric.get()
                if config.save_cfg.tensorboard:
                    sw.add_scalar(tag='lr', value=trainer.learning_rate,
                                  global_step=iteration)
                if epoch == 0 and iteration == 1 and config.save_cfg.profiler:
                    nd.waitall()
                    profiler.set_state('stop')
                    profiler.dump()
                iteration += 1
                sig_pgbar.emit(iteration)
                if check_flag()[0]:
                    sig_state.emit(2)
                while(check_flag()[0] or check_flag()[1]):
                    if check_flag()[1]:
                        print('stop')
                        return
                    else:
                        time.sleep(5)
                        print('pausing')

            epoch_time = time.time() - tic
            train_loss /= batch_size * num_batch
            name, acc = train_metric.get()
            _, train_acc = metric.get()
            name, val_acc, _ = test(ctx, val_data)
            # if config.data_cfg.mixup:
            #     train_history.update([acc, 1-val_acc])
            #     plt.cla()
            #     train_history.plot(save_path='%s/%s_history.png' %
            #                        (plot_name, model_name))
            # else:
            train_history.update([1-train_acc, 1-val_acc])
            plt.cla()
            train_history.plot(save_path='%s/%s_history.png' %
                               (plot_name, model_name))

            if val_acc > best_val_score:
                best_val_score = val_acc
                net.save_parameters('%s/%.4f-cifar-%s-%d-best.params' %
                                    (save_dir, best_val_score, model_name, epoch))

            current_lr = trainer.learning_rate
            name, val_acc, val_loss = test(ctx, val_data)

            logging.info('[Epoch %d] loss=%f train_acc=%f train_RMSE=%f\n     val_acc=%f val_loss=%f lr=%f time: %f' %
                         (epoch, train_loss, train_acc, acc, val_acc, val_loss, current_lr, epoch_time))
            loss_history.update([train_loss, val_loss])
            plt.cla()
            loss_history.plot(save_path='%s/%s_loss.png' %
                              (plot_name, model_name), y_lim=(0, 2), legend_loc='best')
            if config.save_cfg.tensorboard:
                sw._add_scalars(tag='Acc',
                                scalar_dict={'train_acc': train_acc, 'test_acc': val_acc}, global_step=epoch)
                sw._add_scalars(tag='Loss',
                                scalar_dict={'train_loss': train_loss, 'test_loss': val_loss}, global_step=epoch)

            sig_table.emit([epoch, train_loss, train_acc,
                            val_loss, val_acc, current_lr, epoch_time])
            csv_writer.writerow([epoch, train_loss, train_acc,
                                 val_loss, val_acc, current_lr, epoch_time])
            csv_file.flush()

            if save_period and save_dir and (epoch + 1) % save_period == 0:
                net.save_parameters('%s/cifar10-%s-%d.params' %
                                    (save_dir, model_name, epoch))
        if save_period and save_dir:
            net.save_parameters('%s/cifar10-%s-%d.params' %
                                (save_dir, model_name, epochs-1))
Example #9
0
    def train(ctx, batch_size):
        #net.initialize(mx.init.Xavier(), ctx=ctx)
        train_data = DataLoader(ImageDataset(root=default.dataset_path, train=True), \
                                batch_size=batch_size,shuffle=True,num_workers=num_workers)
        val_data = DataLoader(ImageDataset(root=default.dataset_path, train=False), \
                              batch_size=batch_size, shuffle=True,num_workers=num_workers)

        # lr_epoch = [int(epoch) for epoch in args.lr_step.split(',')]
        net.collect_params().reset_ctx(ctx)
        lr = args.lr
        end_lr = args.end_lr
        lr_decay = args.lr_decay
        lr_decay_step = args.lr_decay_step
        all_step = len(train_data)
        schedule = mx.lr_scheduler.FactorScheduler(step=lr_decay_step *
                                                   all_step,
                                                   factor=lr_decay,
                                                   stop_factor_lr=end_lr)
        adam_optimizer = mx.optimizer.Adam(learning_rate=lr,
                                           lr_scheduler=schedule)
        trainer = gluon.Trainer(net.collect_params(), optimizer=adam_optimizer)

        train_metric = CtcMetrics()
        train_history = TrainingHistory(['training-error', 'validation-error'])

        iteration = 0
        best_val_score = 0

        save_period = args.save_period
        save_dir = args.save_dir
        model_name = args.prefix
        plot_path = args.save_dir
        epochs = args.end_epoch
        frequent = args.frequent
        for epoch in range(epochs):
            tic = time.time()
            train_metric.reset()
            train_loss = 0
            num_batch = 0
            tic_b = time.time()
            for datas, labels in train_data:
                data = gluon.utils.split_and_load(nd.array(datas),
                                                  ctx_list=ctx,
                                                  batch_axis=0,
                                                  even_split=False)
                label = gluon.utils.split_and_load(nd.array(labels),
                                                   ctx_list=ctx,
                                                   batch_axis=0,
                                                   even_split=False)
                with ag.record():
                    output = [net(X) for X in data]
                    loss = [loss_fn(yhat, y) for yhat, y in zip(output, label)]
                for l in loss:
                    l.backward()
                trainer.step(batch_size)
                train_loss += sum([l.sum().asscalar() for l in loss])

                train_metric.update(label, output)
                name, acc = train_metric.get()
                iteration += 1
                num_batch += 1
                if num_batch % frequent == 0:
                    train_loss_b = train_loss / (batch_size * num_batch)
                    logging.info(
                        '[Epoch %d] [num_bath %d] tain_acc=%f loss=%f time/batch: %f'
                        % (epoch, num_batch, acc, train_loss_b,
                           (time.time() - tic_b) / num_batch))
            train_loss /= batch_size * num_batch
            name, acc = train_metric.get()
            name, val_acc = test(val_data, ctx)
            train_history.update([1 - acc, 1 - val_acc])
            train_history.plot(save_path='%s/%s_history.png' %
                               (plot_path, model_name))
            if val_acc > best_val_score:
                best_val_score = val_acc
                net.save_parameters(
                    '%s/%.4f-crnn-%s-%d-best.params' %
                    (save_dir, best_val_score, model_name, epoch))
            logging.info('[Epoch %d] train=%f val=%f loss=%f time: %f' %
                         (epoch, acc, val_acc, train_loss, time.time() - tic))

            if save_period and save_dir and (epoch + 1) % save_period == 0:
                symbol_file = os.path.join(save_dir, model_name)
                net.export(path=symbol_file, epoch=epoch)
                # net.save_parameters('%s/crnn-%s-%d.params' % (save_dir, model_name, epoch))

        if save_period and save_dir:
            symbol_file = os.path.join(save_dir, model_name)
            net.export(path=symbol_file, epoch=epoch - 1)
Example #10
0
def train(net, ctx):
    if isinstance(ctx, mx.Context):
        ctx = [ctx]
    net.initialize(initializer, ctx=ctx)

    val_dataloader = get_dataloader(DatasetSplit(train=False),
                                    batch_size=100,
                                    train=False)

    metric = mx.metric.Accuracy()
    train_metric = mx.metric.Accuracy()
    loss_fn = gluon.loss.SoftmaxCrossEntropyLoss()
    if use_pillars:
        plc_loss_fn = gluon.loss.L2Loss(weight=w1)
    if use_cpl:
        loss_fn_cpl = gluon.loss.L2Loss(weight=w2)
    train_history = TrainingHistory(['training-error', 'validation-error'])
    timestr = datetime.datetime.now().strftime('%Y-%m-%d_%H-%M-%S')
    param_dir = os.path.join(save_dir, name, timestr)
    if not os.path.exists(param_dir):
        os.makedirs(param_dir)

    param_file_fmt = '%s/cifar10_%s_%d-%d-%d.params'
    training_record_fmt = '[Session %d, Epoch %d] train=%.4f val=%.4f loss=%.4f '
    if use_pillars:
        training_record_fmt += 'plc-loss=%.4f '
    training_record_fmt += 'time: %.2f'

    prev_dataloader, prev_dataset, prev_pillarset, pillarset = None, None, None, None
    record_acc = dict()

    for sess in range(sessions):

        record_acc[sess] = list()
        logging.info("[Session %d] begin training ..." % (sess + 1))
        if sess == 0 and opt.resume_s1:
            _, val_acc = test(net, ctx, val_dataloader)
            record_acc[sess].append(val_acc)
            logging.info('session 1 test acc : %.4f' % val_acc)
            prev_dataset = DatasetSplit(split_id=sess, train=True)
            prev_dataloader = get_dataloader(prev_dataset,
                                             batch_sizes[sess],
                                             train=True)
            continue

        train_dataset = DatasetSplit(split_id=sess, train=True)
        lr_decay_count, best_val_score = 0, 0

        if sess != 0:
            # Sampling data for continuous training
            logging.info(
                "[Session %d] sampling training data and pillars ..." %
                (sess + 1))
            dataloader = get_dataloader(train_dataset,
                                        batch_size=100,
                                        train=False)
            train_dataset = data_sampler.sample_dataset(train_dataset,
                                                        dataloader,
                                                        net,
                                                        loss_fn,
                                                        num_data_samples,
                                                        ctx=ctx)
            if cumulative:
                train_dataset = merge_datasets(prev_dataset, train_dataset)

        train_dataloader = get_dataloader(train_dataset,
                                          batch_sizes[sess],
                                          train=True)
        # Build trainer for net.
        trainer = gluon.Trainer(net.collect_params(), optimizer,
                                optimizer_params[sess])

        for epoch in range(epochs[sess]):
            tic = time.time()
            train_metric.reset()
            metric.reset()
            train_loss, train_plc_loss = 0, 0
            num_batch = len(train_dataloader)

            if epoch == lr_decay_epochs[sess][lr_decay_count]:
                trainer.set_learning_rate(trainer.learning_rate * lr_decay)
                lr_decay_count += 1

            for i, batch in enumerate(train_dataloader):
                data = gluon.utils.split_and_load(batch[0],
                                                  ctx_list=ctx,
                                                  batch_axis=0)
                label = gluon.utils.split_and_load(batch[1],
                                                   ctx_list=ctx,
                                                   batch_axis=0)
                all_loss = list()
                with ag.record():
                    output = [net(X)[1] for X in data]
                    output_feat = [net(X)[0] for X in data]
                    loss = [loss_fn(yhat, y) for yhat, y in zip(output, label)]
                    all_loss.extend(loss)
                    # Normalize each loss for the trainer with batch_size=1
                    all_loss = [nd.mean(l) for l in all_loss]

                ag.backward(all_loss)
                trainer.step(1, ignore_stale_grad=True)
                train_loss += sum([l.sum().asscalar() for l in loss])
                if sess > 0 and use_pillars:
                    train_plc_loss += sum(
                        [al.mean().asscalar() for al in plc_loss])

                train_metric.update(label, output)

            train_loss /= batch_sizes[sess] * num_batch
            _, acc = train_metric.get()
            _, val_acc = test(net, ctx, val_dataloader)
            train_history.update([1 - acc, 1 - val_acc])
            train_history.plot(save_path='%s/%s_history.png' %
                               (plot_path, model_name))
            if epoch >= epochs[sess] - 5:
                record_acc[sess].append(val_acc)

            training_record = [sess + 1, epoch, acc, val_acc, train_loss]
            if use_pillars:
                training_record += [train_plc_loss]
            training_record += [time.time() - tic]
            logging.info(training_record_fmt % tuple(training_record))

            net.save_parameters(
                param_file_fmt %
                (param_dir, model_name, sess, epochs[sess], epoch))
        prev_dataset = train_dataset
        prev_dataloader = train_dataloader
        prev_pillarset = pillarset
        if sess == 0 or sess == 1:
            save_data = get_dataloader(DatasetSplit(split_id=0, train=True),
                                       batch_size=10000,
                                       train=True)
            for i, batch in enumerate(save_data):
                data = gluon.utils.split_and_load(batch[0],
                                                  ctx_list=ctx,
                                                  batch_axis=0)
                label = gluon.utils.split_and_load(batch[1],
                                                   ctx_list=ctx,
                                                   batch_axis=0)
                outputs = net(data[0])[0]
                np.save('session{}_feats.npy'.format(sess), outputs.asnumpy())
                np.save('session{}_label.npy'.format(sess), label[0].asnumpy())

    for i in range(len(list(record_acc.keys()))):
        mean = np.mean(np.array(record_acc[i]))
        std = np.std(np.array(record_acc[i]))
        print('[Sess %d] Mean=%f Std=%f' % (i + 1, mean, std))
Example #11
0
def train(epochs, ctx):
    if isinstance(ctx, mx.Context):
        ctx = [ctx]
    net.initialize(mx.init.Xavier(), ctx=ctx)

    train_data = gluon.data.DataLoader(
        gluon.data.vision.CIFAR10(train=True).transform_first(transform_train),
        batch_size=batch_size, shuffle=True, last_batch='discard', num_workers=num_workers)

    val_data = gluon.data.DataLoader(
        gluon.data.vision.CIFAR10(train=False).transform_first(transform_test),
        batch_size=batch_size, shuffle=False, num_workers=num_workers)

    trainer = gluon.Trainer(net.collect_params(), optimizer,
                            {'learning_rate': opt.lr, 'wd': opt.wd, 'momentum': opt.momentum})
    metric = mx.metric.Accuracy()
    train_metric = mx.metric.RMSE()
    loss_fn = gluon.loss.SoftmaxCrossEntropyLoss(sparse_label=False)
    train_history = TrainingHistory(['training-error', 'validation-error'])

    iteration = 0
    lr_decay_count = 0

    best_val_score = 0

    for epoch in range(epochs):
        tic = time.time()
        train_metric.reset()
        metric.reset()
        train_loss = 0
        num_batch = len(train_data)
        alpha = 1

        if epoch == lr_decay_epoch[lr_decay_count]:
            trainer.set_learning_rate(trainer.learning_rate*lr_decay)
            lr_decay_count += 1

        for i, batch in enumerate(train_data):
            lam = np.random.beta(alpha, alpha)
            if epoch >= epochs - 50:
                lam = 1

            data_1 = gluon.utils.split_and_load(batch[0], ctx_list=ctx, batch_axis=0)
            label_1 = gluon.utils.split_and_load(batch[1], ctx_list=ctx, batch_axis=0)

            data = [lam*X + (1-lam)*X[::-1] for X in data_1]
            label = []
            for Y in label_1:
                y1 = label_transform(Y, classes)
                y2 = label_transform(Y[::-1], classes)
                label.append(lam*y1 + (1-lam)*y2)

            with ag.record():
                output = [net(X) for X in data]
                loss = [loss_fn(yhat, y) for yhat, y in zip(output, label)]
            for l in loss:
                l.backward()
            trainer.step(batch_size)
            train_loss += sum([l.sum().asscalar() for l in loss])

            output_softmax = [nd.SoftmaxActivation(out) for out in output]
            train_metric.update(label, output_softmax)
            name, acc = train_metric.get()
            iteration += 1

        train_loss /= batch_size * num_batch
        name, acc = train_metric.get()
        name, val_acc = test(ctx, val_data)
        train_history.update([acc, 1-val_acc])
        train_history.plot(save_path='%s/%s_history.png'%(plot_name, model_name))

        if val_acc > best_val_score and epoch > 200:
            best_val_score = val_acc
            net.save_params('%s/%.4f-imagenet-%s-%d-best.params'%(save_dir, best_val_score, model_name, epoch))

        name, val_acc = test(ctx, val_data)
        logging.info('[Epoch %d] train=%f val=%f loss=%f time: %f' %
            (epoch, acc, val_acc, train_loss, time.time()-tic))

        if save_period and save_dir and (epoch + 1) % save_period == 0:
            net.save_params('%s/cifar10-%s-%d.params'%(save_dir, model_name, epoch))

    if save_period and save_dir:
        net.save_params('%s/cifar10-%s-%d.params'%(save_dir, model_name, epochs-1))
Example #12
0
    def train(epochs, ctx):
        if isinstance(ctx, mx.Context):
            ctx = [ctx]
        net.initialize(mx.init.MSRAPrelu(), ctx=ctx)

        root = os.path.join('..', 'datasets', 'cifar-10')
        train_data = gluon.data.DataLoader(
            gluon.data.vision.CIFAR10(
                root=root, train=True).transform_first(transform_train),
            batch_size=batch_size, shuffle=True, last_batch='discard', num_workers=num_workers)

        val_data = gluon.data.DataLoader(
            gluon.data.vision.CIFAR10(
                root=root, train=False).transform_first(transform_test),
            batch_size=batch_size, shuffle=False, num_workers=num_workers)

        trainer = gluon.Trainer(net.collect_params(), optimizer,
                                {'learning_rate': opt.lr, 'wd': opt.wd,
                                 'momentum': opt.momentum, 'lr_scheduler': lr_sch})
        if opt.amp:
            amp.init_trainer(trainer)
        metric = mx.metric.Accuracy()
        train_metric = mx.metric.RMSE()
        loss_fn = gluon.loss.SoftmaxCrossEntropyLoss(
            sparse_label=False if opt.mixup else True)
        train_history = TrainingHistory(['training-error', 'validation-error'])
        # acc_history = TrainingHistory(['training-acc', 'validation-acc'])
        loss_history = TrainingHistory(['training-loss', 'validation-loss'])

        iteration = 0

        best_val_score = 0

        for epoch in range(epochs):
            tic = time.time()
            train_metric.reset()
            metric.reset()
            train_loss = 0
            num_batch = len(train_data)
            alpha = 1

            for i, batch in enumerate(train_data):
                if epoch == 0 and iteration == 1 and opt.profile_mode:
                    profiler.set_state('run')
                lam = np.random.beta(alpha, alpha)
                if epoch >= epochs - 20 or not opt.mixup:
                    lam = 1

                data_1 = gluon.utils.split_and_load(
                    batch[0], ctx_list=ctx, batch_axis=0)
                label_1 = gluon.utils.split_and_load(
                    batch[1], ctx_list=ctx, batch_axis=0)

                if not opt.mixup:
                    data = data_1
                    label = label_1
                else:
                    data = [lam*X + (1-lam)*X[::-1] for X in data_1]
                    label = []
                    for Y in label_1:
                        y1 = label_transform(Y, classes)
                        y2 = label_transform(Y[::-1], classes)
                        label.append(lam*y1 + (1-lam)*y2)

                with ag.record():
                    output = [net(X) for X in data]
                    loss = [loss_fn(yhat, y) for yhat, y in zip(output, label)]
                if opt.amp:
                    with ag.record():
                        with amp.scale_loss(loss, trainer) as scaled_loss:
                            ag.backward(scaled_loss)
                            # scaled_loss.backward()
                else:
                    for l in loss:
                        l.backward()
                trainer.step(batch_size)
                train_loss += sum([l.sum().asscalar() for l in loss])

                output_softmax = [nd.SoftmaxActivation(out) for out in output]
                train_metric.update(label, output_softmax)
                metric.update(label_1, output_softmax)
                name, acc = train_metric.get()
                sw.add_scalar(tag='lr', value=trainer.learning_rate,
                              global_step=iteration)
                if epoch == 0 and iteration == 1 and opt.profile_mode:
                    nd.waitall()
                    profiler.set_state('stop')
                iteration += 1

            train_loss /= batch_size * num_batch
            name, acc = train_metric.get()
            _, train_acc = metric.get()
            name, val_acc, _ = test(ctx, val_data)
            if opt.mixup:
                train_history.update([acc, 1-val_acc])
                plt.cla()
                train_history.plot(save_path='%s/%s_history.png' %
                                   (plot_name, model_name))
            else:
                train_history.update([1-train_acc, 1-val_acc])
                plt.cla()
                train_history.plot(save_path='%s/%s_history.png' %
                                   (plot_name, model_name))
            # acc_history.update([train_acc, val_acc])
            # plt.cla()
            # acc_history.plot(save_path='%s/%s_acc.png' %
            #                  (plot_name, model_name), legend_loc='best')

            if val_acc > best_val_score:
                best_val_score = val_acc
                net.save_parameters('%s/%.4f-cifar-%s-%d-best.params' %
                                    (save_dir, best_val_score, model_name, epoch))

            current_lr = trainer.learning_rate
            name, val_acc, val_loss = test(ctx, val_data)
            loss_history.update([train_loss, val_loss])
            plt.cla()
            loss_history.plot(save_path='%s/%s_loss.png' %
                              (plot_name, model_name), y_lim=(0, 2), legend_loc='best')
            logging.info('[Epoch %d] loss=%f train_acc=%f train_RMSE=%f\n     val_acc=%f val_loss=%f lr=%f time: %f' %
                         (epoch, train_loss, train_acc, acc, val_acc, val_loss, current_lr, time.time()-tic))
            sw._add_scalars(tag='Acc',
                            scalar_dict={'train_acc': train_acc, 'test_acc': val_acc}, global_step=epoch)
            sw._add_scalars(tag='Loss',
                            scalar_dict={'train_loss': train_loss, 'test_loss': val_loss}, global_step=epoch)
            if save_period and save_dir and (epoch + 1) % save_period == 0:
                net.save_parameters('%s/cifar10-%s-%d.params' %
                                    (save_dir, model_name, epoch))
        if save_period and save_dir:
            net.save_parameters('%s/cifar10-%s-%d.params' %
                                (save_dir, model_name, epochs-1))
Example #13
0
clothes_dict = {"jeans": 0, "dress": 1, "shirt": 2}

context = mx.gpu()

args = parse_args()

net = mnet.fashion_net_2_branches(len(clothes_dict), len(color_dict), context)
if args.resume.strip():
    net.load_parameters(args.resume.strip())

net.hybridize()

train_metric_clothes = mx.metric.Accuracy()
train_metric_colors = mx.metric.Accuracy()
train_history = TrainingHistory([
    'train-clothes-error', 'train-color-error', 'train-error-avg',
    'test-clothes-error', 'test-color-error', 'test-error-avg'
])


def validate(ctx, val_loader):
    clothes_metric = mx.metric.Accuracy()
    color_metric = mx.metric.Accuracy()
    for i, batch in enumerate(val_loader):
        label_clothes = batch[2].as_in_context(ctx)
        label_color = batch[1].as_in_context(ctx)
        outputs = net(batch[0].as_in_context(ctx))
        clothes_metric.update(label_clothes, outputs[0])
        color_metric.update(label_color, outputs[1])
    _, clothes_acc = clothes_metric.get()
    _, color_acc = color_metric.get()
Example #14
0
def train(args):
    _ctx = mx.gpu(args.gpu)
    # get data
    train_loader, val_loader = get_data(args)
    # get net
    net = get_net(_ctx, args)

    # optimizer
    # lr_decay = 0.1
    # lr_decay_epoch = [int(i) for i in args.lr_decay_epoch.strip().split(',')]
    # optimizer = 'sgd'
    # optimizer_params = {'learning_rate': args.lr, 'wd':args.wd, 'momentum': args.momentum}
    # trainer = mx.gluon.Trainer(net.collect_params(), optimizer=optimizer, optimizer_params=optimizer_params)
    if 'cos' in args.lr_type:
        lr_sch = mx.lr_scheduler.CosineScheduler(
            (args.epochs - 3) * len(train_loader), args.lr, 1e-6)
        trainer = mx.gluon.Trainer(
            net.collect_params(),
            optimizer='adam',
            optimizer_params={
                'learning_rate': args.lr,
                'wd': args.wd,
                'lr_scheduler': lr_sch
            },
        )
    else:
        trainer = mx.gluon.Trainer(
            net.collect_params(),
            optimizer='adam',
            optimizer_params={
                'learning_rate': args.lr,
                'wd': args.wd
            },
        )

    # train
    pitch_metric_loss = mx.metric.Loss()
    yaw_metric_loss = mx.metric.Loss()
    roll_metric_loss = mx.metric.Loss()
    train_history = TrainingHistory([
        'train-pitch', 'train-yaw', 'train-roll', 'val-pitch', 'val-yaw',
        'val-roll'
    ])
    mae_history = TrainingHistory([
        'train-pitch', 'train-yaw', 'train-roll', 'train-mae', 'val-pitch',
        'val-yaw', 'val-roll', 'val-mae'
    ])
    best_mae, best_epoch = np.inf, 0

    for epoch in range(args.epochs):
        tic = time.time()
        btic = time.time()
        pitch_metric_loss.reset()
        yaw_metric_loss.reset()
        roll_metric_loss.reset()
        total, pitch_mae, yaw_mae, roll_mae = 0, 0, 0, 0

        # if epoch in lr_decay_epoch:
        #     trainer.set_learning_rate(trainer.learning_rate*lr_decay)

        for i, batch in enumerate(train_loader):
            data = batch[0].as_in_context(_ctx)
            bin_label = batch[1].as_in_context(_ctx)
            cont_label = batch[2].as_in_context(_ctx)
            total += len(cont_label)
            with mx.autograd.record():
                outputs = net(data)
                loss_pyr, mae = cal_loss(outputs, bin_label, cont_label, _ctx,
                                         args)

            mx.autograd.backward([*loss_pyr])
            trainer.step(args.bs)
            pitch_metric_loss.update(0, loss_pyr[0])
            yaw_metric_loss.update(0, loss_pyr[1])
            roll_metric_loss.update(0, loss_pyr[2])
            pitch_mae += mae[0]
            yaw_mae += mae[1]
            roll_mae += mae[2]

            if not (i + 1) % args.log_interval:
                sp = args.bs * args.log_interval / (time.time() - btic)
                train_loss = (pitch_metric_loss.get()[1],
                              yaw_metric_loss.get()[1],
                              roll_metric_loss.get()[1])
                print(
                    'Epoch[%03d] Batch[%03d/%03d] Speed: %.2f samples/sec, Loss:(pitch, yaw, roll)/(%.3f, %.3f, %.3f)'
                    % (epoch, i, len(train_loader), sp, *train_loss))
                btic = time.time()

        train_loss = (pitch_metric_loss.get()[1], yaw_metric_loss.get()[1],
                      roll_metric_loss.get()[1])
        mae_ = (pitch_mae / total, yaw_mae / total, roll_mae / total)
        train_mae = (*mae_, sum([*mae_]) / 3)
        val_loss, val_mae = val(net, _ctx, val_loader, args)
        train_history.update([*train_loss, *val_loss])
        mae_history.update([*train_mae, *val_mae])
        print(
            'Epoch[%03d] train: MAE:(pitch, yaw, roll, mean)/(%.3f, %.3f, %.3f, %.3f), Cost=%d sec, lr=%f'
            % (epoch, *train_mae, time.time() - tic, trainer.learning_rate))
        print(
            'Epoch[%03d] val  : MAE:(pitch, yaw, roll, mean)/(%.3f, %.3f, %.3f, %.3f), Loss:(pitch, yaw, roll)/(%.3f, %.3f, %.3f)'
            % (epoch, *val_mae, *val_loss))

        # if (epoch+1)%2==0:
        #     print('save model!')
        #     net.export('%s/pose'%(save_root), epoch=epoch)
        if val_mae[3] < best_mae:
            print('Min val mean MAE! save model!')
            best_mae = val_mae[3]
            best_epoch = epoch
            net.export('%s/best_pose' % (save_root), epoch=0)

    print('\n' * 2 + 'Min mean MAE: %.3f, Epoch: %.3f' %
          (best_mae, best_epoch))
Example #15
0
        label   = gluon.utils.split_and_load(batch[1], ctx_list=ctx, batch_axis=0)
        outputs = [net(X) for X in data]
        acc_top1.update(label, outputs)
        acc_top5.update(label, outputs)
    _, top1 = acc_top1.get()
    _, top5 = acc_top5.get()
    return (top1, top5)


# In[ ]:


epochs = 120
lr_decay_count = 0
train_metric = mx.metric.Accuracy()
train_history = TrainingHistory(['training-error', 'validation-error'])
train_history2 = TrainingHistory(['training-acc', 'val-acc-top1', 'val-acc-top5'])

print("Training loop started for {} epochs:".format(epochs))
for epoch in range(epochs):
    tic = time.time()
    train_metric.reset()
    train_loss = 0

#     # Learning rate decay
#     if epoch == lr_decay_epoch[lr_decay_count]:
#         trainer.set_learning_rate(trainer.learning_rate*lr_decay)
#         lr_decay_count += 1

    # Loop through each batch of training data
    for i, batch in enumerate(train_data):
Example #16
0
 def _create_train_history(self):
     ems = list(self.metric_list)
     ths = []
     for i in ems:
         ths.append(TrainingHistory([i, i + "_val"]))
     return tuple(ths)
Example #17
0
elif model_name.startswith('resnext'):
    kwargs['use_se'] = opt.use_se

optimizer = 'nag'
optimizer_params = {
    'learning_rate': opt.lr,
    'wd': opt.wd,
    'momentum': opt.momentum
}

net = get_model(model_name, **kwargs)

acc_top1 = mx.metric.Accuracy()
acc_top5 = mx.metric.TopKAccuracy(5)
train_history = TrainingHistory([
    'training-top1-err', 'training-top5-err', 'validation-top1-err',
    'validation-top5-err'
])

save_frequency = opt.save_frequency
if opt.save_dir and save_frequency:
    save_dir = opt.save_dir
    makedirs(save_dir)
else:
    save_dir = ''
    save_frequency = 0

plot_path = opt.save_plot_dir

normalize = transforms.Normalize([0.485, 0.456, 0.406], [0.229, 0.224, 0.225])
jitter_param = 0.0 if model_name.startswith('mobilenet') else 0.4
lighting_param = 0.0 if model_name.startswith('mobilenet') else 0.1
#
# -  Metric is how we evaluate model performance. Each metric is related to a
#    specific task, but independent from the model training process.
# -  For classification, we usually only use one loss function to train
#    our model, but we can have several metrics for evaluating
#    performance.
# -  Loss function can be used as a metric, but sometimes its values are hard
#    to interpretate. For instance, the concept "accuracy" is
#    easier to understand than "softmax cross entropy"
#
# For simplicity, we use accuracy as the metric to monitor our training
# process. Besides, we record metric values, and will print them at the
# end of training.

train_metric = mx.metric.Accuracy()
train_history = TrainingHistory(['training-error', 'validation-error'])

################################################################
# Validation
# ----------
#
# Validation dataset provides us a way of monitoring the training process.
# We have labels for validation data, but they are held out during training.
# Instead, we use them to evaluate the models performance on unseen data
# and prevent overfitting.

def test(ctx, val_data):
    metric = mx.metric.Accuracy()
    for i, batch in enumerate(val_data):
        data = gluon.utils.split_and_load(batch[0], ctx_list=ctx, batch_axis=0)
        label = gluon.utils.split_and_load(batch[1], ctx_list=ctx, batch_axis=0)
Example #19
0
    def train(epochs, ctx):
        if isinstance(ctx, mx.Context):
            ctx = [ctx]
        net.initialize(mx.init.Xavier(), ctx=ctx)

        if opt.summary:
            summary(net, mx.nd.zeros((1, 3, 32, 32), ctx=ctx[0]))
            sys.exit()

        if opt.dataset == 'cifar10':
            train_data = gluon.data.DataLoader(
                gluon.data.vision.CIFAR10(train=True).transform_first(transform_train),
                batch_size=batch_size, shuffle=True, last_batch='discard', num_workers=num_workers)
            val_data = gluon.data.DataLoader(
                gluon.data.vision.CIFAR10(train=False).transform_first(transform_test),
                batch_size=batch_size, shuffle=False, num_workers=num_workers)
        elif opt.dataset == 'cifar100':
            train_data = gluon.data.DataLoader(
                gluon.data.vision.CIFAR100(train=True).transform_first(transform_train),
                batch_size=batch_size, shuffle=True, last_batch='discard', num_workers=num_workers)
            val_data = gluon.data.DataLoader(
                gluon.data.vision.CIFAR100(train=False).transform_first(transform_test),
                batch_size=batch_size, shuffle=False, num_workers=num_workers)
        else:
            raise ValueError('Unknown Dataset')

        if opt.no_wd and opt.cosine:
            for k, v in net.collect_params('.*beta|.*gamma|.*bias').items():
                v.wd_mult = 0.0

        trainer = gluon.Trainer(net.collect_params(), optimizer, optimizer_params)

        if opt.label_smoothing or opt.mixup:
            sparse_label_loss = False
        else:
            sparse_label_loss = True

        metric = mx.metric.Accuracy()
        train_metric = mx.metric.RMSE()
        loss_fn = gluon.loss.SoftmaxCrossEntropyLoss(sparse_label=sparse_label_loss)
        train_history = TrainingHistory(['training-error', 'validation-error'])

        iteration = 0
        lr_decay_count = 0

        best_val_score = 0

        for epoch in range(epochs):
            tic = time.time()
            train_metric.reset()
            metric.reset()
            train_loss = 0
            num_batch = len(train_data)

            if not opt.cosine:
                if epoch == lr_decay_epoch[lr_decay_count]:
                    trainer.set_learning_rate(trainer.learning_rate * lr_decay)
                    lr_decay_count += 1

            for i, batch in enumerate(train_data):
                data_1 = gluon.utils.split_and_load(batch[0], ctx_list=ctx, batch_axis=0)
                label_1 = gluon.utils.split_and_load(batch[1], ctx_list=ctx, batch_axis=0)

                if opt.mixup:
                    lam = np.random.beta(opt.mixup_alpha, opt.mixup_alpha)
                    if (epoch >= epochs - opt.mixup_off_epoch) or not opt.mixup:
                        lam = 1

                    data = [lam * X + (1 - lam) * X[::-1] for X in data_1]

                    if opt.label_smoothing:
                        eta = 0.1
                    else:
                        eta = 0.0
                    label = mixup_transform(label_1, classes, lam, eta)

                elif opt.label_smoothing:
                    hard_label = label_1
                    label = smooth(label_1, classes)

                with ag.record():
                    output = [net(X) for X in data]
                    loss = [loss_fn(yhat, y) for yhat, y in zip(output, label)]
                for l in loss:
                    l.backward()
                trainer.step(batch_size)
                train_loss += sum([l.sum().asscalar() for l in loss])

                if opt.mixup:
                    output_softmax = [nd.SoftmaxActivation(out) for out in output]
                    train_metric.update(label, output_softmax)
                else:
                    if opt.label_smoothing:
                        train_metric.update(hard_label, output)
                    else:
                        train_metric.update(label, output)

                name, acc = train_metric.get()
                iteration += 1

            train_loss /= batch_size * num_batch
            name, acc = train_metric.get()
            name, val_acc = test(ctx, val_data)
            train_history.update([acc, 1 - val_acc])
            train_history.plot(save_path='%s/%s_history.png' % (plot_name, model_name))

            if val_acc > best_val_score:
                best_val_score = val_acc
                net.save_parameters('%s/%.4f-%s-best.params' %
                                    (save_dir, best_val_score, model_name))

            name, val_acc = test(ctx, val_data)
            logging.info('[Epoch %d] train=%f val=%f loss=%f lr: %f time: %f' %
                         (epoch, acc, val_acc, train_loss, trainer.learning_rate,
                          time.time() - tic))

        host_name = socket.gethostname()
        with open(opt.dataset + '_' + host_name + '_GPU_' + opt.gpus + '_best_Acc.log', 'a') as f:
            f.write('best Acc: {:.4f}\n'.format(best_val_score))
        print("best_val_score: ", best_val_score)
Example #20
0
    def train(epochs, ctx):
        if isinstance(ctx, mx.Context):
            ctx = [ctx]
        net.initialize(mx.init.MSRAPrelu(), ctx=ctx)

        if opt.summary:
            net.summary(mx.nd.zeros((1, 3, 32, 32)))

        if opt.dataset == 'cifar10':
            # CIFAR10
            train_data = gluon.data.DataLoader(gluon.data.vision.CIFAR10(
                train=True).transform_first(transform_train),
                                               batch_size=batch_size,
                                               shuffle=True,
                                               last_batch='discard',
                                               num_workers=num_workers)
            val_data = gluon.data.DataLoader(gluon.data.vision.CIFAR10(
                train=False).transform_first(transform_test),
                                             batch_size=batch_size,
                                             shuffle=False,
                                             num_workers=num_workers)
        elif opt.dataset == 'cifar100':
            # CIFAR100
            train_data = gluon.data.DataLoader(gluon.data.vision.CIFAR100(
                train=True).transform_first(transform_train),
                                               batch_size=batch_size,
                                               shuffle=True,
                                               last_batch='discard',
                                               num_workers=num_workers)
            val_data = gluon.data.DataLoader(gluon.data.vision.CIFAR100(
                train=False).transform_first(transform_test),
                                             batch_size=batch_size,
                                             shuffle=False,
                                             num_workers=num_workers)
        else:
            raise ValueError('Unknown Dataset')

        if optimizer == 'nag':
            trainer = gluon.Trainer(net.collect_params(), optimizer, {
                'learning_rate': opt.lr,
                'wd': opt.wd,
                'momentum': opt.momentum
            })
        elif optimizer == 'adagrad':
            trainer = gluon.Trainer(net.collect_params(), optimizer, {
                'learning_rate': opt.lr,
                'wd': opt.wd
            })
        elif optimizer == 'adam':
            trainer = gluon.Trainer(net.collect_params(), optimizer, {
                'learning_rate': opt.lr,
                'wd': opt.wd
            })
        else:
            raise ValueError('Unknown optimizer')

        metric = mx.metric.Accuracy()
        train_metric = mx.metric.Accuracy()
        loss_fn = gluon.loss.SoftmaxCrossEntropyLoss()
        train_history = TrainingHistory(['training-error', 'validation-error'])
        host_name = socket.gethostname()

        iteration = 0
        lr_decay_count = 0

        best_val_score = 0

        for epoch in range(epochs):
            tic = time.time()
            train_metric.reset()
            metric.reset()
            train_loss = 0
            num_batch = len(train_data)
            alpha = 1

            if epoch == lr_decay_epoch[lr_decay_count]:
                trainer.set_learning_rate(trainer.learning_rate * lr_decay)
                lr_decay_count += 1

            for i, batch in enumerate(train_data):
                data = gluon.utils.split_and_load(batch[0],
                                                  ctx_list=ctx,
                                                  batch_axis=0)
                label = gluon.utils.split_and_load(batch[1],
                                                   ctx_list=ctx,
                                                   batch_axis=0)
                with ag.record():
                    output = [net(X) for X in data]
                    loss = [loss_fn(yhat, y) for yhat, y in zip(output, label)]
                for l in loss:
                    l.backward()
                trainer.step(batch_size)
                train_loss += sum([l.sum().asscalar() for l in loss])

                train_metric.update(label, output)
                name, acc = train_metric.get()
                iteration += 1

            train_loss /= batch_size * num_batch
            name, acc = train_metric.get()
            name, val_acc = test(ctx, val_data)
            train_history.update([1 - acc, 1 - val_acc])
            train_history.plot(save_path='%s/%s_history.png' %
                               (plot_path, model_name))

            if val_acc > best_val_score:
                best_val_score = val_acc
                # net.save_parameters('%s/%.4f-cifar-%s-%d-best.params'%(save_dir, best_val_score, model_name, epoch))
                pass

            logging.info('[Epoch %d] train=%f val=%f loss=%f time: %f' %
                         (epoch, acc, val_acc, train_loss, time.time() - tic))

            if save_period and save_dir and (epoch + 1) % save_period == 0:
                # net.save_parameters('%s/cifar10-%s-%d.params'%(save_dir, model_name, epoch))
                pass

            if epoch == epochs - 1:
                with open(
                        opt.dataset + '_' + host_name + '_GPU_' + opt.gpus +
                        '_best_Acc.log', 'a') as f:
                    f.write('best Acc: {:.4f}\n'.format(best_val_score))

        print("best_val_score: ", best_val_score)
        if save_period and save_dir:
            # net.save_parameters('%s/cifar10-%s-%d.params'%(save_dir, model_name, epochs-1))
            pass
    def train(epochs, ctx):
        if isinstance(ctx, mx.Context):
            ctx = [ctx]
        net.initialize(mx.init.Xavier(), ctx=ctx)

        train_data = gluon.data.DataLoader(
            gluon.data.vision.CIFAR10(train=True).transform_first(transform_train),
            batch_size=batch_size, shuffle=True, last_batch='discard', num_workers=num_workers)

        val_data = gluon.data.DataLoader(
            gluon.data.vision.CIFAR10(train=False).transform_first(transform_test),
            batch_size=batch_size, shuffle=False, num_workers=num_workers)

        trainer = gluon.Trainer(net.collect_params(), optimizer,
                                {'learning_rate': opt.lr, 'wd': opt.wd, 'momentum': opt.momentum})
        metric = mx.metric.Accuracy()
        train_metric = mx.metric.RMSE()
        loss_fn = gluon.loss.SoftmaxCrossEntropyLoss(sparse_label=False)
        train_history = TrainingHistory(['training-error', 'validation-error'])

        iteration = 0
        lr_decay_count = 0

        best_val_score = 0

        for epoch in range(epochs):
            tic = time.time()
            train_metric.reset()
            metric.reset()
            train_loss = 0
            num_batch = len(train_data)
            alpha = 1

            if epoch == lr_decay_epoch[lr_decay_count]:
                trainer.set_learning_rate(trainer.learning_rate*lr_decay)
                lr_decay_count += 1

            for i, batch in enumerate(train_data):
                lam = np.random.beta(alpha, alpha)
                if epoch >= epochs - 20:
                    lam = 1

                data_1 = gluon.utils.split_and_load(batch[0], ctx_list=ctx, batch_axis=0)
                label_1 = gluon.utils.split_and_load(batch[1], ctx_list=ctx, batch_axis=0)

                data = [lam*X + (1-lam)*X[::-1] for X in data_1]
                label = []
                for Y in label_1:
                    y1 = label_transform(Y, classes)
                    y2 = label_transform(Y[::-1], classes)
                    label.append(lam*y1 + (1-lam)*y2)

                with ag.record():
                    output = [net(X) for X in data]
                    loss = [loss_fn(yhat, y) for yhat, y in zip(output, label)]
                for l in loss:
                    l.backward()
                trainer.step(batch_size)
                train_loss += sum([l.sum().asscalar() for l in loss])

                output_softmax = [nd.SoftmaxActivation(out) for out in output]
                train_metric.update(label, output_softmax)
                name, acc = train_metric.get()
                iteration += 1

            train_loss /= batch_size * num_batch
            name, acc = train_metric.get()
            name, val_acc = test(ctx, val_data)
            train_history.update([acc, 1-val_acc])
            train_history.plot(save_path='%s/%s_history.png'%(plot_name, model_name))

            if val_acc > best_val_score:
                best_val_score = val_acc
                net.save_parameters('%s/%.4f-cifar-%s-%d-best.params'%(save_dir, best_val_score, model_name, epoch))

            name, val_acc = test(ctx, val_data)
            logging.info('[Epoch %d] train=%f val=%f loss=%f time: %f' %
                (epoch, acc, val_acc, train_loss, time.time()-tic))

            if save_period and save_dir and (epoch + 1) % save_period == 0:
                net.save_parameters('%s/cifar10-%s-%d.params'%(save_dir, model_name, epoch))

        if save_period and save_dir:
            net.save_parameters('%s/cifar10-%s-%d.params'%(save_dir, model_name, epochs-1))
Example #22
0
lr_decay_epoch = args.lr_decay_epoch

# Nesterov accelerated gradient descent
optimizer = args.optimizer
# Set parameters
optimizer_params = {
    'learning_rate': args.lr,
    'wd': args.wd,
    'momentum': args.momentum
}

# Define our trainer for net
trainer = gluon.Trainer(params, optimizer, optimizer_params)

train_metric = mx.metric.Accuracy()
train_history = TrainingHistory(['training-error', 'validation-error'])


def test(ctx, val_data):
    metric = mx.metric.Accuracy()
    for i, batch in enumerate(val_data):
        data = gluon.utils.split_and_load(batch[0], ctx_list=ctx, batch_axis=0)
        label = gluon.utils.split_and_load(batch[1],
                                           ctx_list=ctx,
                                           batch_axis=0)
        outputs = [net(X) for X in data]
        probabilitys = [gmloss(output, None) for output in outputs]
        metric.update(label, probabilitys)
    return metric.get()

Example #23
0
    def train(epochs, ctx):
        if isinstance(ctx, mx.Context):
            ctx = [ctx]
        net.initialize(mx.init.Xavier(), ctx=ctx)

        train_data = gluon.data.DataLoader(gluon.data.vision.CIFAR10(
            train=True).transform_first(transform_train),
                                           batch_size=batch_size,
                                           shuffle=True,
                                           last_batch='discard',
                                           num_workers=num_workers)

        val_data = gluon.data.DataLoader(gluon.data.vision.CIFAR10(
            train=False).transform_first(transform_test),
                                         batch_size=batch_size,
                                         shuffle=False,
                                         num_workers=num_workers)

        trainer = gluon.Trainer(net.collect_params(), optimizer)
        metric = mx.metric.Accuracy()
        train_metric = mx.metric.Accuracy()
        loss_fn = gluon.loss.SoftmaxCrossEntropyLoss()
        train_history = TrainingHistory(['training-error', 'validation-error'])

        iteration = 0
        lr_decay_count = 0

        best_val_score = 0

        for epoch in range(epochs):
            tic = time.time()
            train_metric.reset()
            metric.reset()
            train_loss = 0
            num_batch = len(train_data)

            if epoch == lr_decay_epoch[lr_decay_count]:
                trainer.set_learning_rate(trainer.learning_rate * lr_decay)
                lr_decay_count += 1

            for i, batch in enumerate(train_data):
                data = gluon.utils.split_and_load(batch[0],
                                                  ctx_list=ctx,
                                                  batch_axis=0)
                label = gluon.utils.split_and_load(batch[1],
                                                   ctx_list=ctx,
                                                   batch_axis=0)

                with ag.record():
                    output = [net(X) for X in data]
                    loss = [loss_fn(yhat, y) for yhat, y in zip(output, label)]
                for l in loss:
                    l.backward()
                trainer.step(batch_size)
                train_loss += sum([l.sum().asscalar() for l in loss])

                train_metric.update(label, output)
                name, acc = train_metric.get()
                iteration += 1

            train_loss /= batch_size * num_batch
            name, acc = train_metric.get()
            name, val_acc = test(ctx, val_data)
            train_history.update([1 - acc, 1 - val_acc])
            train_history.plot(save_path='%s/%s_history.png' %
                               (plot_path, model_name))

            if val_acc > best_val_score:
                best_val_score = val_acc
                net.save_parameters(
                    '%s/%.4f-cifar-%s-%d-best.params' %
                    (save_dir, best_val_score, model_name, epoch))

            logging.info('[Epoch %d] train=%f val=%f loss=%f time: %f' %
                         (epoch, acc, val_acc, train_loss, time.time() - tic))

            if save_period and save_dir and (epoch + 1) % save_period == 0:
                net.save_parameters('%s/cifar10-%s-%d.params' %
                                    (save_dir, model_name, epoch))

        if save_period and save_dir:
            net.save_parameters('%s/cifar10-%s-%d.params' %
                                (save_dir, model_name, epochs - 1))
optimizer = 'nadam'
optimizer_params = {
    'learning_rate': 0.001,
    'beta1': 0.9,
    'beta2': 0.999,
    'epsilon': 1e-8,
    'schedule_decay': 0.004
}

# Define our trainer for net
trainer = gluon.Trainer(net.collect_params(), optimizer, optimizer_params)

loss_fn = gluon.loss.SoftmaxCrossEntropyLoss()

train_metric = mx.metric.Accuracy()
train_history = TrainingHistory(['training-error', 'validation-error'])

# In[ ]:


def test(ctx, val_data):
    metric = mx.metric.Accuracy()
    for i, batch in enumerate(val_data):
        data = gluon.utils.split_and_load(batch[0], ctx_list=ctx, batch_axis=0)
        label = gluon.utils.split_and_load(batch[1],
                                           ctx_list=ctx,
                                           batch_axis=0)
        outputs = [net(X) for X in data]
        metric.update(label, outputs)
    return metric.get()
Example #25
0
        log.write(sys.argv[arg] + '\n')
if resume_epoch > 0:
    params_f = os.path.join(params_path,
                            '{}_{:03d}__{}.params'.format(
                                                net_name,resume_epoch,model_name))
    if not os.path.isfile(params_f):
        print('Check params path to finetune', params_f)
        raise FileNotFoundError
    net.load_parameters(params_f)

optimizer_params = {'wd': wd, 'momentum': momentum, 'learning_rate': lr}
trainer = gluon.Trainer(net.collect_params(), optimizer, optimizer_params)
train_metric = mx.metric.Accuracy()
L = gluon.loss.SoftmaxCrossEntropyLoss()
#TODO rewrite plot history to use remotely without gluoncv
train_history = TrainingHistory(['training-error', 'validation-error'])
print("Batch size", batch_size)
print('Workon dataset_path: ', dataset_path)
print('Model Name: ', model_name)
print('Params saving in: ', params_path)
print('Start training loop')
best_val_acc = 0
save_best_val_acc = False
lr_decay_count = 0
if train:
    assert resume_epoch < epochs, ('Error in finetune resume_epoch < epochs')
    for epoch in range(resume_epoch, epochs):
        if epoch % lr_decay_interval == 0 and epoch != 0:
            trainer.set_learning_rate(trainer.learning_rate*lr_decay)
            lr_decay_count += 1
Example #26
0
def make_noise(bs):
    return mx.nd.random_normal(0,
                               1,
                               shape=(bs, 512, 1, 1),
                               ctx=CTX,
                               dtype='float32')


# %% prepare training
logger.info("Prepare training")
if should_use_val:
    history_labels = ['gloss', 'gval_loss', 'dloss', 'dval_loss']
else:
    history_labels = ['gloss', 'dloss']
history = TrainingHistory(labels=history_labels)
loss = WasserSteinLoss()
# scheduler = mx.lr_scheduler.MultiFactorScheduler(step=[100, 150, 170, 200, 300, 310, 320], factor=0.5, base_lr=lr)
trainer_gen = gluon.Trainer(generator.collect_params(),
                            optimizer='rmsprop',
                            optimizer_params={
                                'learning_rate': lr,
                                'epsilon': 1e-13,
                            })
trainer_dis = gluon.Trainer(discriminator.collect_params(),
                            optimizer='rmsprop',
                            optimizer_params={
                                'learning_rate': lr,
                                'epsilon': 1e-11,
                                'clip_weights': 0.01
                            })
Example #27
0
    logger.info('trainner.patterns: %s.' % opt.train_patterns )
    logger.info('========\n %s' % net.collect_params(opt.train_patterns) )
else:
    trainer = gluon.Trainer(net.collect_params(), optimizer, optimizer_params, update_on_kvstore=False)

if opt.resume_states is not '':
    trainer.load_states(opt.resume_states)

# Define our trainer for net
#trainer = gluon.Trainer(net.collect_params(), optimizer, optimizer_params)

loss_fn = gluon.loss.SoftmaxCrossEntropyLoss()

train_metric = mx.metric.Accuracy()

train_history = TrainingHistory(['training-acc','val-top1-acc','val-top5-acc'])

lr_decay_count = 0
best_val_score = 0

acc_top1 = mx.metric.Accuracy()
acc_top5 = mx.metric.TopKAccuracy(5)

def test(ctx,val_data):
    acc_top1.reset()
    acc_top5.reset()
    L = gluon.loss.SoftmaxCrossEntropyLoss()
    num_test_iter = len(val_data)
    val_loss_epoch = 0
    for i, batch in enumerate(val_data):
        data, label = batch_fn(batch, ctx)
Example #28
0
    idx = nd.array([2 * n + T for n in range(16)], ctx=ctx[0])
    return nd.take(X, idx, axis=3)


if opt.use_take:
    print('==============================================', opt.new_length)
    print([2 * n + 0 for n in range(16)])  #nd.arange(0,opt.new_length,2))
    for T in range(opt.predict_T + 1):
        print([2 * n + T for n in range(16)
               ])  #nd.arange(opt.predict_T,opt.new_length,2))

train_metric = mx.metric.Accuracy()
train_metric_aet = mx.metric.Accuracy()

train_history = TrainingHistory([
    'training-acc', 'val-top1-acc', 'val-top5-acc', 'training-loss',
    'loss_val', 'loss_aets', 'valaet'
])
# train_history.update([acc,acc_top1_val,acc_top5_val,train_loss/(i+1),loss_val,loss_aet,valaet])
lr_decay_count = 0
best_val_score = 0

acc_top1 = mx.metric.Accuracy()
acc_top5 = mx.metric.TopKAccuracy(5)
acc_valAET = mx.metric.Accuracy()


def test(ctx, val_data):
    acc_top1.reset()
    acc_top5.reset()
    acc_valAET.reset()
    L = gluon.loss.SoftmaxCrossEntropyLoss()