Exemple #1
0
norm = nd.array(norm, ctx=ctx)
g.ndata['norm'] = norm

#########################################################################
# Define your own model here
########################################################################


class Model(gluon.Block):
    pass


model = Model()
model.initialize(ctx=ctx)
trainer = gluon.Trainer(model.collect_params(), 'adam', {
    'learning_rate': 0.01,
    'wd': 5e-4
})
loss_fcn = gluon.loss.SoftmaxCELoss()

feat = feat.as_in_context(ctx)
label = label.as_in_context(ctx)

for epoch in range(200):
    with autograd.record():
        logits = model(g, feat)
        loss = loss_fcn(logits[train_mask],
                        label[train_mask]).sum() / n_train_samples

    loss.backward()
    trainer.step(batch_size=1)
        super(Policy, self).__init__(**kwargs)
        with self.name_scope():
            self.dense = nn.Dense(16, in_units=4, activation='relu')
            self.action_pred = nn.Dense(2, in_units=16)
            self.value_pred = nn.Dense(1, in_units=16)

    def forward(self, x):
        x = self.dense(x)
        probs = self.action_pred(x)
        values = self.value_pred(x)
        return F.softmax(probs), values


net = Policy()
net.collect_params().initialize(mx.init.Uniform(0.02))
trainer = gluon.Trainer(net.collect_params(), 'adam', {'learning_rate': 3e-2})
loss = gluon.loss.L1Loss()

running_reward = 10
for epoch in count(1):
    state = env.reset()
    rewards = []
    values = []
    heads = []
    actions = []
    with autograd.record():
        # Sample a sequence of actions
        for t in range(10000):
            state = mx.nd.array(np.expand_dims(state, 0))
            prob, value = net(state)
            action, logp = mx.nd.sample_multinomial(prob, get_prob=True)
def train(metric):
    """Training function."""
    if not only_inference:
        logging.info('Now we are doing BERT classification training on %s!',
                     ctx)

    all_model_params = model.collect_params()
    optimizer_params = {'learning_rate': lr, 'epsilon': epsilon, 'wd': 0.01}
    try:
        trainer = gluon.Trainer(all_model_params,
                                args.optimizer,
                                optimizer_params,
                                update_on_kvstore=False)
    except ValueError as e:
        print(e)
        warnings.warn(
            'AdamW optimizer is not found. Please consider upgrading to '
            'mxnet>=1.5.0. Now the original Adam optimizer is used instead.')
        trainer = gluon.Trainer(all_model_params,
                                'adam',
                                optimizer_params,
                                update_on_kvstore=False)
    if args.dtype == 'float16':
        amp.init_trainer(trainer)

    step_size = batch_size * accumulate if accumulate else batch_size
    num_train_steps = int(num_train_examples / step_size * args.epochs)
    warmup_ratio = args.warmup_ratio
    num_warmup_steps = int(num_train_steps * warmup_ratio)
    step_num = 0

    # Do not apply weight decay on LayerNorm and bias terms
    for _, v in model.collect_params('.*beta|.*gamma|.*bias').items():
        v.wd_mult = 0.0
    # Collect differentiable parameters
    params = [p for p in all_model_params.values() if p.grad_req != 'null']

    # Set grad_req if gradient accumulation is required
    if accumulate and accumulate > 1:
        for p in params:
            p.grad_req = 'add'
    # track best eval score
    metric_history = []

    tic = time.time()
    for epoch_id in range(args.epochs):
        if not only_inference:
            metric.reset()
            step_loss = 0
            tic = time.time()
            all_model_params.zero_grad()

            for batch_id, seqs in enumerate(train_data):
                # learning rate schedule
                if step_num < num_warmup_steps:
                    new_lr = lr * step_num / num_warmup_steps
                else:
                    non_warmup_steps = step_num - num_warmup_steps
                    offset = non_warmup_steps / (num_train_steps -
                                                 num_warmup_steps)
                    new_lr = lr - offset * lr
                trainer.set_learning_rate(new_lr)

                # forward and backward
                with mx.autograd.record():
                    input_ids, valid_length, type_ids, label = seqs
                    out = model(
                        input_ids.as_in_context(ctx),
                        type_ids.as_in_context(ctx),
                        valid_length.astype('float32').as_in_context(ctx))
                    ls = loss_function(out, label.as_in_context(ctx)).mean()
                    if args.dtype == 'float16':
                        with amp.scale_loss(ls, trainer) as scaled_loss:
                            mx.autograd.backward(scaled_loss)
                    else:
                        ls.backward()

                # update
                if not accumulate or (batch_id + 1) % accumulate == 0:
                    trainer.allreduce_grads()
                    nlp.utils.clip_grad_global_norm(params, 1)
                    trainer.update(accumulate if accumulate else 1)
                    step_num += 1
                    if accumulate and accumulate > 1:
                        # set grad to zero for gradient accumulation
                        all_model_params.zero_grad()

                step_loss += ls.asscalar()
                metric.update([label], [out])
                if (batch_id + 1) % (args.log_interval) == 0:
                    log_train(batch_id, len(train_data), metric, step_loss,
                              args.log_interval, epoch_id,
                              trainer.learning_rate)
                    step_loss = 0
            mx.nd.waitall()

        # inference on dev data
        for segment, dev_data in dev_data_list:
            metric_nm, metric_val = evaluate(dev_data, metric, segment)
            metric_history.append((epoch_id, metric_nm, metric_val))

        if not only_inference:
            # save params
            ckpt_name = 'model_bert_{0}_{1}.params'.format(task_name, epoch_id)
            params_saved = os.path.join(output_dir, ckpt_name)

            nlp.utils.save_parameters(model, params_saved)
            logging.info('params saved in: %s', params_saved)
            toc = time.time()
            logging.info('Time cost=%.2fs', toc - tic)
            tic = toc

    if not only_inference:
        # we choose the best model based on metric[0],
        # assuming higher score stands for better model quality
        metric_history.sort(key=lambda x: x[2][0], reverse=True)
        epoch_id, metric_nm, metric_val = metric_history[0]
        ckpt_name = 'model_bert_{0}_{1}.params'.format(task_name, epoch_id)
        params_saved = os.path.join(output_dir, ckpt_name)
        nlp.utils.load_parameters(model, params_saved)
        metric_str = 'Best model at epoch {}. Validation metrics:'.format(
            epoch_id)
        metric_str += ','.join([i + ':%.4f' for i in metric_nm])
        logging.info(metric_str, *metric_val)

    # inference on test data
    for segment, test_data in test_data_list:
        test(test_data, segment)
def train(net, train_data, val_data, eval_metric, ctx, args):
    """Training pipeline"""
    kv = mx.kvstore.create(args.kv_store)
    net.collect_params().setattr('grad_req', 'null')
    net.collect_train_params().setattr('grad_req', 'write')
    if args.horovod:
        hvd.broadcast_parameters(net.collect_params(), root_rank=0)
        trainer = hvd.DistributedTrainer(
            net.collect_train_params(
            ),  # fix batchnorm, fix first stage, etc...
            'sgd',
            {
                'learning_rate': args.lr,
                'wd': args.wd,
                'momentum': args.momentum
            })
    else:
        trainer = gluon.Trainer(
            net.collect_train_params(
            ),  # fix batchnorm, fix first stage, etc...
            'sgd',
            {
                'learning_rate': args.lr,
                'wd': args.wd,
                'momentum': args.momentum
            },
            update_on_kvstore=(False if args.amp else None),
            kvstore=kv)

    if args.amp:
        amp.init_trainer(trainer)

    # lr decay policy
    lr_decay = float(args.lr_decay)
    lr_steps = sorted(
        [float(ls) for ls in args.lr_decay_epoch.split(',') if ls.strip()])
    lr_warmup = float(args.lr_warmup)  # avoid int division

    # TODO(zhreshold) losses?
    rpn_cls_loss = mx.gluon.loss.SigmoidBinaryCrossEntropyLoss(
        from_sigmoid=False)
    rpn_box_loss = mx.gluon.loss.HuberLoss(rho=1 / 9.)  # == smoothl1
    rcnn_cls_loss = mx.gluon.loss.SoftmaxCrossEntropyLoss()
    rcnn_box_loss = mx.gluon.loss.HuberLoss()  # == smoothl1
    metrics = [
        mx.metric.Loss('RPN_Conf'),
        mx.metric.Loss('RPN_SmoothL1'),
        mx.metric.Loss('RCNN_CrossEntropy'),
        mx.metric.Loss('RCNN_SmoothL1'),
    ]

    rpn_acc_metric = RPNAccMetric()
    rpn_bbox_metric = RPNL1LossMetric()
    rcnn_acc_metric = RCNNAccMetric()
    rcnn_bbox_metric = RCNNL1LossMetric()
    metrics2 = [
        rpn_acc_metric, rpn_bbox_metric, rcnn_acc_metric, rcnn_bbox_metric
    ]

    # set up logger
    logging.basicConfig()
    logger = logging.getLogger()
    logger.setLevel(logging.INFO)
    log_file_path = args.save_prefix + '_train.log'
    log_dir = os.path.dirname(log_file_path)
    if log_dir and not os.path.exists(log_dir):
        os.makedirs(log_dir)
    fh = logging.FileHandler(log_file_path)
    logger.addHandler(fh)
    logger.info(args)
    if args.verbose:
        logger.info('Trainable parameters:')
        logger.info(net.collect_train_params().keys())
    logger.info('Start training from [Epoch {}]'.format(args.start_epoch))
    best_map = [0]
    for epoch in range(args.start_epoch, args.epochs):
        mix_ratio = 1.0
        if not args.disable_hybridization:
            net.hybridize(static_alloc=args.static_alloc)
        rcnn_task = ForwardBackwardTask(net,
                                        trainer,
                                        rpn_cls_loss,
                                        rpn_box_loss,
                                        rcnn_cls_loss,
                                        rcnn_box_loss,
                                        mix_ratio=1.0)
        executor = Parallel(1 if args.horovod else args.executor_threads,
                            rcnn_task)
        if args.mixup:
            # TODO(zhreshold) only support evenly mixup now, target generator needs to be modified otherwise
            train_data._dataset._data.set_mixup(np.random.uniform, 0.5, 0.5)
            mix_ratio = 0.5
            if epoch >= args.epochs - args.no_mixup_epochs:
                train_data._dataset._data.set_mixup(None)
                mix_ratio = 1.0
        while lr_steps and epoch >= lr_steps[0]:
            new_lr = trainer.learning_rate * lr_decay
            lr_steps.pop(0)
            trainer.set_learning_rate(new_lr)
            logger.info("[Epoch {}] Set learning rate to {}".format(
                epoch, new_lr))
        for metric in metrics:
            metric.reset()
        tic = time.time()
        btic = time.time()
        base_lr = trainer.learning_rate
        rcnn_task.mix_ratio = mix_ratio
        for i, batch in enumerate(train_data):
            if epoch == 0 and i <= lr_warmup:
                # adjust based on real percentage
                new_lr = base_lr * get_lr_at_iter(i / lr_warmup)
                if new_lr != trainer.learning_rate:
                    if i % args.log_interval == 0:
                        logger.info(
                            '[Epoch 0 Iteration {}] Set learning rate to {}'.
                            format(i, new_lr))
                    trainer.set_learning_rate(new_lr)
            batch = split_and_load(batch, ctx_list=ctx)
            batch_size = len(batch[0])
            metric_losses = [[] for _ in metrics]
            add_losses = [[] for _ in metrics2]
            for data in zip(*batch):
                executor.put(data)
            for j in range(len(ctx)):
                result = executor.get()
                if (not args.horovod) or hvd.rank() == 0:
                    for k in range(len(metric_losses)):
                        metric_losses[k].append(result[k])
                    for k in range(len(add_losses)):
                        add_losses[k].append(result[len(metric_losses) + k])
            for metric, record in zip(metrics, metric_losses):
                metric.update(0, record)
            for metric, records in zip(metrics2, add_losses):
                for pred in records:
                    metric.update(pred[0], pred[1])
            trainer.step(batch_size)

            # update metrics
            if (not args.horovod or hvd.rank() == 0) and args.log_interval \
                    and not (i + 1) % args.log_interval:
                msg = ','.join([
                    '{}={:.3f}'.format(*metric.get())
                    for metric in metrics + metrics2
                ])
                logger.info(
                    '[Epoch {}][Batch {}], Speed: {:.3f} samples/sec, {}'.
                    format(
                        epoch, i, args.log_interval * args.batch_size /
                        (time.time() - btic), msg))
                btic = time.time()

        if (not args.horovod) or hvd.rank() == 0:
            msg = ','.join(
                ['{}={:.3f}'.format(*metric.get()) for metric in metrics])
            logger.info('[Epoch {}] Training cost: {:.3f}, {}'.format(
                epoch, (time.time() - tic), msg))
            if not (epoch + 1) % args.val_interval:
                # consider reduce the frequency of validation to save time
                map_name, mean_ap = validate(net, val_data, ctx, eval_metric,
                                             args)
                val_msg = '\n'.join(
                    ['{}={}'.format(k, v) for k, v in zip(map_name, mean_ap)])
                logger.info('[Epoch {}] Validation: \n{}'.format(
                    epoch, val_msg))
                current_map = float(mean_ap[-1])
            else:
                current_map = 0.
            save_params(net, logger, best_map, current_map, epoch,
                        args.save_interval, args.save_prefix)
        executor.__del__()
Exemple #5
0
def train_ResNeXt(net, lr, input_shape, batch_size, train_path, test_path,
                  epoch, ctx):
    train_data, val_data = prepare_data(train_path, test_path, input_shape,
                                        batch_size)

    lr_sched = mx.lr_scheduler.FactorScheduler(step=1000,
                                               factor=0.94,
                                               base_lr=1)
    optim = mx.optimizer.SGD(learning_rate=lr,
                             momentum=0.9,
                             wd=1e-3,
                             lr_scheduler=lr_sched)
    trainer = gluon.Trainer(net.collect_params(), optim)

    loss_fn = gluon.loss.SoftmaxCrossEntropyLoss()

    train_acc_meter = mx.metric.Accuracy()
    train_loss_meter = mx.metric.CrossEntropy()

    hybridized = False

    with mxboard.SummaryWriter(logdir="./resnext_logs", flush_secs=30) as sw:
        for ep in range(1, epoch + 1):
            #train_data.reset()
            #val_data.reset()
            print("Current Learning Rate {}".format(trainer.learning_rate))
            epoch_start = timeit.default_timer()

            train_acc_meter.reset()
            train_loss_meter.reset()

            for it, (data, label) in enumerate(train_data):
                data = data.as_in_context(ctx)
                label = label.as_in_context(ctx)

                with autograd.record():
                    output = net(data)
                    loss_val = loss_fn(output, label)
                    loss_val.backward()
                trainer.step(data.shape[0])

                train_acc_meter.update(preds=[output], labels=[label])
                train_loss_meter.update(labels=[label],
                                        preds=[nd.softmax(output, axis=1)])

                if it % 10 == 0:
                    print(
                        "Epoch {}, batch {}, train loss {:.4f}, train acc {:.4f}"
                        .format(ep, it,
                                train_loss_meter.get()[1],
                                train_acc_meter.get()[1]))

            epoch_stop = timeit.default_timer()

            val_loss, val_acc = evaluate(val_data, net, ctx)
            print(
                "Epoch {}, Training time {}, validation loss {:.5f}, validation acc {:.5f}"
                .format(ep, epoch_stop - epoch_start, val_loss, val_acc))
            sw.add_scalar(tag="train_loss",
                          value=train_loss_meter.get()[1],
                          global_step=ep)
            sw.add_scalar(tag="train_acc",
                          value=train_acc_meter.get()[1],
                          global_step=ep)
            sw.add_scalar(tag="val_acc", value=val_acc, global_step=ep)
            sw.add_scalar(tag="val_loss", value=val_loss, global_step=ep)
            sw.add_scalar(tag="learning_rate",
                          value=trainer.learning_rate,
                          global_step=ep)
            if not hybridized:
                sw.add_graph(net)
                hybridized = True

            if ep % 1 == 0:
                net.export("resnext_models/resnext", ep)

    return net
Exemple #6
0
                step_epoch=lr_decay_epoch,
                step_factor=lr_decay,
                power=2)
])
optimizer_params['lr_scheduler'] = lr_scheduler

if opt.partial_bn:
    train_patterns = None
    if 'inceptionv3' in opt.model:
        train_patterns = '.*weight|.*bias|inception30_batchnorm0_gamma|inception30_batchnorm0_beta|inception30_batchnorm0_running_mean|inception30_batchnorm0_running_var'
    else:
        logger.info(
            'Current model does not support partial batch normalization.')

    trainer = gluon.Trainer(net.collect_params(train_patterns),
                            optimizer,
                            optimizer_params,
                            update_on_kvstore=False)
elif opt.partial_bn == False and opt.use_train_patterns == True:
    logger.info('========\n %s' % net.collect_params())
    trainer = gluon.Trainer(net.collect_params(opt.train_patterns),
                            optimizer,
                            optimizer_params,
                            update_on_kvstore=False)
    logger.info('trainner.patterns: %s.' % opt.train_patterns)
    logger.info('========\n %s' % net.collect_params(opt.train_patterns))
elif opt.use_lateral and not opt.freeze_lateral:
    print("============== use_lateral")
    lst = list(net.collect_params().values()) + list(
        net1.collect_params().values())
    trainer = gluon.Trainer(lst,
                            optimizer,
Exemple #7
0
        if not autograd.is_training():
            x = nd.sigmoid(x)
        return x


net = LogisticRegression(in_features, out_features)
net.collect_params().initialize()

# %%
# Loss function: Binary Cross Entropy
loss_fn = gluon.loss.SigmoidBinaryCrossEntropyLoss()

# %%
# Optimizer, Stochastic Gradient Decent
optimizer = mx.optimizer.SGD(learning_rate=LR, wd=0.0, momentum=0.0)
trainer = gluon.Trainer(net.collect_params(), optimizer)

# %%
# Training loop
for epoch in range(EPOCHS):
    with autograd.record(train_mode=True):
        # Compute f(x) = Wx
        y_pred = net(X_train)
        # Compute loss
        loss = loss_fn(y_pred, y_train)
    # Compute dL/dW
    loss.backward()
    # Show intermediate values to screen
    if epoch % 10 == 0:
        log_info(net, loss)
    # Update weights, normalization of grads happens here, not in the loss function
Exemple #8
0
    print(X, y)
"""
""" 3.3.3定义模型 """
from mxnet.gluon import nn

net = nn.Sequential()

net.add(nn.Dense(1))
""" 3.3.4初始化模型函数 """
from mxnet import init

net.initialize(init.Normal(sigma=0.01))
""" 3.3.5定义损失函数 """
from mxnet.gluon import loss as gloss

loss = gloss.L2Loss()
""" 3.3.6定义优化算法 """
from mxnet import gluon

trainer = gluon.Trainer(net.collect_params(), "sgd", {"learning_rate": 0.03})
"""3.3.7训练模型 """
num_epochs = 10
for epoch in range(1, num_epochs + 1):
    for X, y in data_iter:
        with autograd.record():
            l = loss(net(X), y)
        l.backward()
        trainer.step(batch_size)
    l = loss(net(features), labels)
    print("epoch %d, loss:%f" % (epoch, l.mean().asnumpy()))
Exemple #9
0
def main(args):
    # load and preprocess dataset
    data = load_data(args)
    features = mx.nd.array(data.features)
    labels = mx.nd.array(data.labels)
    train_mask = mx.nd.array(data.train_mask)
    val_mask = mx.nd.array(data.val_mask)
    test_mask = mx.nd.array(data.test_mask)
    in_feats = features.shape[1]
    n_classes = data.num_labels
    n_edges = data.graph.number_of_edges()
    print("""----Data statistics------'
      #Edges %d
      #Classes %d
      #Train samples %d
      #Val samples %d
      #Test samples %d""" %
          (n_edges, n_classes,
              train_mask.sum().asscalar(),
              val_mask.sum().asscalar(),
              test_mask.sum().asscalar()))

    if args.gpu < 0:
        cuda = False
        ctx = mx.cpu(0)
    else:
        cuda = True
        ctx = mx.gpu(args.gpu)

    features = features.as_in_context(ctx)
    labels = labels.as_in_context(ctx)
    train_mask = train_mask.as_in_context(ctx)
    val_mask = val_mask.as_in_context(ctx)
    test_mask = test_mask.as_in_context(ctx)

    # create GCN model
    g = DGLGraph(data.graph)
    if args.self_loop:
        g.add_edges(g.nodes(), g.nodes())
    # normalization
    degs = g.in_degrees().astype('float32')
    norm = mx.nd.power(degs, -0.5)
    if cuda:
        norm = norm.as_in_context(ctx)
    g.ndata['norm'] = mx.nd.expand_dims(norm, 1)

    model = GCN(g,
                in_feats,
                args.n_hidden,
                n_classes,
                args.n_layers,
                mx.nd.relu,
                args.dropout)
    model.initialize(ctx=ctx)
    n_train_samples = train_mask.sum().asscalar()
    loss_fcn = gluon.loss.SoftmaxCELoss()

    # use optimizer
    print(model.collect_params())
    trainer = gluon.Trainer(model.collect_params(), 'adam',
            {'learning_rate': args.lr, 'wd': args.weight_decay})

    # initialize graph
    dur = []
    for epoch in range(args.n_epochs):
        if epoch >= 3:
            t0 = time.time()
        # forward
        with mx.autograd.record():
            pred = model(features)
            loss = loss_fcn(pred, labels, mx.nd.expand_dims(train_mask, 1))
            loss = loss.sum() / n_train_samples

        loss.backward()
        trainer.step(batch_size=1)

        if epoch >= 3:
            loss.asscalar()
            dur.append(time.time() - t0)
            acc = evaluate(model, features, labels, val_mask)
            print("Epoch {:05d} | Time(s) {:.4f} | Loss {:.4f} | Accuracy {:.4f} | "
                  "ETputs(KTEPS) {:.2f}". format(
                epoch, np.mean(dur), loss.asscalar(), acc, n_edges / np.mean(dur) / 1000))

    # test set accuracy
    acc = evaluate(model, features, labels, test_mask)
    print("Test accuracy {:.2%}".format(acc))
Exemple #10
0
def batchnormalization():
    """
    批量归一化利用小批量上的均值和标准差,不断调整神经网络的中间输出,从而使整个神经网络在各层的中间输出的数值更稳定
    :return:
    """
    def batch_norm(X, gamma, beta, moving_mean, moving_var, eps, momentum):
        # 通过autograd来判断当前模式是训练模式还是预测模式
        if not autograd.is_training():
            # 如果是在预测模式下,直接使用传入的移动平均所得的均值和方差
            X_hat = (X - moving_mean) / nd.sqrt(moving_var + eps)
        else:
            assert len(X.shape) in (2, 4)
            if len(X.shape) == 2:
                # 使用全连接层的情况,计算特征维上的均值和方差
                mean = X.mean(axis=0)
                var = ((X - mean)**2).mean(axis=0)
            else:
                # 使用二维卷积层的情况,计算通道维上(axis=1)的均值和方差。这里我们需要保持
                # X的形状以便后面可以做广播运算
                mean = X.mean(axis=(0, 2, 3), keepdims=True)
                var = ((X - mean)**2).mean(axis=(0, 2, 3), keepdims=True)
            # 训练模式下用当前的均值和方差做标准化
            X_hat = (X - mean) / nd.sqrt(var + eps)
            # 更新移动平均的均值和方差
            moving_mean = momentum * moving_mean + (1.0 - momentum) * mean
            moving_var = momentum * moving_var + (1.0 - momentum) * var
        Y = gamma * X_hat + beta  # 拉伸和偏移
        return Y, moving_mean, moving_var

    class BatchNorm(nn.Block):
        def __init__(self, num_features, num_dims, **kwargs):
            super(BatchNorm, self).__init__(**kwargs)
            if num_dims == 2:
                shape = (1, num_features)
            else:
                shape = (1, num_features, 1, 1)
            # 参与求梯度和迭代的拉伸和偏移参数,分别初始化成1和0
            self.gamma = self.params.get('gamma', shape=shape, init=init.One())
            self.beta = self.params.get('beta', shape=shape, init=init.Zero())
            # 不参与求梯度和迭代的变量,全在内存上初始化成0
            self.moving_mean = nd.zeros(shape)
            self.moving_var = nd.zeros(shape)

        def forward(self, X):
            # 如果X不在内存上,将moving_mean和moving_var复制到X所在显存上
            if self.moving_mean.context != X.context:
                self.moving_mean = self.moving_mean.copyto(X.context)
                self.moving_var = self.moving_var.copyto(X.context)
            # 保存更新过的moving_mean和moving_var
            Y, self.moving_mean, self.moving_var = batch_norm(
                X,
                self.gamma.data(),
                self.beta.data(),
                self.moving_mean,
                self.moving_var,
                eps=1e-5,
                momentum=0.9)
            return Y

    net = nn.Sequential()
    net.add(nn.Conv2D(6, kernel_size=5), BatchNorm(6, num_dims=4),
            nn.Activation('sigmoid'), nn.MaxPool2D(pool_size=2, strides=2),
            nn.Conv2D(16, kernel_size=5), BatchNorm(16, num_dims=4),
            nn.Activation('sigmoid'), nn.MaxPool2D(pool_size=2, strides=2),
            nn.Dense(120), BatchNorm(120, num_dims=2),
            nn.Activation('sigmoid'), nn.Dense(84), BatchNorm(84, num_dims=2),
            nn.Activation('sigmoid'), nn.Dense(10))

    lr, num_epochs, batch_size, ctx = 1.0, 5, 256, d2l.try_gpu()
    net.initialize(ctx=ctx, init=init.Xavier())
    trainer = gluon.Trainer(net.collect_params(), 'sgd', {'learning_rate': lr})
    train_iter, test_iter = d2l.load_data_fashion_mnist(batch_size)
    d2l.train_ch5(net, train_iter, test_iter, batch_size, trainer, ctx,
                  num_epochs)

    net[1].gamma.data().reshape((-1, )), net[1].beta.data().reshape((-1, ))
Exemple #11
0
def resnet():
    """
    残差块通过跨层的数据通道从而能够训练出有效的深度神经网络。
    :return:
    """
    class Residual(nn.Block):  # 本类已保存在d2lzh包中方便以后使用
        def __init__(self,
                     num_channels,
                     use_1x1conv=False,
                     strides=1,
                     **kwargs):
            super(Residual, self).__init__(**kwargs)
            self.conv1 = nn.Conv2D(num_channels,
                                   kernel_size=3,
                                   padding=1,
                                   strides=strides)
            self.conv2 = nn.Conv2D(num_channels, kernel_size=3, padding=1)
            if use_1x1conv:
                self.conv3 = nn.Conv2D(num_channels,
                                       kernel_size=1,
                                       strides=strides)
            else:
                self.conv3 = None
            self.bn1 = nn.BatchNorm()
            self.bn2 = nn.BatchNorm()

        def forward(self, X):
            Y = nd.relu(self.bn1(self.conv1(X)))
            Y = self.bn2(self.conv2(Y))
            if self.conv3:
                X = self.conv3(X)
            return nd.relu(Y + X)

    def resnet_block(num_channels, num_residuals, first_block=False):
        blk = nn.Sequential()
        for i in range(num_residuals):
            if i == 0 and not first_block:
                blk.add(Residual(num_channels, use_1x1conv=True, strides=2))
            else:
                blk.add(Residual(num_channels))
        return blk

    net = nn.Sequential()
    net.add(nn.Conv2D(64, kernel_size=7, strides=2, padding=3), nn.BatchNorm(),
            nn.Activation('relu'),
            nn.MaxPool2D(pool_size=3, strides=2, padding=1))

    net.add(resnet_block(64, 2, first_block=True), resnet_block(128, 2),
            resnet_block(256, 2), resnet_block(512, 2))

    net.add(nn.GlobalAvgPool2D(), nn.Dense(10))

    X = nd.random.uniform(shape=(1, 1, 224, 224))
    net.initialize()
    for layer in net:
        X = layer(X)
        print(layer.name, 'output shape:\t', X.shape)

    lr, num_epochs, batch_size, ctx = 0.05, 5, 256, d2l.try_gpu()
    net.initialize(force_reinit=True, ctx=ctx, init=init.Xavier())
    trainer = gluon.Trainer(net.collect_params(), 'sgd', {'learning_rate': lr})
    train_iter, test_iter = d2l.load_data_fashion_mnist(batch_size, resize=96)
    d2l.train_ch5(net, train_iter, test_iter, batch_size, trainer, ctx,
                  num_epochs)
Exemple #12
0
def googlenet():
    """
    GoogLeNet吸收了NiN中网络串联网络的思想,并在此基础上做了很大改进。在随后的几年里,研究人员对GoogLeNet进行了数次改进,本节将介绍这个模型系列的第一个版本。
    :return:
    """
    class Inception(nn.Block):
        # c1 - c4为每条线路里的层的输出通道数
        def __init__(self, c1, c2, c3, c4, **kwargs):
            super(Inception, self).__init__(**kwargs)
            # 线路1,单1 x 1卷积层
            self.p1_1 = nn.Conv2D(c1, kernel_size=1, activation='relu')
            # 线路2,1 x 1卷积层后接3 x 3卷积层
            self.p2_1 = nn.Conv2D(c2[0], kernel_size=1, activation='relu')
            self.p2_2 = nn.Conv2D(c2[1],
                                  kernel_size=3,
                                  padding=1,
                                  activation='relu')
            # 线路3,1 x 1卷积层后接5 x 5卷积层
            self.p3_1 = nn.Conv2D(c3[0], kernel_size=1, activation='relu')
            self.p3_2 = nn.Conv2D(c3[1],
                                  kernel_size=5,
                                  padding=2,
                                  activation='relu')
            # 线路4,3 x 3最大池化层后接1 x 1卷积层
            self.p4_1 = nn.MaxPool2D(pool_size=3, strides=1, padding=1)
            self.p4_2 = nn.Conv2D(c4, kernel_size=1, activation='relu')

        def forward(self, x):
            p1 = self.p1_1(x)
            p2 = self.p2_2(self.p2_1(x))
            p3 = self.p3_2(self.p3_1(x))
            p4 = self.p4_2(self.p4_1(x))
            return nd.concat(p1, p2, p3, p4, dim=1)  # 在通道维上连结输出

    b1 = nn.Sequential()
    b1.add(
        nn.Conv2D(64, kernel_size=7, strides=2, padding=3, activation='relu'),
        nn.MaxPool2D(pool_size=3, strides=2, padding=1))

    b2 = nn.Sequential()
    b2.add(nn.Conv2D(64, kernel_size=1, activation='relu'),
           nn.Conv2D(192, kernel_size=3, padding=1, activation='relu'),
           nn.MaxPool2D(pool_size=3, strides=2, padding=1))

    b3 = nn.Sequential()
    b3.add(Inception(64, (96, 128), (16, 32), 32),
           Inception(128, (128, 192), (32, 96), 64),
           nn.MaxPool2D(pool_size=3, strides=2, padding=1))

    b4 = nn.Sequential()
    b4.add(Inception(192, (96, 208), (16, 48), 64),
           Inception(160, (112, 224), (24, 64), 64),
           Inception(128, (128, 256), (24, 64), 64),
           Inception(112, (144, 288), (32, 64), 64),
           Inception(256, (160, 320), (32, 128), 128),
           nn.MaxPool2D(pool_size=3, strides=2, padding=1))

    b5 = nn.Sequential()
    b5.add(Inception(256, (160, 320), (32, 128), 128),
           Inception(384, (192, 384), (48, 128), 128), nn.GlobalAvgPool2D())

    net = nn.Sequential()
    net.add(b1, b2, b3, b4, b5, nn.Dense(10))

    X = nd.random.uniform(shape=(1, 1, 96, 96))
    net.initialize()
    for layer in net:
        X = layer(X)
        print(layer.name, 'output shape:\t', X.shape)

    lr, num_epochs, batch_size, ctx = 0.1, 5, 128, d2l.try_gpu()
    net.initialize(force_reinit=True, ctx=ctx, init=init.Xavier())
    trainer = gluon.Trainer(net.collect_params(), 'sgd', {'learning_rate': lr})
    train_iter, test_iter = d2l.load_data_fashion_mnist(batch_size, resize=96)
    d2l.train_ch5(net, train_iter, test_iter, batch_size, trainer, ctx,
                  num_epochs)
Exemple #13
0
def alexnet():
    """
    alexnet深度学习
    :return:
    """
    def load_data_fashion_mnist(batch_size,
                                resize=None,
                                root=os.path.join('~', '.mxnet', 'datasets',
                                                  'fashion-mnist')):
        root = os.path.expanduser(root)  # 展开用户路径'~'
        transformer = []
        if resize:
            transformer += [gdata.vision.transforms.Resize(resize)]
        transformer += [gdata.vision.transforms.ToTensor()]
        transformer = gdata.vision.transforms.Compose(transformer)
        mnist_train = gdata.vision.FashionMNIST(root=root, train=True)
        mnist_test = gdata.vision.FashionMNIST(root=root, train=False)
        num_workers = 0 if sys.platform.startswith('win32') else 4
        train_iter = gdata.DataLoader(mnist_train.transform_first(transformer),
                                      batch_size,
                                      shuffle=True,
                                      num_workers=num_workers)
        test_iter = gdata.DataLoader(mnist_test.transform_first(transformer),
                                     batch_size,
                                     shuffle=False,
                                     num_workers=num_workers)
        return train_iter, test_iter

    net = nn.Sequential()
    # 使用较大的11 x 11窗口来捕获物体。同时使用步幅4来较大幅度减小输出高和宽。这里使用的输出通
    # 道数比LeNet中的也要大很多
    net.add(
        nn.Conv2D(96, kernel_size=11, strides=4, activation='relu'),
        nn.MaxPool2D(pool_size=3, strides=2),
        # 减小卷积窗口,使用填充为2来使得输入与输出的高和宽一致,且增大输出通道数
        nn.Conv2D(256, kernel_size=5, padding=2, activation='relu'),
        nn.MaxPool2D(pool_size=3, strides=2),
        # 连续3个卷积层,且使用更小的卷积窗口。除了最后的卷积层外,进一步增大了输出通道数。
        # 前两个卷积层后不使用池化层来减小输入的高和宽
        nn.Conv2D(384, kernel_size=3, padding=1, activation='relu'),
        nn.Conv2D(384, kernel_size=3, padding=1, activation='relu'),
        nn.Conv2D(256, kernel_size=3, padding=1, activation='relu'),
        nn.MaxPool2D(pool_size=3, strides=2),
        # 这里全连接层的输出个数比LeNet中的大数倍。使用丢弃层来缓解过拟合
        nn.Dense(4096, activation="relu"),
        nn.Dropout(0.5),
        nn.Dense(4096, activation="relu"),
        nn.Dropout(0.5),
        # 输出层。由于这里使用Fashion-MNIST,所以用类别数为10,而非论文中的1000
        nn.Dense(10))

    X = nd.random.uniform(shape=(1, 1, 224, 224))
    net.initialize()
    for layer in net:
        X = layer(X)
        print(layer.name, 'output shape:\t', X.shape)

    batch_size = 128
    # 如出现“out of memory”的报错信息,可减小batch_size或resize
    train_iter, test_iter = load_data_fashion_mnist(batch_size, resize=224)

    lr, num_epochs, ctx = 0.01, 5, d2l.try_gpu()
    net.initialize(force_reinit=True, ctx=ctx, init=init.Xavier())
    trainer = gluon.Trainer(net.collect_params(), 'sgd', {'learning_rate': lr})
    d2l.train_ch5(net, train_iter, test_iter, batch_size, trainer, ctx,
                  num_epochs)
from mxnet import autograd
from mxnet import gluon

# Generate synthetic data.
X = np.random.randn(10000, 2)
Y = 2 * X[:, 0] - 3.4 * X[:, 1] + 4.2 + .01 * np.random.normal(size=10000)

net = gluon.nn.Sequential()
# The output dimension is 1.
net.add(gluon.nn.Dense(1))
net.collect_params().initialize()
loss = gluon.loss.L2Loss()

# Initialize the learning rate as 0.1.
trainer = gluon.Trainer(net.collect_params(),
                        'sgd',
                        optimizer_params={'learning_rate': 0.1})
net.collect_params().initialize(mx.init.Xavier(magnitude=2.24),
                                force_reinit=True)
train_data = mx.io.NDArrayIter(X, Y, batch_size=10, shuffle=True)

for epoch in range(5):
    train_data.reset()
    for i, batch in enumerate(train_data):
        data = batch.data[0]
        label = batch.label[0].reshape((-1, 1))
        with autograd.record():
            output = net(data)
            mse = loss(output, label)
        mse.backward()
        trainer.step(data.shape[0])
Exemple #15
0
def train(net, train_iter, valid_iter, num_epochs, lr, wd, ctx, lr_period,
          lr_decay):
    trainer = gluon.Trainer(net.collect_params(), 'sgd',
                            {'learning_rate': lr, 'momentum': 0.9, 'wd': wd})
Exemple #16
0
    def fit(
        self,
        audio_path_label_pairs,
        model_dir_path,
        batch_size=64,
        epochs=20,
        test_size=0.2,
        random_state=42,
        input_shape=(1, 96, 1366),
        nb_classes=10,
        learning_rate=0.001,
        checkpoint_interval=10,
    ):

        config_file_path = Cifar10AudioClassifier.get_config_file_path(model_dir_path)

        self.input_shape = input_shape
        self.nb_classes = nb_classes

        self.config = dict()
        self.config["input_shape"] = input_shape
        self.config["nb_classes"] = nb_classes
        np.save(config_file_path, self.config)

        self.model = self.create_model(self.nb_classes)

        X, Y = self.unzip(audio_path_label_pairs)

        Xtrain, Xtest, Ytrain, Ytest = train_test_split(
            X, Y, test_size=test_size, random_state=random_state
        )

        train_gen = self.generate_batch(Xtrain, Ytrain, batch_size, shuffled=True)

        train_num_batches = len(Xtrain) // batch_size

        self.model.collect_params().initialize(
            mx.init.Xavier(magnitude=2.24), ctx=self.model_ctx
        )
        self.model.hybridize()
        trainer = gluon.Trainer(
            self.model.collect_params(),
            optimizer="adam",
            optimizer_params={"learning_rate": learning_rate},
        )

        softmax_loss = gluon.loss.SoftmaxCrossEntropyLoss()

        history = dict()
        loss_train = []
        loss_test = []
        acc_train = []
        acc_test = []

        for e in range(epochs):
            loss_avg = 0.0
            accuracy = mx.metric.Accuracy()
            for batch_index, (data, label) in enumerate(train_gen):
                data = data.as_in_context(self.model_ctx)
                label = label.as_in_context(self.model_ctx)
                with autograd.record():
                    output = self.model(data)
                    prediction = nd.argmax(output, axis=1)
                    accuracy.update(preds=prediction, labels=label)
                    loss = softmax_loss(output, label)
                loss.backward()
                trainer.step(data.shape[0])
                loss_avg = loss_avg * batch_index / (batch_index + 1) + nd.mean(
                    loss
                ).asscalar() / (batch_index + 1)
                print(
                    "Epoch %s / %s, Batch %s / %s. Loss: %s, Accuracy: %s"
                    % (
                        e + 1,
                        epochs,
                        batch_index + 1,
                        train_num_batches,
                        loss_avg,
                        accuracy.get()[1],
                    )
                )
                if batch_index + 1 == train_num_batches:
                    break
            train_acc = accuracy.get()[1]
            acc_train.append(train_acc)
            loss_train.append(loss_avg)

            test_acc, test_avg_loss = self._evaluate_accuracy(
                Xtest, Ytest, batch_size=batch_size
            )
            acc_test.append(test_acc)
            loss_test.append(test_avg_loss)

            print(
                "Epoch %s / %s. Loss: %s. Accuracy: %s. Test Accuracy: %s."
                % (e + 1, epochs, loss_avg, train_acc, test_acc)
            )

            if e % checkpoint_interval == 0:
                self.checkpoint(model_dir_path)

        self.checkpoint(model_dir_path)

        history["loss_train"] = loss_train
        history["loss_test"] = loss_test
        history["acc_train"] = acc_train
        history["acc_test"] = acc_test

        np.save(
            model_dir_path + "/" + Cifar10AudioClassifier.model_name + "-history.npy",
            history,
        )

        return history
Exemple #17
0
def graphsage_cv_train(g, ctx, args, n_classes, train_nid, test_nid,
                       n_test_samples, distributed):
    n0_feats = g.nodes[0].data['features']
    num_nodes = g.number_of_nodes()
    in_feats = n0_feats.shape[1]
    g_ctx = n0_feats.context

    norm = mx.nd.expand_dims(1. / g.in_degrees().astype('float32'), 1)
    g.set_n_repr({'norm': norm.as_in_context(g_ctx)})
    degs = g.in_degrees().astype('float32').asnumpy()
    degs[degs > args.num_neighbors] = args.num_neighbors
    g.set_n_repr(
        {'subg_norm': mx.nd.expand_dims(mx.nd.array(1. / degs, ctx=g_ctx), 1)})
    n_layers = args.n_layers

    g.update_all(
        fn.copy_src(src='features',
                    out='m'), fn.sum(msg='m', out='preprocess'), lambda node:
        {'preprocess': node.data['preprocess'] * node.data['norm']})
    for i in range(n_layers):
        g.init_ndata('h_{}'.format(i), (num_nodes, args.n_hidden), 'float32')
        g.init_ndata('agg_h_{}'.format(i), (num_nodes, args.n_hidden),
                     'float32')

    model = GraphSAGETrain(in_feats,
                           args.n_hidden,
                           n_classes,
                           n_layers,
                           args.dropout,
                           prefix='GraphSAGE')

    model.initialize(ctx=ctx)

    loss_fcn = gluon.loss.SoftmaxCELoss()

    infer_model = GraphSAGEInfer(in_feats,
                                 args.n_hidden,
                                 n_classes,
                                 n_layers,
                                 prefix='GraphSAGE')

    infer_model.initialize(ctx=ctx)

    # use optimizer
    print(model.collect_params())
    kv_type = 'dist_sync' if distributed else 'local'
    trainer = gluon.Trainer(model.collect_params(),
                            'adam', {
                                'learning_rate': args.lr,
                                'wd': args.weight_decay
                            },
                            kvstore=mx.kv.create(kv_type))

    # initialize graph
    dur = []

    adj = g.adjacency_matrix(transpose=False).as_in_context(g_ctx)
    for epoch in range(args.n_epochs):
        start = time.time()
        if distributed:
            msg_head = "Worker {:d}, epoch {:d}".format(g.worker_id, epoch)
        else:
            msg_head = "epoch {:d}".format(epoch)
        for nf in dgl.contrib.sampling.NeighborSampler(g,
                                                       args.batch_size,
                                                       args.num_neighbors,
                                                       neighbor_type='in',
                                                       shuffle=True,
                                                       num_workers=32,
                                                       num_hops=n_layers,
                                                       add_self_loop=True,
                                                       seed_nodes=train_nid):
            for i in range(n_layers):
                agg_history_str = 'agg_h_{}'.format(i)
                dests = nf.layer_parent_nid(i + 1).as_in_context(g_ctx)
                # TODO we could use DGLGraph.pull to implement this, but the current
                # implementation of pull is very slow. Let's manually do it for now.
                agg = mx.nd.dot(mx.nd.take(adj, dests),
                                g.nodes[:].data['h_{}'.format(i)])
                g.set_n_repr({agg_history_str: agg}, dests)

            node_embed_names = [['preprocess', 'features', 'h_0']]
            for i in range(1, n_layers):
                node_embed_names.append([
                    'h_{}'.format(i), 'agg_h_{}'.format(i - 1), 'subg_norm',
                    'norm'
                ])
            node_embed_names.append(
                ['agg_h_{}'.format(n_layers - 1), 'subg_norm', 'norm'])

            nf.copy_from_parent(node_embed_names=node_embed_names, ctx=ctx)
            # forward
            with mx.autograd.record():
                pred = model(nf)
                batch_nids = nf.layer_parent_nid(-1)
                batch_labels = g.nodes[batch_nids].data[
                    'labels'].as_in_context(ctx)
                loss = loss_fcn(pred, batch_labels)
                if distributed:
                    loss = loss.sum() / (len(batch_nids) * g.num_workers)
                else:
                    loss = loss.sum() / (len(batch_nids))

            loss.backward()
            trainer.step(batch_size=1)

            node_embed_names = [['h_{}'.format(i)] for i in range(n_layers)]
            node_embed_names.append([])

            nf.copy_to_parent(node_embed_names=node_embed_names)
        mx.nd.waitall()
        print(msg_head + ': training takes ' + str(time.time() - start))

        infer_params = infer_model.collect_params()

        for key in infer_params:
            idx = trainer._param2idx[key]
            trainer._kvstore.pull(idx, out=infer_params[key].data())

        num_acc = 0.
        num_tests = 0

        if not distributed or g.worker_id == 0:
            for nf in dgl.contrib.sampling.NeighborSampler(
                    g,
                    args.test_batch_size,
                    g.number_of_nodes(),
                    neighbor_type='in',
                    num_hops=n_layers,
                    seed_nodes=test_nid,
                    add_self_loop=True):
                node_embed_names = [['preprocess', 'features']]
                for i in range(n_layers):
                    node_embed_names.append(['norm', 'subg_norm'])
                nf.copy_from_parent(node_embed_names=node_embed_names, ctx=ctx)

                pred = infer_model(nf)
                batch_nids = nf.layer_parent_nid(-1)
                batch_labels = g.nodes[batch_nids].data[
                    'labels'].as_in_context(ctx)
                num_acc += (pred.argmax(
                    axis=1) == batch_labels).sum().asscalar()
                num_tests += nf.layer_size(-1)
                if distributed:
                    g._sync_barrier()
                print(msg_head +
                      ": Test Accuracy {:.4f}".format(num_acc / num_tests))
                break
        elif distributed:
            g._sync_barrier()
Exemple #18
0
    edge_size = 256
    sizes = [[0.2, 0.272], [0.37, 0.447], [0.54, 0.619], [0.71, 0.79],
             [0.88, 0.961]]
    ratios = [[1, 2, 0.5]] * 5
    num_anchors = len(sizes[0]) + len(ratios[0]) - 1
    ctx = mx.gpu(0)

    train_iter, val_iter = load_data_pikachu(batch_size, edge_size)
    train_iter.reshape(label_shape=(3, 5))

    net = TinySSD(num_classes=1)
    net.initialize(init=init.Xavier(), ctx=ctx)
    class_loss = gloss.SoftmaxCrossEntropyLoss()
    bbox_loss = gloss.L1Loss()
    trainer = gluon.Trainer(net.collect_params(), "sgd", {
        "learning_rate": lr,
        "wd": wd
    })

    for epoch in range(20):
        acc, mae = 0, 0
        train_iter.reset()
        start = time.time()
        for i, batch in enumerate(train_iter):
            X = batch.data[0].as_in_context(ctx)
            Y = batch.label[0].as_in_context(ctx)
            with autograd.record():
                anchors, class_preds, bbox_preds = net(X)
                bbox_labels, bbox_masks, class_labels = contrib.nd.MultiBoxTarget(
                    anchors, Y, class_preds.transpose((0, 2, 1)))
                l = calc_loss(class_preds, class_labels, bbox_preds,
                              bbox_labels, bbox_masks)
Exemple #19
0
def check_unroll(cell_type, num_states, layout):
    batch_size = 20
    input_size = 50
    hidden_size = 30
    seq_len = 10
    if layout == 'TNC':
        rnn_data = mx.nd.normal(loc=0,
                                scale=1,
                                shape=(seq_len, batch_size, input_size))
    elif layout == 'NTC':
        rnn_data = mx.nd.normal(loc=0,
                                scale=1,
                                shape=(batch_size, seq_len, input_size))
    else:
        print("Wrong layout")
        return
    valid_length = mx.nd.round(
        mx.nd.random.uniform(low=1, high=10, shape=(batch_size)))
    state_shape = (batch_size, hidden_size)
    states = [
        mx.nd.normal(loc=0, scale=1, shape=state_shape)
        for i in range(num_states)
    ]

    cell = cell_type(hidden_size, prefix='rnn_')
    cell.initialize(ctx=default_context())
    if layout == 'TNC':
        cell(rnn_data[0], states)
    else:
        cell(rnn_data[:, 0, :], states)
    params1 = cell.collect_params()
    orig_params1 = copy.deepcopy(params1)

    trainer = gluon.Trainer(params1, 'sgd', {'learning_rate': 0.03})
    with mx.autograd.record():
        res1, states1 = cell.unroll(seq_len,
                                    rnn_data,
                                    states,
                                    valid_length=valid_length,
                                    layout=layout,
                                    merge_outputs=True)
    res1.backward()
    trainer.step(batch_size)

    configs = [
        lambda layer: None, lambda layer: layer.hybridize(),
        lambda layer: layer.hybridize({'inline_limit': 0}),
        lambda layer: layer.hybridize({'static_alloc': True}),
        lambda layer: layer.hybridize({
            'static_alloc': True,
            'static_shape': True
        })
    ]
    # We can't pass None to a hybrid block, but it accepts an empty list.
    # so we use an empty list to represent valid_length if it's None.
    if valid_length is None:
        valid_length = []
    for config in configs:
        layer = TestRNNLayer(cell_type, hidden_size, layout)
        layer.initialize(ctx=default_context())
        config(layer)
        res2, states2 = layer(rnn_data, states, valid_length)
        params2 = layer.collect_params()
        for key, val in orig_params1.items():
            params2[key].set_data(copy.deepcopy(val.data()))

        trainer = gluon.Trainer(params2, 'sgd', {'learning_rate': 0.03})
        with mx.autograd.record():
            res2, states2 = layer(rnn_data, states, valid_length)
        assert_almost_equal(res1, res2, rtol=0.001, atol=0.0001)
        assert len(states1) == len(states2)
        for i in range(len(states1)):
            assert_almost_equal(states1[i],
                                states2[i],
                                rtol=0.001,
                                atol=0.0001)
        res2.backward()
        trainer.step(batch_size)

        for key, val in params1.items():
            weight1 = val.data()
            weight2 = params2[key].data()
            assert_almost_equal(weight1, weight2, rtol=0.001, atol=0.0001)
train_data = get_dataloader(net, dataset, 512, 16, 0)

#############################################################################################
# Try use GPU for training
try:
    a = mx.nd.zeros((1, ), ctx=mx.gpu(0))
    ctx = [mx.gpu(0)]
except:
    ctx = [mx.cpu()]

#############################################################################################
# Start training(finetuning)
net.collect_params().reset_ctx(ctx)
trainer = gluon.Trainer(net.collect_params(), 'sgd', {
    'learning_rate': 0.001,
    'wd': 0.0005,
    'momentum': 0.9
})

mbox_loss = gcv.loss.SSDMultiBoxLoss()
ce_metric = mx.metric.Loss('CrossEntropy')
smoothl1_metric = mx.metric.Loss('SmoothL1')

for epoch in range(0, 2):
    ce_metric.reset()
    smoothl1_metric.reset()
    tic = time.time()
    btic = time.time()
    net.hybridize(static_alloc=True, static_shape=True)
    for i, batch in enumerate(train_data):
        batch_size = batch[0].shape[0]
Exemple #21
0
    num_convs_in_dense_blocks = [4, 4, 4, 4]
    for i, num_convs in enumerate(num_convs_in_dense_blocks):
       densenet.add(denseblock(num_convs, growth_rate))
       num_channels += num_convs * growth_rate
       if i != len(num_convs_in_dense_blocks) - 1:
           num_channels //= 2
           densenet.add(transition_block(num_channels))

    densenet.add(nn.BatchNorm(), nn.Activation('relu'), nn.GlobalAvgPool2D(),
                nn.Dense(10))
    '''
    X = nd.random.uniform(shape=(100, 1, 28, 28))
    densenet.initialize()
    for blk in densenet:
        X = blk(X)
        print(blk.name, 'output shape:\t', X.shape)
    exit()
    '''

    lr = 0.05
    num_epochs = int(sys.argv[1])
    densenet.initialize(force_reinit=True, init=init.Xavier(), ctx=ctx)
    trainer = gluon.Trainer(densenet.collect_params(), 'sgd', {'learning_rate': lr})
    test_acc_list = do_train(net=densenet, 
                        train_iter=train_data_batched, test_iter=test_data_batched, 
                        batch_size=batch_size, trainer=trainer, 
                        num_epochs=num_epochs, ctx=ctx)
    pkl_file = os.path.basename(__file__).split('.')[0] + '.pkl'
    with open(pkl_file, 'wb') as pkl_f:
        pickle.dump(test_acc_list, pkl_f)
    
Exemple #22
0
    def fit(self,
            data_train,
            data_eva,
            meta,
            model_dir_path,
            epochs=10,
            learning_rate=0.01):

        config = dict()
        config['input_mode_answer'] = self.input_mode_answer
        config['input_mode_question'] = self.input_mode_question
        config['nb_classes'] = self.nb_classes
        config['meta'] = meta
        self.meta = meta
        np.save(self.get_config_file_path(model_dir_path), config)

        loss = gluon.loss.SoftmaxCrossEntropyLoss()

        self.model = Net1(self.nb_classes)
        self.model.collect_params().initialize(init=mx.init.Xavier(),
                                               ctx=self.model_ctx)
        trainer = gluon.Trainer(self.model.collect_params(), 'sgd',
                                {'learning_rate': learning_rate})

        history = dict()
        history['train_acc'] = list()
        history['val_acc'] = list()

        moving_loss = 0.
        best_eva = 0
        for e in range(epochs):
            data_train.reset()
            for i, batch in enumerate(data_train):
                batch_size = batch.data[0].shape[0]

                data1 = batch.data[0].as_in_context(self.model_ctx)
                data2 = batch.data[1].as_in_context(self.model_ctx)
                data = [data1, data2]
                label = batch.label[0].as_in_context(self.model_ctx)
                with autograd.record():
                    output = self.model(data)
                    cross_entropy = loss(output, label)
                    cross_entropy.backward()
                trainer.step(batch_size)

                if i == 0:
                    moving_loss = np.mean(cross_entropy.asnumpy()[0])
                else:
                    moving_loss = .99 * moving_loss + .01 * np.mean(
                        cross_entropy.asnumpy()[0])
                if i % 200 == 0:
                    logging.debug("Epoch %s, batch %s. Moving avg of loss: %s",
                                  e, i, moving_loss)
            eva_accuracy = self.evaluate_accuracy(data_iterator=data_eva)
            train_accuracy = self.evaluate_accuracy(data_iterator=data_train)
            history['train_acc'].append(train_accuracy)
            history['val_acc'].append(eva_accuracy)
            print("Epoch %s. Loss: %s, Train_acc %s, Eval_acc %s" %
                  (e, moving_loss, train_accuracy, eva_accuracy))
            if eva_accuracy > best_eva:
                best_eva = eva_accuracy
                logging.info('Best validation acc found. Checkpointing...')
                self.checkpoint(model_dir_path)
            if e % 5 == 0:
                self.save_history(history, model_dir_path)

        self.save_history(history, model_dir_path)
        return history
Exemple #23
0
    def train(self):

        self.net.collect_params().reset_ctx(self.ctx)

        trainer = gluon.Trainer(
            params=self.net.collect_params(),
            optimizer='sgd',
            optimizer_params={
                'learning_rate': self.lr,
                'wd': self.wd,
                'momentum': self.momentum
            },
            update_on_kvstore=(False if self.use_amp else None))

        if self.use_amp:
            amp.init_trainer(trainer)

        lr_decay = self.lr_decay
        lr_steps = sorted(
            [float(ls) for ls in self.lr_decay_epoch.split(',') if ls.strip()])

        cls_criterion = FocalLoss(num_class=80)
        box_criterion = HuberLoss(rho=0.11)
        cls_metric = mx.metric.Loss('FocalLoss')
        box_metric = mx.metric.Loss('SmoothL1')

        logging.info('Start training from scratch...')

        for epoch in range(self.epoch):
            while lr_steps and epoch > lr_steps[0]:
                new_lr = trainer.learning_rate * lr_decay
                lr_steps.pop(0)
                trainer.set_learning_rate(new_lr)
                logging.info("Epoch {} Set learning rate to {}".format(
                    epoch, new_lr))
            cls_metric.reset()
            box_metric.reset()
            tic = time.time()
            btic = time.time()
            # reset cause save params may change
            self.net.collect_params().reset_ctx(self.ctx)
            self.net.hybridize(static_alloc=True, static_shape=True)
            for i, batch in enumerate(self.train_data):
                data, box_targets, cls_targets = batch

                with autograd.record():
                    cls_preds = []
                    box_preds = []
                    for x in data:
                        cls_pred, box_pred, _ = self.net(x)
                        cls_preds.append(cls_pred)
                        box_preds.append(box_pred)
                    cls_loss = [
                        cls_criterion(cls_pred, cls_target)
                        for cls_pred, cls_target in zip(
                            cls_preds, cls_targets)
                    ]
                    box_loss = [
                        box_criterion(box_pred, box_target)
                        for box_pred, box_target in zip(
                            box_preds, box_targets)
                    ]
                    sum_loss = [(cl + bl)
                                for cl, bl in zip(cls_loss, box_loss)]

                    if self.use_amp:
                        with amp.scale_loss(sum_loss, trainer) as scaled_loss:
                            autograd.backward(scaled_loss)
                    else:
                        autograd.backward(sum_loss)
                # since we have already normalized the loss, we don't want to normalize
                # by batch-size anymore
                trainer.step(1)
                cls_metric.update(0, [l * self.batch_size for l in cls_loss])
                box_metric.update(0, [l * self.batch_size for l in box_loss])
                if i > 0 and i % 50 == 0:
                    name1, loss1 = cls_metric.get()
                    name2, loss2 = box_metric.get()
                    logging.info('Epoch {} Batch {} Speed: {:.3f} samples/s, {}={:.5f}, {}={:.5f}'.\
                           format(epoch, i, self.batch_size/(time.time()-btic), name1, loss1, name2, loss2))

                btic = time.time()

            logging.info('[Epoch {}] Starting Validation.'.format(epoch))
            map_name, mean_ap = self.validation()
            val_msg = '\n'.join(
                ['{}={}'.format(k, v) for k, v in zip(map_name, mean_ap)])
            logging.info('[Epoch {}] Validation: \n{}'.format(epoch, val_msg))
            self.save_params(epoch)
    def train(ctx):
        if isinstance(ctx, mx.Context):
            ctx = [ctx]
        if opt.resume_params is '':
            net.initialize(mx.init.MSRAPrelu(), ctx=ctx)

        if opt.no_wd:
            for k, v in net.collect_params('.*beta|.*gamma|.*bias').items():
                v.wd_mult = 0.0

        trainer = gluon.Trainer(net.collect_params(), optimizer,
                                optimizer_params)
        if opt.resume_states is not '':
            trainer.load_states(opt.resume_states)

        if opt.label_smoothing or opt.mixup:
            sparse_label_loss = False
        else:
            sparse_label_loss = True
        if distillation:
            L = gcv.loss.DistillationSoftmaxCrossEntropyLoss(
                temperature=opt.temperature,
                hard_weight=opt.hard_weight,
                sparse_label=sparse_label_loss)
        else:
            L = gluon.loss.SoftmaxCrossEntropyLoss(
                sparse_label=sparse_label_loss)

        best_val_score = 1

        for epoch in range(opt.resume_epoch, opt.num_epochs):
            tic = time.time()
            if opt.use_rec:
                train_data.reset()
            train_metric.reset()
            btic = time.time()

            for i, batch in enumerate(train_data):
                data, label = batch_fn(batch, ctx)

                if opt.mixup:
                    lam = np.random.beta(opt.mixup_alpha, opt.mixup_alpha)
                    if epoch >= opt.num_epochs - opt.mixup_off_epoch:
                        lam = 1
                    data = [lam * X + (1 - lam) * X[::-1] for X in data]

                    if opt.label_smoothing:
                        eta = 0.1
                    else:
                        eta = 0.0
                    label = mixup_transform(label, classes, lam, eta)

                elif opt.label_smoothing:
                    hard_label = label
                    label = smooth(label, classes)

                if distillation:
                    teacher_prob = [nd.softmax(teacher(X.astype(opt.dtype, copy=False)) / opt.temperature) \
                                    for X in data]

                with ag.record():
                    outputs = [
                        net(X.astype(opt.dtype, copy=False)) for X in data
                    ]
                    if distillation:
                        loss = [
                            L(yhat.astype('float32', copy=False),
                              y.astype('float32', copy=False),
                              p.astype('float32', copy=False))
                            for yhat, y, p in zip(outputs, label, teacher_prob)
                        ]
                    else:
                        loss = [
                            L(yhat, y.astype(opt.dtype, copy=False))
                            for yhat, y in zip(outputs, label)
                        ]
                for l in loss:
                    l.backward()
                trainer.step(batch_size)

                if opt.mixup:
                    output_softmax = [nd.SoftmaxActivation(out.astype('float32', copy=False)) \
                                    for out in outputs]
                    train_metric.update(label, output_softmax)
                else:
                    if opt.label_smoothing:
                        train_metric.update(hard_label, outputs)
                    else:
                        train_metric.update(label, outputs)

                if opt.log_interval and not (i + 1) % opt.log_interval:
                    train_metric_name, train_metric_score = train_metric.get()
                    logger.info(
                        'Epoch[%d] Batch [%d]\tSpeed: %f samples/sec\t%s=%f\tlr=%f'
                        % (epoch, i, batch_size * opt.log_interval /
                           (time.time() - btic), train_metric_name,
                           train_metric_score, trainer.learning_rate))
                    btic = time.time()

            train_metric_name, train_metric_score = train_metric.get()
            throughput = int(batch_size * i / (time.time() - tic))

            err_top1_val, err_top5_val = test(ctx, val_data)

            logger.info('[Epoch %d] training: %s=%f' %
                        (epoch, train_metric_name, train_metric_score))
            logger.info('[Epoch %d] speed: %d samples/sec\ttime cost: %f' %
                        (epoch, throughput, time.time() - tic))
            logger.info('[Epoch %d] validation: err-top1=%f err-top5=%f' %
                        (epoch, err_top1_val, err_top5_val))

            if err_top1_val < best_val_score:
                best_val_score = err_top1_val
                net.save_parameters(
                    '%s/%.4f-imagenet-%s-%d-best.params' %
                    (save_dir, best_val_score, model_name, epoch))
                trainer.save_states(
                    '%s/%.4f-imagenet-%s-%d-best.states' %
                    (save_dir, best_val_score, model_name, epoch))

            if save_frequency and save_dir and (epoch +
                                                1) % save_frequency == 0:
                net.save_parameters('%s/imagenet-%s-%d.params' %
                                    (save_dir, model_name, epoch))
                trainer.save_states('%s/imagenet-%s-%d.states' %
                                    (save_dir, model_name, epoch))

        if save_frequency and save_dir:
            net.save_parameters('%s/imagenet-%s-%d.params' %
                                (save_dir, model_name, opt.num_epochs - 1))
            trainer.save_states('%s/imagenet-%s-%d.states' %
                                (save_dir, model_name, opt.num_epochs - 1))
H = {
    "epoch": [],
    "train_loss": [],
    "train_acc": [],
    "val_loss": [],
    "val_acc": [],
    "chrono": []
}

scheduler = mx.lr_scheduler.FactorScheduler(base_lr=1e-3,
                                            factor=0.7,
                                            step=10 * len(train_data))
trainer = gluon.Trainer(net.collect_params(), "sgd", {
    "lr_scheduler": scheduler,
    "momentum": sgd_momentum,
    "wd": sgd_wd
})

train(0, transfer_epochs, H)

# %%
# -- Finetune last N blocks of the network

pretrained_features = pretrained_net.features
# Allow update of weights for the last N blocks
for param in pretrained_features[24:].collect_params().values():
    param.grad_req = 'write'

# DEBUG
for index, param in enumerate(net.collect_params().values()):
Exemple #26
0
def main(net, batch_size, epochs, opt, ctx):
    train_data, val_data = get_data_iters(batch_size)
    if opt.hybridize:
        net.hybridize()

    trainer = gluon.Trainer(net.collect_params(), 'sgd', {'learning_rate': opt.lr, 'wd': opt.wd, 'momentum': opt.momentum})
    #trainer = gluon.Trainer(net.collect_params(), 'adam', {'learning_rate': opt.lr, 'wd': opt.wd})
    criterion1 = []
    for _ in range(8):
        criterion1.append(gluon.loss.SoftmaxCrossEntropyLoss())
    criterion2 = []
    if opt.triplet:
        for _ in range(3):
            criterion2.append(TripletLoss())

    lr = opt.lr
    minlr = lr*0.01
    dlr = (lr-minlr)/(epochs[0]-1)

    prev_time = datetime.datetime.now()
    for epoch in range(epochs[-1]):
        _loss = 0.
        if epoch<epochs[0]:
            lr = minlr + dlr*epoch
        else:
            if epoch in epochs[1:]:
                lr = lr * opt.lr_decay
        trainer.set_learning_rate(lr)

        for data, label in train_data:
            data_list = gluon.utils.split_and_load(data, ctx)
            label_list = gluon.utils.split_and_load(label, ctx)
            with autograd.record():
                losses = []
                for i in range(opt.num_gpus):
                    outputs, features = net(data_list[i])
                    temp_loss = []
                    num = len(outputs)
                    for j in range(len(outputs)):
                        temp_loss.append(criterion1[j](outputs[j], label_list[i]))
                    if opt.triplet:
                        num += len(features)
                        for j in range(len(features)):
                            temp_loss.append(criterion2[j](features[j], label_list[i]))
                    loss = sum(temp_loss) / num
                    losses.append(loss)

            for l in losses:
                l.backward()
            trainer.step(batch_size)
            _loss_list = [l.mean().asscalar() for l in losses]
            _loss += sum(_loss_list) / len(_loss_list)

        cur_time = datetime.datetime.now()
        h, remainder = divmod((cur_time - prev_time).seconds, 3600)
        m, s = divmod(remainder, 60)
        time_str = "Time %02d:%02d:%02d" % (h, m, s)
        __loss = _loss/len(train_data)

        if val_data is not None:
            val_loss, val_accuray = validate(val_data, net, criterion1, criterion2, ctx)
            epoch_str = ("Epoch %d. Train loss: %f, Val loss %f, Val accuray %f, " % (epoch, __loss , val_loss, val_accuray))
        else:
            epoch_str = ("Epoch %d. Train loss: %f, " % (epoch, __loss))

        prev_time = cur_time
        print(epoch_str + time_str + ', lr ' + str(trainer.learning_rate))

    if not os.path.exists("params"):
        os.mkdir("params")
    net.save_parameters("params/resnet50.params")
Exemple #27
0
def train():
    epochs = 100

    lr = 0.1
    lamda = 0.1

    lr_steps = [40, 70, np.inf]
    wd = 5e-4
    momentum = 0.9
    batch_size = 256

    plot_period = 5

    ctx = [mx.gpu(i) for i in range(2)]

    train_set = MNIST(train=True, transform=transform_train)
    train_data = gluon.data.DataLoader(train_set, batch_size, True, num_workers=4, last_batch='discard')
    val_set = MNIST(train=False, transform=transform_val)
    val_data = gluon.data.DataLoader(val_set, batch_size, shuffle=False, num_workers=4)

    net = MnistNet(embedding_size=2, weight_norm=True)
    net.initialize(init=mx.init.MSRAPrelu(), ctx=ctx)
    # net.load_parameters("./pretrained_mnist.params", ctx=ctx)
    net.hybridize()

    loss = RingLoss(lamda)
    loss.initialize(ctx=ctx)
    loss.hybridize()

    train_params = net.collect_params()
    train_params.update(loss.params)
    trainer = gluon.Trainer(train_params, 'sgd', {'learning_rate': lr, 'momentum': momentum, 'wd': wd})

    lr_counter = 0

    metric = mtc.Accuracy()
    num_batch = len(train_data)

    for epoch in range(epochs):
        if epoch == lr_steps[lr_counter]:
            trainer.set_learning_rate(trainer.learning_rate * 0.1)
            lr_counter += 1
        if (epoch % plot_period) == 0:
            plot = True
        else:
            plot = False
        train_loss = 0
        metric.reset()
        tic = time.time()
        ebs = []
        lbs = []
        print("Radius", loss.R.data(ctx=mx.gpu(0)).asscalar())

        for batch in train_data:
            data = gluon.utils.split_and_load(batch[0], ctx_list=ctx, batch_axis=0, even_split=False)
            labels = gluon.utils.split_and_load(batch[1], ctx_list=ctx, batch_axis=0, even_split=False)

            with ag.record():
                ots = [net(X) for X in data]
                embedds = [ot[0] for ot in ots]
                outputs = [ot[1] for ot in ots]
                losses = [loss(yhat, y, emb) for yhat, y, emb in zip(outputs, labels, embedds)]

            for l in losses:
                ag.backward(l)
            if plot:
                for es, ls in zip(embedds, labels):
                    assert len(es) == len(ls)
                    for idx in range(len(es)):
                        ebs.append(es[idx].asnumpy())
                        lbs.append(ls[idx].asscalar())

            trainer.step(batch_size)
            metric.update(labels, outputs)

            train_loss += sum([l.mean().asscalar() for l in losses]) / len(losses)

        _, train_acc = metric.get()
        train_loss /= num_batch

        val_acc, val_loss, val_ebs, val_lbs = validate(net, val_data, ctx, loss, plot)

        if plot:
            ebs = np.vstack(ebs)
            lbs = np.hstack(lbs)

            plot_result(ebs, lbs, os.path.join("./resources", "ringloss-train-epoch{}.png".format(epoch)))
            plot_result(val_ebs, val_lbs, os.path.join("./resources", "ringloss-val-epoch{}.png".format(epoch)))

        toc = time.time()
        print('[epoch % 3d] train accuracy: %.6f, train loss: %.6f | '
              'val accuracy: %.6f, val loss: %.6f, time: %.6f'
              % (epoch, train_acc, train_loss, val_acc, val_loss, toc - tic))
Exemple #28
0
def train_model(model, train_data_loader, val_data_loader, embedding, ctx,
                args):
    """
    Train model and validate/save every epoch.
    """
    logger.info(vars(args))

    # Initialization
    model.hybridize()
    model.collect_params().initialize(mx.init.Normal(0.01), ctx=ctx)
    model.word_emb.weight.set_data(embedding.idx_to_vec)
    # Fix word embedding
    if args.fix_embedding:
        model.word_emb.weight.grad_req = 'null'

    loss_func = gluon.loss.SoftmaxCrossEntropyLoss()
    trainer = gluon.Trainer(model.collect_params(), args.optimizer, {
        'learning_rate': args.lr,
        'wd': args.weight_decay,
        'clip_gradient': 5
    })

    checkpoints_dir = os.path.join(args.output_dir, 'checkpoints')
    if not os.path.exists(checkpoints_dir):
        os.makedirs(checkpoints_dir)

    best_val_acc = 0.
    for epoch_id in range(args.epochs):
        avg_loss = 0.
        avg_acc = 0.
        for batch_id, example in enumerate(train_data_loader):
            s1, s2, label = example
            s1 = s1.as_in_context(ctx)
            s2 = s2.as_in_context(ctx)
            label = label.as_in_context(ctx)

            with autograd.record():
                output = model(s1, s2)
                loss = loss_func(output, label).mean()
            loss.backward()
            trainer.step(1)
            avg_loss += loss.sum().asscalar()

            pred = output.argmax(axis=1)
            acc = (pred == label.astype(np.float32)).mean()
            avg_acc += acc.asscalar()

            if (batch_id + 1) % args.print_interval == 0:
                avg_loss /= args.print_interval
                avg_acc /= args.print_interval
                logger.info(
                    '[Epoch {} Batch {}/{}] loss={:.4f}, acc={:.4f}'.format(
                        epoch_id, batch_id + 1, len(train_data_loader),
                        avg_loss, avg_acc))
                avg_loss = 0.
                avg_acc = 0.

        # Validation
        val_loss, val_acc = test_model(model, val_data_loader, loss_func, ctx)
        if val_acc > best_val_acc:
            best_val_acc = val_acc
            checkpoint_path = os.path.join(args.output_dir, 'checkpoints',
                                           'valid_best.params')
            model.save_parameters(checkpoint_path)
        logger.info(
            '[Epoch {}] valid loss={:.4f}, valid acc={:.4f}, best valid acc={:.4f}'
            .format(epoch_id, val_loss, val_acc, best_val_acc))

        # Save checkpoint of last epoch
        checkpoint_path = os.path.join(args.output_dir, 'checkpoints',
                                       'last.params')
        model.save_parameters(checkpoint_path)
Exemple #29
0
            # chain all blocks together
            self.net = nn.HybridSequential()
            self.net.add(b1, b2, b3, b4, b5, b6)

    def forward(self, x):
        out = x
        for i, b in enumerate(self.net):
            out = b(out)
            if self.verbose:
                print('Block %d output: %s' % (i + 1, out.shape))
        return out


################################################################
# train

train_data, test_data = utils.load_data_fashion_mnist(batch_size=64, resize=96)

ctx = utils.try_gpu()
net = ResNet(10)
net.initialize(ctx=ctx, init=init.Xavier())

############### 그래프 ###############
import gluoncv
gluoncv.utils.viz.plot_network(net, save_prefix=False)
#####################################

loss = gluon.loss.SoftmaxCrossEntropyLoss()
trainer = gluon.Trainer(net.collect_params(), 'sgd', {'learning_rate': 0.05})
utils.train(train_data, test_data, net, loss, trainer, ctx, num_epochs=1)
Exemple #30
0
    def __init__(self, options, logger):
        # configuration setting
        self.opt = options
        self.logger = logger
        self.log_path = os.path.join(self.opt.log_dir, self.opt.model_zoo)

        # checking height and width are multiples of 32
        assert self.opt.height % 32 == 0, "'height' must be a multiple of 32"
        assert self.opt.width % 32 == 0, "'width' must be a multiple of 32"

        self.num_scales = len(self.opt.scales)
        self.num_input_frames = len(self.opt.frame_ids)

        assert self.opt.frame_ids[0] == 0, "frame_ids must start with 0"

        self.use_pose_net = not (self.opt.use_stereo
                                 and self.opt.frame_ids == [0])

        if self.opt.use_stereo:
            self.opt.frame_ids.append("s")

        ######################### dataloader #########################
        datasets_dict = {
            "kitti": KITTIRAWDataset,
            "kitti_odom": KITTIOdomDataset
        }
        self.dataset = datasets_dict[self.opt.dataset]

        fpath = os.path.join(os.path.expanduser("~"), ".mxnet/datasets/kitti",
                             "splits", self.opt.split, "{}_files.txt")
        train_filenames = readlines(fpath.format("train"))
        val_filenames = readlines(fpath.format("val"))
        img_ext = '.png' if self.opt.png else '.jpg'

        train_dataset = self.dataset(self.opt.data_path,
                                     train_filenames,
                                     self.opt.height,
                                     self.opt.width,
                                     self.opt.frame_ids,
                                     num_scales=4,
                                     is_train=True,
                                     img_ext=img_ext)
        self.train_loader = gluon.data.DataLoader(
            train_dataset,
            batch_size=self.opt.batch_size,
            shuffle=True,
            batchify_fn=dict_batchify_fn,
            num_workers=self.opt.num_workers,
            pin_memory=True,
            last_batch='discard')

        val_dataset = self.dataset(self.opt.data_path,
                                   val_filenames,
                                   self.opt.height,
                                   self.opt.width,
                                   self.opt.frame_ids,
                                   num_scales=4,
                                   is_train=False,
                                   img_ext=img_ext)
        self.val_loader = gluon.data.DataLoader(
            val_dataset,
            batch_size=self.opt.batch_size,
            shuffle=False,
            batchify_fn=dict_batchify_fn,
            num_workers=self.opt.num_workers,
            pin_memory=True,
            last_batch='discard')

        ################### model initialization ###################
        # create depth network
        if self.opt.model_zoo is not None:
            self.model = get_model(self.opt.model_zoo,
                                   pretrained_base=self.opt.pretrained_base,
                                   scales=self.opt.scales,
                                   ctx=self.opt.ctx)
        else:
            assert "Must choose a model from model_zoo, " \
                   "please provide depth the model_zoo using --model_zoo"
        self.logger.info(self.model)

        # resume checkpoint if needed
        if self.opt.resume_depth is not None:
            if os.path.isfile(self.opt.resume_depth):
                logger.info('Resume depth model: %s' % self.opt.resume_depth)
                self.model.load_parameters(self.opt.resume_depth,
                                           ctx=self.opt.ctx)
            else:
                raise RuntimeError("=> no checkpoint found at '{}'".format(
                    self.opt.resume_depth))

        if self.use_pose_net:
            # create pose network
            if self.opt.model_zoo_pose is not None:
                self.posenet = get_model(
                    self.opt.model_zoo_pose,
                    pretrained_base=self.opt.pretrained_base,
                    num_input_images=2,
                    num_input_features=1,
                    num_frames_to_predict_for=2,
                    ctx=self.opt.ctx)
            else:
                assert "Must choose a model from model_zoo, " \
                       "please provide the pose model_zoo_pose using --model_zoo_pose"
            self.logger.info(self.posenet)

            # resume checkpoint if needed
            if self.opt.resume_pose is not None:
                if os.path.isfile(self.opt.resume_pose):
                    logger.info('Resume pose model: %s' % self.opt.resume_pose)
                    self.model.load_parameters(self.opt.resume_pose,
                                               ctx=self.opt.ctx)
                else:
                    raise RuntimeError("=> no checkpoint found at '{}'".format(
                        self.opt.resume_pose))

        if self.opt.hybridize:
            self.model.hybridize()
            self.posenet.hybridize()

        ################### optimization setting ###################
        self.lr_scheduler_depth = LRSequential([
            LRScheduler('step',
                        base_lr=self.opt.learning_rate,
                        nepochs=self.opt.num_epochs - self.opt.warmup_epochs,
                        iters_per_epoch=len(self.train_loader),
                        step_epoch=[
                            self.opt.scheduler_step_size -
                            self.opt.warmup_epochs
                        ])
        ])
        optimizer_params_depth = {
            'lr_scheduler': self.lr_scheduler_depth,
            'learning_rate': self.opt.learning_rate
        }

        self.depth_optimizer = gluon.Trainer(self.model.collect_params(),
                                             'adam', optimizer_params_depth)

        if self.use_pose_net:
            self.lr_scheduler_pose = LRSequential([
                LRScheduler(
                    'step',
                    base_lr=self.opt.learning_rate,
                    nepochs=self.opt.num_epochs - self.opt.warmup_epochs,
                    iters_per_epoch=len(self.train_loader),
                    step_epoch=[
                        self.opt.scheduler_step_size - self.opt.warmup_epochs
                    ])
            ])
            optimizer_params_pose = {
                'lr_scheduler': self.lr_scheduler_pose,
                'learning_rate': self.opt.learning_rate
            }
            self.pose_optimizer = gluon.Trainer(self.posenet.collect_params(),
                                                'adam', optimizer_params_pose)

        print("Training model named:\n  ", self.opt.model_zoo)
        print("Models are saved to:\n  ", self.opt.log_dir)
        print("Training is using:\n  ",
              "CPU" if self.opt.ctx[0] is mx.cpu() else "GPU")

        ################### loss function ###################
        if not self.opt.no_ssim:
            self.ssim = SSIM()

        self.backproject_depth = {}
        self.project_3d = {}
        for scale in self.opt.scales:
            h = self.opt.height // (2**scale)
            w = self.opt.width // (2**scale)

            self.backproject_depth[scale] = BackprojectDepth(
                self.opt.batch_size, h, w, ctx=self.opt.ctx[0])
            self.project_3d[scale] = Project3D(self.opt.batch_size, h, w)

        ################### metrics ###################
        self.depth_metric_names = [
            "de/abs_rel", "de/sq_rel", "de/rms", "de/log_rms", "da/a1",
            "da/a2", "da/a3"
        ]

        print("Using split:\n  ", self.opt.split)
        print(
            "There are {:d} training items and {:d} validation items\n".format(
                len(train_dataset), len(val_dataset)))

        self.save_opts()

        # for save best model
        self.best_delta1 = 0
        self.best_model = self.model

        if self.use_pose_net:
            self.best_posenet = self.posenet