# global variables
    if opt.save_frequency is None:
        opt.save_frequency = get_default_save_frequency(opt.dataset)
    logger.info('Starting new image-classification task:, %s', opt)
    mx.random.seed(opt.seed)
    batch_size, dataset, classes = opt.batch_size, opt.dataset, get_num_classes(
        opt.dataset)
    context = [mx.gpu(int(i)) for i in opt.gpus.split(',')
               ] if opt.gpus.strip() else [mx.cpu()]
    if opt.dry_run:
        context = [mx.cpu()]
    num_gpus = len(context)
    batch_size *= max(1, num_gpus)
    opt.batch_size = batch_size
    metric = CompositeEvalMetric([Accuracy(), TopKAccuracy(5)])

    net, arg_params, aux_params = get_model(opt, context)

    print(net)

    if opt.profile:
        import hotshot, hotshot.stats
        prof = hotshot.Profile('image-classifier-%s-%s.prof' %
                               (opt.model, opt.mode))
        prof.runcall(main)
        prof.close()
        stats = hotshot.stats.load('image-classifier-%s-%s.prof' %
                                   (opt.model, opt.mode))
        stats.strip_dirs()
        stats.sort_stats('cumtime', 'calls')
Exemple #2
0
def get_metrics():
    # `metrics` argument was split into `train_metrics` and `val_metrics` in mxnet 1.6.0:
    # https://github.com/apache/incubator-mxnet/pull/17048
    arg_name = "metrics" if is_mxnet_older_than_1_6_0() else "train_metrics"
    return {arg_name: Accuracy()}
Exemple #3
0
def main(train_list,
         val_list,
         model,
         exp,
         saved_model,
         batch_size,
         optimizer,
         nb_epochs,
         augment,
         max_lr,
         min_lr,
         loss_function,
         train_all,
         nb_frames,
         eager,
         params=None,
         **kwargs):

    print("Unused arguments:", kwargs)

    setname = train_list.split(os.sep)[0]
    # Timestamp to name experiment folder
    xptime = strftime("%Y-%m-%d_%Hh%Mm%Ss", gmtime())
    xp_folder = "experiments/%s-%s-%s_%s" % (setname, model, exp, xptime)
    # Make folder
    mkdir_p(xp_folder)
    mkdir_p(os.path.join(xp_folder, 'checkpoints'))
    mkdir_p(os.path.join(xp_folder, 'tb'))
    print("\nSaving experiment data to:", xp_folder)

    # Save command (as well as possible)
    with open(os.path.join(xp_folder, 'command.sh'), "w") as f:
        command = " ".join(sys.argv[:]) + "\n"
        f.write(command)

    # Save employed parameters for future reference
    if params is not None:
        write_params(os.path.join(xp_folder, 'params.json'), params)

    #############
    # Callbacks #
    #############

    # Helper: Save the model.
    ckpt_fmt = os.path.join(
        xp_folder, 'checkpoints', model + '-' + exp +
        '.{epoch:03d}-loss{val_loss:.3f}-acc{val_acc:.3f}.hdf5')
    checkpointer = ModelCheckpoint(filepath=ckpt_fmt,
                                   verbose=1,
                                   save_best_only=True,
                                   monitor='val_acc')

    # Helper: TensorBoard
    tb = HistoryKeeper(logdir=os.path.join(xp_folder),
                       keys=['val_acc', 'val_loss', 'train_time', 'val_time'])

    # Helper: Stop when we stop learning.
    # early_stopper = EarlyStopper(patience=15)

    # Helper: Terminate when finding a NaN loss
    nan_term = TerminateOnNaN()

    callbacks = [tb, checkpointer, nan_term]
    #############

    #############
    #  Loading  #
    #############
    if augment:
        augmenter = default_augmenter(strip_size=4)
    else:
        augment = False
        augmenter = None

    # Dataset classes
    transform = lambda data, label: (augmenter(preprocess(data)), label)

    train_data = ImageFolderDataset(train_list, transform=transform)
    val_data = ImageFolderDataset(val_list)
    img_shape = train_data[0][0].shape

    # Train loader
    train_loader = DataLoader(train_data,
                              batch_size=batch_size,
                              shuffle=True,
                              num_workers=10)
    nb_samples = len(train_data)  # loader should provide the number of sampĺes

    # Validation loader
    val_loader = DataLoader(val_data,
                            batch_size=batch_size,
                            shuffle=False,
                            num_workers=10)
    nb_validation = len(
        val_data)  # loader should provide the number of sampĺes

    # Compute number of steps
    steps_per_epoch = math.ceil(nb_samples / batch_size)
    validation_steps = math.ceil(nb_validation / batch_size)

    # The model
    net = ResearchModels(8,
                         model,
                         saved_model,
                         input_shape=img_shape,
                         train_all=train_all).model

    # A little more verbosity
    print("************************************")
    if train_all:
        print("Train all layers.")
    print("Max lr:", max_lr, " Min lr:", min_lr)
    print("Batch size:", batch_size)
    print(nb_samples, "training samples,", steps_per_epoch, "steps per epoch")
    print(nb_validation, "validation samples,", validation_steps,
          "validation steps")
    print("Optimizer:", optimizer)
    if augment:
        print("Using data augmentation")
    else:
        print("WARNING: Not using data augmentation")
    print("************************************")

    ############################
    #   Loss and Optimization  #
    ############################

    trainer = gluon.Trainer(net.collect_params(), optimizer,
                            {'learning_rate': max_lr})

    if loss_function == 'categorical_crossentropy':
        loss_fn = gluon.loss.SoftmaxCrossEntropyLoss()
        loss_fn.hybridize()

    ############
    # Training #
    ############
    progress_desc = "Epoch %03d - acc %.3f - loss %.3f  "
    acc = Accuracy()
    loss = Loss()
    start_time = time()

    for epoch in range(1, nb_epochs + 1):
        nb_batches = 0
        tic = time()
        acc.reset()
        loss.reset()

        train_time = 0
        t = tqdm(train_loader, unit='batch')
        for data, label in t:
            size = data.shape[0]
            # print(data.shape)

            data = data.copyto(mx.gpu(0))
            label = label.copyto(mx.gpu(0))

            start = time()
            with autograd.record():
                output = net(data)
                l = loss_fn(output, label)
            l.backward()
            end = time()
            train_time += end - start

            # update parameters
            trainer.step(size)

            acc.update(preds=output, labels=label)
            loss.update(preds=l, _=None)

            nb_batches += 1

            t.set_description(progress_desc %
                              (epoch, acc.get()[1], loss.get()[1]))

        train_loss = loss.get()[1]
        train_acc = acc.get()[1]

        acc.reset()
        val_time = 0
        # calculate validation accuracy
        tval = tqdm(val_loader,
                    leave=False,
                    desc='Running validation',
                    unit='batch')
        for data, label in tval:
            data = data.copyto(mx.gpu(0))
            label = label.copyto(mx.gpu(0))

            # Compute outputs
            start = time()
            output = net(data)
            l = loss_fn(output, label)
            end = time()
            val_time += end - start

            # Compute metrics
            loss.update(preds=l, _=None)
            acc.update(preds=output, labels=label)

        val_loss = loss.get()[1]
        val_acc = acc.get()[1]

        print(
            "Epoch %d: loss %.3f, acc %.3f, val_loss %.3f, val_acc %.3f, in %.1f sec"
            % (epoch, train_loss, train_acc, val_loss, val_acc, time() - tic))
        print(
            "--------------------------------------------------------------------------------"
        )

        stop = False
        train_info = {
            'epoch': epoch,
            'loss': train_loss,
            'acc': train_acc,
            'val_loss': val_loss,
            'val_acc': val_acc,
            'train_time': train_time,
            'val_time': val_time
        }
        for cb in callbacks:
            if cb(net, train_info):
                stop = True

        if stop:
            break
        print()

    hours, rem = divmod(time() - start_time, 3600)
    days, hours = divmod(hours, 24)
    minutes, seconds = divmod(rem, 60)

    print("%d training epochs in %dd, %dh%dm%.2fs." %
          (epoch, int(days), int(hours), int(minutes), seconds))
 def __init__(self):
     is_pair = True
     class_labels = ['0', '1']
     metric = Accuracy()
     super(WNLITask, self).__init__(class_labels, metric, is_pair)
 def __init__(self):
     is_pair = True
     class_labels = ['neutral', 'entailment', 'contradiction']
     metric = Accuracy()
     super(MNLITask, self).__init__(class_labels, metric, is_pair)
Exemple #6
0
 def get_metric(cls):
     """Get metrics Accuracy and F1"""
     metric = CompositeEvalMetric()
     for child_metric in [Accuracy(), F1(average='micro')]:
         metric.add(child_metric)
     return metric
Exemple #7
0
def _suggest_metric_for_loss(loss):
    if isinstance(loss, SoftmaxCrossEntropyLoss):
        return Accuracy()
    return None
model_name = opt.model
dataset_classes = {
    'mnist': 10,
    'cifar10': 10,
    'imagenet': 1000,
    'dummy': 1000,
    'sampleimgnet': 200
}
batch_size, dataset, classes = opt.batch_size, opt.dataset, dataset_classes[
    opt.dataset]
context = [mx.gpu(int(i))
           for i in opt.gpus.split(',')] if opt.gpus.strip() else [mx.cpu()]
num_gpus = len(context)
batch_size *= max(1, num_gpus)
lr_steps = [int(x) for x in opt.lr_steps.split(',') if x.strip()]
metric = CompositeEvalMetric([Accuracy(), TopKAccuracy(5), CrossEntropy()])


def get_model(model, ctx, opt):
    """Model initialization."""
    kwargs = {'ctx': ctx, 'pretrained': opt.use_pretrained, 'classes': classes}
    if model.startswith('resnet'):
        kwargs['thumbnail'] = opt.use_thumbnail
    elif model.startswith('vgg'):
        kwargs['batch_norm'] = opt.batch_norm

    net = models.get_model(model, **kwargs)
    if opt.resume:
        net.load_params(opt.resume)
    elif not opt.use_pretrained:
        if model in ['alexnet']:
out = mx.sym.SoftmaxOutput(data=fc2, name='softmax')
model = mx.mod.Module(out, context=ctx)
model.bind(data_shapes=train_data.provide_data,
           label_shapes=train_data.provide_label)

# initialize parameters
model.init_params(initializer=mx.init.Xavier(magnitude=2.))
opt_params = {
    'learning_rate': 0.001,
    'beta1': 0.9,
    'beta2': 0.999,
    'epsilon': 1e-08
}
opt = mx.optimizer.create('adam', **opt_params)
model.init_optimizer(kvstore='device', optimizer=opt)
metric = Accuracy()
# train
start = time.perf_counter()
for epoch in range(1, EPOCHS + 1):
    train_data.reset()
    for i, batch in enumerate(train_data):
        if i == 0:
            tick_0 = time.time()
        model.forward(batch, is_train=True)
        model.backward()
        model.update()
        model.update_metric(metric, batch.label)
    str1 = 'Epoch [{}], Accuracy {:.4f}'.format(epoch, metric.get()[1])
    str2 = '~Samples/Sec {:.4f}'.format(BATCH_SIZE * (i + 1) /
                                        (time.time() - tick_0))
    print('%s  %s' % (str1, str2))
Exemple #10
0
 def get_metric():
     """Get metrics Accuracy"""
     return Accuracy()
Exemple #11
0
 def get_metric():
     """Get metrics Accuracy and F1"""
     metric = CompositeEvalMetric()
     for child_metric in [Accuracy(), F1()]:
         metric.add(child_metric)
     return metric
Exemple #12
0
def main():
    epoches = 32
    gpu_id = 7
    ctx_list = [mx.gpu(x) for x in [7, 8]]
    log_interval = 100
    batch_size = 32
    start_epoch = 0
    # trainer_resume = resume + ".states" if resume is not None else None
    trainer_resume = None

    resume = None
    from mxnet.gluon.data.vision import transforms
    transform_fn = transforms.Compose([
        LeftTopPad(dest_shape=(256, 256)),
        transforms.ToTensor(),
        transforms.Normalize(mean=(0.485, 0.456, 0.406),
                             std=(0.229, 0.224, 0.225))
    ])
    dataset = CaptionDataSet(
        image_root="/data3/zyx/yks/coco2017/train2017",
        annotation_path=
        "/data3/zyx/yks/coco2017/annotations/captions_train2017.json",
        transforms=transform_fn,
        feature_hdf5="output/train2017.h5")
    val_dataset = CaptionDataSet(
        image_root="/data3/zyx/yks/coco2017/val2017",
        annotation_path=
        "/data3/zyx/yks/coco2017/annotations/captions_val2017.json",
        words2index=dataset.words2index,
        index2words=dataset.index2words,
        transforms=transform_fn,
        feature_hdf5="output/val2017.h5")
    dataloader = DataLoader(dataset=dataset,
                            batch_size=batch_size,
                            shuffle=True,
                            num_workers=1,
                            pin_memory=True,
                            last_batch="discard")
    val_loader = DataLoader(dataset=val_dataset,
                            batch_size=batch_size,
                            shuffle=True,
                            num_workers=1,
                            pin_memory=True)

    num_words = dataset.words_count

    # set up logger
    save_prefix = "output/res50_"
    logging.basicConfig()
    logger = logging.getLogger()
    logger.setLevel(logging.INFO)
    log_file_path = save_prefix + '_train.log'
    log_dir = os.path.dirname(log_file_path)
    if log_dir and not os.path.exists(log_dir):
        os.makedirs(log_dir)
    fh = logging.FileHandler(log_file_path)
    logger.addHandler(fh)

    net = EncoderDecoder(num_words=num_words,
                         test_max_len=val_dataset.max_len).cuda()
    for name, p in net.named_parameters():
        if "bias" in name:
            p.data.zero_()
        else:
            p.data.normal_(0, 0.01)
        print(name)
    net = torch.nn.DataParallel(net)
    if resume is not None:
        net.collect_params().load(resume,
                                  allow_missing=True,
                                  ignore_extra=True)
        logger.info("Resumed form checkpoint {}.".format(resume))

    trainer = torch.optim.Adam(params=filter(lambda p: p.requires_grad,
                                             net.parameters()),
                               lr=4e-4)
    criterion = Criterion()
    accu_top3_metric = TopKAccuracy(top_k=3)
    accu_top1_metric = Accuracy(name="batch_accu")
    ctc_loss_metric = Loss(name="ctc_loss")
    alpha_metric = Loss(name="alpha_loss")
    batch_bleu = BleuMetric(name="batch_bleu",
                            pred_index2words=dataset.index2words,
                            label_index2words=dataset.index2words)
    epoch_bleu = BleuMetric(name="epoch_bleu",
                            pred_index2words=dataset.index2words,
                            label_index2words=dataset.index2words)
    btic = time.time()
    logger.info(batch_size)
    logger.info(num_words)
    logger.info(len(dataset.words2index))
    logger.info(len(dataset.index2words))
    logger.info(dataset.words2index["<PAD>"])
    logger.info(val_dataset.words2index["<PAD>"])
    logger.info(len(val_dataset.words2index))
    for nepoch in range(start_epoch, epoches):
        if nepoch > 15:
            trainer.set_learning_rate(4e-5)
        logger.info("Current lr: {}".format(trainer.param_groups[0]["lr"]))
        accu_top1_metric.reset()
        accu_top3_metric.reset()
        ctc_loss_metric.reset()
        alpha_metric.reset()
        epoch_bleu.reset()
        batch_bleu.reset()
        for nbatch, batch in enumerate(tqdm.tqdm(dataloader)):
            batch = [
                Variable(torch.from_numpy(x.asnumpy()).cuda()) for x in batch
            ]
            data, label, label_len = batch
            label = label.long()
            label_len = label_len.long()
            max_len = label_len.max().data.cpu().numpy()
            net.train()
            outputs = net(data, label, max_len)
            predictions, alphas = outputs
            ctc_loss = criterion(predictions, label, label_len)
            loss2 = 1.0 * ((1. - alphas.sum(dim=1))**2).mean()
            ((ctc_loss + loss2) / batch_size).backward()
            for group in trainer.param_groups:
                for param in group['params']:
                    if param.grad is not None:
                        param.grad.data.clamp_(-5, 5)

            trainer.step()
            if nbatch % 10 == 0:
                for n, l in enumerate(label_len):
                    l = int(l.data.cpu().numpy())
                    la = label[n, 1:l].data.cpu().numpy()
                    pred = predictions[n, :(l - 1)].data.cpu().numpy()
                    accu_top3_metric.update(mx.nd.array(la), mx.nd.array(pred))
                    accu_top1_metric.update(mx.nd.array(la), mx.nd.array(pred))
                    epoch_bleu.update(la, predictions[n, :].data.cpu().numpy())
                    batch_bleu.update(la, predictions[n, :].data.cpu().numpy())
                ctc_loss_metric.update(
                    None,
                    preds=mx.nd.array([ctc_loss.data.cpu().numpy()]) /
                    batch_size)
                alpha_metric.update(None,
                                    preds=mx.nd.array(
                                        [loss2.data.cpu().numpy()]))
                if nbatch % log_interval == 0 and nbatch > 0:
                    msg = ','.join([
                        '{}={:.3f}'.format(*metric.get()) for metric in [
                            epoch_bleu, batch_bleu, accu_top1_metric,
                            accu_top3_metric, ctc_loss_metric, alpha_metric
                        ]
                    ])
                    logger.info(
                        '[Epoch {}][Batch {}], Speed: {:.3f} samples/sec, {}'.
                        format(
                            nepoch, nbatch,
                            log_interval * batch_size / (time.time() - btic),
                            msg))
                    btic = time.time()
                    batch_bleu.reset()
                    accu_top1_metric.reset()
                    accu_top3_metric.reset()
                    ctc_loss_metric.reset()
                    alpha_metric.reset()
        net.eval()
        bleu, acc_top1 = validate(net,
                                  gpu_id=gpu_id,
                                  val_loader=val_loader,
                                  train_index2words=dataset.index2words,
                                  val_index2words=val_dataset.index2words)
        save_path = save_prefix + "_weights-%d-bleu-%.4f-%.4f.params" % (
            nepoch, bleu, acc_top1)
        torch.save(net.module.state_dict(), save_path)
        torch.save(trainer.state_dict(), save_path + ".states")
        logger.info("Saved checkpoint to {}.".format(save_path))
Exemple #13
0
 def fit(self, itr, ctx, epochs, batch_size, callbacks=None):
     # ADAM optimizer
     #opt_params={'learning_rate':0.001, 'beta1':0.9, 'beta2':0.999, 'epsilon':1e-08}
     opt = mx.optimizer.create('adam')
     # SGD optimizer
     #opt = mx.optimizer.create('sgd')
     # AdaDelta optimizer
     #opt = mx.optimizer.create('adadelta')
     # initialize parameters
     # MXNet initializes the weight matrices uniformly by drawing from [−0.07,0.07], bias parameters are all set to 0
     # 'Xavier': initializer is designed to keep the scale of gradients roughly the same in all layers
     self._net.initialize(mx.init.Xavier(magnitude=2.3),
                          ctx=ctx,
                          force_reinit=True)
     # fetch and broadcast parameters
     params = self._net.collect_params()
     # trainer
     trainer = Trainer(params=params, optimizer=opt, kvstore='device')
     # loss function
     loss_fn = SoftmaxCrossEntropyLoss()
     # use accuracy as the evaluation metric
     metric = Accuracy()
     # train
     for e in range(epochs):
         if callbacks is not None:
             for cb in callbacks:
                 cb.before_epoch(e)
         # reset evaluation result to initial state
         metric.reset()
         # reset the train data iterator.
         itr.reset()
         # loop over the train data iterator
         for i, batch in enumerate(itr):
             # splits train data into multiple slices along batch_axis
             # copy each slice into a context
             data = split_and_load(batch.data[0],
                                   ctx_list=ctx,
                                   batch_axis=0,
                                   even_split=False)
             # splits train label into multiple slices along batch_axis
             # copy each slice into a context
             label = split_and_load(batch.label[0],
                                    ctx_list=ctx,
                                    batch_axis=0,
                                    even_split=False)
             outputs = []
             losses = []
             # inside training scope
             with ag.record():
                 for x, y in zip(data, label):
                     z = self._net(x)
                     # computes softmax cross entropy loss
                     l = loss_fn(z, y)
                     outputs.append(z)
                     losses.append(l)
             # backpropagate the error for one iteration
             for l in losses:
                 l.backward()
             # make one step of parameter update.
             # trainer needs to know the batch size of data
             # to normalize the gradient by 1/batch_size
             trainer.step(batch_size)
             # updates internal evaluation
             metric.update(label, outputs)
             # invoke callbacks after batch
             if callbacks is not None:
                 for cb in callbacks:
                     cb.after_batch(e, i, batch_size, metric)
         # invoke callbacks after epoch
         if callbacks is not None:
             for cb in callbacks:
                 cb.after_epoch(e, i, batch_size, metric)
     return metric
    'beta2': 0.999,
    'epsilon': 1e-08
}
opt = mx.optimizer.create('adam', **opt_params)
# initialize parameters
model.initialize(force_reinit=True, ctx=ctx)
# fetch and broadcast parameters
params = model.collect_params()
if params is not None:
    hvd.broadcast_parameters(params, root_rank=0)
# create DistributedTrainer, a subclass of gluon.Trainer
trainer = hvd.DistributedTrainer(params, opt)
# loss function
loss_fn = SoftmaxCrossEntropyLoss()
# use accuracy as the evaluation metric
metric = Accuracy()
# train
start = time.perf_counter()
for epoch in range(1, EPOCHS + 1):
    # Reset the train data iterator.
    train_data.reset()
    for i, batch in enumerate(train_data):
        if i == 0:
            tick_0 = time.time()
        data = batch.data[0].as_in_context(ctx)
        label = batch.label[0].as_in_context(ctx)
        with ag.record():
            output = model(data.astype('float32', copy=False))
            loss = loss_fn(output, label)
        loss.backward()
        trainer.step(BATCH_SIZE)
Exemple #15
0
 def __init__(self):
     is_pair = True
     class_labels = ['not_entailment', 'entailment']
     metric = Accuracy()
     super(QNLITask, self).__init__(class_labels, metric, is_pair)
Exemple #16
0
batch_size = 256
train_data = gluon.data.DataLoader(mnist_train,
                                   batch_size=batch_size,
                                   shuffle=True,
                                   num_workers=4)

mnist_valid = gluon.data.vision.FashionMNIST(train=False)
valid_data = gluon.data.DataLoader(mnist_valid.transform_first(transformer),
                                   batch_size=batch_size,
                                   num_workers=4)

# Only hybrid based networks can be exported
net = HybridSequential()
net.add(Conv2D(channels=6, kernel_size=5, activation="relu"),
        MaxPool2D(pool_size=2, strides=2),
        Conv2D(channels=16, kernel_size=3, activation="relu"),
        MaxPool2D(pool_size=2, strides=2), Flatten(),
        Dense(120, activation="relu"), Dense(84, activation="relu"), Dense(10))
net.initialize(init=init.Xavier())
# Only after hybridization a model can be exported with architecture included
net.hybridize()

trainer = Trainer(net.collect_params(), "sgd", {"learning_rate": 0.1})

est = estimator.Estimator(net=net,
                          loss=SoftmaxCrossEntropyLoss(),
                          metrics=Accuracy(),
                          trainer=trainer)
est.fit(train_data=train_data, epochs=2, val_data=valid_data)
Exemple #17
0
 def __init__(self):
     is_pair = False
     class_labels = ['0', '1']
     metric = Accuracy()
     super(SSTTask, self).__init__(class_labels, metric, is_pair)
Exemple #18
0

def load_net(param_file="net.params", ctx=cpu(0)):
    net = SimpleNet()
    net.load_parameters(param_file, ctx=ctx)
    return net


def get_val_data(transformer, batch_size=128):
    mnist_valid = gluon.data.vision.FashionMNIST(train=False)
    valid_data = gluon.data.DataLoader(
        mnist_valid.transform_first(transformer),
        batch_size=batch_size,
        num_workers=4)
    return valid_data


if __name__ == "__main__":
    ctx = gpu(0) if context.num_gpus() else cpu(0)
    net = load_net("net.params", ctx=ctx)
    valid_data = get_val_data(transformer)

    val_acc = Accuracy()
    for data, label in valid_data:
        data = data.as_in_context(ctx)
        label = label.as_in_context(ctx)
        with autograd.predict_mode():
            out = net(data)
            val_acc.update(label, out)
    print("Accuray: ", val_acc.get()[1])