Example #1
0
def handler(context):
    dataset_alias = context.datasets
    trainval_2007_dataset_id = dataset_alias['trainval2007']
    trainval_2012_dataset_id = dataset_alias['trainval2012']
    test_2007_dataset_id = dataset_alias['test2007']

    trainval_2007_dataset = list(
        load_dataset_from_api(trainval_2007_dataset_id))
    trainval_2012_dataset = list(
        load_dataset_from_api(trainval_2012_dataset_id))
    test_2007_dataset = list(load_dataset_from_api(test_2007_dataset_id))

    if network_model == 'ssd300':
        model = SSD300(n_fg_class=len(voc_bbox_label_names),
                       pretrained_model='imagenet')
    elif network_model == 'ssd512':
        model = SSD512(n_fg_class=len(voc_bbox_label_names),
                       pretrained_model='imagenet')

    model.use_preset('evaluate')
    train_chain = MultiboxTrainChain(model)
    if USE_GPU >= 0:
        chainer.cuda.get_device_from_id(USE_GPU).use()
        model.to_gpu()

    trainval_2007 = DetectionDatasetFromAPI(trainval_2007_dataset)
    trainval_2012 = DetectionDatasetFromAPI(trainval_2012_dataset)
    test_2007 = DetectionDatasetFromAPI(test_2007_dataset,
                                        use_difficult=True,
                                        return_difficult=True)

    train = TransformDataset(ConcatenatedDataset(trainval_2007, trainval_2012),
                             Transform(model.coder, model.insize, model.mean))
    train_iter = chainer.iterators.SerialIterator(train, BATCHSIZE)

    test_iter = chainer.iterators.SerialIterator(test_2007,
                                                 BATCHSIZE,
                                                 repeat=False,
                                                 shuffle=False)

    # initial lr is set to 1e-3 by ExponentialShift
    optimizer = chainer.optimizers.MomentumSGD()
    optimizer.setup(train_chain)
    for param in train_chain.params():
        if param.name == 'b':
            param.update_rule.add_hook(GradientScaling(2))
        else:
            param.update_rule.add_hook(WeightDecay(0.0005))

    updater = training.updaters.StandardUpdater(train_iter,
                                                optimizer,
                                                device=USE_GPU)
    trainer = training.Trainer(updater, (nb_iterations, 'iteration'),
                               out=ABEJA_TRAINING_RESULT_DIR)
    trainer.extend(extensions.ExponentialShift('lr', 0.1, init=1e-3),
                   trigger=triggers.ManualScheduleTrigger([80000, 100000],
                                                          'iteration'))

    trainer.extend(DetectionVOCEvaluator(test_iter,
                                         model,
                                         use_07_metric=True,
                                         label_names=voc_bbox_label_names),
                   trigger=(10000, 'iteration'))

    log_interval = 100, 'iteration'
    trainer.extend(extensions.LogReport(trigger=log_interval))
    trainer.extend(extensions.observe_lr(), trigger=log_interval)

    print_entries = [
        'iteration', 'main/loss', 'main/loss/loc', 'main/loss/conf',
        'validation/main/map'
    ]
    report_entries = [
        'epoch', 'iteration', 'lr', 'main/loss', 'main/loss/loc',
        'main/loss/conf', 'validation/main/map'
    ]

    trainer.extend(Statistics(report_entries,
                              nb_iterations,
                              obs_key='iteration'),
                   trigger=log_interval)
    trainer.extend(Tensorboard(report_entries, out_dir=log_path))
    trainer.extend(extensions.PrintReport(print_entries), trigger=log_interval)

    trainer.extend(extensions.snapshot_object(
        model, 'model_iter_{.updater.iteration}'),
                   trigger=(nb_iterations, 'iteration'))

    trainer.run()
Example #2
0
def handler(context):
    dataset_alias = context.datasets

    trainval_dataset_id = dataset_alias['trainval']
    test_dataset_id = dataset_alias['test']

    trainval_dataset = list(load_dataset_from_api(trainval_dataset_id))
    test_dataset = list(load_dataset_from_api(test_dataset_id))

    trainval = DetectionDatasetFromAPI(trainval_dataset,
                                       transform=SSDAugmentation(
                                           min_dim, MEANS))
    test = DetectionDatasetFromAPI(test_dataset,
                                   transform=SSDAugmentation(min_dim, MEANS))
    train_dataset = trainval
    test_dataset = test

    priorbox = PriorBox(min_dim, PARAMS)
    with torch.no_grad():
        priors = priorbox.forward().to(device)

    ssd_net = build_ssd('train', priors, min_dim, num_classes)
    ssd_net = ssd_net.to(device)

    url = 'https://s3.amazonaws.com/amdegroot-models/vgg16_reducedfc.pth'
    weight_file = os.path.join(ABEJA_TRAINING_RESULT_DIR,
                               'vgg16_reducedfc.pth')
    download(url, weight_file)

    vgg_weights = torch.load(weight_file)
    print('Loading base network...')
    ssd_net.vgg.load_state_dict(vgg_weights)

    optimizer = optim.SGD(ssd_net.parameters(),
                          lr=lr,
                          momentum=0.9,
                          weight_decay=5e-4)
    criterion = MultiBoxLoss(num_classes, 0.5, True, 0, True, 3, 0.5, False,
                             PARAMS['variance'], device)

    # loss counters
    step_index = 0

    trainloader = data.DataLoader(train_dataset,
                                  batch_size,
                                  num_workers=0,
                                  shuffle=True,
                                  collate_fn=tools.detection_collate,
                                  pin_memory=True)

    testloader = data.DataLoader(test_dataset,
                                 batch_size,
                                 num_workers=0,
                                 shuffle=False,
                                 collate_fn=tools.detection_collate,
                                 pin_memory=True)

    # create batch iterator
    iteration = 1
    while iteration <= max_iter:
        ssd_net.train()
        for images, targets in trainloader:
            if iteration > max_iter:
                break

            if iteration in lr_steps:
                step_index += 1
                adjust_learning_rate(optimizer, 0.1, step_index)

            # load train data
            images = images.to(device)
            targets = [ann.to(device) for ann in targets]

            # forward
            out = ssd_net(images)

            # backprop
            optimizer.zero_grad()
            loss_l, loss_c = criterion(out, targets)
            loss = loss_l + loss_c
            loss.backward()
            optimizer.step()

            if iteration % 100 == 0:
                print('[Train] iter {}, loss: {:.4f}'.format(
                    iteration, loss.item()))
                statistics(iteration, loss.item(), None, None, None)
                writer.add_scalar('main/loss', loss.item(), iteration)
                writer.add_scalar('main/loc_loss', loss_l.item(), iteration)
                writer.add_scalar('main/conf_loss', loss_c.item(), iteration)

            if iteration % 10000 == 0:
                eval(testloader, ssd_net, criterion, iteration)
                ssd_net.train()

            iteration += 1
    torch.save(ssd_net.state_dict(),
               os.path.join(ABEJA_TRAINING_RESULT_DIR, 'model.pth'))
def handler(context):
    class_labels = 10

    dataset_alias = context.datasets
    train_dataset_id = dataset_alias['train']
    test_dataset_id = dataset_alias['test']

    train_data = list(load_dataset_from_api(train_dataset_id))
    test_data = list(load_dataset_from_api(test_dataset_id))

    train = ImageDatasetFromAPI(train_data, train=True)
    test = ImageDatasetFromAPI(test_data)

    net = utils.VGG.VGG(class_labels)
    model = L.Classifier(net)

    if USE_GPU >= 0:
        # Make a specified GPU current
        chainer.backends.cuda.get_device_from_id(USE_GPU).use()
        model.to_gpu()  # Copy the model to the GPU

    optimizer = chainer.optimizers.MomentumSGD(learnrate)
    optimizer.setup(model)
    optimizer.add_hook(chainer.optimizer_hooks.WeightDecay(5e-4))

    train_iter = chainer.iterators.SerialIterator(train, batchsize)
    test_iter = chainer.iterators.SerialIterator(test,
                                                 batchsize,
                                                 repeat=False,
                                                 shuffle=False)

    stop_trigger = (epochs, 'epoch')
    # Early stopping option
    if early_stopping:
        stop_trigger = triggers.EarlyStoppingTrigger(monitor=early_stopping,
                                                     verbose=True,
                                                     max_trigger=(epochs,
                                                                  'epoch'))

    # Set up a trainer
    updater = training.updaters.StandardUpdater(train_iter,
                                                optimizer,
                                                device=USE_GPU)
    trainer = training.Trainer(updater,
                               stop_trigger,
                               out=ABEJA_TRAINING_RESULT_DIR)

    # Evaluate the model with the test dataset for each epoch
    trainer.extend(extensions.Evaluator(test_iter, model, device=USE_GPU))

    # Reduce the learning rate by half every 25 epochs.
    trainer.extend(extensions.ExponentialShift('lr', 0.5),
                   trigger=(25, 'epoch'))

    # Take a snapshot at each epoch
    trainer.extend(extensions.snapshot_object(net, 'net.model'),
                   trigger=(epochs, 'epoch'))

    # Write a log of evaluation statistics for each epoch
    trainer.extend(extensions.LogReport())

    # Print selected entries of the log to stdout
    # Here "main" refers to the target link of the "main" optimizer again, and
    # "validation" refers to the default name of the Evaluator extension.
    # Entries other than 'epoch' are reported by the Classifier link, called by
    # either the updater or the evaluator.

    report_entries = [
        'epoch', 'main/loss', 'validation/main/loss', 'main/accuracy',
        'validation/main/accuracy'
    ]

    trainer.extend(Statistics(report_entries, epochs), trigger=(1, 'epoch'))
    trainer.extend(Tensorboard(report_entries, out_dir=log_path))
    trainer.extend(extensions.PrintReport(report_entries))

    trainer.run()
Example #4
0
def handler(context):
    print(
        f'start training with parameters : {Parameters.as_dict()}, context : {context}'
    )

    try:
        dataset_alias = context.datasets  # for older version
    except AttributeError:
        dataset_alias = context['datasets']

    train_dataset_id, val_dataset_id = get_dataset_ids(dataset_alias)

    id2index, _ = set_categories(list(dataset_alias.values()))
    num_classes = len(id2index)
    num_classes += 1  # add for background class

    print(f'number of classes : {num_classes}')

    print("Start downloading datasets.")
    dataset_items = list(
        load_dataset_from_api(train_dataset_id, max_num=Parameters.MAX_ITEMS))
    print("Finish downloading datasets.")

    random.shuffle(dataset_items)
    if val_dataset_id is not None:
        val_dataset_items = list(
            load_dataset_from_api(val_dataset_id,
                                  max_num=Parameters.MAX_ITEMS))
        random.shuffle(val_dataset_items)
        train_dataset_items = dataset_items
    else:
        test_size = int(len(dataset_items) * Parameters.TEST_SIZE)
        train_dataset_items, val_dataset_items = dataset_items[
            test_size:], dataset_items[:test_size]

    train_dataset = ABEJAPlatformDataset(train_dataset_items,
                                         phase="train",
                                         transform=DataTransform(
                                             Parameters.IMG_SIZE,
                                             Parameters.MEANS))

    val_dataset = ABEJAPlatformDataset(val_dataset_items,
                                       phase="val",
                                       transform=DataTransform(
                                           Parameters.IMG_SIZE,
                                           Parameters.MEANS))

    print(f'train dataset : {len(train_dataset)}')
    print(f'val dataset : {len(val_dataset)}')

    train_dataloader = data.DataLoader(train_dataset,
                                       batch_size=Parameters.BATCH_SIZE,
                                       shuffle=Parameters.SHUFFLE,
                                       collate_fn=od_collate_fn)

    val_dataloader = data.DataLoader(val_dataset,
                                     batch_size=Parameters.BATCH_SIZE,
                                     shuffle=False,
                                     collate_fn=od_collate_fn)

    dataloaders_dict = {"train": train_dataloader, "val": val_dataloader}
    print(f'data loaders : {dataloaders_dict}')

    ssd_cfg = {
        'num_classes':
        num_classes,  # number of classes including background class
        'input_size': Parameters.IMG_SIZE,
        'bbox_aspect_num': Parameters.BBOX_ASPECT_NUM,
        'feature_maps': Parameters.FEATURE_MAPS,
        'steps': Parameters.STEPS,
        'min_sizes': Parameters.MIN_SIZES,
        'max_sizes': Parameters.MAX_SIZES,
        'aspect_ratios': Parameters.ASPECT_RATIOS,
        'conf_thresh': Parameters.CONF_THRESHOLD,
        'top_k': Parameters.TOP_K,
        'nms_thresh': Parameters.NMS_THRESHOLD
    }
    net = SSD(phase="train", cfg=ssd_cfg)

    # TODO: better to host this file by ourselves
    # https://github.com/amdegroot/ssd.pytorch#training-ssd
    url = 'https://s3.amazonaws.com/amdegroot-models/vgg16_reducedfc.pth'
    weight_file = os.path.join(Parameters.ABEJA_TRAINING_RESULT_DIR,
                               'vgg16_reducedfc.pth')
    download(url, weight_file)

    vgg_weights = torch.load(weight_file)
    print('finish loading base network...')
    net.vgg.load_state_dict(vgg_weights)

    def weights_init(m):
        if isinstance(m, nn.Conv2d):
            init.kaiming_normal_(m.weight.data)
            if m.bias is not None:  # in case of bias
                nn.init.constant_(m.bias, 0.0)

    # apply initial values of He
    net.extras.apply(weights_init)
    net.loc.apply(weights_init)
    net.conf.apply(weights_init)

    # configure loss function
    criterion = MultiBoxLoss(jaccard_thresh=Parameters.OVERLAP_THRESHOLD,
                             neg_pos=Parameters.NEG_POS,
                             device=device)

    # configure optimizer
    optimizer = optim.SGD(net.parameters(),
                          lr=Parameters.LR,
                          momentum=Parameters.MOMENTUM,
                          dampening=Parameters.DAMPENING,
                          weight_decay=Parameters.WEIGHT_DECAY,
                          nesterov=Parameters.NESTEROV)

    # move network to device
    net.to(device)

    # NOTE: This flag allows to enable the inbuilt cudnn auto-tuner
    # to find the best algorithm to use for your hardware.
    # cf. https://discuss.pytorch.org/t/what-does-torch-backends-cudnn-benchmark-do/5936/2
    torch.backends.cudnn.benchmark = True

    iteration = 1
    epoch_train_loss = 0.0
    epoch_val_loss = 0.0
    latest_epoch_train_loss = epoch_train_loss
    latest_epoch_val_loss = epoch_val_loss

    for epoch in range(Parameters.EPOCHS):

        t_epoch_start = time.time()
        t_iter_start = time.time()

        print('-------------')
        print('Epoch {}/{}'.format(epoch + 1, Parameters.EPOCHS))
        print('-------------')

        # loop of train and validation for each epoch
        for phase in ['train', 'val']:
            if phase == 'train':
                net.train()
                print('(train)')
            else:
                if (epoch + 1) % 10 == 0:
                    net.eval()
                    print('-------------')
                    print('(val)')
                else:
                    # perform validation once every ten times
                    continue

            # loop each mini-batch from data loader
            for images, targets in dataloaders_dict[phase]:

                images = images.to(device)
                targets = [ann.to(device) for ann in targets]

                # initialize optimizer
                optimizer.zero_grad()

                # calculate forward
                with torch.set_grad_enabled(phase == 'train'):
                    outputs = net(images)

                    # calculate loss
                    loss_l, loss_c = criterion(outputs, targets)
                    loss = loss_l + loss_c

                    if phase == 'train':
                        # back propagate when training
                        loss.backward()  # calculate gradient

                        nn.utils.clip_grad_value_(
                            net.parameters(), clip_value=Parameters.CLIP_VALUE)

                        optimizer.step()  # update parameters

                        if iteration % 10 == 0:  # display loss once every ten iterations
                            t_iter_finish = time.time()
                            duration = t_iter_finish - t_iter_start
                            print(
                                'iter {} || Loss: {:.4f} || 10iter: {:.4f} sec.'
                                .format(iteration, loss.item(), duration))
                            t_iter_start = time.time()

                        epoch_train_loss += loss.item()
                        iteration += 1

                    else:
                        epoch_val_loss += loss.item()

        # loss and accuracy rate of each phase of epoch
        t_epoch_finish = time.time()

        # keep latest epoch loss
        if epoch_train_loss != 0.0:
            num_total = len(dataloaders_dict['train'])
            latest_epoch_train_loss = epoch_train_loss / num_total
        if epoch_val_loss != 0.0:
            num_total = len(dataloaders_dict['val'])
            latest_epoch_val_loss = epoch_val_loss / num_total

        print('-------------')
        print('epoch {} || Epoch_TRAIN_Loss:{:.4f} || Epoch_VAL_Loss:{:.4f}'.
              format(epoch + 1, latest_epoch_train_loss,
                     latest_epoch_val_loss))
        print('timer:  {:.4f} sec.'.format(t_epoch_finish - t_epoch_start))
        t_epoch_start = time.time()

        statistics(epoch + 1, latest_epoch_train_loss, None,
                   latest_epoch_val_loss, None)

        writer.add_scalar('main/loss', latest_epoch_train_loss, epoch + 1)
        if (epoch + 1) % 10 == 0:
            writer.add_scalar('test/loss', latest_epoch_val_loss, epoch + 1)

            model_path = os.path.join(Parameters.ABEJA_TRAINING_RESULT_DIR,
                                      f'ssd300_{str(epoch + 1)}.pth')
            torch.save(net.state_dict(), model_path)

        writer.flush()
        epoch_train_loss = 0.0
        epoch_val_loss = 0.0

    torch.save(net.state_dict(),
               os.path.join(Parameters.ABEJA_TRAINING_RESULT_DIR, 'model.pth'))
    writer.close()
def handler(context):
    dataset_alias = context.datasets
    data = list(load_dataset_from_api(dataset_alias['train']))

    np.random.seed(0)
    data = np.random.permutation(data)
    nb_data = len(data)
    nb_train = int(7 * nb_data // 10)
    train_data_raw = data[:nb_train]
    test_data_raw = data[nb_train:]

    premodel = SSD300(n_fg_class=20, pretrained_model='voc0712')
    model = SSD300(n_fg_class=1)

    copy_ssd(model, premodel)

    model.use_preset('evaluate')
    train_chain = MultiboxTrainChain(model)
    if USE_GPU >= 0:
        chainer.cuda.get_device_from_id(USE_GPU).use()
        model.to_gpu()

    # initial lr is set to 1e-3 by ExponentialShift
    optimizer = chainer.optimizers.MomentumSGD()
    optimizer.setup(train_chain)
    for param in train_chain.params():
        if param.name == 'b':
            param.update_rule.add_hook(GradientScaling(2))
        else:
            param.update_rule.add_hook(WeightDecay(0.0005))

    fix_ssd(train_chain)

    train_data = DetectionDatasetFromAPI(train_data_raw)
    test_data = DetectionDatasetFromAPI(test_data_raw,
                                        use_difficult=True,
                                        return_difficult=True)

    train_data = TransformDataset(
        train_data, Transform(model.coder, model.insize, model.mean))
    train_iter = chainer.iterators.SerialIterator(train_data, BATCHSIZE)

    test_iter = chainer.iterators.SerialIterator(test_data,
                                                 BATCHSIZE,
                                                 repeat=False,
                                                 shuffle=False)

    updater = training.updaters.StandardUpdater(train_iter,
                                                optimizer,
                                                device=USE_GPU)
    trainer = training.Trainer(updater, (nb_epochs, 'epoch'),
                               out=ABEJA_TRAINING_RESULT_DIR)
    trainer.extend(extensions.ExponentialShift('lr', 0.1, init=1e-3),
                   trigger=triggers.ManualScheduleTrigger([1200, 1600],
                                                          'epoch'))

    trainer.extend(DetectionVOCEvaluator(test_iter,
                                         model,
                                         use_07_metric=True,
                                         label_names=['cup']),
                   trigger=(1, 'epoch'))

    log_interval = 1, 'epoch'
    trainer.extend(extensions.LogReport(trigger=log_interval))

    print_entries = [
        'epoch', 'main/loss', 'main/loss/loc', 'main/loss/conf',
        'validation/main/map'
    ]
    report_entries = [
        'epoch', 'lr', 'main/loss', 'main/loss/loc', 'main/loss/conf',
        'validation/main/map'
    ]

    trainer.extend(Statistics(report_entries, nb_epochs), trigger=log_interval)
    trainer.extend(Tensorboard(report_entries, out_dir=log_path))
    trainer.extend(extensions.PrintReport(print_entries), trigger=log_interval)

    trainer.extend(extensions.snapshot_object(model,
                                              'model_epoch_{.updater.epoch}'),
                   trigger=(nb_epochs, 'epoch'))

    trainer.run()
def handler(context):
    dataset_alias = context.datasets
    train_dataset_id = dataset_alias['train']
    test_dataset_id = dataset_alias['test']

    train_data = list(load_dataset_from_api(train_dataset_id))
    test_data = list(load_dataset_from_api(test_dataset_id))

    # Data
    print('==> Preparing data..')
    transform_train = transforms.Compose([
        transforms.RandomCrop(32, padding=4),
        transforms.RandomHorizontalFlip(),
        transforms.ToTensor(),
        transforms.Normalize((0.4914, 0.4822, 0.4465),
                             (0.2023, 0.1994, 0.2010)),
    ])

    transform_test = transforms.Compose([
        transforms.ToTensor(),
        transforms.Normalize((0.4914, 0.4822, 0.4465),
                             (0.2023, 0.1994, 0.2010)),
    ])

    #trainset = torchvision.datasets.CIFAR10(root='./data', train=True, download=True, transform=transform_train)
    trainset = ImageDatasetFromAPI(train_data, transform=transform_train)
    trainloader = torch.utils.data.DataLoader(trainset,
                                              batch_size=128,
                                              shuffle=True,
                                              num_workers=2)

    #testset = torchvision.datasets.CIFAR10(root='./data', train=False, download=True, transform=transform_test)
    testset = ImageDatasetFromAPI(test_data, transform=transform_test)
    testloader = torch.utils.data.DataLoader(testset,
                                             batch_size=100,
                                             shuffle=False,
                                             num_workers=2)

    # Model
    print('==> Building model..')
    if model == 'VGG19':
        net = VGG('VGG19')
    elif model == 'ResNet18':
        net = ResNet18()
    elif model == 'PreActResNet18':
        net = PreActResNet18()
    elif model == 'GoogLeNet':
        net = GoogLeNet()
    elif model == 'DenseNet121':
        net = DenseNet121()
    elif model == 'ResNeXt29_2x64d':
        net = ResNeXt29_2x64d()
    elif model == 'MobileNet':
        net = MobileNet()
    elif model == 'MobileNetV2':
        net = MobileNetV2()
    elif model == 'DPN92':
        net = DPN92()
    elif model == 'ShuffleNetG2':
        net = ShuffleNetG2()
    elif model == 'SENet18':
        net = SENet18()
    elif model == 'ShuffleNetV2':
        net = ShuffleNetV2(1)

    net = net.to(device)

    #if device == 'cuda':
    #    net = torch.nn.DataParallel(net)
    #    cudnn.benchmark = True

    criterion = nn.CrossEntropyLoss()
    optimizer = optim.SGD(net.parameters(),
                          lr=lr,
                          momentum=0.9,
                          weight_decay=5e-4)
    scheduler = lr_scheduler.MultiStepLR(optimizer,
                                         milestones=[150, 250],
                                         gamma=0.1)

    statistics = Statistics(epochs)

    for epoch in range(epochs):
        scheduler.step()

        train_loss, train_acc = train(net, optimizer, trainloader, criterion,
                                      epoch)
        test_loss, test_acc = test(net, testloader, criterion, epoch)

        # Reporting
        print(
            '[{:d}] main/loss: {:.3f} main/acc: {:.3f}, main/validation/loss: {:.3f}, main/validation/acc: {:.3f}'
            .format(epoch + 1, train_loss, train_acc, test_loss, test_acc))

        statistics(epoch + 1, train_loss, train_acc, test_loss, test_acc)
        writer.add_scalar('main/loss', train_loss, epoch + 1)
        writer.add_scalar('main/acc', train_acc, epoch + 1)
        writer.add_scalar('main/validation/loss', test_loss, epoch + 1)
        writer.add_scalar('main/validation/acc', test_acc, epoch + 1)

    torch.save(net.state_dict(),
               os.path.join(ABEJA_TRAINING_RESULT_DIR, 'model.pth'))