def download_coco(path, overwrite=False):
    _DOWNLOAD_URLS = [
        ('http://images.cocodataset.org/zips/train2017.zip',
         '10ad623668ab00c62c096f0ed636d6aff41faca5'),
        ('http://images.cocodataset.org/zips/val2017.zip',
         '4950dc9d00dbe1c933ee0170f5797584351d2a41'),
        ('http://images.cocodataset.org/annotations/annotations_trainval2017.zip',
         '8551ee4bb5860311e79dace7e79cb91e432e78b3'),
        ('https://hangzh.s3.amazonaws.com/encoding/data/coco/train_ids.pth',
         '12cd266f97c8d9ea86e15a11f11bcb5faba700b6'),
        ('https://hangzh.s3.amazonaws.com/encoding/data/coco/val_ids.pth',
         '4ce037ac33cbf3712fd93280a1c5e92dae3136bb'),
    ]
    mkdir(path)
    for url, checksum in _DOWNLOAD_URLS:
        filename = download(url, path=path, overwrite=overwrite, sha1_hash=checksum)
        # extract
        if os.path.splitext(filename)[1] == '.zip':
            with zipfile.ZipFile(filename) as zf:
                zf.extractall(path=path)
        else:
            shutil.move(filename, os.path.join(path, 'annotations/'+os.path.basename(filename)))
Beispiel #2
0
def train_gluon():
    if args.save_dir:
        save_dir = args.save_dir
        save_dir = os.path.expanduser(save_dir)
        mkdir(save_dir)
    else:
        save_dir = './'
        save_frequency = 0

    def evaluate(epoch):
        acc_top1 = mx.metric.Accuracy()
        acc_top5 = mx.metric.TopKAccuracy(5)
        for _, batch in enumerate(val_data):
            data, label = val_batch_fn(batch, context)
            output = net(data.astype(args.dtype, copy=False))
            acc_top1.update([label], [output])
            acc_top5.update([label], [output])

        top1_name, top1_acc = acc_top1.get()
        top5_name, top5_acc = acc_top5.get()
        if MPI is not None:
            comm = MPI.COMM_WORLD
            res1 = comm.gather(top1_acc, root=0)
            res2 = comm.gather(top5_acc, root=0)
        if rank == 0:
            if MPI is not None:
                #logging.info('MPI gather res1: {}'.format(res1))
                top1_acc = sum(res1) / len(res1)
                top5_acc = sum(res2) / len(res2)
            logging.info(
                'Epoch[%d] Rank[%d]\tValidation-%s=%f\tValidation-%s=%f',
                epoch, rank, top1_name, top1_acc, top5_name, top5_acc)

    # Hybridize and initialize model
    net.hybridize()
    if args.resume_params is not '':
        net.load_parameters(args.resume_params, ctx=context)

    else:
        net.initialize(initializer, ctx=context)

    if args.no_wd:
        for k, v in net.collect_params('.*beta|.*gamma|.*bias').items():
            v.wd_mult = 0.0

    # Horovod: fetch and broadcast parameters
    params = net.collect_params()
    if params is not None:
        hvd.broadcast_parameters(params, root_rank=0)

    # Create optimizer
    optimizer = 'nag'
    optimizer_params = {
        'wd': args.wd,
        'momentum': args.momentum,
        'lr_scheduler': lr_sched
    }
    if args.dtype == 'float16':
        optimizer_params['multi_precision'] = True
    opt = mx.optimizer.create(optimizer, **optimizer_params)

    # Horovod: create DistributedTrainer, a subclass of gluon.Trainer
    trainer = hvd.DistributedTrainer(params, opt)
    if args.resume_states is not '':
        trainer.load_states(args.resume_states)

    # Create loss function and train metric
    if args.label_smoothing or args.mixup:
        sparse_label_loss = False
    else:
        sparse_label_loss = True

    loss_fn = gluon.loss.SoftmaxCrossEntropyLoss(
        sparse_label=sparse_label_loss)
    if args.mixup:
        train_metric = mx.metric.RMSE()
    else:
        train_metric = mx.metric.Accuracy()

    def mixup_transform(label, classes, lam=1, eta=0.0):
        if isinstance(label, mx.nd.NDArray):
            label = [label]
        res = []
        for l in label:
            y1 = l.one_hot(classes,
                           on_value=1 - eta + eta / classes,
                           off_value=eta / classes)
            y2 = l[::-1].one_hot(classes,
                                 on_value=1 - eta + eta / classes,
                                 off_value=eta / classes)
            res.append(lam * y1 + (1 - lam) * y2)
        return res

    def smooth(label, classes, eta=0.1):
        if isinstance(label, mx.NDArray):
            label = [label]
        smoothed = []
        for l in label:
            res = l.one_hot(classes,
                            on_value=1 - eta + eta / classes,
                            off_value=eta / classes)
            smoothed.append(res)
        return smoothed

    # Train model
    for epoch in range(args.resume_epoch, args.num_epochs):
        drop_scheduler(epoch)
        tic = time.time()
        train_metric.reset()

        btic = time.time()
        for nbatch, batch in enumerate(train_data, start=1):
            data, label = train_batch_fn(batch, context)
            data, label = [data], [label]
            if args.mixup:
                lam = np.random.beta(args.mixup_alpha, args.mixup_alpha)
                if epoch >= args.num_epochs - args.mixup_off_epoch:
                    lam = 1
                data = [lam * X + (1 - lam) * X[::-1] for X in data]

                if args.label_smoothing:
                    eta = 0.1
                else:
                    eta = 0.0
                label = mixup_transform(label, num_classes, lam, eta)

            elif args.label_smoothing:
                hard_label = label
                label = smooth(label, num_classes)

            with autograd.record():
                outputs = [net(X.astype(args.dtype, copy=False)) for X in data]
                loss = [
                    loss_fn(yhat, y.astype(args.dtype, copy=False))
                    for yhat, y in zip(outputs, label)
                ]
            for l in loss:
                l.backward()
            trainer.step(batch_size)

            if args.mixup:
                output_softmax = [mx.nd.SoftmaxActivation(out.astype('float32', copy=False)) \
                                  for out in outputs]
                train_metric.update(label, output_softmax)
            else:
                if args.label_smoothing:
                    train_metric.update(hard_label, outputs)
                else:
                    train_metric.update(label, outputs)

            if args.log_interval and nbatch % args.log_interval == 0:
                if rank == 0:
                    logging.info('Epoch[%d] Batch[%d] Loss[%.3f]', epoch,
                                 nbatch, loss[0].mean().asnumpy()[0])

                    train_metric_name, train_metric_score = train_metric.get()
                    logging.info('Epoch[%d] Rank[%d] Batch[%d]\t%s=%f\tlr=%f',
                                 epoch, rank, nbatch, train_metric_name,
                                 train_metric_score, trainer.learning_rate)
                btic = time.time()

        # Report metrics
        elapsed = time.time() - tic
        _, acc = train_metric.get()
        if rank == 0:
            logging.info(
                'Epoch[%d] Rank[%d] Batch[%d]\tTime cost=%.2f\tTrain-metric=%f',
                epoch, rank, nbatch, elapsed, acc)
            epoch_speed = num_workers * batch_size * nbatch / elapsed
            logging.info('Epoch[%d]\tSpeed: %.2f samples/sec', epoch,
                         epoch_speed)

        # Evaluate performance
        if args.eval_frequency and (epoch + 1) % args.eval_frequency == 0:
            evaluate(epoch)

        # Save model
        if args.save_frequency and (epoch + 1) % args.save_frequency == 0:
            net.save_parameters('%s/imagenet-%s-%d.params' %
                                (save_dir, args.model, epoch))
            trainer.save_states('%s/imagenet-%s-%d.states' %
                                (save_dir, args.model, epoch))

    # Evaluate performance at the end of training
    evaluate(epoch)

    net.save_parameters('%s/imagenet-%s-%d.params' %
                        (save_dir, args.model, args.num_epochs - 1))
    trainer.save_states('%s/imagenet-%s-%d.states' %
                        (save_dir, args.model, args.num_epochs - 1))
Beispiel #3
0
def main_worker(gpu, ngpus_per_node, args, cfg):
    args.gpu = gpu
    args.rank = args.rank * ngpus_per_node + gpu
    logger.info(f'rank: {args.rank} / {args.world_size}')
    dist.init_process_group(backend=args.dist_backend,
                            init_method=args.dist_url,
                            world_size=args.world_size,
                            rank=args.rank)
    torch.cuda.set_device(args.gpu)
    if args.gpu == 0:
        mkdir(args.outdir)
        fh = logging.FileHandler(os.path.join(args.outdir, 'log.txt'))
        fh.setLevel(logging.INFO)
        logger.addHandler(fh)
        logger.info(args)

    # init the global
    global best_pred, acclist_train, acclist_val

    # seed
    torch.manual_seed(cfg.SEED)
    torch.cuda.manual_seed(cfg.SEED)

    # init dataloader
    transform_train, transform_val = get_transform(cfg.DATA.DATASET)(
        cfg.DATA.BASE_SIZE, cfg.DATA.CROP_SIZE, cfg.DATA.RAND_AUG)
    trainset = get_dataset(cfg.DATA.DATASET)(root=cfg.DATA.ROOT,
                                             transform=transform_train,
                                             train=True,
                                             download=True)
    valset = get_dataset(cfg.DATA.DATASET)(root=cfg.DATA.ROOT,
                                           transform=transform_val,
                                           train=False,
                                           download=True)

    train_sampler = torch.utils.data.distributed.DistributedSampler(trainset)
    train_loader = torch.utils.data.DataLoader(
        trainset,
        batch_size=cfg.TRAINING.BATCH_SIZE,
        shuffle=False,
        num_workers=cfg.TRAINING.WORKERS,
        pin_memory=True,
        sampler=train_sampler)

    val_sampler = torch.utils.data.distributed.DistributedSampler(
        valset, shuffle=False)
    val_loader = torch.utils.data.DataLoader(
        valset,
        batch_size=cfg.TRAINING.TEST_BATCH_SIZE,
        shuffle=False,
        num_workers=cfg.TRAINING.WORKERS,
        pin_memory=True,
        sampler=val_sampler)

    # init the model
    model_kwargs = {}
    if cfg.MODEL.FINAL_DROP > 0.0:
        model_kwargs['final_drop'] = cfg.MODEL.FINAL_DROP

    if cfg.TRAINING.LAST_GAMMA:
        model_kwargs['last_gamma'] = True

    model = get_model(cfg.MODEL.NAME)(**model_kwargs)

    if args.gpu == 0:
        logger.info(model)

    criterion, train_loader = get_criterion(cfg, train_loader, args.gpu)

    model.cuda(args.gpu)
    criterion.cuda(args.gpu)
    model = DistributedDataParallel(model, device_ids=[args.gpu])

    # criterion and optimizer
    if cfg.OPTIMIZER.DISABLE_BN_WD:
        parameters = model.named_parameters()
        param_dict = {}
        for k, v in parameters:
            param_dict[k] = v
        bn_params = [
            v for n, v in param_dict.items() if ('bn' in n or 'bias' in n)
        ]
        rest_params = [
            v for n, v in param_dict.items() if not ('bn' in n or 'bias' in n)
        ]
        if args.gpu == 0:
            logger.info(" Weight decay NOT applied to BN parameters ")
            logger.info(
                f'len(parameters): {len(list(model.parameters()))} = {len(bn_params)} + {len(rest_params)}'
            )
        optimizer = torch.optim.SGD([{
            'params': bn_params,
            'weight_decay': 0
        }, {
            'params': rest_params,
            'weight_decay': cfg.OPTIMIZER.WEIGHT_DECAY
        }],
                                    lr=cfg.OPTIMIZER.LR,
                                    momentum=cfg.OPTIMIZER.MOMENTUM,
                                    weight_decay=cfg.OPTIMIZER.WEIGHT_DECAY)
    else:
        optimizer = torch.optim.SGD(model.parameters(),
                                    lr=cfg.OPTIMIZER.LR,
                                    momentum=cfg.OPTIMIZER.MOMENTUM,
                                    weight_decay=cfg.OPTIMIZER.WEIGHT_DECAY)
    # check point
    if args.resume is not None:
        if os.path.isfile(args.resume):
            if args.gpu == 0:
                logger.info(f"=> loading checkpoint '{args.resume}'")
            checkpoint = torch.load(args.resume)
            cfg.TRAINING.START_EPOCHS = checkpoint['epoch'] + 1 if cfg.TRAINING.START_EPOCHS == 0 \
                    else cfg.TRAINING.START_EPOCHS
            best_pred = checkpoint['best_pred']
            acclist_train = checkpoint['acclist_train']
            acclist_val = checkpoint['acclist_val']
            model.module.load_state_dict(checkpoint['state_dict'])
            optimizer.load_state_dict(checkpoint['optimizer'])
            if args.gpu == 0:
                logger.info(
                    f"=> loaded checkpoint '{args.resume}' (epoch {checkpoint['epoch']})"
                )
        else:
            raise RuntimeError(
                f"=> no resume checkpoint found at '{args.resume}'")

    scheduler = LR_Scheduler(cfg.OPTIMIZER.LR_SCHEDULER,
                             base_lr=cfg.OPTIMIZER.LR,
                             num_epochs=cfg.TRAINING.EPOCHS,
                             iters_per_epoch=len(train_loader),
                             warmup_epochs=cfg.OPTIMIZER.WARMUP_EPOCHS)

    def train(epoch):
        train_sampler.set_epoch(epoch)
        model.train()
        losses = AverageMeter()
        top1 = AverageMeter()
        global best_pred, acclist_train
        for batch_idx, (data, target) in enumerate(train_loader):
            scheduler(optimizer, batch_idx, epoch, best_pred)
            if not cfg.DATA.MIXUP:
                data, target = data.cuda(args.gpu), target.cuda(args.gpu)
            optimizer.zero_grad()
            output = model(data)
            loss = criterion(output, target)
            loss.backward()
            optimizer.step()

            if not cfg.DATA.MIXUP:
                acc1 = accuracy(output, target, topk=(1, ))
                top1.update(acc1[0], data.size(0))

            losses.update(loss.item(), data.size(0))
            if batch_idx % 100 == 0 and args.gpu == 0:
                if cfg.DATA.MIXUP:
                    logger.info('Batch: %d| Loss: %.3f' %
                                (batch_idx, losses.avg))
                else:
                    logger.info('Batch: %d| Loss: %.3f | Top1: %.3f' %
                                (batch_idx, losses.avg, top1.avg))

        acclist_train += [top1.avg]

    def validate(epoch):
        model.eval()
        top1 = AverageMeter()
        top5 = AverageMeter()
        global best_pred, acclist_train, acclist_val
        is_best = False
        for batch_idx, (data, target) in enumerate(val_loader):
            data, target = data.cuda(args.gpu), target.cuda(args.gpu)
            with torch.no_grad():
                output = model(data)
                acc1, acc5 = accuracy(output, target, topk=(1, 5))
                top1.update(acc1[0], data.size(0))
                top5.update(acc5[0], data.size(0))

        # sum all
        sum1, cnt1, sum5, cnt5 = torch_dist_sum(args.gpu, top1.sum, top1.count,
                                                top5.sum, top5.count)

        if args.gpu == 0:
            top1_acc = sum(sum1) / sum(cnt1)
            top5_acc = sum(sum5) / sum(cnt5)
            logger.info('Validation: Top1: %.3f | Top5: %.3f' %
                        (top1_acc, top5_acc))
            if args.eval_only:
                return

            # save checkpoint
            acclist_val += [top1_acc]
            if top1_acc > best_pred:
                best_pred = top1_acc
                is_best = True
            save_checkpoint(
                {
                    'epoch': epoch,
                    'state_dict': model.module.state_dict(),
                    'optimizer': optimizer.state_dict(),
                    'best_pred': best_pred,
                    'acclist_train': acclist_train,
                    'acclist_val': acclist_val,
                },
                directory=args.outdir,
                is_best=False,
                filename=f'checkpoint_{epoch}.pth')

    if args.export:
        if args.gpu == 0:
            torch.save(model.module.state_dict(), args.export + '.pth')
        return

    if args.eval_only:
        validate(cfg.TRAINING.START_EPOCHS)
        return

    for epoch in range(cfg.TRAINING.START_EPOCHS, cfg.TRAINING.EPOCHS):
        tic = time.time()
        train(epoch)
        if epoch % 10 == 0 or epoch == cfg.TRAINING.EPOCHS - 1:
            validate(epoch)
        elapsed = time.time() - tic
        if args.gpu == 0:
            logger.info(f'Epoch: {epoch}, Time cost: {elapsed}')

    if args.gpu == 0:
        save_checkpoint(
            {
                'epoch': cfg.TRAINING.EPOCHS - 1,
                'state_dict': model.module.state_dict(),
                'optimizer': optimizer.state_dict(),
                'best_pred': best_pred,
                'acclist_train': acclist_train,
                'acclist_val': acclist_val,
            },
            directory=args.outdir,
            is_best=False,
            filename='model_final.pth')
        # extract
        if os.path.splitext(filename)[1] == '.zip':
            with zipfile.ZipFile(filename) as zf:
                zf.extractall(path=path)
        else:
            shutil.move(filename, os.path.join(path, 'annotations/'+os.path.basename(filename)))


def install_coco_api():
    repo_url = "https://github.com/cocodataset/cocoapi"
    os.system("git clone " + repo_url)
    os.system("cd cocoapi/PythonAPI/ && python setup.py install")
    shutil.rmtree('cocoapi')
    try:
        import pycocotools
    except Exception:
        print("Installing COCO API failed, please install it manually %s"%(repo_url))


if __name__ == '__main__':
    args = parse_args()
    mkdir(os.path.expanduser('~/.encoding/data'))
    if args.download_dir is not None:
        if os.path.isdir(_TARGET_DIR):
            os.remove(_TARGET_DIR)
        # make symlink
        os.symlink(args.download_dir, _TARGET_DIR)
    else:
        download_coco(_TARGET_DIR, overwrite=False)
    install_coco_api()