def download_coco(path, overwrite=False): _DOWNLOAD_URLS = [ ('http://images.cocodataset.org/zips/train2017.zip', '10ad623668ab00c62c096f0ed636d6aff41faca5'), ('http://images.cocodataset.org/zips/val2017.zip', '4950dc9d00dbe1c933ee0170f5797584351d2a41'), ('http://images.cocodataset.org/annotations/annotations_trainval2017.zip', '8551ee4bb5860311e79dace7e79cb91e432e78b3'), ('https://hangzh.s3.amazonaws.com/encoding/data/coco/train_ids.pth', '12cd266f97c8d9ea86e15a11f11bcb5faba700b6'), ('https://hangzh.s3.amazonaws.com/encoding/data/coco/val_ids.pth', '4ce037ac33cbf3712fd93280a1c5e92dae3136bb'), ] mkdir(path) for url, checksum in _DOWNLOAD_URLS: filename = download(url, path=path, overwrite=overwrite, sha1_hash=checksum) # extract if os.path.splitext(filename)[1] == '.zip': with zipfile.ZipFile(filename) as zf: zf.extractall(path=path) else: shutil.move(filename, os.path.join(path, 'annotations/'+os.path.basename(filename)))
def train_gluon(): if args.save_dir: save_dir = args.save_dir save_dir = os.path.expanduser(save_dir) mkdir(save_dir) else: save_dir = './' save_frequency = 0 def evaluate(epoch): acc_top1 = mx.metric.Accuracy() acc_top5 = mx.metric.TopKAccuracy(5) for _, batch in enumerate(val_data): data, label = val_batch_fn(batch, context) output = net(data.astype(args.dtype, copy=False)) acc_top1.update([label], [output]) acc_top5.update([label], [output]) top1_name, top1_acc = acc_top1.get() top5_name, top5_acc = acc_top5.get() if MPI is not None: comm = MPI.COMM_WORLD res1 = comm.gather(top1_acc, root=0) res2 = comm.gather(top5_acc, root=0) if rank == 0: if MPI is not None: #logging.info('MPI gather res1: {}'.format(res1)) top1_acc = sum(res1) / len(res1) top5_acc = sum(res2) / len(res2) logging.info( 'Epoch[%d] Rank[%d]\tValidation-%s=%f\tValidation-%s=%f', epoch, rank, top1_name, top1_acc, top5_name, top5_acc) # Hybridize and initialize model net.hybridize() if args.resume_params is not '': net.load_parameters(args.resume_params, ctx=context) else: net.initialize(initializer, ctx=context) if args.no_wd: for k, v in net.collect_params('.*beta|.*gamma|.*bias').items(): v.wd_mult = 0.0 # Horovod: fetch and broadcast parameters params = net.collect_params() if params is not None: hvd.broadcast_parameters(params, root_rank=0) # Create optimizer optimizer = 'nag' optimizer_params = { 'wd': args.wd, 'momentum': args.momentum, 'lr_scheduler': lr_sched } if args.dtype == 'float16': optimizer_params['multi_precision'] = True opt = mx.optimizer.create(optimizer, **optimizer_params) # Horovod: create DistributedTrainer, a subclass of gluon.Trainer trainer = hvd.DistributedTrainer(params, opt) if args.resume_states is not '': trainer.load_states(args.resume_states) # Create loss function and train metric if args.label_smoothing or args.mixup: sparse_label_loss = False else: sparse_label_loss = True loss_fn = gluon.loss.SoftmaxCrossEntropyLoss( sparse_label=sparse_label_loss) if args.mixup: train_metric = mx.metric.RMSE() else: train_metric = mx.metric.Accuracy() def mixup_transform(label, classes, lam=1, eta=0.0): if isinstance(label, mx.nd.NDArray): label = [label] res = [] for l in label: y1 = l.one_hot(classes, on_value=1 - eta + eta / classes, off_value=eta / classes) y2 = l[::-1].one_hot(classes, on_value=1 - eta + eta / classes, off_value=eta / classes) res.append(lam * y1 + (1 - lam) * y2) return res def smooth(label, classes, eta=0.1): if isinstance(label, mx.NDArray): label = [label] smoothed = [] for l in label: res = l.one_hot(classes, on_value=1 - eta + eta / classes, off_value=eta / classes) smoothed.append(res) return smoothed # Train model for epoch in range(args.resume_epoch, args.num_epochs): drop_scheduler(epoch) tic = time.time() train_metric.reset() btic = time.time() for nbatch, batch in enumerate(train_data, start=1): data, label = train_batch_fn(batch, context) data, label = [data], [label] if args.mixup: lam = np.random.beta(args.mixup_alpha, args.mixup_alpha) if epoch >= args.num_epochs - args.mixup_off_epoch: lam = 1 data = [lam * X + (1 - lam) * X[::-1] for X in data] if args.label_smoothing: eta = 0.1 else: eta = 0.0 label = mixup_transform(label, num_classes, lam, eta) elif args.label_smoothing: hard_label = label label = smooth(label, num_classes) with autograd.record(): outputs = [net(X.astype(args.dtype, copy=False)) for X in data] loss = [ loss_fn(yhat, y.astype(args.dtype, copy=False)) for yhat, y in zip(outputs, label) ] for l in loss: l.backward() trainer.step(batch_size) if args.mixup: output_softmax = [mx.nd.SoftmaxActivation(out.astype('float32', copy=False)) \ for out in outputs] train_metric.update(label, output_softmax) else: if args.label_smoothing: train_metric.update(hard_label, outputs) else: train_metric.update(label, outputs) if args.log_interval and nbatch % args.log_interval == 0: if rank == 0: logging.info('Epoch[%d] Batch[%d] Loss[%.3f]', epoch, nbatch, loss[0].mean().asnumpy()[0]) train_metric_name, train_metric_score = train_metric.get() logging.info('Epoch[%d] Rank[%d] Batch[%d]\t%s=%f\tlr=%f', epoch, rank, nbatch, train_metric_name, train_metric_score, trainer.learning_rate) btic = time.time() # Report metrics elapsed = time.time() - tic _, acc = train_metric.get() if rank == 0: logging.info( 'Epoch[%d] Rank[%d] Batch[%d]\tTime cost=%.2f\tTrain-metric=%f', epoch, rank, nbatch, elapsed, acc) epoch_speed = num_workers * batch_size * nbatch / elapsed logging.info('Epoch[%d]\tSpeed: %.2f samples/sec', epoch, epoch_speed) # Evaluate performance if args.eval_frequency and (epoch + 1) % args.eval_frequency == 0: evaluate(epoch) # Save model if args.save_frequency and (epoch + 1) % args.save_frequency == 0: net.save_parameters('%s/imagenet-%s-%d.params' % (save_dir, args.model, epoch)) trainer.save_states('%s/imagenet-%s-%d.states' % (save_dir, args.model, epoch)) # Evaluate performance at the end of training evaluate(epoch) net.save_parameters('%s/imagenet-%s-%d.params' % (save_dir, args.model, args.num_epochs - 1)) trainer.save_states('%s/imagenet-%s-%d.states' % (save_dir, args.model, args.num_epochs - 1))
def main_worker(gpu, ngpus_per_node, args, cfg): args.gpu = gpu args.rank = args.rank * ngpus_per_node + gpu logger.info(f'rank: {args.rank} / {args.world_size}') dist.init_process_group(backend=args.dist_backend, init_method=args.dist_url, world_size=args.world_size, rank=args.rank) torch.cuda.set_device(args.gpu) if args.gpu == 0: mkdir(args.outdir) fh = logging.FileHandler(os.path.join(args.outdir, 'log.txt')) fh.setLevel(logging.INFO) logger.addHandler(fh) logger.info(args) # init the global global best_pred, acclist_train, acclist_val # seed torch.manual_seed(cfg.SEED) torch.cuda.manual_seed(cfg.SEED) # init dataloader transform_train, transform_val = get_transform(cfg.DATA.DATASET)( cfg.DATA.BASE_SIZE, cfg.DATA.CROP_SIZE, cfg.DATA.RAND_AUG) trainset = get_dataset(cfg.DATA.DATASET)(root=cfg.DATA.ROOT, transform=transform_train, train=True, download=True) valset = get_dataset(cfg.DATA.DATASET)(root=cfg.DATA.ROOT, transform=transform_val, train=False, download=True) train_sampler = torch.utils.data.distributed.DistributedSampler(trainset) train_loader = torch.utils.data.DataLoader( trainset, batch_size=cfg.TRAINING.BATCH_SIZE, shuffle=False, num_workers=cfg.TRAINING.WORKERS, pin_memory=True, sampler=train_sampler) val_sampler = torch.utils.data.distributed.DistributedSampler( valset, shuffle=False) val_loader = torch.utils.data.DataLoader( valset, batch_size=cfg.TRAINING.TEST_BATCH_SIZE, shuffle=False, num_workers=cfg.TRAINING.WORKERS, pin_memory=True, sampler=val_sampler) # init the model model_kwargs = {} if cfg.MODEL.FINAL_DROP > 0.0: model_kwargs['final_drop'] = cfg.MODEL.FINAL_DROP if cfg.TRAINING.LAST_GAMMA: model_kwargs['last_gamma'] = True model = get_model(cfg.MODEL.NAME)(**model_kwargs) if args.gpu == 0: logger.info(model) criterion, train_loader = get_criterion(cfg, train_loader, args.gpu) model.cuda(args.gpu) criterion.cuda(args.gpu) model = DistributedDataParallel(model, device_ids=[args.gpu]) # criterion and optimizer if cfg.OPTIMIZER.DISABLE_BN_WD: parameters = model.named_parameters() param_dict = {} for k, v in parameters: param_dict[k] = v bn_params = [ v for n, v in param_dict.items() if ('bn' in n or 'bias' in n) ] rest_params = [ v for n, v in param_dict.items() if not ('bn' in n or 'bias' in n) ] if args.gpu == 0: logger.info(" Weight decay NOT applied to BN parameters ") logger.info( f'len(parameters): {len(list(model.parameters()))} = {len(bn_params)} + {len(rest_params)}' ) optimizer = torch.optim.SGD([{ 'params': bn_params, 'weight_decay': 0 }, { 'params': rest_params, 'weight_decay': cfg.OPTIMIZER.WEIGHT_DECAY }], lr=cfg.OPTIMIZER.LR, momentum=cfg.OPTIMIZER.MOMENTUM, weight_decay=cfg.OPTIMIZER.WEIGHT_DECAY) else: optimizer = torch.optim.SGD(model.parameters(), lr=cfg.OPTIMIZER.LR, momentum=cfg.OPTIMIZER.MOMENTUM, weight_decay=cfg.OPTIMIZER.WEIGHT_DECAY) # check point if args.resume is not None: if os.path.isfile(args.resume): if args.gpu == 0: logger.info(f"=> loading checkpoint '{args.resume}'") checkpoint = torch.load(args.resume) cfg.TRAINING.START_EPOCHS = checkpoint['epoch'] + 1 if cfg.TRAINING.START_EPOCHS == 0 \ else cfg.TRAINING.START_EPOCHS best_pred = checkpoint['best_pred'] acclist_train = checkpoint['acclist_train'] acclist_val = checkpoint['acclist_val'] model.module.load_state_dict(checkpoint['state_dict']) optimizer.load_state_dict(checkpoint['optimizer']) if args.gpu == 0: logger.info( f"=> loaded checkpoint '{args.resume}' (epoch {checkpoint['epoch']})" ) else: raise RuntimeError( f"=> no resume checkpoint found at '{args.resume}'") scheduler = LR_Scheduler(cfg.OPTIMIZER.LR_SCHEDULER, base_lr=cfg.OPTIMIZER.LR, num_epochs=cfg.TRAINING.EPOCHS, iters_per_epoch=len(train_loader), warmup_epochs=cfg.OPTIMIZER.WARMUP_EPOCHS) def train(epoch): train_sampler.set_epoch(epoch) model.train() losses = AverageMeter() top1 = AverageMeter() global best_pred, acclist_train for batch_idx, (data, target) in enumerate(train_loader): scheduler(optimizer, batch_idx, epoch, best_pred) if not cfg.DATA.MIXUP: data, target = data.cuda(args.gpu), target.cuda(args.gpu) optimizer.zero_grad() output = model(data) loss = criterion(output, target) loss.backward() optimizer.step() if not cfg.DATA.MIXUP: acc1 = accuracy(output, target, topk=(1, )) top1.update(acc1[0], data.size(0)) losses.update(loss.item(), data.size(0)) if batch_idx % 100 == 0 and args.gpu == 0: if cfg.DATA.MIXUP: logger.info('Batch: %d| Loss: %.3f' % (batch_idx, losses.avg)) else: logger.info('Batch: %d| Loss: %.3f | Top1: %.3f' % (batch_idx, losses.avg, top1.avg)) acclist_train += [top1.avg] def validate(epoch): model.eval() top1 = AverageMeter() top5 = AverageMeter() global best_pred, acclist_train, acclist_val is_best = False for batch_idx, (data, target) in enumerate(val_loader): data, target = data.cuda(args.gpu), target.cuda(args.gpu) with torch.no_grad(): output = model(data) acc1, acc5 = accuracy(output, target, topk=(1, 5)) top1.update(acc1[0], data.size(0)) top5.update(acc5[0], data.size(0)) # sum all sum1, cnt1, sum5, cnt5 = torch_dist_sum(args.gpu, top1.sum, top1.count, top5.sum, top5.count) if args.gpu == 0: top1_acc = sum(sum1) / sum(cnt1) top5_acc = sum(sum5) / sum(cnt5) logger.info('Validation: Top1: %.3f | Top5: %.3f' % (top1_acc, top5_acc)) if args.eval_only: return # save checkpoint acclist_val += [top1_acc] if top1_acc > best_pred: best_pred = top1_acc is_best = True save_checkpoint( { 'epoch': epoch, 'state_dict': model.module.state_dict(), 'optimizer': optimizer.state_dict(), 'best_pred': best_pred, 'acclist_train': acclist_train, 'acclist_val': acclist_val, }, directory=args.outdir, is_best=False, filename=f'checkpoint_{epoch}.pth') if args.export: if args.gpu == 0: torch.save(model.module.state_dict(), args.export + '.pth') return if args.eval_only: validate(cfg.TRAINING.START_EPOCHS) return for epoch in range(cfg.TRAINING.START_EPOCHS, cfg.TRAINING.EPOCHS): tic = time.time() train(epoch) if epoch % 10 == 0 or epoch == cfg.TRAINING.EPOCHS - 1: validate(epoch) elapsed = time.time() - tic if args.gpu == 0: logger.info(f'Epoch: {epoch}, Time cost: {elapsed}') if args.gpu == 0: save_checkpoint( { 'epoch': cfg.TRAINING.EPOCHS - 1, 'state_dict': model.module.state_dict(), 'optimizer': optimizer.state_dict(), 'best_pred': best_pred, 'acclist_train': acclist_train, 'acclist_val': acclist_val, }, directory=args.outdir, is_best=False, filename='model_final.pth')
# extract if os.path.splitext(filename)[1] == '.zip': with zipfile.ZipFile(filename) as zf: zf.extractall(path=path) else: shutil.move(filename, os.path.join(path, 'annotations/'+os.path.basename(filename))) def install_coco_api(): repo_url = "https://github.com/cocodataset/cocoapi" os.system("git clone " + repo_url) os.system("cd cocoapi/PythonAPI/ && python setup.py install") shutil.rmtree('cocoapi') try: import pycocotools except Exception: print("Installing COCO API failed, please install it manually %s"%(repo_url)) if __name__ == '__main__': args = parse_args() mkdir(os.path.expanduser('~/.encoding/data')) if args.download_dir is not None: if os.path.isdir(_TARGET_DIR): os.remove(_TARGET_DIR) # make symlink os.symlink(args.download_dir, _TARGET_DIR) else: download_coco(_TARGET_DIR, overwrite=False) install_coco_api()