##### dataset if cfg.dataset == 'scannetv2': if data_name == 'scannet': from data.scannetv2_inst import ScannetDatast dataset = ScannetDatast(cfg) else: print("Error: no data loader - " + data_name) exit(0) dataset.trainLoader() logger.info('Training samples: {}'.format(len(dataset.train_file_names))) dataset.valLoader() logger.info('Validation samples: {}'.format(len(dataset.val_file_names))) ##### resume start_epoch, f = utils.checkpoint_restore(model, cfg.exp_path, cfg.config.split('/')[-1][:-5]) # resume from the latest epoch, or specify the epoch to restore logger.info('Restore from {}'.format(f) if len(f) > 0 else 'Start from epoch {}'.format(start_epoch)) ##### train and val for epoch in range(start_epoch, cfg.epochs + 1): train_epoch(dataset.train_data_loader, model, model_fn, optimizer, epoch) if utils.is_multiple(epoch, cfg.save_freq) or utils.is_power2(epoch): eval_epoch(dataset.val_data_loader, model, model_fn, epoch) ##### delete SA # if cfg.cache: # if cfg.dataset == 'scannetv2': # utils.delete_shared_memory(train_file_names, wlabel=True) # utils.delete_shared_memory(val_file_names, wlabel=True)
else: print("Error: no model version " + model_name) exit(0) model = Network(cfg) use_cuda = torch.cuda.is_available() logger.info('cuda available: {}'.format(use_cuda)) assert use_cuda model = model.cuda() # logger.info(model) logger.info('#classifier parameters (model): {}'.format( sum([x.nelement() for x in model.parameters()]))) ##### model_fn (criterion) model_fn = model_fn_decorator(test=True) ##### load model utils.checkpoint_restore( model, cfg.exp_path, cfg.config.split('/')[-1][:-5], use_cuda, cfg.test_epoch, dist=False, f=cfg.pretrain ) # resume from the latest epoch, or specify the epoch to restore ##### evaluate test(model, model_fn, data_name, cfg.test_epoch)
def main(gpu, cfgs): setproctitle.setproctitle('python') ##### config global cfg cfg = cfgs cfg.local_rank = gpu ##### logger & summary writer if cfg.local_rank == 0: # logger global logger from util.log import get_logger logger = get_logger(cfg) # summary writer global writer writer = SummaryWriter(cfg.exp_path) ##### distributed training setting if cfg.dist: cfg.rank = cfg.node_rank * cfg.ngpu_per_node + gpu print('[PID {}] rank: {} world_size: {}'.format( os.getpid(), cfg.rank, cfg.world_size)) dist.init_process_group(backend='nccl', init_method='tcp://127.0.0.1:%d' % cfg.tcp_port, world_size=cfg.world_size, rank=cfg.rank) torch.cuda.set_device(gpu) assert cfg.batch_size % cfg.world_size == 0, 'Batch size should be matched with GPUS: (%d, %d)' % ( cfg.batch_size, cfg.world_size) cfg.batch_size = cfg.batch_size // cfg.world_size if cfg.local_rank == 0: logger.info(cfg) ##### get model version and data version exp_name = cfg.config.split('/')[-1][:-5] model_name = exp_name.split('_')[0] data_name = exp_name.split('_')[-1] ##### model if cfg.local_rank == 0: logger.info('=> creating model ...') Network = model_class(cfg.model_name) model = Network(cfg) use_cuda = torch.cuda.is_available() if cfg.local_rank == 0: logger.info('cuda available: {}'.format(use_cuda)) assert use_cuda model = model.to(gpu) if cfg.dist: if cfg.sync_bn: model = torch.nn.SyncBatchNorm.convert_sync_batchnorm(model).to( gpu) model = torch.nn.parallel.DistributedDataParallel( model, device_ids=[gpu], find_unused_parameters=True) if cfg.local_rank == 0: # logger.info(model) logger.info('#classifier parameters: {}'.format( sum([x.nelement() for x in model.parameters()]))) ##### optimizer if cfg.optim == 'Adam': optimizer = optim.Adam(filter(lambda p: p.requires_grad, model.parameters()), lr=cfg.lr) elif cfg.optim == 'SGD': optimizer = optim.SGD(filter(lambda p: p.requires_grad, model.parameters()), lr=cfg.lr, momentum=cfg.momentum, weight_decay=cfg.weight_decay) ##### model_fn (criterion) model_fn = model_fn_decorator(cfg) ##### dataset if cfg.dataset == 'scannetv2': if data_name == 'scannet': import data.scannetv2_inst dataset = data.scannetv2_inst.ScannetDatast(cfg) else: print("Error: no data loader - " + data_name) exit(0) dataset.trainLoader() dataset.valLoader() if cfg.local_rank == 0: logger.info('Training samples: {}'.format( len(dataset.train_file_names))) logger.info('Validation samples: {}'.format( len(dataset.val_file_names))) # f = utils.checkpoint_save(model, cfg.exp_path, cfg.config.split('/')[-1][:-5] + '_%d' % os.getpid(), 0, cfg.save_freq) ##### resume start_epoch, f = utils.checkpoint_restore( model, cfg.exp_path, cfg.config.split('/')[-1][:-5], dist=cfg.dist, f=cfg.pretrain, gpu=gpu ) # resume from the latest epoch, or specify the epoch to restore if cfg.local_rank == 0: logger.info('Restore from {}'.format(f) if len(f) > 0 else 'Start from epoch {}'.format(start_epoch)) ##### train and val for epoch in range(start_epoch, cfg.epochs + 1): if cfg.dist: dataset.train_sampler.set_epoch(epoch) train_epoch(dataset.train_data_loader, model, model_fn, optimizer, epoch) if cfg.validation: if utils.is_multiple(epoch, cfg.save_freq) or utils.is_power2(epoch): if cfg.dist: dataset.val_sampler.set_epoch(epoch) eval_epoch(dataset.val_data_loader, model, model_fn, epoch)