def _dist_train(model, dataset, cfg, logger=None, timestamp=None, meta=None): # prepare data loaders dataset = dataset if isinstance(dataset, (list, tuple)) else [dataset] data_loaders = [ build_dataloader(ds, cfg.data.imgs_per_gpu, cfg.data.workers_per_gpu, dist=True, shuffle=True, replace=getattr(cfg.data, 'sampling_replace', False), seed=cfg.seed, drop_last=getattr(cfg.data, 'drop_last', False)) for ds in dataset ] # put model on gpus model = MMDistributedDataParallel(model.cuda(), device_ids=[torch.cuda.current_device()], broadcast_buffers=False) # build runner optimizer = build_optimizer(model, cfg.optimizer) runner = Runner(model, batch_processor, optimizer, cfg.work_dir, logger=logger, meta=meta) # an ugly walkaround to make the .log and .log.json filenames the same runner.timestamp = timestamp optimizer_config = DistOptimizerHook(**cfg.optimizer_config) # register hooks runner.register_training_hooks(cfg.lr_config, optimizer_config, cfg.checkpoint_config, cfg.log_config) runner.register_hook(DistSamplerSeedHook()) # register custom hooks for hook in cfg.get('custom_hooks', ()): if hook.type == 'DeepClusterHook': common_params = dict(dist_mode=True, data_loaders=data_loaders) else: common_params = dict(dist_mode=True) runner.register_hook(build_hook(hook, common_params)) if cfg.resume_from: runner.resume(cfg.resume_from) elif cfg.load_from: runner.load_checkpoint(cfg.load_from) runner.run(data_loaders, cfg.workflow, cfg.total_epochs)
def __init__(self, dataset, dist_mode=True, initial=True, interval=1, **eval_kwargs): from openselfsup import datasets if isinstance(dataset, Dataset): self.dataset = dataset elif isinstance(dataset, dict): self.dataset = datasets.build_dataset(dataset) else: raise TypeError( 'dataset must be a Dataset object or a dict, not {}'.format( type(dataset))) self.data_loader = datasets.build_dataloader( self.dataset, eval_kwargs['imgs_per_gpu'], eval_kwargs['workers_per_gpu'], dist=dist_mode, shuffle=False) self.dist_mode = dist_mode self.initial = initial self.interval = interval self.eval_kwargs = eval_kwargs # my self.update_bn = eval_kwargs.get('update_bn', False) if self.update_bn: self.train_dataset = datasets.build_dataset( eval_kwargs['train_dataset']) self.train_data_loader = datasets.build_dataloader( self.train_dataset, eval_kwargs['imgs_per_gpu'], eval_kwargs['workers_per_gpu'], dist=dist_mode, shuffle=False)
def __init__(self, dataset, imgs_per_gpu, workers_per_gpu, dist_mode=False): from openselfsup import datasets if isinstance(dataset, Dataset): self.dataset = dataset elif isinstance(dataset, dict): self.dataset = datasets.build_dataset(dataset) else: raise TypeError( 'dataset must be a Dataset object or a dict, not {}'.format( type(dataset))) self.data_loader = datasets.build_dataloader(self.dataset, imgs_per_gpu, workers_per_gpu, dist=dist_mode, shuffle=False) self.dist_mode = dist_mode self.avg_pool = nn.AdaptiveAvgPool2d((1, 1))
def main(): args = parse_args() cfg = mmcv.Config.fromfile(args.config) # set cudnn_benchmark if cfg.get('cudnn_benchmark', False): torch.backends.cudnn.benchmark = True # update configs according to CLI args if args.work_dir is not None: cfg.work_dir = args.work_dir layer_ind = [int(idx) for idx in args.layer_ind.split(',')] cfg.model.backbone.out_indices = layer_ind # checkpoint and pretrained are exclusive assert cfg.model.pretrained == "random" or args.checkpoint is None, \ "Checkpoint and pretrained are exclusive." # check memcached package exists if importlib.util.find_spec('mc') is None: for field in ['train', 'val', 'test']: if hasattr(cfg.data, field): getattr(cfg.data, field).data_source.memcached = False # init distributed env first, since logger depends on the dist info. if args.launcher == 'none': distributed = False else: distributed = True init_dist(args.launcher, **cfg.dist_params) # create work_dir mmcv.mkdir_or_exist(osp.abspath(cfg.work_dir)) # logger timestamp = time.strftime('%Y%m%d_%H%M%S', time.localtime()) log_file = osp.join(cfg.work_dir, 'extract_{}.log'.format(timestamp)) logger = get_root_logger(log_file=log_file, log_level=cfg.log_level) # build the dataloader dataset_cfg = mmcv.Config.fromfile(args.dataset_config) dataset = build_dataset(dataset_cfg.data.extract) data_loader = build_dataloader( dataset, imgs_per_gpu=dataset_cfg.data.imgs_per_gpu, workers_per_gpu=dataset_cfg.data.workers_per_gpu, dist=distributed, shuffle=False) # build the model and load checkpoint model = build_model(cfg.model) if args.checkpoint is not None: logger.info("Use checkpoint: {} to extract features".format( args.checkpoint)) load_checkpoint(model, args.checkpoint, map_location='cpu') elif args.pretrained != "random": logger.info('Use pretrained model: {} to extract features'.format( args.pretrained)) else: logger.info('No checkpoint or pretrained is give, use random init.') if not distributed: model = MMDataParallel(model, device_ids=[0]) else: model = MMDistributedDataParallel( model.cuda(), device_ids=[torch.cuda.current_device()], broadcast_buffers=False) # build extraction processor extractor = ExtractProcess(pool_type='specified', backbone='resnet50', layer_indices=layer_ind) # run outputs = extractor.extract(model, data_loader, distributed=distributed) rank, _ = get_dist_info() mmcv.mkdir_or_exist("{}/features/".format(args.work_dir)) if rank == 0: for key, val in outputs.items(): split_num = len(dataset_cfg.split_name) split_at = dataset_cfg.split_at for ss in range(split_num): output_file = "{}/features/{}_{}.npy".format( args.work_dir, dataset_cfg.split_name[ss], key) if ss == 0: np.save(output_file, val[:split_at[0]]) elif ss == split_num - 1: np.save(output_file, val[split_at[-1]:]) else: np.save(output_file, val[split_at[ss - 1]:split_at[ss]])
def _dist_train(model, dataset, cfg, logger=None, timestamp=None, meta=None): # prepare data loaders dataset = dataset if isinstance(dataset, (list, tuple)) else [dataset] data_loaders = [ build_dataloader( ds, cfg.data.imgs_per_gpu, cfg.data.workers_per_gpu, dist=True, shuffle=True, replace=getattr(cfg.data, 'sampling_replace', False), seed=cfg.seed, drop_last=getattr(cfg.data, 'drop_last', False), # my repeat=getattr(cfg.data, 'repeat', 1)) for ds in dataset ] optimizer = build_optimizer(model, cfg.optimizer) if 'use_fp16' in cfg and cfg.use_fp16 == True: model, optimizer = apex.amp.initialize(model.cuda(), optimizer, opt_level="O1") print_log('**** Initializing mixed precision done. ****') # put model on gpus model = MMDistributedDataParallel( model.cuda(), device_ids=[torch.cuda.current_device()], broadcast_buffers=False, # my find_unused_parameters=True) # # build runner # my global Runner if cfg.workflow[0][0] is 'search': Runner = SearchRunner # runner = Runner(model, batch_processor, optimizer, cfg.work_dir, logger=logger, meta=meta) # an ugly walkaround to make the .log and .log.json filenames the same runner.timestamp = timestamp optimizer_config = DistOptimizerHook(**cfg.optimizer_config) # register hooks runner.register_training_hooks(cfg.lr_config, optimizer_config, cfg.checkpoint_config, cfg.log_config) runner.register_hook(DistSamplerSeedHook()) # register custom hooks for hook in cfg.get('custom_hooks', ()): priority = hook.pop('priority', None) if hook.type == 'DeepClusterHook': common_params = dict(dist_mode=True, data_loaders=data_loaders) else: common_params = dict(dist_mode=True) if priority is None: runner.register_hook(build_hook(hook, common_params)) else: runner.register_hook(build_hook(hook, common_params), priority) if cfg.resume_from: runner.resume(cfg.resume_from) elif cfg.load_from: runner.load_checkpoint(cfg.load_from) runner.run(data_loaders, cfg.workflow, cfg.total_epochs)
def main(): args = parse_args() cfg = mmcv.Config.fromfile(args.config) # set cudnn_benchmark if cfg.get('cudnn_benchmark', False): torch.backends.cudnn.benchmark = True # update configs according to CLI args if args.work_dir is not None: cfg.work_dir = args.work_dir cfg.model.pretrained = None # ensure to use checkpoint rather than pretraining # check memcached package exists if importlib.util.find_spec('mc') is None: traverse_replace(cfg, 'memcached', False) # init distributed env first, since logger depends on the dist info. if args.launcher == 'none': distributed = False else: distributed = True if args.launcher == 'slurm': cfg.dist_params['port'] = args.port init_dist(args.launcher, **cfg.dist_params) # logger timestamp = time.strftime('%Y%m%d_%H%M%S', time.localtime()) log_file = osp.join(cfg.work_dir, 'test_{}.log'.format(timestamp)) logger = get_root_logger(log_file=log_file, log_level=cfg.log_level) # build the dataloader dataset = build_dataset(cfg.data.val) data_loader = build_dataloader(dataset, imgs_per_gpu=cfg.data.imgs_per_gpu, workers_per_gpu=cfg.data.workers_per_gpu, dist=distributed, shuffle=False) # build the model and load checkpoint model = build_model(cfg.model) activations = defaultdict(list) #idea from gist.github.com/Tushar-N/680633ec18f5cb4b47933da7d10902af if args.layer_type == nn.Linear: #can save all activations def save_activation(name, mod, inp, out): activations[name].append(out.cpu()) else: def save_activation(name, mod, inp, out): activations[name] = [out.cpu()] load_checkpoint(model, args.checkpoint, map_location='cpu') for name, m in model.named_modules(): if type(m) == args.layer_type: m.register_forward_hook(partial(save_activation, name)) if not distributed: model = MMDataParallel(model, device_ids=[0]) outputs = single_gpu_test(model, data_loader) else: raise NotImplementedError( "Distributed Data Parallel does not register hooks.") model = MMDistributedDataParallel( model.cuda(), device_ids=[torch.cuda.current_device()], broadcast_buffers=False) outputs = multi_gpu_test(model, data_loader) # dict{key: np.ndarray} activations = { name: torch.cat(outputs, 0) for name, outputs in activations.items() } act_file = osp.join(cfg.work_dir, "model_acts") np.savez(act_file, **activations)
def _dist_train(model, dataset, cfg, logger=None, timestamp=None, meta=None): # prepare data loaders dataset = dataset if isinstance(dataset, (list, tuple)) else [dataset] data_loaders = [ build_dataloader(ds, cfg.data.imgs_per_gpu, cfg.data.workers_per_gpu, dist=True, shuffle=True, replace=getattr(cfg.data, 'sampling_replace', False), seed=cfg.seed, drop_last=getattr(cfg.data, 'drop_last', False), prefetch=cfg.prefetch, img_norm_cfg=cfg.img_norm_cfg) for ds in dataset ] optimizer = build_optimizer(model, cfg.optimizer) if 'use_fp16' in cfg and cfg.use_fp16: model, optimizer = apex.amp.initialize(model.cuda(), optimizer, opt_level="O1") model.use_fp16 = True print_log('**** Initializing mixed precision done. ****') # put model on gpus model = MMDistributedDataParallel( model if next(model.parameters()).is_cuda else model.cuda(), device_ids=[torch.cuda.current_device()], broadcast_buffers=False, find_unused_parameters=True) # build runner runner = MultiStageRunner(model=model, batch_processor=multipath_batch_processor, optimizer=optimizer, work_dir=cfg.work_dir, logger=logger, meta=meta, num_stages=model.module.num_block, max_epochs=cfg.total_epochs) # an ugly walkaround to make the .log and .log.json filenames the same runner.timestamp = timestamp optimizer_config = DistOptimizerHook(**cfg.optimizer_config) # register hooks runner.register_training_hooks(cfg.lr_config, optimizer_config, cfg.checkpoint_config, cfg.log_config) runner.register_hook(DistSamplerSeedHook()) # register custom hooks for hook in cfg.get('custom_hooks', ()): if hook.type == 'DeepClusterHook': common_params = dict(dist_mode=True, data_loaders=data_loaders) else: common_params = dict(dist_mode=True) runner.register_hook(build_hook(hook, common_params)) if cfg.resume_from: runner.resume(cfg.resume_from, resume_optimizer='resume_optimizer' in cfg and cfg.resume_optimizer) elif cfg.load_from: runner.load_checkpoint(cfg.load_from) if is_module_wrapper(model): model.module.optimizer = optimizer else: model.optimizer = optimizer runner.run(data_loaders, cfg.workflow, cfg.total_epochs)
def _non_dist_train(model, dataset, cfg, validate=False, logger=None, timestamp=None, meta=None): # use batch size instead of per-gpu batch size if getattr(cfg.data, 'batch_size', False): cfg.data.imgs_per_gpu = int(cfg.data.batch_size) print_log( f"Using {cfg.data.imgs_per_gpu} per gpu for batch size {cfg.data.batch_size}") # prepare data loaders dataset = dataset if isinstance(dataset, (list, tuple)) else [dataset] data_loaders = [ build_dataloader( ds, cfg.data.imgs_per_gpu, cfg.data.workers_per_gpu, cfg.gpus, dist=False, shuffle=True, replace=getattr(cfg.data, 'sampling_replace', False), seed=cfg.seed, drop_last=getattr(cfg.data, 'drop_last', False), prefetch=cfg.prefetch, img_norm_cfg=cfg.img_norm_cfg) for ds in dataset ] if 'use_fp16' in cfg and cfg.use_fp16 == True: raise NotImplementedError('apex do not support non_dist_train!') # put model on gpus model = MMDataParallel(model, device_ids=range(cfg.gpus)).cuda() # build runner optimizer = build_optimizer(model, cfg.optimizer) optimizer_config = NonDistOptimizerHook(**cfg.optimizer_config) if not cfg.get('by_iter', False): runner = Runner( model, optimizer=optimizer, work_dir=cfg.work_dir, logger=logger, meta=meta) else: runner = IterBasedRunner( model, optimizer=optimizer, work_dir=cfg.work_dir, logger=logger, meta=meta) # an ugly walkaround to make the .log and .log.json filenames the same runner.timestamp = timestamp runner.register_training_hooks(cfg.lr_config, optimizer_config, cfg.checkpoint_config, cfg.log_config) # register custom hooks for hook in cfg.get('custom_hooks', ()): if hook.type == 'DeepClusterHook': common_params = dict(dist_mode=False, data_loaders=data_loaders) else: common_params = dict(dist_mode=False) runner.register_hook(build_hook(hook, common_params)) if cfg.resume_from: runner.resume(cfg.resume_from) elif cfg.load_from: runner.load_checkpoint(cfg.load_from) if not cfg.get('by_iter', False): runner.run(data_loaders, cfg.workflow, cfg.total_epochs) else: runner.run(data_loaders, cfg.workflow, cfg.total_iters)
def _dist_train(model, dataset, cfg, logger=None, timestamp=None, meta=None): # prepare data loaders dataset = dataset if isinstance(dataset, (list, tuple)) else [dataset] # use batch size instead of per-gpu batch size if getattr(cfg.data, 'batch_size', False): num_gpus = torch.cuda.device_count() cfg.data.imgs_per_gpu = int(cfg.data.batch_size // num_gpus) print_log( f"Using {cfg.data.imgs_per_gpu} per gpu for batch size {cfg.data.batch_size}") data_loaders = [ build_dataloader( ds, cfg.data.imgs_per_gpu, cfg.data.workers_per_gpu, dist=True, shuffle=True, replace=getattr(cfg.data, 'sampling_replace', False), seed=cfg.seed, drop_last=getattr(cfg.data, 'drop_last', False), prefetch=cfg.prefetch, img_norm_cfg=cfg.img_norm_cfg) for ds in dataset ] optimizer = build_optimizer(model, cfg.optimizer) if 'use_fp16' in cfg and cfg.use_fp16: model, optimizer = apex.amp.initialize( model.cuda(), optimizer, opt_level="O1") print_log('**** Initializing mixed precision done. ****') # put model on gpus model = MMDistributedDataParallel( model if next(model.parameters()).is_cuda else model.cuda(), device_ids=[torch.cuda.current_device()], broadcast_buffers=False) # build runner if not cfg.get('by_iter', False): runner = Runner( model, optimizer=optimizer, work_dir=cfg.work_dir, logger=logger, meta=meta) else: runner = IterBasedRunner( model, optimizer=optimizer, work_dir=cfg.work_dir, logger=logger, meta=meta) # an ugly walkaround to make the .log and .log.json filenames the same runner.timestamp = timestamp optimizer_config = DistOptimizerHook(**cfg.optimizer_config) # register hooks runner.register_training_hooks(cfg.lr_config, optimizer_config, cfg.checkpoint_config, cfg.log_config) if not cfg.get('by_iter', False): runner.register_hook(DistSamplerSeedHook()) # register custom hooks for hook in cfg.get('custom_hooks', ()): if hook.type == 'DeepClusterHook': common_params = dict(dist_mode=True, data_loaders=data_loaders) else: common_params = dict(dist_mode=True) runner.register_hook(build_hook(hook, common_params)) if cfg.resume_from: runner.resume(cfg.resume_from) elif cfg.load_from: runner.load_checkpoint(cfg.load_from) if not cfg.get('by_iter', False): runner.run(data_loaders, cfg.workflow, cfg.total_epochs) else: runner.run(data_loaders, cfg.workflow, cfg.total_iters)