def run(config_path, ir_checkpoint_dir=None, pre_checkpoint_dir=None, cluster_checkpoint_dir=None): config = process_config(config_path) AgentClass = globals()[config.agent] agent = AgentClass(config) if ir_checkpoint_dir is not None: agent.load_checkpoint('checkpoint.pth.tar', ir_checkpoint_dir, load_memory_bank=True, load_model=True, load_optim=False, load_epoch=False, cluster_label_dir=cluster_checkpoint_dir) if pre_checkpoint_dir is not None: agent.load_checkpoint('checkpoint.pth.tar', pre_checkpoint_dir, load_memory_bank=True, load_model=True, load_optim=True, load_epoch=True, cluster_label_dir=cluster_checkpoint_dir) try: agent.run() agent.finalise() except KeyboardInterrupt: pass
def run(config_path): config = process_config(config_path) AgentClass = globals()[config.agent] agent = AgentClass(config) try: agent.run() agent.finalise() except KeyboardInterrupt: pass
def run(args, gpu_device=None): '''Run the Lightning system. Args: args args.config_path: str, filepath to the config file gpu_device: str or None, specifies GPU device as follows: None: CPU (specified as null in config) 'cpu': CPU '-1': All available GPUs '0': GPU 0 '4': GPU 4 '0,3' GPUs 1 and 3 See: https://pytorch-lightning.readthedocs.io/en/latest/multi_gpu.html ''' if gpu_device == 'cpu' or not gpu_device: gpu_device = None config = process_config(args.config) # Only override if specified. if gpu_device: config.gpu_device = gpu_device if args.quick: config.quick = args.quick if args.num_workers is not None: config.data_loader_workers = args.num_workers seed_everything(config.seed) SystemClass = SYSTEM[config.system] system = SystemClass(config) ckpt_callback = pl.callbacks.ModelCheckpoint( os.path.join(config.exp_dir, 'checkpoints'), save_top_k=-1, period=1, ) wandb.init(project='sensor', entity='viewmaker', name=config.exp_name, config=config, sync_tensorboard=True) trainer = pl.Trainer( default_root_dir=config.exp_dir, gpus=gpu_device, distributed_backend=config.distributed_backend or 'dp', max_epochs=config.num_epochs, min_epochs=config.num_epochs, checkpoint_callback=ckpt_callback, resume_from_checkpoint=args.ckpt or config.continue_from_checkpoint, profiler=args.profiler, precision=config.optim_params.precision or 32, callbacks=None, val_check_interval=config.val_check_interval or 1.0, limit_val_batches=config.limit_val_batches or 1.0, ) trainer.fit(system)
def main(config_path): config = process_config(config_path) # Create the Agent and run it with given configuration AgentClass = globals()[config.agent] agent = AgentClass(config) try: agent.run() agent.finalise() except KeyboardInterrupt: pass
def run(config_path, ir_checkpoint_dir=None): config = process_config(config_path) AgentClass = globals()[config.agent] agent = AgentClass(config) if ir_checkpoint_dir is not None: # this will load both the weights and memory bank agent.load_checkpoint('final.pth.tar', ir_checkpoint_dir, load_memory_bank=True, load_model=True, load_optim=False, load_epoch=False) try: agent.run() agent.finalise() except KeyboardInterrupt: pass
def random_search(config_path, num_exps): gpu = get_free_gpu() print("=============== Acquired GPU: {} ===============)".format(gpu)) for n in range(num_exps): params = random_search_params() nested_dict = flat_to_nested_dict(params) curr_config = process_config(config_path, override_dotmap=DotMap(nested_dict), exp_base=EXP_BASE) exp_dir = run_agent(globals()[curr_config.agent], curr_config, gpu) print('======> Finished: ', exp_dir) print('================================================================') print('* COMPLETED MASS EXPERIMENTS *') print('================================================================')
def run(config_path): config = process_config(config_path) AgentClass = globals()[config.agent] agent = AgentClass(config) if config.continue_exp_dir is not None: agent.logger.info("Found existing model... Continuing training!") agent.load_checkpoint('checkpoint.pth.tar', checkpoint_dir=os.path.join( config.continue_exp_dir, 'checkpoints'), load_model=True, load_optim=True, load_epoch=True) try: agent.run() agent.finalise() except KeyboardInterrupt: pass
def run(config_path, gpu_device=-1): config = process_config(config_path) if gpu_device >= 0: config.gpu_device = gpu_device seed_everything(config.seed) SystemClass = SYSTEM[config.system] system = SystemClass(config) ckpt_callback = pl.callbacks.ModelCheckpoint( os.path.join(config.exp_dir, 'checkpoints'), save_top_k=-1, period=1, ) trainer = pl.Trainer( default_save_path=config.exp_dir, gpus=[config.gpu_device], max_epochs=config.num_epochs, min_epochs=config.num_epochs, checkpoint_callback=ckpt_callback, val_percent_check=0.1, resume_from_checkpoint=config.continue_from_checkpoint, ) trainer.fit(system)
def run(config_path, gpu_device=None): if gpu_device == 'cpu' or not gpu_device: gpu_device = None config = process_config(config_path) if gpu_device: config.gpu_device = gpu_device seed_everything(config.seed, use_cuda=config.cuda) SystemClass = SYSTEM[config.system] system = SystemClass(config) if config.optim_params.scheduler: # moco scheduler lr_callback = globals()[config.optim_params.scheduler]( initial_lr=config.optim_params.learning_rate, max_epochs=config.num_epochs, schedule=( int(0.6 * config.num_epochs), int(0.8 * config.num_epochs), ), ) callbacks = [lr_callback] else: callbacks = None ckpt_callback = pl.callbacks.ModelCheckpoint( os.path.join(config.exp_dir, 'checkpoints'), save_top_k=-1, period=1, ) trainer = pl.Trainer( default_root_dir=config.exp_dir, gpus=gpu_device, max_epochs=config.num_epochs, min_epochs=config.num_epochs, checkpoint_callback=ckpt_callback, val_percent_check=0.1, resume_from_checkpoint=config.continue_from_checkpoint, ) trainer.fit(system)
def run(config_path, gpu_device=-1): config = process_config(config_path) if gpu_device >= 0: config.gpu_device = [gpu_device] AgentClass = globals()[config.agent] agent = AgentClass(config) if config.continue_exp_dir is not None: agent.logger.info("Found existing model... Continuing training!") checkpoint_dir = os.path.join(config.continue_exp_dir, 'checkpoints') agent.load_checkpoint( config.continue_exp_name, checkpoint_dir=checkpoint_dir, load_memory_bank=True, load_model=True, load_optim=True, load_epoch=True, ) try: agent.run() agent.finalise() except KeyboardInterrupt: pass
choices=['easy', 'hard']) parser.add_argument('--batch-size', type=int, default=128) parser.add_argument('--gpu-device', type=int, default=0) parser.add_argument('--cuda', action='store_true', default=False) parser.add_argument('--seed', type=int, default=42) args = parser.parse_args() OUT_DIR = f"/mnt/fs5/wumike/reference/pretrain/{args.dataset}/ir_imagenet" if not os.path.isdir(OUT_DIR): os.makedirs(OUT_DIR) config_path = os.path.join(MODEL_DIR, 'config.json') checkpoint_dir = os.path.join(MODEL_DIR, 'checkpoints') assert os.path.isfile(os.path.join(checkpoint_dir, 'model_best.pth.tar')) config = process_config(config_path, override_dotmap={'gpu_device': args.gpu_device}) AgentClass = globals()[config.agent] localagg = AgentClass(config) localagg.load_checkpoint( 'model_best.pth.tar', checkpoint_dir=checkpoint_dir, load_memory_bank=True, load_model=True, ) localagg._set_models_to_eval() gpu_device = localagg.config.gpu_device[0] resnet = copy.deepcopy(localagg.model) resnet.load_state_dict(localagg.model.state_dict()) # resnet = nn.Sequential(*list(resnet.children())[:-2]) resnet = resnet.eval() for param in resnet.parameters():
def run(args, gpu_device=None): '''Run the Lightning system. Args: args args.config_path: str, filepath to the config file gpu_device: str or None, specifies GPU device as follows: None: CPU (specified as null in config) 'cpu': CPU '-1': All available GPUs '0': GPU 0 '4': GPU 4 '0,3' GPUs 1 and 3 See the following for more options: https://pytorch-lightning.readthedocs.io/en/latest/multi_gpu.html ''' if gpu_device == 'cpu' or not gpu_device: gpu_device = None config = process_config(args.config) # Only override if specified. if gpu_device: config.gpu_device = gpu_device if args.num_workers: config.data_loader_workers = args.num_workers seed_everything(config.seed) SystemClass = SYSTEM[config.system] system = SystemClass(config) if config.optim_params.scheduler: lr_callback = globals()[config.optim_params.scheduler]( initial_lr=config.optim_params.learning_rate, max_epochs=config.num_epochs, schedule=( int(0.6 * config.num_epochs), int(0.8 * config.num_epochs), ), ) callbacks = [lr_callback] else: callbacks = [] # TODO: adjust period for saving checkpoints. ckpt_callback = pl.callbacks.ModelCheckpoint( os.path.join(config.exp_dir, 'checkpoints'), save_top_k=-1, period=1, ) wandb.init(project='image', entity='viewmaker', name=config.exp_name, config=config, sync_tensorboard=True) trainer = pl.Trainer( default_root_dir=config.exp_dir, gpus=gpu_device, # 'ddp' is usually faster, but we use 'dp' so the negative samples # for the whole batch are used for the SimCLR loss # distributed_backend=config.distributed_backend or 'dp', max_epochs=config.num_epochs, min_epochs=config.num_epochs, checkpoint_callback=ckpt_callback, resume_from_checkpoint=args.ckpt or config.continue_from_checkpoint, profiler=args.profiler, precision=config.optim_params.precision or 32, callbacks=callbacks, val_check_interval=config.val_check_interval or 1.0, limit_val_batches=config.limit_val_batches or 1.0, ) trainer.fit(system)