def archive_backup(self): """Archiving the source folder, the training script and environment info. The training script is saved with the prefix '0-' to distinguish from regular scripts. Some of the information saved in the env info is: PyTorch version: 0.4.0 Is debug build: No CUDA used to build PyTorch: 8.0.61 OS: CentOS Linux release 7.3.1611 (Core) GCC version: (GCC) 5.2.0 CMake version: Could not collect Python version: 3.6 Is CUDA available: Yes CUDA runtime version: 8.0.44 GPU models and configuration: GPU 0: GeForce GTX 980 Ti GPU 1: GeForce GTX 980 Ti . """ # Archiving the Training script shutil.copyfile(self.script_path, self.save_path + '/0-' + os.path.basename(self.script_path)) os.chmod(self.save_path + '/0-' + os.path.basename(self.script_path), 0o755) # Archiving the src folder pkg_path = os.path.dirname(arch_src) backup_path = os.path.join(self.save_path, 'src_backup') shutil.make_archive(backup_path, 'gztar', pkg_path) # Archiving the Environment Info env_info = collect_env.get_pretty_env_info() with open(self.save_path + '/env_info.txt', 'w') as f: f.write(env_info)
def collect_env_info(): """Returns env info as a string. """ from torch.utils.collect_env import get_pretty_env_info env_str = get_pretty_env_info() env_str += '\n Pillow ({})'.format(PIL.__version__) return env_str
def dump_system_info(file_path: str): if os.path.isfile(file_path): os.remove(file_path) with open(file_path, 'w+') as o: o.write(headline('torch collect_env')) o.write(collect_env.get_pretty_env_info()) o.write(headline('system info')) o.write('platform: %s\n' % platform.platform()) o.write('python: %s\n' % platform.python_version()) o.write(headline('gpus')) try: for i, gpu in enumerate(GPUtil.getGPUs()): o.write('gpu %d\n' % i) for k in ['id', 'driver', 'name', 'memoryTotal']: o.write('\t%s=%s\n' % (k, gpu.__dict__[k])) except ValueError as e: o.write("%s" % repr(e)) o.write(headline('cuda / cudnn')) o.write('cuda via cat: %s\n' % get_command_result('cat /usr/local/cuda/version.txt')) o.write('cuda via dpkg: %s\n' % get_command_result('dpkg -l | grep cuda-toolkit')) o.write('cuda via nvcc: %s\n' % get_command_result('nvcc --version')) o.write('cudnn version: %s\n' % cudnn.version()) # o.write('\nnvidia-smi:\n%s\n' % get_command_result('nvidia-smi')) o.write(headline('pip freeze')) for r in freeze(local_only=True): o.write('%s\n' % r)
def collect_env_info(): env_str = get_pretty_env_info() env_str += get_PIL_version() if git_available(): env_str += '\nGit revision number: {}'.format(get_git_rev()) env_str += '\nGit Modified\n{}'.format(get_git_modifed()) # env_str += '\nGit Untrakced\n {}'.format(get_git_untracked()) return env_str
def collect_env_info(): """ 环境信息 :return: """ env_str = get_pretty_env_info() env_str += "\n OpenCV ({})".format(cv2.__version__) return env_str
def collect_env_info(): r"""Returns env info as a string. Code source: github.com/facebookresearch/maskrcnn-benchmark """ from torch.utils.collect_env import get_pretty_env_info env_str = get_pretty_env_info() env_str += '\n Pillow ({})'.format(PIL.__version__) return env_str
def main(): parser = argparse.ArgumentParser(description="sampling-free") parser.add_argument( "--config-file", default="", metavar="FILE", help="path to config file", ) parser.add_argument( "opts", help="Modify config options using the command-line", default=None, nargs=argparse.REMAINDER, ) args = parser.parse_args() args.num_gpus = int(os.environ["WORLD_SIZE"]) args.device_id = int(os.environ["LOCAL_RANK"]) torch.backends.cudnn.benchmark = True torch.backends.cudnn.enabled = True if args.num_gpus > 1: torch.cuda.set_device(args.device_id) torch.distributed.init_process_group(backend="nccl") cfg.merge_from_file(args.config_file) cfg.merge_from_list(args.opts) cfg.freeze() output_dir = args.config_file.replace("config.yaml", "inference") if output_dir: mkdir(output_dir) logger = setup_logger("sampling-free", output_dir, args.device_id) logger.info("Using {} GPUs".format(args.num_gpus)) logger.info(args) logger.info("Collecting env info (might take some time)") from torch.utils.collect_env import get_pretty_env_info logger.info("\n" + get_pretty_env_info()) logger.info("Loaded configuration file {}".format(args.config_file)) with open(args.config_file, "r") as cf: config_str = "\n" + cf.read() logger.info(config_str) logger.info("Running with config:\n{}".format(cfg)) model = build_model(cfg).cuda(args.device_id) if args.num_gpus > 1: logger.info("Use PyTorch DDP inference") model = torch.nn.parallel.DistributedDataParallel( model, device_ids=[args.device_id]) _ = Checkpointer(cfg, model) eval_checkpoint(cfg, model, output_dir, args.num_gpus)
def main(): parser = argparse.ArgumentParser(description='PyTorch RecLib') parser.add_argument('--config-file', default='Configs/default.yaml', metavar='FILE', help='path to configuration file', type=str) args = parser.parse_args() cfg.merge_from_file(args.config_file) cfg.freeze() # # create output dir # experiment_dir = get_unique_temp_folder(cfg.OUTPUT_DIR) print("Collecting env info (may take some time)\n") print(get_pretty_env_info()) print("Loading configuration file from {}".format(args.config_file)) print('Running with configuration: \n') print(cfg) # set random seed for pytorch and numpy if cfg.SEED != 0: print("Using manual seed: {}".format(cfg.SEED)) torch.manual_seed(cfg.SEED) torch.cuda.manual_seed(cfg.SEED) torch.backends.cudnn.deterministic = True torch.backends.cudnn.benchmark = False np.random.seed(cfg.SEED) else: print("Using random seed") torch.backends.cudnn.benchmark = True # create dataloader train_loader, field_info = make_dataloader(cfg, split='train') valid_loader, _ = make_dataloader(cfg, split='valid') test_loader, _ = make_dataloader(cfg, split='test') # create model model = get_model(cfg, field_info) best_model = train(cfg, model, train_loader, valid_loader, save=False) auc, log_loss = test(cfg, best_model, test_loader, device=cfg.DEVICE) print("*" * 20) print("* Test AUC: {:.5f} *".format(auc)) print("* Test Log Loss: {:.5f} *".format(log_loss)) print("*" * 20) model.eval() macs, params = profile_model(model, test_loader, device=cfg.DEVICE) print("*" * 20) print("* MACs (M): {} *".format(macs / 10**6)) print("* #Params (M): {} *".format(params / 10**6)) print("* Model Size (MB): {} *".format(params * 8 / 10**6)) # torch.float64 by default print('*' * 20)
def collect_torch_env(): try: import torch.__config__ return torch.__config__.show() except ImportError: from torch.utils.collect_env import get_pretty_env_info return get_pretty_env_info()
def collect_torch_env(): try: import torch.__config__ return torch.__config__.show() except ImportError: # compatible with older versions of pytorch from torch.utils.collect_env import get_pretty_env_info return get_pretty_env_info()
def main(): parser = argparse.ArgumentParser(description="pymv") parser.add_argument( "--config-file", default="", metavar="FILE", help="path to config file", type=str, ) parser.add_argument("--local_rank", type=int, default=0) parser.add_argument( "opts", help="Modify config options using the command-line", default=None, nargs=argparse.REMAINDER, ) args = parser.parse_args() num_gpus = int(os.environ["WORLD_SIZE"]) if "WORLD_SIZE" in os.environ else 1 args.distributed = num_gpus > 1 if args.distributed: torch.cuda.set_device(args.local_rank) torch.distributed.init_process_group(backend="nccl") synchronize() cfg.merge_from_file(args.config_file) cfg.merge_from_list(args.opts) cfg.freeze() model_name = os.path.splitext(os.path.basename(args.config_file))[0] output_dir = os.path.join("outputs", model_name) if output_dir: mkdir(output_dir) logger = setup_logger("pymv", output_dir, get_rank()) logger.info("Using {} GPUs".format(num_gpus)) logger.info(args) logger.info("Collecting env info (might take some time)") from torch.utils.collect_env import get_pretty_env_info logger.info("\n" + get_pretty_env_info()) logger.info("Loaded configuration file {}".format(args.config_file)) with open(args.config_file, "r") as cf: config_str = "\n" + cf.read() logger.info(config_str) logger.info("Running with config:\n{}".format(cfg)) output_config_path = os.path.join(output_dir, 'config.yml') logger.info("Saving config into: {}".format(output_config_path)) # save overloaded model config in the output directory save_config(cfg, output_config_path) model = train(cfg, args.local_rank, args.distributed, output_dir)
def main(): parser = argparse.ArgumentParser(description="sampling-free") parser.add_argument( "--config-file", default="", metavar="FILE", help="path to config file", ) parser.add_argument( "opts", help="Modify config options using the command-line", default=None, nargs=argparse.REMAINDER, ) args = parser.parse_args() args.num_gpus = int(os.environ["WORLD_SIZE"]) args.device_id = int(os.environ["LOCAL_RANK"]) torch.backends.cudnn.benchmark = True torch.backends.cudnn.enabled = True if args.num_gpus > 1: torch.cuda.set_device(args.device_id) torch.distributed.init_process_group(backend="nccl") cfg.merge_from_file(args.config_file) cfg.merge_from_list(args.opts) cfg.freeze() output_dir = args.config_file.replace("configs", "outputs").strip('.yaml') if output_dir: mkdir(output_dir) logger = setup_logger("sampling-free", output_dir, args.device_id) logger.info("Using {} GPUs".format(args.num_gpus)) logger.info(args) logger.info("Collecting env info (might take some time)") from torch.utils.collect_env import get_pretty_env_info logger.info("\n" + get_pretty_env_info()) logger.info("Loaded configuration file {}".format(args.config_file)) with open(args.config_file, "r") as cf: config_str = "\n" + cf.read() logger.info(config_str) logger.info("Running with config:\n{}".format(cfg)) output_config_path = os.path.join(output_dir, 'config.yaml') logger.info("Saving config into: {}".format(output_config_path)) save_config(cfg, output_config_path) model = train(cfg, args.device_id, args.num_gpus, output_dir, logger)
def test_expect(self): info_output = get_pretty_env_info() ci_build_envs = [ 'pytorch-linux-trusty-py2.7', 'pytorch-linux-xenial-cuda9-cudnn7-py3', 'pytorch-macos-10.13-py3', 'pytorch-win-ws2016-cuda9-cudnn7-py3' ] build_env = os.environ['BUILD_ENVIRONMENT'] if build_env not in ci_build_envs: return self.assertExpectedOutput(info_output, build_env)
def collect_torch_env(): """ If torch is available, print the torch config. """ try: import torch.__config__ return torch.__config__.show() except ImportError: # compatible with older versions of pytorch from torch.utils.collect_env import get_pretty_env_info return get_pretty_env_info()
def test_expect(self): info_output = get_pretty_env_info() ci_build_envs = [ 'pytorch-linux-trusty-py2.7', 'pytorch-linux-xenial-cuda9-cudnn7-py3', 'pytorch-macos-10.13-py3', 'pytorch-win-ws2016-cuda9-cudnn7-py3' ] build_env = os.environ['BUILD_ENVIRONMENT'] if build_env not in ci_build_envs: return self.assertExpectedOutput(info_output, build_env)
def get_diagnostic_info(): diagnostic_info = f"Log Time: {time.strftime('%Y-%m-%d %H:%M:%S', time.localtime())}\n" diagnostic_info += f"UUID: {uuid.uuid1()}\n" diagnostic_info += f"Argv: {' '.join(sys.argv)}\n" diagnostic_info += f"Git Branch: {get_branch_name()}\n" diagnostic_info += f"Git Commit ID: {get_last_commit_id()}\n\n" diagnostic_info += f"More Diagnostic Info: \n" diagnostic_info += "-" * 50 + "\n" diagnostic_info += get_pretty_env_info() + "\n" diagnostic_info += "-" * 50 + "\n" return diagnostic_info
def main(): parser = argparse.ArgumentParser( description="PyTorch Classification Training.") parser.add_argument( "--config-file", default="", metavar="FILE", help="path to config file", type=str, ) parser.add_argument("--local_rank", type=int, default=0) parser.add_argument( "opts", help="Modify config options using the command-line", default=None, nargs=argparse.REMAINDER, ) args = parser.parse_args() cfg.merge_from_file(args.config_file) cfg.merge_from_list(args.opts) cfg.freeze() if cfg.MODEL.DEVICE == "cuda" and cfg.CUDA_VISIBLE_DEVICES is not "": os.environ["CUDA_VISIBLE_DEVICES"] = cfg.CUDA_VISIBLE_DEVICES num_gpus = int( os.environ["WORLD_SIZE"]) if "WORLD_SIZE" in os.environ else 1 distributed = num_gpus > 1 if distributed: torch.cuda.set_device(args.local_rank) torch.distributed.init_process_group(backend="nccl", init_method="env://") synchronize() logger = setup_logger("Classification", "", get_rank()) logger.info("Using {} GPUs".format(num_gpus)) logger.info(cfg) logger.info("Collecting env info (might take some time)") logger.info("\n" + get_pretty_env_info()) acc = run_test(cfg, args.local_rank, distributed) save_dict_data(acc, os.path.join(cfg.OUTPUT_DIR, "acc.txt")) print_dict_data(acc)
def archive_backup(self): """Archiving the source folder, the training script and environment info. The training script is saved with the prefix "0-" to distinguish from regular scripts. Environment information equivalent to the output of ``python -m torch.utils.collect_env`` is saved in a file named "env_info.txt". """ # Archiving the Training script shutil.copyfile(self.script_path, self.save_path + '/0-' + os.path.basename(self.script_path)) os.chmod(self.save_path + '/0-' + os.path.basename(self.script_path), 0o755) # Archiving the src folder pkg_path = os.path.dirname(arch_src) backup_path = os.path.join(self.save_path, 'src_backup') shutil.make_archive(backup_path, 'gztar', pkg_path) # Archiving the Environment Info env_info = collect_env.get_pretty_env_info() with open(self.save_path + '/env_info.txt', 'w') as f: f.write(env_info)
def get_env_info(): """Gets the environment information.""" return '\n' + get_pretty_env_info()
def main(): parser = argparse.ArgumentParser( description='PyTorch Imbalanced Metric Learning') parser.add_argument('--config-file', default='../configs/default.yaml', metavar='FILE', help='path to configuration file', type=str) args = parser.parse_args() cfg.merge_from_file(args.config_file) cfg.freeze() # create output dir if not os.path.exists(cfg.OUTPUT_DIR): os.makedirs(cfg.OUTPUT_DIR) if not os.path.exists(os.path.join(cfg.OUTPUT_DIR, cfg.DATASET.TRAIN)): os.makedirs(os.path.join(cfg.OUTPUT_DIR, cfg.DATASET.TRAIN)) # set up logger logger = setup_logger('Imbalanced', cfg.OUTPUT_DIR) logger.info("Collecting env info (may take some time)\n") logger.info(get_pretty_env_info()) logger.info("Loading configuration file from {}".format(args.config_file)) with open(args.config_file) as f: config_str = f.read() config_str = '\n' + config_str.strip() logger.info(config_str) logger.info('Running with configuration: \n') logger.info(cfg) # set random seed for pytorch and numpy if cfg.SEED != 0: logger.info("Using manual seed: {}".format(cfg.SEED)) torch.manual_seed(cfg.SEED) torch.cuda.manual_seed(cfg.SEED) torch.backends.cudnn.deterministic = True torch.backends.cudnn.benchmark = False np.random.seed(cfg.SEED) else: logger.info("Using random seed") torch.backends.cudnn.benchmark = True train_loader = make_data_loader(cfg, is_train=True, is_valid=False) valid_loader = make_data_loader(cfg, is_train=False, is_valid=True) test_loader = make_data_loader(cfg, is_train=False, is_valid=False) model = build_model(cfg).to(cfg.DEVICE) logger.info(model) logger.info("Start Training ...") best_epoch = train(cfg, model, train_loader, valid_loader, save=True) logger.info("Done.") # load best model on validation dataset # please specify the best model, by default, it is the one with best G-Mean best_name = 'Epoch_{}'.format(best_epoch) best_model = torch.load('{}/{}/{}'.format(cfg.OUTPUT_DIR, cfg.DATASET.TRAIN, '{}.pth'.format(best_name))) logger.info("Best Model Name: {} - Start Evaluation ...".format(best_name)) test(cfg, best_model, test_loader) logger.info("Done.")
parser.add_argument("--bs", type=int, default=4, help='test batch size.') parser.add_argument("--weights", default="res50_1x.pth", help="path to the trained model") parser.add_argument("--show_env", action='store_true', default=False, help="Whether to show the env information.") args = parser.parse_args() cfg = update_config(args) cfg.print_cfg() if cfg.show_env: from torch.utils.collect_env import get_pretty_env_info print(get_pretty_env_info()) val_loader = make_data_loader(cfg, is_train=False) model = GeneralizedRCNN(cfg).cuda() model.eval() checkpoint = torch.load(cfg.weights, map_location=torch.device("cpu")) model.load_state_dict(checkpoint) predictions = {} with torch.no_grad(): for _, (images, targets, image_ids) in enumerate(tqdm(val_loader)): output = model(images.to(torch.device('cuda'))) output = output[0].to(torch.device('cpu'))
def collect_env_info(): env_str = get_pretty_env_info() # 获取运行环境信息 env_str += "\n OpenCV ({})".format(cv2.__version__) return env_str
def main_worker(local_rank: int, ngpus_per_node: int, args: Args, conf: ConfigTree): device = set_proper_device(local_rank) rank = args.node_rank * ngpus_per_node + local_rank init_logger(rank=rank, filenmae=args.output_dir / "default.log") writer = SummaryWriter(args.output_dir) if is_master() else DummyClass() # log some diagnostic messages if not conf.get_bool("only_evaluate"): _logger.info("Collect envs from system:\n" + get_pretty_env_info()) _logger.info("Args:\n" + pprint.pformat(dataclasses.asdict(args))) # init distribited if args.world_size > 1: dist.init_process_group(backend=args.dist_backend, init_method=args.dist_url, world_size=args.world_size, rank=rank) # init model, optimizer, scheduler model: nn.Module = MODEL.build_from(conf.get("model")) if conf.get("model.load_from", None) is not None: model.load_state_dict( torch.load(conf.get("model.load_from"), map_location="cpu")) if is_dist_avail_and_init() and conf.get_bool("sync_batchnorm"): model = nn.SyncBatchNorm.convert_sync_batchnorm(model) image_size = conf.get_int('data.image_size') _logger.info( f"Model details: n_params={compute_nparam(model)/1e6:.2f}M, " f"flops={compute_flops(model,(1,3, image_size, image_size))/1e6:.2f}M." ) train_loader, val_loader = DATA.build_from(conf.get("data")) criterion = CRITERION.build_from(conf.get("criterion")) optimizer = OPTIMIZER.build_from(conf.get("optimizer"), dict(params=model.parameters())) scheduler = SCHEDULER.build_from(conf.get("scheduler"), dict(optimizer=optimizer)) if torch.cuda.is_available(): model = model.to(device=device) criterion = criterion.to(device=device) # restore metrics, model, optimizer and scheduler state of the checkpoint metrics = MetricsList() minitor_metric = "val/top1_acc" states = dict(model=unwarp_module(model), optimizer=optimizer, scheduler=scheduler) saver = ModelSaver(args.output_dir) if conf.get_bool("auto_resume"): saver.restore(metrics, states, device=device) start_epoch = len(metrics[minitor_metric]) if start_epoch != 0: _logger.info(f"Load chckpoint from epoch={start_epoch}.") else: start_epoch = 0 if is_dist_avail_and_init(): model = nn.parallel.DistributedDataParallel(model, device_ids=[local_rank]) if conf.get_bool("only_evaluate"): val_metrics = evaluate(epoch=0, model=model, loader=val_loader, criterion=criterion, device=device, log_interval=conf.get_int("log_interval")) _logger.info( f"EVAL complete, top1-acc={val_metrics['val/top1_acc']*100:.2f}%, " + f"top5-acc={val_metrics['val/top5_acc']*100:.2f}%") else: ETA = EstimatedTimeArrival(conf.get_int("max_epochs")) for epoch in range(start_epoch + 1, conf.get_int("max_epochs") + 1): metrics += train(epoch=epoch, model=model, loader=train_loader, criterion=criterion, optimizer=optimizer, scheduler=scheduler, use_amp=conf.get_bool("use_amp"), accmulated_steps=conf.get_int("accmulated_steps"), device=device, log_interval=conf.get_int("log_interval")) metrics += evaluate(epoch=epoch, model=model, loader=val_loader, criterion=criterion, device=device, log_interval=conf.get_int("log_interval")) # record metric in tensorboard for name, metric_values in metrics.items(): writer.add_scalar(name, metric_values[-1], epoch) # save checkpoint saver.save(minitor=minitor_metric, metrics=metrics.as_plain_dict(), states=states) ETA.step() # log the best metric best_epoch_index, _ = find_best_metric(metrics[minitor_metric]) best_top1acc = metrics["val/top1_acc"][best_epoch_index] best_top5acc = metrics["val/top5_acc"][best_epoch_index] _logger.info( f"Epoch={epoch:04d} complete, best val top1-acc={best_top1acc*100:.2f}%, " f"top5-acc={best_top5acc*100:.2f}% (epoch={best_epoch_index+1}), {ETA}" )
def collect_env_info(): env_str = get_pretty_env_info() env_str += get_pil_version() return env_str
def test_smoke(self): info_output = get_pretty_env_info() self.assertTrue(info_output.count('\n') >= 17)
def main(): parser = argparse.ArgumentParser( description="PyTorch Object Detection Inference") parser.add_argument( "--config-file", default="", metavar="FILE", help="path to config file", ) parser.add_argument("--local_rank", type=int, default=0) parser.add_argument( "opts", help="Modify config options using the command-line", default=None, nargs=argparse.REMAINDER, ) args = parser.parse_args() num_gpus = int( os.environ["WORLD_SIZE"]) if "WORLD_SIZE" in os.environ else 1 distributed = num_gpus > 1 if distributed: torch.cuda.set_device(args.local_rank) torch.distributed.init_process_group(backend="nccl", init_method="env://") # Merge config file. cfg.merge_from_file(args.config_file) cfg.merge_from_list(args.opts) cfg.freeze() # Print experimental infos. save_dir = "" logger = setup_logger("alphaction", save_dir, get_rank()) logger.info("Using {} GPUs".format(num_gpus)) logger.info(cfg) logger.info("Collecting env info (might take some time)") logger.info("\n" + get_pretty_env_info()) # Build the model. model = build_detection_model(cfg) model.to("cuda") # load weight. output_dir = cfg.OUTPUT_DIR checkpointer = ActionCheckpointer(cfg, model, save_dir=output_dir) checkpointer.load(cfg.MODEL.WEIGHT) output_folders = [None] * len(cfg.DATASETS.TEST) dataset_names = cfg.DATASETS.TEST mem_active = has_memory(cfg.MODEL.IA_STRUCTURE) if cfg.OUTPUT_DIR: for idx, dataset_name in enumerate(dataset_names): output_folder = os.path.join(cfg.OUTPUT_DIR, "inference", dataset_name) os.makedirs(output_folder, exist_ok=True) output_folders[idx] = output_folder # Do inference. data_loaders_test = make_data_loader(cfg, is_train=False, is_distributed=distributed) for output_folder, dataset_name, data_loader_test in zip( output_folders, dataset_names, data_loaders_test): inference( model, data_loader_test, dataset_name, mem_active=mem_active, output_folder=output_folder, ) synchronize()
def train(self): """ Perform a training run. """ print_cfg() logger.info("System config:\n{}".format(get_pretty_env_info())) model = BaseImageSSLModel() criterion = get_criterion() optimizer = get_optimizer(model) scheduler = get_scheduler(optimizer) logger.info(model) start_epoch = 0 if cfg.TRAINER.AUTO_RESUME and checkpoint.has_checkpoint(): last_checkpoint = checkpoint.get_last_checkpoint() checkpoint_epoch = checkpoint.load_checkpoint( last_checkpoint, model, optimizer, scheduler ) logger.info("Loaded checkpoint from: {}".format(last_checkpoint)) if not cfg.TRAINER.RESET_START_EPOCH: start_epoch = checkpoint_epoch + 1 if torch.cuda.is_available(): if len(cfg.GPU_IDS) > 1 or ( len(cfg.GPU_IDS) == 0 and torch.cuda.device_count() > 1 ): num_gpus = ( len(cfg.GPU_IDS) if cfg.GPU_IDS else torch.cuda.device_count() ) model = nn.DataParallel( model, device_ids=(cfg.GPU_IDS if cfg.GPU_IDS else None) ) cfg.TRAIN.BATCH_SIZE = cfg.TRAIN.BATCH_SIZE * num_gpus cfg.TEST.BATCH_SIZE = cfg.TEST.BATCH_SIZE * num_gpus elif len(cfg.GPU_IDS) == 1: torch.cuda.set_device(cfg.GPU_IDS[0]) print('use cuda') model.cuda() train_dataset = GenericSSLDataset("TRAIN") train_loader = DataLoader( train_dataset, batch_size=cfg.TRAIN.BATCH_SIZE, shuffle=True, num_workers=cfg.TRAINER.NUM_WORKERS, drop_last=True, ) if cfg.TRAINER.EVAL_MODEL: val_dataset = GenericSSLDataset("TEST") val_loader = DataLoader( val_dataset, batch_size=cfg.TEST.BATCH_SIZE, shuffle=False, num_workers=cfg.TRAINER.NUM_WORKERS, drop_last=True, ) train_timer = Timer() test_timer = Timer() logger.info("=> Training model...") for i_epoch in range(start_epoch, cfg.TRAINER.MAX_EPOCHS): train_timer.tic() self.train_loop( train_loader, model, criterion, optimizer, scheduler, i_epoch ) train_timer.toc() if checkpoint.is_checkpoint_epoch(i_epoch): checkpoint.save_checkpoint(model, optimizer, scheduler, i_epoch) if cfg.TRAINER.EVAL_MODEL and is_eval_epoch(i_epoch): test_timer.tic() self.eval_loop(val_loader, model, i_epoch) test_timer.toc() log_post_epoch_timer_stats(train_timer, test_timer, i_epoch)
def test(): parser = argparse.ArgumentParser( description='PyTorch Photo-Realistic Style Transfer Library') parser.add_argument('--config-file', type=str, default='', help='path to configuration file') parser.add_argument('--outputDir', type=str, default='Demo', help='name of output folder') parser.add_argument('--saveOrig', default=False, action='store_true') parser.add_argument('--contentDir', type=str, default='', help='path to directory of content images') parser.add_argument('--styleDir', type=str, default='', help='path to directory of style images') parser.add_argument('--content', type=str, default='', help='path to content image') parser.add_argument('--style', type=str, default='', help='path to style image') parser.add_argument( '--mode', type=int, default=0, help= 'Inference mode: 0 - Single Content; 1 - Multiple Content (Stored in a directory)' ) # advanced options parser.add_argument('--content-seg', default='', type=str, help='path to content mask image') parser.add_argument('--style-seg', default='', type=str, help='path to style mask image') parser.add_argument('--resize', default=False, action='store_true', help='resize original image to accelerate computing') args = parser.parse_args() # update configuration cfg.merge_from_file(args.config_file) cfg.freeze() test_transform = build_transform(cfg, train=False, interpolation=Image.BICUBIC, normalize=True) test_seg_transform = build_transform(cfg, train=False, interpolation=Image.NEAREST, normalize=False) if args.content_seg or args.style_seg: mask_on = True else: mask_on = False # create output dir if cfg.OUTPUT_DIR: os.makedirs(cfg.OUTPUT_DIR, exist_ok=True) # create logger logger = setup_logger(cfg.MODEL.NAME, save_dir=cfg.OUTPUT_DIR, filename=cfg.MODEL.NAME + '.txt') num_gpus = int( os.environ["WORLD_SIZE"]) if "WORLD_SIZE" in os.environ else 1 logger.info("Using {} GPUs".format(num_gpus)) logger.info("Collecting env info (might take some time)") logger.info("\n" + get_pretty_env_info()) logger.info('Loaded configuration file {}'.format(args.config_file)) logger.info("Running with config:\n{}".format(cfg)) # create output dir output_dir = os.path.join(cfg.OUTPUT_DIR, args.outputDir) os.makedirs(output_dir, exist_ok=True) logger.info('Output Dir Created: {}'.format(output_dir)) # create model model = model_factory[cfg.MODEL.NAME](cfg) logger.info(model) # inference if args.mode == 0: if mask_on: # 1-content | N-style | 1-mask, process single content image assert args.content, 'Path to the content image should be non-empty' assert args.style, 'Paths to the style images should be non-empty' assert args.content_seg, 'Path to the content segment image should be non-empty' assert args.style_seg, 'Path to the style segment image should be non-empty' content_img_path = os.path.join(cfg.INPUT_DIR, args.content) style_img_path = os.path.join(cfg.INPUT_DIR, args.style) content_seg_path = os.path.join( cfg.INPUT_DIR, args.content_seg) if args.content_seg else args.content_seg style_seg_path = os.path.join( cfg.INPUT_DIR, args.style_seg) if args.style_seg else args.style_seg name = content_img_path.split('/')[-1] name = name[:name.rindex('.')] # load image content_img = default_loader(content_img_path) style_img = default_loader(style_img_path) content_copy = content_img.copy() cw, ch = content_copy.width, content_copy.height sw, sh = style_img.width, style_img.height if args.resize: # new size after resizing content image new_cw, new_ch = memory_limit_image_size(content_img, cfg.INPUT.MIN_SIZE, cfg.INPUT.MAX_SIZE, logger=logger) # new size after resizing style image new_sw, new_sh = memory_limit_image_size(style_img, cfg.INPUT.MIN_SIZE, cfg.INPUT.MAX_SIZE, logger=logger) else: new_cw, new_ch = cw, ch new_sw, new_sh = sw, sh content_img = test_transform(content_img).unsqueeze(0) style_img = test_transform(style_img).unsqueeze(0) cont_seg = Image.open(content_seg_path) styl_seg = Image.open(style_seg_path) # resize segmentation image the same size as corresponding images cont_seg = cont_seg.resize((new_cw, new_ch), Image.NEAREST) styl_seg = styl_seg.resize((new_sw, new_sh), Image.NEAREST) cont_seg = test_seg_transform(cont_seg) styl_seg = test_seg_transform(styl_seg) with torch.no_grad(): infer_image(cfg, name, model, content_img, style_img, logger, output_dir, ch, cw, save_orig=args.saveOrig, content_seg_img=cont_seg, style_seg_img=styl_seg, orig_content=content_copy, test_transform=test_transform) elif args.content and args.style: # 1-content | 1-style, process single pair of images content_img_path = os.path.join(cfg.INPUT_DIR, args.content) style_img_path = os.path.join(cfg.INPUT_DIR, args.style) name = content_img_path.split('/')[-1] name = name[:name.rindex('.')] content_img = default_loader(content_img_path) style_img = default_loader(style_img_path) ch, cw = content_img.width, content_img.height content_copy = content_img.copy() if args.resize: # new size after resizing content image new_cw, new_ch = memory_limit_image_size(content_img, cfg.INPUT.MIN_SIZE, cfg.INPUT.MAX_SIZE, logger=logger) # new size after resizing style image new_sw, new_sh = memory_limit_image_size(style_img, cfg.INPUT.MIN_SIZE, cfg.INPUT.MAX_SIZE, logger=logger) else: new_cw, new_ch = cw, ch content_img = test_transform(content_img).unsqueeze(0) style_img = test_transform(style_img).unsqueeze(0) with torch.no_grad(): infer_image(cfg, name, model, content_img, style_img, logger, output_dir, ch, cw, save_orig=args.saveOrig, orig_content=content_copy, test_transform=test_transform) else: raise RuntimeError('Invalid Argument Setting') else: if args.contentDir and args.styleDir: # 1-vs-1, but process multiple images in the directory content_img, style_img, names = prepare_loading( cfg, os.path.join(cfg.INPUT_DIR, args.contentDir), os.path.join(cfg.INPUT_DIR, args.styleDir), ) iterator = tqdm(range(len(content_img))) for i in iterator: c_img, s_img = content_img[i], style_img[i] cw, ch = c_img.width, c_img.height c_copy = c_img.copy() if args.resize: # new size after resizing content image new_cw, new_ch = memory_limit_image_size( c_img, cfg.INPUT.MIN_SIZE, cfg.INPUT.MAX_SIZE, logger=logger) # new size after resizing style image new_sw, new_sh = memory_limit_image_size( s_img, cfg.INPUT.MIN_SIZE, cfg.INPUT.MAX_SIZE, logger=logger) else: new_cw, new_ch = cw, ch c_img = test_transform(c_img).unsqueeze(0) s_img = test_transform(s_img).unsqueeze(0) name = names[i] with torch.no_grad(): infer_image(cfg, name, model, c_img, s_img, logger, output_dir, ch, cw, save_orig=args.saveOrig, orig_content=c_copy, test_transform=test_transform) iterator.set_description(desc='Test Case {}'.format(i)) else: raise RuntimeError('Invalid Argument Setting') logger.info('Done!')
def main(): parser = argparse.ArgumentParser( description="PyTorch Classification Training.") parser.add_argument( "--config-file", default="", metavar="FILE", help="path to config file", type=str, ) parser.add_argument("--local_rank", type=int, default=0) parser.add_argument( "--skip-test", dest="skip_test", help="Do not test the final model", action="store_true", ) parser.add_argument( "opts", help="Modify config options using the command-line", default=None, nargs=argparse.REMAINDER, ) args = parser.parse_args() cfg.merge_from_file(args.config_file) cfg.merge_from_list(args.opts) cfg.freeze() if cfg.MODEL.DEVICE == "cuda" and cfg.CUDA_VISIBLE_DEVICES is not "": os.environ["CUDA_VISIBLE_DEVICES"] = cfg.CUDA_VISIBLE_DEVICES num_gpus = int( os.environ["WORLD_SIZE"]) if "WORLD_SIZE" in os.environ else 1 distributed = num_gpus > 1 if distributed: torch.cuda.set_device(args.local_rank) torch.distributed.init_process_group(backend="nccl", init_method="env://") synchronize() # create tensorboard writer output_dir = cfg.OUTPUT_DIR tb_dir = os.path.join(output_dir, 'tb_log') if get_rank() == 0 and output_dir: mkdir(output_dir) tb_writer = SummaryWriter(tb_dir) logger = setup_logger("Classification", output_dir, get_rank()) logger.info("Using {} GPUs".format(num_gpus)) logger.info(args) logger.info("Collecting env info (might take some time)") logger.info("\n" + get_pretty_env_info()) logger.info("Loaded configuration file {}".format(args.config_file)) with open(args.config_file, "r") as cf: config_str = "\n" + cf.read() logger.info(config_str) logger.info("Running with config:\n{}".format(cfg)) output_config_path = os.path.join(cfg.OUTPUT_DIR, "config.yaml") logger.info("Saving config into: {}".format(output_config_path)) save_config(cfg, output_config_path) model = run_train(cfg, args.local_rank, distributed, tb_writer) if not args.skip_test: acc = run_test(cfg, args.local_rank, distributed, model) save_dict_data(acc, os.path.join(cfg.OUTPUT_DIR, "acc.txt")) print_dict_data(acc)
def train_lst(): parser = argparse.ArgumentParser( description='PyTorch Style Transfer -- LinearStyleTransfer') parser.add_argument('--config-file', type=str, default='', help='path to configuration file') args = parser.parse_args() cfg.merge_from_file(args.config_file) cfg.freeze() # create output dir if cfg.OUTPUT_DIR: os.makedirs(cfg.OUTPUT_DIR, exist_ok=True) # create logger logger = setup_logger(cfg.MODEL.NAME, save_dir=cfg.OUTPUT_DIR, filename=cfg.MODEL.NAME + '.txt') num_gpus = int( os.environ["WORLD_SIZE"]) if "WORLD_SIZE" in os.environ else 1 logger.info("Using {} GPUs".format(num_gpus)) logger.info("Collecting env info (might take some time)") logger.info("\n" + get_pretty_env_info()) logger.info('Loaded configuration file {}'.format(args.config_file)) logger.info("Running with config:\n{}".format(cfg)) # create model model = get_model(cfg.MODEL.NAME, cfg) # push model to device model.to(cfg.DEVICE) logger.info(model) # create dataloader train_path_content, train_path_style = get_data(cfg, dtype='train') content_dataset = DatasetNoSeg(cfg, train_path_content, train=True) style_dataset = DatasetNoSeg(cfg, train_path_style, train=True) # content loader sampler = torch.utils.data.sampler.RandomSampler(content_dataset) batch_sampler = torch.utils.data.sampler.BatchSampler( sampler, cfg.DATALOADER.BATCH_SIZE, drop_last=False) content_loader = DataLoader(content_dataset, batch_sampler=IterationBasedBatchSampler( batch_sampler, cfg.OPTIMIZER.MAX_ITER, start_iter=0), num_workers=cfg.DATALOADER.NUM_WORKERS) logger.info('Content Loader Created!') # style loader sampler = torch.utils.data.sampler.RandomSampler(style_dataset) batch_sampler = torch.utils.data.sampler.BatchSampler( sampler, cfg.DATALOADER.BATCH_SIZE, drop_last=False) style_loader = DataLoader(style_dataset, batch_sampler=IterationBasedBatchSampler( batch_sampler, cfg.OPTIMIZER.MAX_ITER, start_iter=0), num_workers=cfg.DATALOADER.NUM_WORKERS) logger.info('Style Loader Created!') content_loader = iter(content_loader) style_loader = iter(style_loader) optimizer = build_optimizer(cfg, model.trans_layer) lr_scheduler = build_lr_scheduler(cfg, optimizer) logger.info("Using Optimizer: ") logger.info(optimizer) logger.info("Using LR Scheduler: {}".format( cfg.OPTIMIZER.LR_SCHEDULER.NAME)) iterator = tqdm(range(cfg.OPTIMIZER.MAX_ITER)) writer = SummaryWriter(log_dir=cfg.OUTPUT_DIR) # start training for i in iterator: content_img = next(content_loader).to(cfg.DEVICE) style_img = next(style_loader).to(cfg.DEVICE) if content_img.shape[0] != style_img.shape[0]: continue g_t = model.forward_with_trans(content_img, style_img) loss, style_loss, content_loss = model.cal_trans_loss( g_t, content_img, style_img) # update info iterator.set_description( desc= 'Iteration: {} -- Loss: {:.3f} -- Content Loss: {:.3f} -- Style Loss: {:.3f}' .format(i + 1, loss.item(), content_loss.item(), style_loss.item())) writer.add_scalar('loss_content', content_loss.item(), i + 1) writer.add_scalar('loss_style', style_loss.item(), i + 1) # update model optimizer.zero_grad() loss.backward() optimizer.step() # update lr lr_scheduler.step() # save image if i % 1000 == 0: n = content_img.shape[0] all_imgs = torch.cat((content_img, style_img, g_t), dim=0) save_image(all_imgs, os.path.join(cfg.OUTPUT_DIR, '{}.jpg'.format(i)), nrow=n) if i % 10000 == 0: torch.save(model.trans_layer.state_dict(), os.path.join(cfg.OUTPUT_DIR, '{}_lst.pth'.format(i))) torch.save(model.trans_layer.state_dict(), os.path.join(cfg.OUTPUT_DIR, 'final_lst.pth')) writer.close()
def test_smoke(self): info_output = get_pretty_env_info() self.assertTrue(info_output.count('\n') >= 17)
def main() -> None: """Main train and eval function.""" args = parse_args() torch.distributed.init_process_group( backend=args.backend, init_method='env://', ) if args.cuda: torch.cuda.set_device(args.local_rank) torch.cuda.manual_seed(args.seed) # torch.backends.cudnn.benchmark = False # torch.backends.cudnn.deterministic = True args.base_lr = (args.base_lr * dist.get_world_size() * args.batches_per_allreduce) args.verbose = dist.get_rank() == 0 if args.verbose: print('Collecting env info...') print(collect_env.get_pretty_env_info()) print() for r in range(torch.distributed.get_world_size()): if r == torch.distributed.get_rank(): print( f'Global rank {torch.distributed.get_rank()} initialized: ' f'local_rank = {args.local_rank}, ' f'world_size = {torch.distributed.get_world_size()}', ) torch.distributed.barrier() train_sampler, train_loader, _, val_loader = datasets.get_cifar(args) model = models.get_model(args.model) device = 'cpu' if not args.cuda else 'cuda' model.to(device) if args.verbose: summary(model, (args.batch_size, 3, 32, 32), device=device) model = torch.nn.parallel.DistributedDataParallel( model, device_ids=[args.local_rank], ) os.makedirs(args.log_dir, exist_ok=True) args.checkpoint_format = os.path.join(args.log_dir, args.checkpoint_format) args.log_writer = SummaryWriter(args.log_dir) if args.verbose else None args.resume_from_epoch = 0 for try_epoch in range(args.epochs, 0, -1): if os.path.exists(args.checkpoint_format.format(epoch=try_epoch)): args.resume_from_epoch = try_epoch break scaler = None if args.fp16: if not TORCH_FP16: raise ValueError( 'The installed version of torch does not ' 'support torch.cuda.amp fp16 training. This ' 'requires torch version >= 1.16', ) scaler = GradScaler() args.grad_scaler = scaler ( optimizer, preconditioner, (lr_scheduler, kfac_scheduler), ) = optimizers.get_optimizer( model, args, ) if args.verbose: print(preconditioner) loss_func = torch.nn.CrossEntropyLoss() if args.resume_from_epoch > 0: filepath = args.checkpoint_format.format(epoch=args.resume_from_epoch) map_location = {'cuda:0': f'cuda:{args.local_rank}'} checkpoint = torch.load(filepath, map_location=map_location) model.module.load_state_dict(checkpoint['model']) optimizer.load_state_dict(checkpoint['optimizer']) if checkpoint['lr_scheduler'] is not None: lr_scheduler.load_state_dict(checkpoint['lr_scheduler']) if (checkpoint['preconditioner'] is not None and preconditioner is not None): preconditioner.load_state_dict(checkpoint['preconditioner']) start = time.time() for epoch in range(args.resume_from_epoch + 1, args.epochs + 1): engine.train( epoch, model, optimizer, preconditioner, loss_func, train_sampler, train_loader, args, ) engine.test(epoch, model, loss_func, val_loader, args) lr_scheduler.step() if kfac_scheduler is not None: kfac_scheduler.step(step=epoch) if (epoch > 0 and epoch % args.checkpoint_freq == 0 and dist.get_rank() == 0): # Note: save model.module b/c model may be Distributed wrapper # so saving the underlying model is more generic save_checkpoint( model.module, optimizer, preconditioner, lr_scheduler, args.checkpoint_format.format(epoch=epoch), ) if args.verbose: print( '\nTraining time: {}'.format( datetime.timedelta(seconds=time.time() - start), ), )