def test_register_record_function(tmpdir): use_cuda = torch.cuda.is_available() pytorch_profiler = PyTorchProfiler( export_to_chrome=False, use_cuda=use_cuda, dirpath=tmpdir, filename="profiler", schedule=None, on_trace_ready=None, ) class TestModel(BoringModel): def __init__(self): super().__init__() self.layer = torch.nn.Sequential(torch.nn.Linear(1, 1), torch.nn.ReLU(), torch.nn.Linear(1, 1)) model = TestModel() input = torch.rand((1, 1)) if use_cuda: model = model.cuda() input = input.cuda() with pytorch_profiler.profile("a"): with RegisterRecordFunction(model): model(input) pytorch_profiler.describe() event_names = [e.name for e in pytorch_profiler.function_events] assert "[pl][module]torch.nn.modules.container.Sequential: layer" in event_names assert "[pl][module]torch.nn.modules.linear.Linear: layer.0" in event_names assert "[pl][module]torch.nn.modules.activation.ReLU: layer.1" in event_names assert "[pl][module]torch.nn.modules.linear.Linear: layer.2" in event_names
def test_pytorch_profiler_trainer_ddp(tmpdir, use_output_filename): """Ensure that the profiler can be given to the training and default step are properly recorded. """ if use_output_filename: output_filename = os.path.join(tmpdir, "profiler.txt") else: output_filename = None profiler = PyTorchProfiler(output_filename=output_filename) model = BoringModel() trainer = Trainer( fast_dev_run=True, profiler=profiler, accelerator="ddp", gpus=2, ) trainer.fit(model) enabled = use_output_filename or not use_output_filename and profiler.local_rank == 0 if enabled: assert len(profiler.summary()) > 0 assert set(profiler.profiled_actions.keys()) == { 'training_step_and_backward', 'validation_step' } else: assert profiler.summary() is None assert set(profiler.profiled_actions.keys()) == set() # todo (tchaton) add support for all ranks if use_output_filename and os.getenv("LOCAL_RANK") == "0": data = Path(profiler.output_fname).read_text() assert len(data) > 0
def test_pytorch_profiler_trainer_ddp(tmpdir): """Ensure that the profiler can be given to the training and default step are properly recorded. """ pytorch_profiler = PyTorchProfiler(dirpath=None, filename="profiler") model = BoringModel() trainer = Trainer( max_epochs=1, default_root_dir=tmpdir, limit_train_batches=2, limit_val_batches=2, profiler=pytorch_profiler, accelerator="ddp", gpus=2, ) trainer.fit(model) assert len(pytorch_profiler.summary()) > 0 assert set(pytorch_profiler.profiled_actions) == { 'training_step_and_backward', 'validation_step' } files = sorted(f for f in os.listdir(pytorch_profiler.dirpath) if "fit" in f) rank = int(os.getenv("LOCAL_RANK", "0")) expected = f"fit-profiler-{rank}.txt" assert files[rank] == expected path = os.path.join(pytorch_profiler.dirpath, expected) data = Path(path).read_text("utf-8") assert len(data) > 0
def test_pytorch_profiler_nested_emit_nvtx(tmpdir): """This test check emit_nvtx is correctly supported.""" profiler = PyTorchProfiler(use_cuda=True, emit_nvtx=True) model = BoringModel() trainer = Trainer(fast_dev_run=True, profiler=profiler, gpus=1) trainer.fit(model)
def test_pytorch_profiler_trainer_test(tmpdir): """Ensure that the profiler can be given to the trainer and test step are properly recorded. """ pytorch_profiler = PyTorchProfiler(dirpath=tmpdir, filename="profile", schedule=None) model = BoringModel() trainer = Trainer( default_root_dir=tmpdir, max_epochs=1, limit_test_batches=2, profiler=pytorch_profiler, ) trainer.test(model) assert sum(e.name == 'test_step' for e in pytorch_profiler.function_events) path = pytorch_profiler.dirpath / f"test-{pytorch_profiler.filename}.txt" assert path.read_text("utf-8") if _KINETO_AVAILABLE: files = sorted( [file for file in os.listdir(tmpdir) if file.endswith('.json')]) assert any(f'test-{pytorch_profiler.filename}' in f for f in files) path = pytorch_profiler.dirpath / f"test-{pytorch_profiler.filename}.txt" assert path.read_text("utf-8")
def test_pytorch_profiler_trainer(fn, step_name, boring_model_cls, tmpdir): """Ensure that the profiler can be given to the trainer and test step are properly recorded.""" pytorch_profiler = PyTorchProfiler(dirpath=tmpdir, filename="profile", schedule=None) model = boring_model_cls() model.predict_dataloader = model.train_dataloader trainer = Trainer(default_root_dir=tmpdir, max_epochs=1, limit_test_batches=2, profiler=pytorch_profiler) getattr(trainer, fn)(model) assert sum(e.name == f"{step_name}_step" for e in pytorch_profiler.function_events) path = pytorch_profiler.dirpath / f"{fn}-{pytorch_profiler.filename}.txt" assert path.read_text("utf-8") if _KINETO_AVAILABLE: files = sorted(file for file in os.listdir(tmpdir) if file.endswith(".json")) assert any(f"{fn}-{pytorch_profiler.filename}" in f for f in files) path = pytorch_profiler.dirpath / f"{fn}-{pytorch_profiler.filename}.txt" assert path.read_text("utf-8")
def test_pytorch_profiler_nested(tmpdir): """Ensure that the profiler handles nested context""" pytorch_profiler = PyTorchProfiler(profiled_functions=["a", "b", "c"], use_cuda=False, output_filename=os.path.join( tmpdir, "profiler.txt")) with pytorch_profiler.profile("a"): a = torch.ones(42) with pytorch_profiler.profile("b"): b = torch.zeros(42) with pytorch_profiler.profile("c"): _ = a + b pa = pytorch_profiler.profiled_actions # From PyTorch 1.8.0, less operation are being traced. if LooseVersion(torch.__version__) >= LooseVersion("1.8.0"): expected_ = { 'a': ['ones', 'empty', 'fill_', 'zeros', 'empty', 'zero_', 'add'], 'b': ['zeros', 'empty', 'zero_'], 'c': ['add'], } # From PyTorch 1.6.0, more operation are being traced. elif LooseVersion(torch.__version__) >= LooseVersion("1.6.0"): expected_ = { 'a': [ 'ones', 'empty', 'fill_', 'zeros', 'empty', 'zero_', 'fill_', 'add', 'empty' ], 'b': ['zeros', 'empty', 'zero_', 'fill_'], 'c': ['add', 'empty'], } else: expected_ = { 'a': ['add'], 'b': [], 'c': ['add'], } for n in ('a', 'b', 'c'): pa[n] = [e.name for e in pa[n]] if LooseVersion(torch.__version__) >= LooseVersion("1.7.1"): pa[n] = [e.replace("aten::", "") for e in pa[n]] assert pa[n] == expected_[n]
def test_pytorch_profiler_nested(tmpdir): """Ensure that the profiler handles nested context""" pytorch_profiler = PyTorchProfiler(record_functions={"a", "b", "c"}, use_cuda=False, dirpath=tmpdir, filename="profiler", schedule=None) with pytorch_profiler.profile("a"): a = torch.ones(42) with pytorch_profiler.profile("b"): b = torch.zeros(42) with pytorch_profiler.profile("c"): _ = a + b pytorch_profiler.describe() events_name = {e.name for e in pytorch_profiler.function_events} names = {"a", "b", "c"} ops = {"add", "empty", "fill_", "ones", "zero_", "zeros"} if _TORCH_GREATER_EQUAL_1_7: ops = {"aten::" + op for op in ops} expected = names.union(ops) assert events_name == expected, (events_name, torch.__version__, platform.system())
def build_profiler(name): if name == 'inference': return InferenceProfiler() elif name == 'pytorch': from pytorch_lightning.profiler import PyTorchProfiler return PyTorchProfiler(use_cuda=True, profile_memory=True, row_limit=100) elif name is None: return PassThroughProfiler() else: raise ValueError(f'Invalid profiler: {name}')
def build_profiler(name): if name == 'inference': return InferenceProfiler() elif name == 'pytorch': from pytorch_lightning.profiler import PyTorchProfiler # TODO: this profiler will be introduced after upgrading pl dependency to 1.3.0 @zehong return PyTorchProfiler(use_cuda=True, profile_memory=True, row_limit=100) elif name is None: return PassThroughProfiler() else: raise ValueError(f'Invalid profiler: {name}')
def test_pytorch_profiler_deepcopy(tmpdir): pytorch_profiler = PyTorchProfiler(dirpath=tmpdir, filename="profiler", schedule=None) pytorch_profiler.start("on_train_start") torch.tensor(1) pytorch_profiler.describe() assert deepcopy(pytorch_profiler)
def test_pytorch_profiler_trainer_validate(tmpdir): """Ensure that the profiler can be given to the trainer and validate function are properly recorded.""" pytorch_profiler = PyTorchProfiler(dirpath=tmpdir, filename="profile", schedule=None) model = BoringModel() trainer = Trainer(default_root_dir=tmpdir, max_epochs=1, limit_val_batches=2, profiler=pytorch_profiler) trainer.validate(model) assert sum(e.name == "validation_step" for e in pytorch_profiler.function_events) path = pytorch_profiler.dirpath / f"validate-{pytorch_profiler.filename}.txt" assert path.read_text("utf-8")
def test_pytorch_profiler_trainer_fit(fast_dev_run, boring_model_cls, tmpdir): """Ensure that the profiler can be given to the trainer and test step are properly recorded.""" pytorch_profiler = PyTorchProfiler(dirpath=tmpdir, filename="profile") model = boring_model_cls() trainer = Trainer(default_root_dir=tmpdir, max_epochs=1, fast_dev_run=fast_dev_run, profiler=pytorch_profiler) trainer.fit(model) assert sum(e.name == "validation_step" for e in pytorch_profiler.function_events) path = pytorch_profiler.dirpath / f"fit-{pytorch_profiler.filename}.txt" assert path.read_text("utf-8") if _KINETO_AVAILABLE: files = sorted(file for file in os.listdir(tmpdir) if file.endswith(".json")) assert any(f"fit-{pytorch_profiler.filename}" in f for f in files) path = pytorch_profiler.dirpath / f"fit-{pytorch_profiler.filename}.txt" assert path.read_text("utf-8")
def test_pytorch_profiler_trainer_predict(tmpdir): """Ensure that the profiler can be given to the trainer and predict function are properly recorded. """ pytorch_profiler = PyTorchProfiler(dirpath=tmpdir, filename="profile", schedule=None) model = BoringModel() model.predict_dataloader = model.train_dataloader trainer = Trainer( default_root_dir=tmpdir, max_epochs=1, limit_predict_batches=2, profiler=pytorch_profiler, ) trainer.predict(model) assert sum(e.name == 'predict_step' for e in pytorch_profiler.function_events) path = pytorch_profiler.dirpath / f"predict-{pytorch_profiler.filename}.txt" assert path.read_text("utf-8")
def test_pytorch_profiler_nested(tmpdir): """Ensure that the profiler handles nested context""" pytorch_profiler = PyTorchProfiler(record_functions={"a", "b", "c"}, use_cuda=False, dirpath=tmpdir, filename="profiler", schedule=None) with pytorch_profiler.profile("a"): a = torch.ones(42) with pytorch_profiler.profile("b"): b = torch.zeros(42) with pytorch_profiler.profile("c"): _ = a + b pytorch_profiler.describe() events_name = {e.name for e in pytorch_profiler.function_events} if platform.system() == "Windows": expected = { 'a', 'add', 'b', 'c', 'profiler::_record_function_enter', 'profiler::_record_function_exit' } else: expected = { 'signed char', 'add', 'profiler::_record_function_exit', 'bool', 'char', 'profiler::_record_function_enter' } if Version(torch.__version__) >= Version("1.6.0"): expected = { 'add', 'zeros', 'ones', 'zero_', 'b', 'fill_', 'c', 'a', 'empty' } if Version(torch.__version__) >= Version("1.7.0"): expected = { 'aten::zeros', 'aten::add', 'aten::zero_', 'c', 'b', 'a', 'aten::fill_', 'aten::empty', 'aten::ones' } assert events_name == expected, (events_name, torch.__version__, platform.system())
def test_profile_callbacks(tmpdir): """Checks if profiling callbacks works correctly, specifically when there are two of the same callback type.""" pytorch_profiler = PyTorchProfiler(dirpath=tmpdir, filename="profiler") model = BoringModel() trainer = Trainer( default_root_dir=tmpdir, fast_dev_run=1, profiler=pytorch_profiler, callbacks=[EarlyStopping("val_loss"), EarlyStopping("train_loss")], ) trainer.fit(model) assert sum( e.name == "[pl][profile][Callback]EarlyStopping{'monitor': 'val_loss', 'mode': 'min'}.on_validation_start" for e in pytorch_profiler.function_events ) assert sum( e.name == "[pl][profile][Callback]EarlyStopping{'monitor': 'train_loss', 'mode': 'min'}.on_validation_start" for e in pytorch_profiler.function_events )
def process_args(args=None, return_io=False): """ Process arguments for running training """ if not isinstance(args, argparse.Namespace): args = parse_args(args) args.loader_kwargs = dict() targs = dict(max_epochs=args.epochs, ) targs['accumulate_grad_batches'] = args.accumulate env = None if args.ipu: targs['accelerator'] = 'ipu' targs['devices'] = process_gpus(args.gpus) else: targs['gpus'] = process_gpus(args.gpus) targs['num_nodes'] = args.num_nodes if args.lsf: ########################################################################################## # Currently coding against pytorch-lightning 1.4.3 ########################################################################################## if args.num_workers > 4: print0( "num_workers (-k) > 4 can lead to hanging on Summit -- setting to 4", file=sys.stderr) args.num_workers = 4 args.loader_kwargs[ 'num_workers'] = 1 # Set as a default. This will get overridden elsewhere args.loader_kwargs['multiprocessing_context'] = 'spawn' env = LSFEnvironment() elif args.slurm: env = SLURMEnvironment() if env is not None: global RANK global SIZE try: RANK = env.global_rank() SIZE = env.world_size() except: print( ">>> Could not get global rank -- setting RANK to 0 and SIZE to 1", file=sys.stderr) RANK = 0 SIZE = 1 if targs['gpus'] is not None: targs['accelerator'] = 'gpu' if targs['gpus'] == 1: targs['devices'] = 1 else: if env is None: raise ValueError( 'Please specify environment (--lsf or --slurm) if using more than one GPU' ) # parallel_devices = [torch.device(i) for i in range(torch.cuda.device_count()) if i < targs['gpus']] # precision_plugin = NativeMixedPrecisionPlugin(16, 'cuda') torch.cuda.set_device(env.local_rank()) targs['devices'] = targs['gpus'] targs['strategy'] = DDPStrategy( find_unused_parameters=False, cluster_environment=env, #accelerator=GPUAccelerator(), #parallel_devices=parallel_devices, #precision_plugin=precision_plugin, ) print( "---- Rank %s - Using GPUAccelerator with DDPStrategy" % env.global_rank(), file=sys.stderr) else: targs['accelerator'] = 'cpu' del args.gpus if args.sanity: if isinstance(args.sanity, str): args.sanity = int(args.sanity) else: args.sanity = 4000 targs['limit_train_batches'] = args.sanity targs['limit_val_batches'] = args.sanity // 4 if args.lr_find: targs['auto_lr_find'] = True del args.lr_find if args.checkpoint is not None: if os.path.exists(args.checkpoint): targs['resume_from_checkpoint'] = args.checkpoint else: warnings.warn( "Ignoring -c/--checkpoint argument because {args.checkpoint} does not exist." ) args.checkpoint = None if args.cuda_profile: targs['profiler'] = PyTorchProfiler( filename=f'pytorch_prof.{RANK:0{len(str(SIZE))}}', emit_nvtx=True) targs['replace_sampler_ddp'] = False args.loader_kwargs = dict() # make sure we are classifying if we are using adding classifier layers # to a resnet features model if args.features_checkpoint is not None: if args.manifold: raise ValueError( 'Cannot use manifold loss (i.e. -M) if adding classifier (i.e. -F)' ) args.classify = True data_mod = DeepIndexDataModule(args, keep_open=True, seed=args.seed + RANK, rank=RANK, size=SIZE) # if classification problem, use the number of taxa as the number of outputs if args.classify: args.n_outputs = data_mod.dataset.n_outputs args.input_nc = 136 if args.tnf else len(data_mod.dataset.vocab) model = process_model(args, taxa_table=data_mod.dataset.difile.taxa_table) if args.num_workers > 0: data_mod.dataset.close() ret = [model, args, targs] if return_io: ret.append(io) ret.append(data_mod) return tuple(ret)
def test_pytorch_profiler_raises(pytorch_profiler): """Ensure errors are raised where expected.""" with pytest.raises( MisconfigurationException, match="profiled_functions` and `PyTorchProfiler.record"): PyTorchProfiler(profiled_functions=["a"], record_functions=["b"])
def pytorch_profiler(tmpdir): return PyTorchProfiler(dirpath=tmpdir, filename="profiler")
def test_v1_5_0_legacy_profiler_argument(): with pytest.deprecated_call(match="renamed to `record_functions` in v1.3"): PyTorchProfiler(profiled_functions=[])
def cli_main(): pl.seed_everything(1234) # ------------ # args # ------------ parser = ArgumentParser() parser.add_argument('--base_folders', nargs='+', default=[], required=True) parser.add_argument('--datasets', nargs='+', default=[], required=True) parser.add_argument('--shuffle', action="store_true", default=False) parser.add_argument('--use_tpu', action="store_true", default=False) parser.add_argument('--memory_profile', action="store_true", default=False) parser.add_argument('--tags', nargs='*', default=[]) parser = UTWRS.add_model_specific_args(parser) parser = pl.Trainer.add_argparse_args(parser) args = parser.parse_args() # ------------ # data path # ------------ file_paths = [] max_seq_length = 0 max_summary_length = 0 if "BBC" in args.datasets: i = args.datasets.index("BBC") file_paths.append(get_file_paths(args.base_folders[i])) max_seq_length = max(get_max_seq_len(args.base_folders[i]), max_seq_length) max_summary_length = max(get_max_summary_len(args.base_folders[i]), max_summary_length) if "OVSD" in args.datasets: i = args.datasets.index("OVSD") file_paths.append(get_file_paths(args.base_folders[i])) max_seq_length = max(get_max_seq_len(args.base_folders[i]), max_seq_length) max_summary_length = max(get_max_summary_len(args.base_folders[i]), max_summary_length) if file_paths == []: raise UnsupportedOperation("--dataset only support BBC or OVSD.") # ------------ # data args # ------------ # Add <START> and <END> token args.enc_seq_len = max_seq_length + 2 args.dec_seq_len = max_summary_length + 2 # ------------ # Split train/test # ------------ print(f"\nTotal number of videos: {sum([len(i) for i in file_paths])}") print(f"Max length of videos: {max_seq_length}") print(f"Max length of summary: {max_summary_length}\n") train_paths = [] test_paths = [] for dataset in file_paths: np.random.shuffle(dataset) train_paths.extend(dataset[:-2]) test_paths.extend(dataset[-2:]) # ------------ # K-fold # ------------ kfold = StratifiedKFold(n_splits=3, shuffle=False) # Generate data index for kfold X = [0] * len(train_paths) Y = [] for i, dataset in enumerate(file_paths): Y += [i] * (len(dataset) - 2) train_paths = np.array(train_paths) for k, (train, val) in enumerate( tqdm(kfold.split(X, Y), total=kfold.get_n_splits())): print(f"Training data: f{train_paths[train]}") print(f"Validation data: f{train_paths[val]}") # ------------ # data loader # ------------ data_loader = OVSDBBCDataModule(max_seq_length, max_summary_length, args.d_model, train_paths[train], train_paths[val], shuffle=args.shuffle, use_tpu=args.use_tpu) # ------------ # model # ------------ model = UTWRS(args, SRC_PAD_TOKEN, TRG_PAD_TOKEN) # ------------ # neptune logger # ------------ neptune_logger = NeptuneLogger(project_name="guyleaf/UTWRS", params=vars(args), experiment_name=f"{k+1}-fold_logger", tags=args.tags) neptune_logger.experiment.log_text("training_data", ','.join(train_paths[train])) neptune_logger.experiment.log_text("validation_data", ','.join(train_paths[val])) # ------------ # checkpoint # ------------ model_checkpoint = ModelCheckpoint( dirpath="checkpoints", filename='{epoch:02d}_{test_loss:.2f}', save_top_k=3, monitor='test_loss', mode='min') # ------------ # profiler # ------------ profiler = PyTorchProfiler( output_filename=f"profiles/{k}-fold_profiler", profile_memory=True, sort_by_key="cuda_memory_usage", row_limit=50, enabled=args.memory_profile) # ------------ # training # ------------ trainer = pl.Trainer.from_argparse_args( args, logger=neptune_logger, profiler=profiler, checkpoint_callback=model_checkpoint, track_grad_norm=2, log_every_n_steps=100) trainer.fit(model, data_loader) # Log model checkpoint to Neptune for k in model_checkpoint.best_k_models.keys(): model_name = 'checkpoints/' + k.split('/')[-1] neptune_logger.experiment.log_artifact(k, model_name) # Log score of the best model checkpoint. neptune_logger.experiment.set_property( 'best_model_loss', model_checkpoint.best_model_score.tolist()) if args.profiler: neptune_logger.experiment.log_artifact('profiles')
def pytorch_profiler(tmpdir): profiler = PyTorchProfiler(output_filename=os.path.join( tmpdir, "profiler.txt"), local_rank=0) return profiler