def test_simple_profiler_iterable_durations(tmpdir, action: str, expected: list): """Ensure the reported durations are reasonably accurate.""" def _sleep_generator(durations): """the profile_iterable method needs an iterable in which we can ensure that we're properly timing how long it takes to call __next__""" for duration in durations: time.sleep(duration) yield duration def _get_python_cprofile_total_duration(profile): return sum(x.inlinetime for x in profile.getstats()) simple_profiler = SimpleProfiler() iterable = _sleep_generator(expected) with pytest.deprecated_call( match= "`SimpleProfiler.profile_iterable` is deprecated in v1.6 and will be removed in v1.8." ): for _ in simple_profiler.profile_iterable(iterable, action): pass # we exclude the last item in the recorded durations since that's when StopIteration is raised np.testing.assert_allclose(simple_profiler.recorded_durations[action][:-1], expected, rtol=0.2) advanced_profiler = AdvancedProfiler(dirpath=tmpdir, filename="profiler") iterable = _sleep_generator(expected) with pytest.deprecated_call( match= "`AdvancedProfiler.profile_iterable` is deprecated in v1.6 and will be removed in v1.8." ): for _ in advanced_profiler.profile_iterable(iterable, action): pass recorded_total_duration = _get_python_cprofile_total_duration( advanced_profiler.profiled_actions[action]) expected_total_duration = np.sum(expected) np.testing.assert_allclose(recorded_total_duration, expected_total_duration, rtol=0.2)
def main(): args = parse_args() cfg = Config.fromfile(args.config) setup_seed(cfg.random_seed) model = LightningModel(cfg) checkpoint_callback = ModelCheckpoint( filepath=f"{cfg.checkpoint_path}/{cfg.name}/{cfg.version}/" f"{cfg.name}_{cfg.version}_{{epoch}}_{{avg_val_loss:.3f}}_{{ade:.3f}}_{{fde:.3f}}_{{fiou:.3f}}", save_last=True, save_top_k=8, verbose=True, monitor='fiou', mode='max', prefix='') lr_logger_callback = LearningRateLogger(logging_interval='step') logger = TensorBoardLogger(save_dir=cfg.log_path, name=cfg.name, version=cfg.version) logger.log_hyperparams(model.hparams) profiler = SimpleProfiler() if cfg.simple_profiler else AdvancedProfiler() check_val_every_n_epoch = cfg.check_val_every_n_epoch if hasattr( cfg, 'check_val_every_n_epoch') else 1 trainer = pl.Trainer( gpus=cfg.num_gpus, max_epochs=cfg.max_epochs, logger=logger, profiler=profiler, # this line won't work in multi-gpu setting. weights_summary="top", gradient_clip_val=cfg.gradient_clip_val, callbacks=[lr_logger_callback], checkpoint_callback=checkpoint_callback, resume_from_checkpoint=cfg.resume_from_checkpoint, accumulate_grad_batches=cfg.batch_size_times, check_val_every_n_epoch=check_val_every_n_epoch) if (not (args.train or args.test)) or args.train: shutil.copy( args.config, os.path.join(cfg.log_path, cfg.name, cfg.version, args.config.split('/')[-1])) if cfg.load_from_checkpoint is not None: model_ckpt = partial_state_dict(model, cfg.load_from_checkpoint) model.load_state_dict(model_ckpt) trainer.fit(model) if args.test: if cfg.test_checkpoint is not None: model_ckpt = partial_state_dict(model, cfg.test_checkpoint) model.load_state_dict(model_ckpt) trainer.test(model)
def main(args): print(args) if args.load_from_checkpoint is None: raise ValueError('`load-from-checkpoint` should be specified.') model = TripletVAE(args.load_from_checkpoint, n_hidden=args.n_hidden, n_layers=args.n_layers, learning_rate=args.learning_rate, vae_learning_rate=args.vae_lr, scheduler=args.scheduler) print(model) if args.profile: profiler = AdvancedProfiler() else: profiler = None dm = TripletDataModule(args.train_biom, args.test_biom, args.val_biom, metadata=args.sample_metadata, batch_category=args.batch_category, class_category=args.class_category, segment_triples=args.segment_triples, batch_size=args.batch_size, num_workers=args.num_workers) ckpt_path = os.path.join(args.output_directory, "checkpoints") checkpoint_callback = ModelCheckpoint(dirpath=ckpt_path, period=1, monitor='val/triplet_loss', mode='min', verbose=True) os.mkdir(args.output_directory) tb_logger = pl_loggers.TensorBoardLogger(f'{args.output_directory}/logs/') # save hyper-parameters to yaml file with open(f'{args.output_directory}/hparams.yaml', 'w') as outfile: yaml.dump(model._hparams, outfile, default_flow_style=False) trainer = Trainer(max_epochs=args.epochs, gpus=args.gpus, check_val_every_n_epoch=10, gradient_clip_val=args.grad_clip, profiler=profiler, logger=tb_logger, callbacks=[checkpoint_callback]) trainer.fit(model, dm) ckpt_path = args.output_directory + '/last_ckpt.pt' trainer.save_checkpoint(ckpt_path) # Perform KNN classification batch = next(iter(dm.test_dataloader())) res = model.test_step(batch, 0)['test/knn_results'] open(f'{args.output_directory}/cross_validation.csv', 'w').write(res)
def test_advanced_profiler(): def _get_duration(profile): return sum([x.totaltime for x in profile.getstats()]) p = AdvancedProfiler() with p.profile("a"): time.sleep(3) with p.profile("a"): time.sleep(1) with p.profile("b"): time.sleep(2) with p.profile("c"): time.sleep(1) # different environments have different precision when it comes to time.sleep() # see: https://github.com/PyTorchLightning/pytorch-lightning/issues/796 a_duration = _get_duration(p.profiled_actions["a"]) np.testing.assert_allclose(a_duration, [4], rtol=0.2) b_duration = _get_duration(p.profiled_actions["b"]) np.testing.assert_allclose(b_duration, [2], rtol=0.2) c_duration = _get_duration(p.profiled_actions["c"]) np.testing.assert_allclose(c_duration, [1], rtol=0.2)
def train(omegaConf: DictConfig) -> LightningModule: # Misc part if omegaConf['runner']['verbose'] is True: print(OmegaConf.to_yaml(omegaConf)) pl.seed_everything(omegaConf['runner']['seed']) # Runner part runner = make_runner(omegaConf['runner']) if "auto_lr_find" in omegaConf['trainer'] and omegaConf['trainer']['auto_lr_find'] is True: runner = custom_lr_finder(runner, omegaConf) # When we are here, the omegaConf has already been checked by OmegaConf # so we can extract primitives to use with other libs config = OmegaConf.to_container(omegaConf) assert isinstance(config, dict) config['trainer']['default_root_dir'] = check_default_root_dir(config) config['trainer']['checkpoint_callback'] = build_checkpoint_callback(config) if 'logger' in config['trainer']: config['trainer']['logger'] = build_logger(config) if 'deterministic' in config['trainer']: config['trainer']['deterministic'] = True if 'profiler' in config['trainer'] and config['trainer']['profiler'] is True: config['trainer']['profiler'] = AdvancedProfiler() if 'scheduler' in config['runner'] and config['runner']['scheduler'] is not None: lr_monitor = LearningRateLogger(logging_interval='step') config['trainer']['callbacks'] = [lr_monitor] # ### # # Early stopping # # It is breaking neptune logging somehow, it seems that it overrides by 1 the current timestep # ### # early_stop_callback = EarlyStopping( # monitor='val_accuracy', min_delta=0.00, patience=10, verbose=False, mode='max' # ) # config['trainer']['early_stop_callback'] = early_stop_callback trainer = pl.Trainer(**config['trainer']) trainer.fit(runner) return runner
def create_profiler(profiler_params, checkpoint_path): if profiler_params is None: return None else: if profiler_params.save_profile: output_filename = checkpoint_path / 'profile.log' else: output_filename = None if profiler_params.name == 'simple': return SimpleProfiler(output_filename) elif profiler_params.name == 'advanced': return AdvancedProfiler(output_filename) else: raise ValueError( 'Given type of profiler is not supported. Use `simple` or `advanced`' )
def main(args): seed_everything(args.seed) model = NNCF(args) checkpoint_callback = ModelCheckpoint( filepath='./checkpoints/nncf_{step}-{val_loss:.3f}', save_top_k=-1, verbose=True, monitor='val_acc', mode='max', prefix='', period=1, ) tb_logger = TensorBoardLogger(save_dir=os.getcwd(), version=1, name='lightning_logs') trainer = pl.Trainer( fast_dev_run=True if args.dev_run else False, weights_summary=args.weights_summary, num_sanity_val_steps=args.num_val_sanity, gpus=args.gpus, distributed_backend='dp', benchmark=True, amp_level='O1', precision=16 if args.fp16 else 32, deterministic=False, accumulate_grad_batches=args.accum_batches, auto_lr_find=True if args.auto_lr else False, checkpoint_callback=checkpoint_callback, # early_stop_callback=early_stop, # callbacks=callbacks, gradient_clip_val=args.grad_clip_val, limit_val_batches=args.limit_val_batches, # max_steps=args.num_steps, max_epochs=args.num_epochs, val_check_interval=args.val_check_interval, profiler=AdvancedProfiler( output_filename='profile_report.txt') if args.profile else None, track_grad_norm=2 if args.track_grads else -1, logger=tb_logger) trainer.fit(model)
def process(args): seed_everything(2299) dict_args = vars(args) if args.pretrained_model is not None: pm_name = Path(args.pretrained_model).parents[1].name version = f"{args.name}_{args.exp}_{pm_name}_fold-{args.fold}" else: version = f"{args.name}_{args.exp}_fold-{args.fold}" # Data dm = CTRPDataModule.from_argparse_args(args) # Model if args.exp == 'vanilla': model = StandardNetwork(**dict_args) elif args.exp == 'transformer': model = TransformerNetwork(**dict_args) else: model = ConditionalNetwork(**dict_args) # Callbacks logger = TensorBoardLogger(save_dir=args.default_root_dir, version=version, name='lightning_logs') early_stop = EarlyStopping(monitor='val_loss', min_delta=0.001, patience=20, verbose=False, mode='min') checkpoint_callback = ModelCheckpoint(monitor='val_loss', mode='min') profiler = AdvancedProfiler(filename='profile') profiler = 'simple' # Trainer start = datetime.now() trainer = Trainer.from_argparse_args( args, default_root_dir=logger.log_dir, logger=logger, callbacks=[early_stop, checkpoint_callback], profiler=profiler) trainer.fit(model, dm) print("Completed fold {} in {}".format(args.fold, str(datetime.now() - start))) return
def main(): args = parse_args() cfg = Config.fromfile(args.config) setup_seed(cfg.random_seed) model = LightningTransformer(cfg) checkpoint_callback = ModelCheckpoint(filepath=os.path.join( cfg.checkpoint_path, cfg.name, cfg.version, "{}_{}_{{epoch}}_{{val_loss_per_word}}".format(cfg.name, cfg.version)), save_last=True, save_top_k=8, verbose=True, monitor='val_loss_per_word', mode='min', prefix='') lr_logger_callback = LearningRateLogger(logging_interval='step') logger = TensorBoardLogger(save_dir=cfg.log_path, name=cfg.name, version=cfg.version) logger.log_hyperparams(model.hparams) profiler = SimpleProfiler() if cfg.simple_profiler else AdvancedProfiler() trainer = pl.Trainer(gpus=cfg.num_gpus, max_epochs=cfg.max_epochs, logger=logger, profiler=profiler, weights_summary="top", callbacks=[lr_logger_callback], checkpoint_callback=checkpoint_callback, resume_from_checkpoint=cfg.resume_from_checkpoint, accumulate_grad_batches=cfg.batch_size_times) if cfg.load_from_checkpoint is not None: ckpt = torch.load(cfg.load_from_checkpoint, map_location=lambda storage, loc: storage) model.load_state_dict(ckpt['state_dict']) trainer.fit(model)
def _create_pl_profiler(self): # Only if an experiment direcotyr exists if self.exp_main_dir: prof_out_file = os.path.join(self.cfg["setup_cfg"]["exp_main_dir"], "runtime_profiling.txt") else: return None if self.cfg["training_cfg"]["pl_which_profiler"].lower() == "simple": return SimpleProfiler( output_filename=prof_out_file, extended=True ) elif self.cfg["train_cfg"]["pl_which_profiler"].lower() == "advanced": return AdvancedProfiler( output_filename=prof_out_file, line_count_restriction = 1.0 ) elif self.cfg["train_cfg"]["pl_which_profiler"].lower() in ["none", ""]: return None else: raise NotImplementedError
def on_trainer_init(self, profiler: Union[BaseProfiler, bool, str]): if profiler and not isinstance(profiler, (bool, str, BaseProfiler)): # TODO: Update exception on removal of bool raise MisconfigurationException("Only None, bool, str and subclasses of `BaseProfiler` " "are valid values for `Trainer`'s `profiler` parameter. " f"Received {profiler} which is of type {type(profiler)}.") if isinstance(profiler, bool): rank_zero_warn("Passing a bool value as a `profiler` argument to `Trainer` is deprecated" " and will be removed in v1.3. Use str ('simple' or 'advanced') instead.", DeprecationWarning) if profiler: profiler = SimpleProfiler() elif isinstance(profiler, str): profiler = profiler.lower() if profiler == "simple": profiler = SimpleProfiler() elif profiler == "advanced": profiler = AdvancedProfiler() else: raise ValueError("When passing string value for the `profiler` parameter of" " `Trainer`, it can only be 'simple' or 'advanced'") self.trainer.profiler = profiler or PassThroughProfiler()
def test_advanced_profiler(): def get_duration(profile): return sum([x.totaltime for x in profile.getstats()]) p = AdvancedProfiler() with p.profile("a"): time.sleep(3) with p.profile("a"): time.sleep(1) with p.profile("b"): time.sleep(2) with p.profile("c"): time.sleep(1) a_duration = get_duration(p.profiled_actions["a"]) np.testing.assert_almost_equal(a_duration, [4], decimal=1) b_duration = get_duration(p.profiled_actions["b"]) np.testing.assert_almost_equal(b_duration, [2], decimal=1) c_duration = get_duration(p.profiled_actions["c"]) np.testing.assert_almost_equal(c_duration, [1], decimal=1)
def main(gpus, nodes, fast_dev_run, mixed_precision, project_config, hparams): torch.manual_seed(0) np.random.seed(0) # init module model = MonoSemiSupDepth_Packnet(hparams) # tags associated to the run def shape_format(shape): # shape = [Height, Width] return f"{shape[1]}x{shape[0]}" list_of_tags = [ hparams.model.depth_net.name, hparams.model.pose_net.name, hparams.optimizer.name, hparams.scheduler.name, {1: 'gray', 3: 'rgb'}[hparams.input_channels], f"train-{shape_format(hparams.datasets.train.data_transform_options.image_shape)}", f"val-{shape_format(hparams.datasets.val.data_transform_options.image_shape)}", f"test-{shape_format(hparams.datasets.test.data_transform_options.image_shape)}", ] if mixed_precision: list_of_tags += 'mixed_precision' base_output_dir = Path(project_config.output_dir) experiment_output_dir = base_output_dir / project_config.project_name / project_config.experiment_name experiment_output_dir.mkdir(parents=True, exist_ok=True) wandb_output_dir = str(experiment_output_dir) wandb_logger = WandbLogger( project = project_config.project_name, save_dir=wandb_output_dir, # the path to a directory where artifacts will be written log_model=True, tags=list_of_tags ) #wandb_logger.watch(model, log='all', log_freq=5000) # watch model's gradients and params run_output_dir = experiment_output_dir / f'{wandb_logger.experiment.id}' run_output_dir.mkdir(parents=True, exist_ok=True) run_output_dir = str(run_output_dir) checkpoint_callback = ModelCheckpoint( filepath=run_output_dir + '/{epoch:04d}-{val-abs_rel:.5f}', # saves a file like: my/path/epoch=2-abs_rel=0.0115.ckpt save_top_k=3, verbose=True, monitor='val-abs_rel', mode='min', ) lr_logger = LearningRateLogger() if mixed_precision: amp_level='01' precision=16 if gpus > 1: distributed_backend = 'ddp' else: distributed_backend = None profiler = False if fast_dev_run: from pytorch_lightning.profiler import AdvancedProfiler profiler = AdvancedProfiler(output_filename='./profiler.log') trainer = Trainer( gpus=gpus, distributed_backend=distributed_backend, nb_gpu_nodes=nodes, checkpoint_callback=checkpoint_callback, callbacks=[lr_logger], logger=wandb_logger, fast_dev_run=fast_dev_run, profiler=profiler, early_stop_callback=False, #amp_level='O1', #precision=16, **hparams.trainer ) trainer.fit(model) trainer.test(model)
def advanced_profiler(tmpdir): return AdvancedProfiler(dirpath=tmpdir, filename="profiler")
def main(hparams): # init data dm = TripleEmbeddingDataModule(hparams) ## identify input_size hparams.model.input_size = dm.dim # model model = Distiller(hparams) # early stop # logger # log_dir = str(root_dir / "lightning_logs")/ # tt_logger = loggers.TestTubeLogger("tb_logs", name=hparams.experiment_name) # tb_logger = loggers.TensorBoardLogger("tb_logs") # init logger source_files_path = str(Path(hydra.utils.get_original_cwd()) / "**/*.py") tags = generate_tags(hparams) log_params = flatten_params(hparams) close_after_fit = not hparams.train.upload_checkpoints neptune_logger = NeptuneLogger( experiment_name="_".join(tags), project_name=f"kjang0517/{hparams.dataset.name}", params=log_params, # Optional, tags=tags, # Optional, close_after_fit=close_after_fit, upload_source_files=[source_files_path], ) # class SaveHparamsCallback(pl.Callback): # def on_sanity_check_start(self, trainer, pl_module): # # save hparams # hparams_str = trainer.model.hparams.pretty() # hparams_path = Path(trainer.ckpt_path) / "hparams.yaml" # with hparams_path.open("w", encoding="utf-8") as f: # f.write(hparams_str) # trainer.logger.experiment.log_artifact(str(hparams_path)) # Callbacks callbacks = [PostTrainCallback(), LearningRateLogger()] # Callbacks: Early stop early_stop_callback = True # if hparams.train.use_early_stop: # patience = hparams.train.early_stop_patience # early_stop_callback = EarlyStopping( # monitor="val_early_stop_on", patience=patience, verbose=True, mode="min" # ) # else: # early_stop_callback = None # use profiler profiler = AdvancedProfiler() if hparams.train.profile else None # train # trainer = Trainer.from_argparse_args(hparams) trainer = Trainer( # default_root_dir=root_dir, max_epochs=hparams.train.max_epochs, gpus=hparams.train.gpus, distributed_backend=hparams.train.distributed_backend, fast_dev_run=hparams.train.fast_dev_run, amp_level=hparams.train.amp_level, precision=hparams.train.precision, train_percent_check=hparams.train.train_percent_check, val_percent_check=hparams.train.val_percent_check, # **hparams.train, benchmark=True, profiler=profiler, logger=neptune_logger, early_stop_callback=early_stop_callback, callbacks=callbacks, # deterministic=True, ) # train trainer.fit(model, dm)
def advanced_profiler(tmpdir): profiler = AdvancedProfiler( output_filename=os.path.join(tmpdir, "profiler.txt")) return profiler
def advanced_profiler(): profiler = AdvancedProfiler() return profiler
def main(): torch.set_printoptions(threshold=100, edgeitems=50, precision=8, sci_mode=False) parser = argparse.ArgumentParser() parser.add_argument( "--config", type=str, default=None, help="Path to (.yml) config file if running new experiment.") parser.add_argument( "--log-checkpoint", type=str, default=None, help= "Training log path with the config and checkpoints to resume the experiment.", ) parser.add_argument( "--checkpoint", type=str, default="model_last.ckpt", help="Resume training from the latest checkpoint by default.", ) parser.add_argument("--run-name", type=str, default="default", help="Name of the training log run") parser.add_argument( "--gpus", type=int, default=1, help="Amount of Gpus that should be used(In most cases leave at 1)", ) parser.add_argument( "--precision", type=int, default=32, help= "Full precision (32) default, half precision (16) on newer devices to speed up the training.", ) parser.add_argument( "--deterministic", action="store_true", default=False, help="Run deterministic training, useful for experimenting") parser.add_argument("--use-profiler", action="store_true", default=False, help="Run profiler for the training set") config_args = parser.parse_args() # Log path path_parser = PathParser() cfg, logger = path_parser.parse(config_args.config, config_args.log_checkpoint, config_args.run_name, config_args.checkpoint, create_logger=True) # # (Optional:) enable this to track autograd issues when debugging # torch.autograd.set_detect_anomaly(True) if config_args.deterministic: seed_everything(cfg.experiment.randomseed) # Create model model = getattr(models, cfg.experiment.model)(cfg) # Model checkpoint generator checkpoint_callback = ModelCheckpoint(filepath=path_parser.checkpoint_dir, save_top_k=3, save_last=True, verbose=True, monitor="val_loss", mode="min", prefix="model_") # Trainer callbacks logger_callback = LoggerCallback(cfg) # Optional profiler profiler = None if config_args.use_profiler: profiler = AdvancedProfiler(output_filename="report.txt", line_count_restriction=.4) trainer = Trainer(weights_summary=None, resume_from_checkpoint=path_parser.checkpoint_path, gpus=config_args.gpus, default_root_dir=path_parser.log_dir, logger=logger, num_sanity_val_steps=0, checkpoint_callback=checkpoint_callback, row_log_interval=1, log_gpu_memory=None, precision=config_args.precision, profiler=profiler, fast_dev_run=False, deterministic=config_args.deterministic, progress_bar_refresh_rate=0, accumulate_grad_batches=1, callbacks=[logger_callback]) if config_args.log_checkpoint is not None: # Add log props logger.experiment.add_text("description", cfg.experiment.description, 0) logger.experiment.add_text("config", f"\t{cfg.dump()}".replace("\n", "\n\t"), 0) logger.experiment.add_text( "params", f"\t{ModelSummary(model, mode='full')}".replace("\n", "\n\t"), 0) trainer.fit(model) print("Done!")
def on_epoch_start(self): print(">>>>>>>>>>>>>>>>>>>>> on_epoch_start") self.start_time = time.time() self.nepochs += 1 def on_epoch_end(self): print(">>>>>>>>>>>>>>>>>>>>> on_epoch_end1") epoch_time = time.time() - self.start_time self.total_time += epoch_time # print(">>>>>>>>>>>>>>>>>>>>> on_epoch_end2", self.nepochs) print("Epoch Time taken: ", epoch_time, self.total_time / self.nepochs) model = grammarTransformer() profiler = AdvancedProfiler() start_time = time.time() if config.GPUS == 1: trainer = pl.Trainer(max_epochs=config.MAX_EPOCHS, gpus=config.GPUS, precision=config.PRECISION) # trainer = pl.Trainer(max_epochs=config.MAX_EPOCHS, gpus=config.GPUS, profiler=profiler) # trainer = pl.Trainer(max_epochs=config.MAX_EPOCHS, gpus=config.GPUS, profiler=True) elif config.GPUS == 0: trainer = pl.Trainer(max_epochs=config.MAX_EPOCHS, precision=config.PRECISION) else: trainer = pl.Trainer(max_epochs=config.MAX_EPOCHS, gpus=config.GPUS,
def main(args): if args.load_from_checkpoint is not None: model = MultVAE.load_from_checkpoint(args.load_from_checkpoint) else: n_input = load_table(args.val_biom).shape[0] model = MultVAE(n_input, n_latent=args.n_latent, n_hidden=args.n_hidden, basis=args.basis, dropout=args.dropout, bias=args.bias, tss=args.tss, distribution=args.distribution, batch_norm=args.batch_norm, encoder_depth=args.encoder_depth, learning_rate=args.learning_rate, scheduler=args.scheduler, transform=args.transform, overdispersion=args.overdispersion, grassmannian=args.grassmannian) print(args) print(model) if args.eigvectors is not None and args.eigvalues is not None: eigvectors = np.loadtxt(args.eigvectors) eigvalues = np.loadtxt(args.eigvalues) model.set_eigs(eigvectors, eigvalues) if args.profile: profiler = AdvancedProfiler() else: profiler = None dm = BiomDataModule(args.train_biom, args.test_biom, args.val_biom, metadata=args.sample_metadata, batch_category=args.batch_category, batch_size=args.batch_size, num_workers=args.num_workers) ckpt_path = os.path.join(args.output_directory, "checkpoints") checkpoint_callback = ModelCheckpoint(dirpath=ckpt_path, period=1, monitor='val_loss', mode='min', verbose=True) os.mkdir(args.output_directory) tb_logger = pl_loggers.TensorBoardLogger(f'{args.output_directory}/logs/') # save hyper-parameters to yaml file with open(f'{args.output_directory}/hparams.yaml', 'w') as outfile: yaml.dump(model._hparams, outfile, default_flow_style=False) # save tree to file if specified if os.path.exists(args.basis): tree = TreeNode.read(args.basis) tree.write(f'{args.output_directory}/tree.nwk') trainer = Trainer(max_epochs=args.epochs, gpus=args.gpus, stochastic_weight_avg=False, auto_scale_batch_size='binsearch', check_val_every_n_epoch=10, gradient_clip_val=args.grad_clip, profiler=profiler, logger=tb_logger, callbacks=[checkpoint_callback]) trainer.fit(model, dm) trainer.save_checkpoint(args.output_directory + '/last_ckpt.pt')
def main(project_config, hparams): torch.manual_seed(0) np.random.seed(0) # init module model = MonocularSemiSupDepth(hparams) # tags associated to the run def shape_format(shape): # shape = [Height, Width] return f"{shape[1]}x{shape[0]}" #assert hparams.metrics.use_gt_scale != hparams.datasets.train.load_pose, f"Either velocity of gt scaled" base_output_dir = Path( project_config.output_dir) / project_config.project_name logs_dir = base_output_dir / 'logs' logs_dir.mkdir(parents=True, exist_ok=True) experiment_output_dir = base_output_dir / 'outputs' / project_config.experiment_name assert hparams.logger in ['wandb', 'tensorboard'] if hparams.logger == 'tensorboard': experiment_logger = TensorBoardLogger( save_dir=logs_dir, name=project_config.experiment_name) run_output_dir = experiment_output_dir / f'version_{experiment_logger.version}' elif hparams.logger == 'wandb': list_of_tags = [ f"{hparams.model.depth_net.name} DepthNet", f"{hparams.model.pose_net.name} PoseNet", hparams.optimizer.name, hparams.scheduler.name, { 1: 'gray', 3: 'rgb' }[hparams.input_channels], f"train-{shape_format(hparams.datasets.train.data_transform_options.image_shape)}", f"val-{shape_format(hparams.datasets.val.data_transform_options.image_shape)}", f"test-{shape_format(hparams.datasets.test.data_transform_options.image_shape)}", ] if project_config.mixed_precision: list_of_tags.append('mixed_precision') losses = list(hparams.losses.keys()) if 'supervised_loss_weight' in losses: losses.remove('supervised_loss_weight') list_of_tags += losses experiment_logger = WandbLogger( project=project_config.project_name, save_dir= logs_dir, # the path to a directory where artifacts will be written log_model=True, tags=list_of_tags) #wandb_logger.watch(model, log='all', log_freq=5000) # watch model's gradients and params run_output_dir = experiment_output_dir / f'version_{experiment_logger.experiment.id}' else: run_output_dir = experiment_output_dir / 'no_version_system' run_output_dir.mkdir(parents=True, exist_ok=True) run_output_dir = str(run_output_dir) checkpoint_callback = ModelCheckpoint( filepath=run_output_dir + '/{epoch:04d}-{val-rmse_log:.5f}', # saves a file like: my/path/epoch=2-abs_rel=0.0115.ckpt save_top_k=3, verbose=True, monitor='val-rmse_log', mode='min', ) lr_logger = LearningRateLogger() if project_config.mixed_precision: amp_level = '01' precision = 16 if project_config.gpus > 1: distributed_backend = 'ddp' else: distributed_backend = None profiler = False if project_config.fast_dev_run: from pytorch_lightning.profiler import AdvancedProfiler profiler = AdvancedProfiler(output_filename='./profiler.log') trainer = Trainer( gpus=project_config.gpus, distributed_backend=distributed_backend, num_nodes=project_config.nodes, checkpoint_callback=checkpoint_callback, callbacks=[lr_logger], logger=experiment_logger, fast_dev_run=project_config.fast_dev_run, profiler=profiler, early_stop_callback=False, #amp_level='O1', #precision=16, **hparams.trainer) trainer.fit(model) trainer.test(model)
def main(args): if args.load_from_checkpoint is not None: model = MultBatchVAE.load_from_checkpoint(args.load_from_checkpoint) else: table = load_table(args.train_biom) n_input = table.shape[0] sample_metadata = pd.read_table(args.sample_metadata, dtype=str) sample_metadata = sample_metadata.set_index(sample_metadata.columns[0]) sample_metadata = sample_metadata.loc[table.ids()] n_batches = len(sample_metadata[args.batch_category].value_counts()) model = MultBatchVAE(n_input, args.batch_prior, n_batches, n_latent=args.n_latent, n_hidden=args.n_hidden, basis=args.basis, dropout=args.dropout, bias=args.bias, batch_norm=args.batch_norm, encoder_depth=args.encoder_depth, learning_rate=args.learning_rate, vae_lr=args.vae_lr, scheduler=args.scheduler, transform=args.transform, grassmannian=args.grassmannian) if args.load_vae_weights is not None: # initialize encoder/decoder weights with pretrained VAE other_model = MultVAE.load_from_checkpoint(args.load_vae_weights) model.vae.encoder = other_model.vae.encoder model.vae.decoder = other_model.vae.decoder model.vae.log_sigma_sq = other_model.vae.log_sigma_sq model.vae.variational_logvars = other_model.vae.variational_logvars # Note that input_embed isn't handled here. print(args) print(model) if args.eigvectors is not None and args.eigvalues is not None: eigvectors = np.loadtxt(args.eigvectors) eigvalues = np.loadtxt(args.eigvalues) model.set_eigs(eigvectors, eigvalues) if args.profile: profiler = AdvancedProfiler() else: profiler = None dm = BiomDataModule(args.train_biom, args.test_biom, args.val_biom, metadata=args.sample_metadata, batch_category=args.batch_category, batch_size=args.batch_size, num_workers=args.num_workers) ckpt_path = os.path.join(args.output_directory, "checkpoints") checkpoint_callback = ModelCheckpoint(dirpath=ckpt_path, period=1, monitor='val_loss', mode='min', verbose=True) os.mkdir(args.output_directory) tb_logger = pl_loggers.TensorBoardLogger(f'{args.output_directory}/logs/') # save hyper-parameters to yaml file with open(f'{args.output_directory}/hparams.yaml', 'w') as outfile: yaml.dump(model._hparams, outfile, default_flow_style=False) # save batch class mappings dm.batch_categories.to_csv(f'{args.output_directory}/batch_categories.txt', sep='\t', header=None) # save tree to file if specified if os.path.exists(args.basis): tree = TreeNode.read(args.basis) tree.write(f'{args.output_directory}/tree.nwk') trainer = Trainer(max_epochs=args.epochs, gpus=args.gpus, check_val_every_n_epoch=1, gradient_clip_val=args.grad_clip, profiler=profiler, logger=tb_logger, callbacks=[checkpoint_callback]) trainer.fit(model, dm) trainer.save_checkpoint(args.output_directory + '/last_ckpt.pt')
def setup_profiler(cfg): profiler = None if cfg.training.trainer.profiler: profiler = AdvancedProfiler(filename="profile.txt") return profiler
def train_task(init, close, exp_cfg_path, env_cfg_path, task_nr, logger_pass=None): seed_everything(42) local_rank = int(os.environ.get('LOCAL_RANK', 0)) if local_rank != 0 or not init: print(init, local_rank) rm = exp_cfg_path.find('cfg/exp/') + len('cfg/exp/') exp_cfg_path = os.path.join(exp_cfg_path[:rm], 'tmp/', exp_cfg_path[rm:]) exp = load_yaml(exp_cfg_path) env = load_yaml(env_cfg_path) if local_rank == 0 and init: # Set in name the correct model path if exp.get('timestamp', True): timestamp = datetime.datetime.now().replace( microsecond=0).isoformat() model_path = os.path.join(env['base'], exp['name']) p = model_path.split('/') model_path = os.path.join('/', *p[:-1], str(timestamp) + '_' + p[-1]) else: model_path = os.path.join(env['base'], exp['name']) try: shutil.rmtree(model_path) except: pass # Create the directory if not os.path.exists(model_path): try: os.makedirs(model_path) except: print("Failed generating network run folder") else: print("Network run folder already exits") # Only copy config files for the main ddp-task exp_cfg_fn = os.path.split(exp_cfg_path)[-1] env_cfg_fn = os.path.split(env_cfg_path)[-1] print(f'Copy {env_cfg_path} to {model_path}/{exp_cfg_fn}') shutil.copy(exp_cfg_path, f'{model_path}/{exp_cfg_fn}') shutil.copy(env_cfg_path, f'{model_path}/{env_cfg_fn}') exp['name'] = model_path else: # the correct model path has already been written to the yaml file. model_path = os.path.join(exp['name'], f'rank_{local_rank}_{task_nr}') # Create the directory if not os.path.exists(model_path): try: os.makedirs(model_path) except: pass # if local_rank == 0 and env['workstation'] == False: # cm = open(os.path.join(model_path, f'info{local_rank}_{task_nr}.log'), 'w') # else: # cm = nullcontext() # with cm as f: # if local_rank == 0 and env['workstation'] == False: # cm2 = redirect_stdout(f) # else: # cm2 = nullcontext() # with cm2: # # Setup logger for each ddp-task # logging.getLogger("lightning").setLevel(logging.DEBUG) # logger = logging.getLogger("lightning") # fh = logging.FileHandler( , 'a') # logger.addHandler(fh) # Copy Dataset from Scratch to Nodes SSD if env['workstation'] == False: # use proxy hack for neptunai !!! NeptuneLogger._create_or_get_experiment = _create_or_get_experiment2 # move data to ssd if exp['move_datasets'][0]['env_var'] != 'none': for dataset in exp['move_datasets']: scratchdir = os.getenv('TMPDIR') env_var = dataset['env_var'] tar = os.path.join(env[env_var], f'{env_var}.tar') name = (tar.split('/')[-1]).split('.')[0] if not os.path.exists( os.path.join(scratchdir, dataset['env_var'])): try: cmd = f"tar -xvf {tar} -C $TMPDIR >/dev/null 2>&1" st = time.time() print(f'Start moveing dataset-{env_var}: {cmd}') os.system(cmd) env[env_var] = str(os.path.join(scratchdir, name)) print( f'Finished moveing dataset-{env_var} in {time.time()-st}s' ) except: rank_zero_warn('ENV Var' + env_var) env[env_var] = str(os.path.join(scratchdir, name)) rank_zero_warn('Copying data failed') else: env[env_var] = str(os.path.join(scratchdir, name)) else: env['mlhypersim'] = str( os.path.join(env['mlhypersim'], 'mlhypersim')) if (exp['trainer']).get('gpus', -1): nr = torch.cuda.device_count() exp['trainer']['gpus'] = nr print(f'Set GPU Count for Trainer to {nr}!') model = Network(exp=exp, env=env) lr_monitor = LearningRateMonitor(**exp['lr_monitor']['cfg']) if exp['cb_early_stopping']['active']: early_stop_callback = EarlyStopping(**exp['cb_early_stopping']['cfg']) cb_ls = [early_stop_callback, lr_monitor] else: cb_ls = [lr_monitor] tses = TaskSpecificEarlyStopping( nr_tasks=exp['task_generator']['total_tasks'], **exp['task_specific_early_stopping']) cb_ls.append(tses) if local_rank == 0: for i in range(exp['task_generator']['total_tasks']): if i == task_nr: m = '/'.join( [a for a in model_path.split('/') if a.find('rank') == -1]) dic = copy.deepcopy(exp['cb_checkpoint']['cfg']) # try: # if len(exp['cb_checkpoint'].get('nameing',[])) > 0: # #filepath += '-{task_name:10s}' # for m in exp['cb_checkpoint']['nameing']: # filepath += '-{'+ m + ':.2f}' # except: # pass # dic['monitor'] += str(i) checkpoint_callback = ModelCheckpoint( dirpath=m, filename='task' + str(i) + '-{epoch:02d}--{step:06d}', **dic) cb_ls.append(checkpoint_callback) params = log_important_params(exp) if env['workstation']: t1 = 'workstation' else: t1 = 'leonhard' # if local_rank == 0: cwd = os.getcwd() files = [ str(p).replace(cwd + '/', '') for p in Path(cwd).rglob('*.py') if str(p).find('vscode') == -1 ] files.append(exp_cfg_path) files.append(env_cfg_path) if not exp.get('offline_mode', False): # if exp.get('experiment_id',-1) == -1: #create new experiment_id and write back if logger_pass is None: logger = NeptuneLogger( api_key=os.environ["NEPTUNE_API_TOKEN"], project_name="jonasfrey96/asl", experiment_name=exp['name'].split('/')[-2] + "_" + exp['name'].split('/')[-1], # Optional, params=params, # Optional, tags=[ t1, exp['name'].split('/')[-2], exp['name'].split('/')[-1] ] + exp["tag_list"], # Optional, close_after_fit=False, offline_mode=exp.get('offline_mode', False), upload_source_files=files, upload_stdout=False, upload_stderr=False) exp['experiment_id'] = logger.experiment.id print('created experiment id' + str(exp['experiment_id'])) else: logger = logger_pass # else: # print('loaded experiment id' + str( exp['experiment_id'])) # TODO # logger = NeptuneLogger( # api_key=os.environ["NEPTUNE_API_TOKEN"], # project_name="jonasfrey96/asl", # experiment_name= exp['name'].split('/')[-2] +"_"+ exp['name'].split('/')[-1], # Optional, # params=params, # Optional, # tags=[t1, exp['name'].split('/')[-2], exp['name'].split('/')[-1]] + exp["tag_list"], # Optional, # close_after_fit = False, # offline_mode = exp.get('offline_mode', False), # upload_source_files=files, # upload_stdout=False, # upload_stderr=False # ) # logger = NeptuneLogger( # api_key=os.environ["NEPTUNE_API_TOKEN"], # project_name="jonasfrey96/asl", # experiment_id=exp.get('experiment_id',-1), # close_after_fit = False, # ) print('Neptune Experiment ID: ' + str(logger.experiment.id) + " TASK NR " + str(task_nr)) else: logger = TensorBoardLogger( save_dir=model_path, name='tensorboard', # Optional, default_hp_metric=params, # Optional, ) # else: # logger = TensorBoardLogger( # save_dir=model_path+'/rank/'+str(local_rank), # name= exp['name'].split('/')[-2] +"_"+ exp['name'].split('/')[-1], # Optional, # ) weight_restore = exp.get('weights_restore', False) checkpoint_load = exp['checkpoint_load'] if local_rank == 0 and init: # write back the exp file with the correct name set to the model_path! # other ddp-task dont need to care about timestamps # also storeing the path to the latest.ckpt that downstream tasks can restore the model state exp['weights_restore_2'] = False exp['checkpoint_restore_2'] = True exp['checkpoint_load_2'] = os.path.join(model_path, 'last.ckpt') rm = exp_cfg_path.find('cfg/exp/') + len('cfg/exp/') exp_cfg_path = os.path.join(exp_cfg_path[:rm], 'tmp/', exp_cfg_path[rm:]) Path(exp_cfg_path).parent.mkdir(parents=True, exist_ok=True) with open(exp_cfg_path, 'w+') as f: yaml.dump(exp, f, default_flow_style=False, sort_keys=False) if not init: # restore model state from previous task. exp['checkpoint_restore'] = exp['checkpoint_restore_2'] exp['checkpoint_load'] = exp['checkpoint_load_2'] exp['weights_restore'] = exp['weights_restore_2'] # Always use advanced profiler if exp['trainer'].get('profiler', False): exp['trainer']['profiler'] = AdvancedProfiler( output_filename=os.path.join(model_path, 'profile.out')) else: exp['trainer']['profiler'] = False # print( exp['trainer'] ) # print(os.environ.get('GLOBAL_RANK')) if exp.get('checkpoint_restore', False): p = os.path.join(env['base'], exp['checkpoint_load']) trainer = Trainer(**exp['trainer'], default_root_dir=model_path, callbacks=cb_ls, resume_from_checkpoint=p, logger=logger) else: trainer = Trainer(**exp['trainer'], default_root_dir=model_path, callbacks=cb_ls, logger=logger) if exp['weights_restore']: # it is not strict since the latent replay buffer is not always available p = os.path.join(env['base'], exp['checkpoint_load']) if os.path.isfile(p): res = model.load_state_dict(torch.load( p, map_location=lambda storage, loc: storage)['state_dict'], strict=False) print('Restoring weights: ' + str(res)) else: raise Exception('Checkpoint not a file') main_visu = MainVisualizer(p_visu=os.path.join(model_path, 'main_visu'), logger=logger, epoch=0, store=True, num_classes=22) tc = TaskCreator(**exp['task_generator'], output_size=exp['model']['input_size']) print(tc) _task_start_training = time.time() _task_start_time = time.time() for idx, out in enumerate(tc): if idx == task_nr: break if True: #for idx, out in enumerate(tc): task, eval_lists = out main_visu.epoch = idx # New Logger print(f'<<<<<<<<<<<< TASK IDX {idx} TASK NAME : ' + task.name + ' >>>>>>>>>>>>>') model._task_name = task.name model._task_count = idx dataloader_train, dataloader_buffer = get_dataloader_train( d_train=task.dataset_train_cfg, env=env, exp=exp) print(str(dataloader_train.dataset)) print(str(dataloader_buffer.dataset)) dataloader_list_test = eval_lists_into_dataloaders(eval_lists, env=env, exp=exp) print(f'<<<<<<<<<<<< All Datasets are loaded and set up >>>>>>>>>>>>>') #Training the model trainer.should_stop = False # print("GLOBAL STEP ", model.global_step) for d in dataloader_list_test: print(str(d.dataset)) if idx < exp['start_at_task']: # trainer.limit_val_batches = 1.0 trainer.limit_train_batches = 1 trainer.max_epochs = 1 trainer.check_val_every_n_epoch = 1 train_res = trainer.fit(model=model, train_dataloader=dataloader_train, val_dataloaders=dataloader_list_test) trainer.max_epochs = exp['trainer']['max_epochs'] trainer.check_val_every_n_epoch = exp['trainer'][ 'check_val_every_n_epoch'] trainer.limit_val_batches = exp['trainer']['limit_val_batches'] trainer.limit_train_batches = exp['trainer']['limit_train_batches'] else: print('Train', dataloader_train) print('Val', dataloader_list_test) train_res = trainer.fit(model=model, train_dataloader=dataloader_train, val_dataloaders=dataloader_list_test) res = trainer.logger_connector.callback_metrics res_store = {} for k in res.keys(): try: res_store[k] = float(res[k]) except: pass base_path = '/'.join( [a for a in model_path.split('/') if a.find('rank') == -1]) with open(f"{base_path}/res{task_nr}.pkl", "wb") as f: pickle.dump(res_store, f) print(f'<<<<<<<<<<<< TASK IDX {idx} TASK NAME : ' + task.name + ' Trained >>>>>>>>>>>>>') if exp.get('buffer', {}).get('fill_after_fit', False): print(f'<<<<<<<<<<<< Performance Test to Get Buffer >>>>>>>>>>>>>') trainer.test(model=model, test_dataloaders=dataloader_buffer) if local_rank == 0: checkpoint_callback.save_checkpoint(trainer, model) print(f'<<<<<<<<<<<< Performance Test DONE >>>>>>>>>>>>>') number_validation_dataloaders = len(dataloader_list_test) if model._rssb_active: # visualize rssb bins, valids = model._rssb.get() fill_status = (bins != 0).sum(axis=1) main_visu.plot_bar(fill_status, x_label='Bin', y_label='Filled', title='Fill Status per Bin', sort=False, reverse=False, tag='Buffer_Fill_Status') plot_from_pkl(main_visu, base_path, task_nr) try: if close: logger.experiment.stop() except: pass
def test_pytorch_profiler_deepcopy(tmpdir): pytorch_profiler = PyTorchProfiler(dirpath=tmpdir, filename="profiler", schedule=None) pytorch_profiler.start("on_train_start") torch.tensor(1) pytorch_profiler.describe() assert deepcopy(pytorch_profiler) @pytest.mark.parametrize( ["profiler", "expected"], [ (None, PassThroughProfiler), (SimpleProfiler(), SimpleProfiler), (AdvancedProfiler(), AdvancedProfiler), ("simple", SimpleProfiler), ("Simple", SimpleProfiler), ("advanced", AdvancedProfiler), ("pytorch", PyTorchProfiler), ], ) def test_trainer_profiler_correct_args(profiler, expected): kwargs = {"profiler": profiler} if profiler is not None else {} trainer = Trainer(**kwargs) assert isinstance(trainer.profiler, expected) def test_trainer_profiler_incorrect_str_arg(): with pytest.raises( MisconfigurationException,
) if trial is not None and args.opt_prune: trainer_kwargs['early_stop_callback'] = PyTorchLightningPruningCallback(trial, monitor=args.monitor_metric) # enable debug mode if args.debug_mode: print("\n**** DEBUG MODE ON! ****\n") trainer_kwargs["track_grad_norm"] = 2 trainer_kwargs["log_gpu_memory"] = True trainer_kwargs['print_nan_grads'] = False if not args.no_save: profile_path = os.path.join(results_path, "profiler.log") print("Profiling to ", profile_path) trainer_kwargs["profiler"] = AdvancedProfiler(output_filename=profile_path) else: trainer_kwargs["profiler"] = AdvancedProfiler() # set GPU availability if not torch.cuda.is_available(): trainer_kwargs['gpus'] = 0 trainer = pl.Trainer(**trainer_kwargs) return trainer, trainer_kwargs, results_path def train_model(args, trial = None): ''' Train a single model whose hyperparameters are specified in the run config