Esempio n. 1
0
def test_simple_profiler_iterable_durations(tmpdir, action: str,
                                            expected: list):
    """Ensure the reported durations are reasonably accurate."""
    def _sleep_generator(durations):
        """the profile_iterable method needs an iterable in which we can ensure that we're properly timing how long
        it takes to call __next__"""
        for duration in durations:
            time.sleep(duration)
            yield duration

    def _get_python_cprofile_total_duration(profile):
        return sum(x.inlinetime for x in profile.getstats())

    simple_profiler = SimpleProfiler()
    iterable = _sleep_generator(expected)

    with pytest.deprecated_call(
            match=
            "`SimpleProfiler.profile_iterable` is deprecated in v1.6 and will be removed in v1.8."
    ):
        for _ in simple_profiler.profile_iterable(iterable, action):
            pass

    # we exclude the last item in the recorded durations since that's when StopIteration is raised
    np.testing.assert_allclose(simple_profiler.recorded_durations[action][:-1],
                               expected,
                               rtol=0.2)

    advanced_profiler = AdvancedProfiler(dirpath=tmpdir, filename="profiler")

    iterable = _sleep_generator(expected)

    with pytest.deprecated_call(
            match=
            "`AdvancedProfiler.profile_iterable` is deprecated in v1.6 and will be removed in v1.8."
    ):
        for _ in advanced_profiler.profile_iterable(iterable, action):
            pass

    recorded_total_duration = _get_python_cprofile_total_duration(
        advanced_profiler.profiled_actions[action])
    expected_total_duration = np.sum(expected)
    np.testing.assert_allclose(recorded_total_duration,
                               expected_total_duration,
                               rtol=0.2)
def main():
    args = parse_args()
    cfg = Config.fromfile(args.config)
    setup_seed(cfg.random_seed)

    model = LightningModel(cfg)

    checkpoint_callback = ModelCheckpoint(
        filepath=f"{cfg.checkpoint_path}/{cfg.name}/{cfg.version}/"
        f"{cfg.name}_{cfg.version}_{{epoch}}_{{avg_val_loss:.3f}}_{{ade:.3f}}_{{fde:.3f}}_{{fiou:.3f}}",
        save_last=True,
        save_top_k=8,
        verbose=True,
        monitor='fiou',
        mode='max',
        prefix='')

    lr_logger_callback = LearningRateLogger(logging_interval='step')

    logger = TensorBoardLogger(save_dir=cfg.log_path,
                               name=cfg.name,
                               version=cfg.version)
    logger.log_hyperparams(model.hparams)

    profiler = SimpleProfiler() if cfg.simple_profiler else AdvancedProfiler()
    check_val_every_n_epoch = cfg.check_val_every_n_epoch if hasattr(
        cfg, 'check_val_every_n_epoch') else 1

    trainer = pl.Trainer(
        gpus=cfg.num_gpus,
        max_epochs=cfg.max_epochs,
        logger=logger,
        profiler=profiler,  # this line won't work in multi-gpu setting.
        weights_summary="top",
        gradient_clip_val=cfg.gradient_clip_val,
        callbacks=[lr_logger_callback],
        checkpoint_callback=checkpoint_callback,
        resume_from_checkpoint=cfg.resume_from_checkpoint,
        accumulate_grad_batches=cfg.batch_size_times,
        check_val_every_n_epoch=check_val_every_n_epoch)

    if (not (args.train or args.test)) or args.train:
        shutil.copy(
            args.config,
            os.path.join(cfg.log_path, cfg.name, cfg.version,
                         args.config.split('/')[-1]))

        if cfg.load_from_checkpoint is not None:
            model_ckpt = partial_state_dict(model, cfg.load_from_checkpoint)
            model.load_state_dict(model_ckpt)
        trainer.fit(model)

    if args.test:
        if cfg.test_checkpoint is not None:
            model_ckpt = partial_state_dict(model, cfg.test_checkpoint)
            model.load_state_dict(model_ckpt)
        trainer.test(model)
Esempio n. 3
0
def main(args):
    print(args)
    if args.load_from_checkpoint is None:
        raise ValueError('`load-from-checkpoint` should be specified.')

    model = TripletVAE(args.load_from_checkpoint,
                       n_hidden=args.n_hidden,
                       n_layers=args.n_layers,
                       learning_rate=args.learning_rate,
                       vae_learning_rate=args.vae_lr,
                       scheduler=args.scheduler)

    print(model)
    if args.profile:
        profiler = AdvancedProfiler()
    else:
        profiler = None

    dm = TripletDataModule(args.train_biom,
                           args.test_biom,
                           args.val_biom,
                           metadata=args.sample_metadata,
                           batch_category=args.batch_category,
                           class_category=args.class_category,
                           segment_triples=args.segment_triples,
                           batch_size=args.batch_size,
                           num_workers=args.num_workers)
    ckpt_path = os.path.join(args.output_directory, "checkpoints")
    checkpoint_callback = ModelCheckpoint(dirpath=ckpt_path,
                                          period=1,
                                          monitor='val/triplet_loss',
                                          mode='min',
                                          verbose=True)

    os.mkdir(args.output_directory)
    tb_logger = pl_loggers.TensorBoardLogger(f'{args.output_directory}/logs/')
    # save hyper-parameters to yaml file
    with open(f'{args.output_directory}/hparams.yaml', 'w') as outfile:
        yaml.dump(model._hparams, outfile, default_flow_style=False)

    trainer = Trainer(max_epochs=args.epochs,
                      gpus=args.gpus,
                      check_val_every_n_epoch=10,
                      gradient_clip_val=args.grad_clip,
                      profiler=profiler,
                      logger=tb_logger,
                      callbacks=[checkpoint_callback])

    trainer.fit(model, dm)
    ckpt_path = args.output_directory + '/last_ckpt.pt'
    trainer.save_checkpoint(ckpt_path)

    # Perform KNN classification
    batch = next(iter(dm.test_dataloader()))
    res = model.test_step(batch, 0)['test/knn_results']
    open(f'{args.output_directory}/cross_validation.csv', 'w').write(res)
Esempio n. 4
0
def test_advanced_profiler():
    def _get_duration(profile):
        return sum([x.totaltime for x in profile.getstats()])

    p = AdvancedProfiler()

    with p.profile("a"):
        time.sleep(3)

    with p.profile("a"):
        time.sleep(1)

    with p.profile("b"):
        time.sleep(2)

    with p.profile("c"):
        time.sleep(1)

    # different environments have different precision when it comes to time.sleep()
    # see: https://github.com/PyTorchLightning/pytorch-lightning/issues/796
    a_duration = _get_duration(p.profiled_actions["a"])
    np.testing.assert_allclose(a_duration, [4], rtol=0.2)
    b_duration = _get_duration(p.profiled_actions["b"])
    np.testing.assert_allclose(b_duration, [2], rtol=0.2)
    c_duration = _get_duration(p.profiled_actions["c"])
    np.testing.assert_allclose(c_duration, [1], rtol=0.2)
Esempio n. 5
0
def train(omegaConf: DictConfig) -> LightningModule:
    # Misc part
    if omegaConf['runner']['verbose'] is True:
        print(OmegaConf.to_yaml(omegaConf))

    pl.seed_everything(omegaConf['runner']['seed'])

    # Runner part
    runner = make_runner(omegaConf['runner'])

    if "auto_lr_find" in omegaConf['trainer'] and omegaConf['trainer']['auto_lr_find'] is True:
        runner = custom_lr_finder(runner, omegaConf)

    # When we are here, the omegaConf has already been checked by OmegaConf
    # so we can extract primitives to use with other libs
    config = OmegaConf.to_container(omegaConf)
    assert isinstance(config, dict)

    config['trainer']['default_root_dir'] = check_default_root_dir(config)

    config['trainer']['checkpoint_callback'] = build_checkpoint_callback(config)

    if 'logger' in config['trainer']:
        config['trainer']['logger'] = build_logger(config)

    if 'deterministic' in config['trainer']:
        config['trainer']['deterministic'] = True

    if 'profiler' in config['trainer'] and config['trainer']['profiler'] is True:
        config['trainer']['profiler'] = AdvancedProfiler()

    if 'scheduler' in config['runner'] and config['runner']['scheduler'] is not None:
        lr_monitor = LearningRateLogger(logging_interval='step')
        config['trainer']['callbacks'] = [lr_monitor]

    # ###
    # # Early stopping
    # # It is breaking neptune logging somehow, it seems that it overrides by 1 the current timestep
    # ###
    # early_stop_callback = EarlyStopping(
    #     monitor='val_accuracy', min_delta=0.00, patience=10, verbose=False, mode='max'
    # )
    # config['trainer']['early_stop_callback'] = early_stop_callback

    trainer = pl.Trainer(**config['trainer'])
    trainer.fit(runner)

    return runner
Esempio n. 6
0
def create_profiler(profiler_params, checkpoint_path):
    if profiler_params is None:
        return None
    else:
        if profiler_params.save_profile:
            output_filename = checkpoint_path / 'profile.log'
        else:
            output_filename = None

        if profiler_params.name == 'simple':
            return SimpleProfiler(output_filename)
        elif profiler_params.name == 'advanced':
            return AdvancedProfiler(output_filename)
        else:
            raise ValueError(
                'Given type of profiler is not supported. Use `simple` or `advanced`'
            )
Esempio n. 7
0
def main(args):
    seed_everything(args.seed)
    model = NNCF(args)

    checkpoint_callback = ModelCheckpoint(
        filepath='./checkpoints/nncf_{step}-{val_loss:.3f}',
        save_top_k=-1,
        verbose=True,
        monitor='val_acc',
        mode='max',
        prefix='',
        period=1,
    )

    tb_logger = TensorBoardLogger(save_dir=os.getcwd(),
                                  version=1,
                                  name='lightning_logs')

    trainer = pl.Trainer(
        fast_dev_run=True if args.dev_run else False,
        weights_summary=args.weights_summary,
        num_sanity_val_steps=args.num_val_sanity,
        gpus=args.gpus,
        distributed_backend='dp',
        benchmark=True,
        amp_level='O1',
        precision=16 if args.fp16 else 32,
        deterministic=False,
        accumulate_grad_batches=args.accum_batches,
        auto_lr_find=True if args.auto_lr else False,
        checkpoint_callback=checkpoint_callback,
        # early_stop_callback=early_stop,
        # callbacks=callbacks,
        gradient_clip_val=args.grad_clip_val,
        limit_val_batches=args.limit_val_batches,
        # max_steps=args.num_steps,
        max_epochs=args.num_epochs,
        val_check_interval=args.val_check_interval,
        profiler=AdvancedProfiler(
            output_filename='profile_report.txt') if args.profile else None,
        track_grad_norm=2 if args.track_grads else -1,
        logger=tb_logger)

    trainer.fit(model)
Esempio n. 8
0
def process(args):
    seed_everything(2299)
    dict_args = vars(args)
    if args.pretrained_model is not None:
        pm_name = Path(args.pretrained_model).parents[1].name
        version = f"{args.name}_{args.exp}_{pm_name}_fold-{args.fold}"
    else:
        version = f"{args.name}_{args.exp}_fold-{args.fold}"
    # Data
    dm = CTRPDataModule.from_argparse_args(args)
    # Model
    if args.exp == 'vanilla':
        model = StandardNetwork(**dict_args)
    elif args.exp == 'transformer':
        model = TransformerNetwork(**dict_args)
    else:
        model = ConditionalNetwork(**dict_args)
    # Callbacks
    logger = TensorBoardLogger(save_dir=args.default_root_dir,
                               version=version,
                               name='lightning_logs')
    early_stop = EarlyStopping(monitor='val_loss',
                               min_delta=0.001,
                               patience=20,
                               verbose=False,
                               mode='min')
    checkpoint_callback = ModelCheckpoint(monitor='val_loss', mode='min')
    profiler = AdvancedProfiler(filename='profile')
    profiler = 'simple'
    # Trainer
    start = datetime.now()
    trainer = Trainer.from_argparse_args(
        args,
        default_root_dir=logger.log_dir,
        logger=logger,
        callbacks=[early_stop, checkpoint_callback],
        profiler=profiler)
    trainer.fit(model, dm)
    print("Completed fold {} in {}".format(args.fold,
                                           str(datetime.now() - start)))

    return
Esempio n. 9
0
def main():
    args = parse_args()
    cfg = Config.fromfile(args.config)
    setup_seed(cfg.random_seed)

    model = LightningTransformer(cfg)

    checkpoint_callback = ModelCheckpoint(filepath=os.path.join(
        cfg.checkpoint_path, cfg.name, cfg.version,
        "{}_{}_{{epoch}}_{{val_loss_per_word}}".format(cfg.name, cfg.version)),
                                          save_last=True,
                                          save_top_k=8,
                                          verbose=True,
                                          monitor='val_loss_per_word',
                                          mode='min',
                                          prefix='')

    lr_logger_callback = LearningRateLogger(logging_interval='step')

    logger = TensorBoardLogger(save_dir=cfg.log_path,
                               name=cfg.name,
                               version=cfg.version)
    logger.log_hyperparams(model.hparams)

    profiler = SimpleProfiler() if cfg.simple_profiler else AdvancedProfiler()

    trainer = pl.Trainer(gpus=cfg.num_gpus,
                         max_epochs=cfg.max_epochs,
                         logger=logger,
                         profiler=profiler,
                         weights_summary="top",
                         callbacks=[lr_logger_callback],
                         checkpoint_callback=checkpoint_callback,
                         resume_from_checkpoint=cfg.resume_from_checkpoint,
                         accumulate_grad_batches=cfg.batch_size_times)

    if cfg.load_from_checkpoint is not None:
        ckpt = torch.load(cfg.load_from_checkpoint,
                          map_location=lambda storage, loc: storage)
        model.load_state_dict(ckpt['state_dict'])
    trainer.fit(model)
Esempio n. 10
0
    def _create_pl_profiler(self):
        # Only if an experiment direcotyr exists
        if self.exp_main_dir:
            prof_out_file = os.path.join(self.cfg["setup_cfg"]["exp_main_dir"], "runtime_profiling.txt")
        else:
            return None

        if self.cfg["training_cfg"]["pl_which_profiler"].lower() == "simple":
            return SimpleProfiler(
                output_filename=prof_out_file,
                extended=True
            )
        elif self.cfg["train_cfg"]["pl_which_profiler"].lower() == "advanced":
            return AdvancedProfiler(
                output_filename=prof_out_file,
                line_count_restriction = 1.0
            )
        elif self.cfg["train_cfg"]["pl_which_profiler"].lower() in ["none", ""]:
            return None
        else:
            raise NotImplementedError 
    def on_trainer_init(self, profiler: Union[BaseProfiler, bool, str]):

        if profiler and not isinstance(profiler, (bool, str, BaseProfiler)):
            # TODO: Update exception on removal of bool
            raise MisconfigurationException("Only None, bool, str and subclasses of `BaseProfiler` "
                                            "are valid values for `Trainer`'s `profiler` parameter. "
                                            f"Received {profiler} which is of type {type(profiler)}.")

        if isinstance(profiler, bool):
            rank_zero_warn("Passing a bool value as a `profiler` argument to `Trainer` is deprecated"
                           " and will be removed in v1.3. Use str ('simple' or 'advanced') instead.",
                           DeprecationWarning)
            if profiler:
                profiler = SimpleProfiler()
        elif isinstance(profiler, str):
            profiler = profiler.lower()
            if profiler == "simple":
                profiler = SimpleProfiler()
            elif profiler == "advanced":
                profiler = AdvancedProfiler()
            else:
                raise ValueError("When passing string value for the `profiler` parameter of"
                                 " `Trainer`, it can only be 'simple' or 'advanced'")
        self.trainer.profiler = profiler or PassThroughProfiler()
Esempio n. 12
0
def test_advanced_profiler():
    def get_duration(profile):
        return sum([x.totaltime for x in profile.getstats()])

    p = AdvancedProfiler()

    with p.profile("a"):
        time.sleep(3)

    with p.profile("a"):
        time.sleep(1)

    with p.profile("b"):
        time.sleep(2)

    with p.profile("c"):
        time.sleep(1)

    a_duration = get_duration(p.profiled_actions["a"])
    np.testing.assert_almost_equal(a_duration, [4], decimal=1)
    b_duration = get_duration(p.profiled_actions["b"])
    np.testing.assert_almost_equal(b_duration, [2], decimal=1)
    c_duration = get_duration(p.profiled_actions["c"])
    np.testing.assert_almost_equal(c_duration, [1], decimal=1)
Esempio n. 13
0
def main(gpus, nodes, fast_dev_run, mixed_precision, project_config, hparams):
    torch.manual_seed(0)
    np.random.seed(0)

    # init module
    model = MonoSemiSupDepth_Packnet(hparams)

    # tags associated to the run
    def shape_format(shape):
        # shape = [Height, Width]
        return f"{shape[1]}x{shape[0]}"

    list_of_tags = [
        hparams.model.depth_net.name,
        hparams.model.pose_net.name,
        hparams.optimizer.name,
        hparams.scheduler.name,
        {1: 'gray', 3: 'rgb'}[hparams.input_channels],
        f"train-{shape_format(hparams.datasets.train.data_transform_options.image_shape)}",
        f"val-{shape_format(hparams.datasets.val.data_transform_options.image_shape)}",
        f"test-{shape_format(hparams.datasets.test.data_transform_options.image_shape)}",
    ]
    if mixed_precision:
        list_of_tags += 'mixed_precision'

    base_output_dir = Path(project_config.output_dir)
    experiment_output_dir = base_output_dir / project_config.project_name / project_config.experiment_name
    experiment_output_dir.mkdir(parents=True, exist_ok=True)

    wandb_output_dir = str(experiment_output_dir)
    wandb_logger = WandbLogger(
        project = project_config.project_name,
        save_dir=wandb_output_dir, # the path to a directory where artifacts will be written
        log_model=True,
        tags=list_of_tags
    )
    #wandb_logger.watch(model, log='all', log_freq=5000) # watch model's gradients and params

    run_output_dir = experiment_output_dir / f'{wandb_logger.experiment.id}'
    run_output_dir.mkdir(parents=True, exist_ok=True)
    run_output_dir = str(run_output_dir)

    checkpoint_callback = ModelCheckpoint(
        filepath=run_output_dir + '/{epoch:04d}-{val-abs_rel:.5f}', # saves a file like: my/path/epoch=2-abs_rel=0.0115.ckpt
        save_top_k=3,
        verbose=True,
        monitor='val-abs_rel',
        mode='min',
    )

    lr_logger = LearningRateLogger()


    if mixed_precision:
        amp_level='01'
        precision=16

    if gpus > 1:
        distributed_backend = 'ddp'
    else:
        distributed_backend = None

    profiler = False
    if fast_dev_run:
        from pytorch_lightning.profiler import AdvancedProfiler
        profiler = AdvancedProfiler(output_filename='./profiler.log')

    trainer = Trainer(
        gpus=gpus,
        distributed_backend=distributed_backend,
        nb_gpu_nodes=nodes,
        checkpoint_callback=checkpoint_callback,
        callbacks=[lr_logger],
        logger=wandb_logger,
        fast_dev_run=fast_dev_run,
        profiler=profiler,
        early_stop_callback=False,
        #amp_level='O1',
        #precision=16,
        **hparams.trainer
    )
    trainer.fit(model)
    trainer.test(model)
Esempio n. 14
0
def advanced_profiler(tmpdir):
    return AdvancedProfiler(dirpath=tmpdir, filename="profiler")
def main(hparams):
    # init data
    dm = TripleEmbeddingDataModule(hparams)
    ## identify input_size
    hparams.model.input_size = dm.dim

    # model
    model = Distiller(hparams)

    # early stop

    # logger
    # log_dir = str(root_dir / "lightning_logs")/
    # tt_logger = loggers.TestTubeLogger("tb_logs", name=hparams.experiment_name)
    # tb_logger = loggers.TensorBoardLogger("tb_logs")

    # init logger
    source_files_path = str(Path(hydra.utils.get_original_cwd()) / "**/*.py")
    tags = generate_tags(hparams)
    log_params = flatten_params(hparams)
    close_after_fit = not hparams.train.upload_checkpoints
    neptune_logger = NeptuneLogger(
        experiment_name="_".join(tags),
        project_name=f"kjang0517/{hparams.dataset.name}",
        params=log_params,  # Optional,
        tags=tags,  # Optional,
        close_after_fit=close_after_fit,
        upload_source_files=[source_files_path],
    )

    # class SaveHparamsCallback(pl.Callback):
    #     def on_sanity_check_start(self, trainer, pl_module):
    #         # save hparams
    #         hparams_str = trainer.model.hparams.pretty()
    #         hparams_path = Path(trainer.ckpt_path) / "hparams.yaml"
    #         with hparams_path.open("w", encoding="utf-8") as f:
    #             f.write(hparams_str)
    #         trainer.logger.experiment.log_artifact(str(hparams_path))

    # Callbacks
    callbacks = [PostTrainCallback(), LearningRateLogger()]

    # Callbacks: Early stop
    early_stop_callback = True
    # if hparams.train.use_early_stop:
    #     patience = hparams.train.early_stop_patience
    #     early_stop_callback = EarlyStopping(
    #         monitor="val_early_stop_on", patience=patience, verbose=True, mode="min"
    #     )
    # else:
    #     early_stop_callback = None

    # use profiler
    profiler = AdvancedProfiler() if hparams.train.profile else None

    # train
    # trainer = Trainer.from_argparse_args(hparams)
    trainer = Trainer(
        # default_root_dir=root_dir,
        max_epochs=hparams.train.max_epochs,
        gpus=hparams.train.gpus,
        distributed_backend=hparams.train.distributed_backend,
        fast_dev_run=hparams.train.fast_dev_run,
        amp_level=hparams.train.amp_level,
        precision=hparams.train.precision,
        train_percent_check=hparams.train.train_percent_check,
        val_percent_check=hparams.train.val_percent_check,
        # **hparams.train,
        benchmark=True,
        profiler=profiler,
        logger=neptune_logger,
        early_stop_callback=early_stop_callback,
        callbacks=callbacks,
        # deterministic=True,
    )

    # train
    trainer.fit(model, dm)
Esempio n. 16
0
def advanced_profiler(tmpdir):
    profiler = AdvancedProfiler(
        output_filename=os.path.join(tmpdir, "profiler.txt"))
    return profiler
Esempio n. 17
0
def advanced_profiler():
    profiler = AdvancedProfiler()
    return profiler
Esempio n. 18
0
def main():
    torch.set_printoptions(threshold=100,
                           edgeitems=50,
                           precision=8,
                           sci_mode=False)

    parser = argparse.ArgumentParser()
    parser.add_argument(
        "--config",
        type=str,
        default=None,
        help="Path to (.yml) config file if running new experiment.")
    parser.add_argument(
        "--log-checkpoint",
        type=str,
        default=None,
        help=
        "Training log path with the config and checkpoints to resume the experiment.",
    )
    parser.add_argument(
        "--checkpoint",
        type=str,
        default="model_last.ckpt",
        help="Resume training from the latest checkpoint by default.",
    )
    parser.add_argument("--run-name",
                        type=str,
                        default="default",
                        help="Name of the training log run")
    parser.add_argument(
        "--gpus",
        type=int,
        default=1,
        help="Amount of Gpus that should be used(In most cases leave at 1)",
    )
    parser.add_argument(
        "--precision",
        type=int,
        default=32,
        help=
        "Full precision (32) default, half precision (16) on newer devices to speed up the training.",
    )
    parser.add_argument(
        "--deterministic",
        action="store_true",
        default=False,
        help="Run deterministic training, useful for experimenting")
    parser.add_argument("--use-profiler",
                        action="store_true",
                        default=False,
                        help="Run profiler for the training set")
    config_args = parser.parse_args()

    # Log path
    path_parser = PathParser()
    cfg, logger = path_parser.parse(config_args.config,
                                    config_args.log_checkpoint,
                                    config_args.run_name,
                                    config_args.checkpoint,
                                    create_logger=True)

    # # (Optional:) enable this to track autograd issues when debugging
    # torch.autograd.set_detect_anomaly(True)
    if config_args.deterministic:
        seed_everything(cfg.experiment.randomseed)

    # Create model
    model = getattr(models, cfg.experiment.model)(cfg)

    # Model checkpoint generator
    checkpoint_callback = ModelCheckpoint(filepath=path_parser.checkpoint_dir,
                                          save_top_k=3,
                                          save_last=True,
                                          verbose=True,
                                          monitor="val_loss",
                                          mode="min",
                                          prefix="model_")

    # Trainer callbacks
    logger_callback = LoggerCallback(cfg)

    # Optional profiler
    profiler = None
    if config_args.use_profiler:
        profiler = AdvancedProfiler(output_filename="report.txt",
                                    line_count_restriction=.4)

    trainer = Trainer(weights_summary=None,
                      resume_from_checkpoint=path_parser.checkpoint_path,
                      gpus=config_args.gpus,
                      default_root_dir=path_parser.log_dir,
                      logger=logger,
                      num_sanity_val_steps=0,
                      checkpoint_callback=checkpoint_callback,
                      row_log_interval=1,
                      log_gpu_memory=None,
                      precision=config_args.precision,
                      profiler=profiler,
                      fast_dev_run=False,
                      deterministic=config_args.deterministic,
                      progress_bar_refresh_rate=0,
                      accumulate_grad_batches=1,
                      callbacks=[logger_callback])

    if config_args.log_checkpoint is not None:
        # Add log props
        logger.experiment.add_text("description", cfg.experiment.description,
                                   0)
        logger.experiment.add_text("config",
                                   f"\t{cfg.dump()}".replace("\n", "\n\t"), 0)
        logger.experiment.add_text(
            "params",
            f"\t{ModelSummary(model, mode='full')}".replace("\n", "\n\t"), 0)

    trainer.fit(model)

    print("Done!")
Esempio n. 19
0
    def on_epoch_start(self):
        print(">>>>>>>>>>>>>>>>>>>>> on_epoch_start")
        self.start_time = time.time()
        self.nepochs += 1

    def on_epoch_end(self):
        print(">>>>>>>>>>>>>>>>>>>>> on_epoch_end1")
        epoch_time = time.time() - self.start_time

        self.total_time += epoch_time
        # print(">>>>>>>>>>>>>>>>>>>>> on_epoch_end2", self.nepochs)
        print("Epoch Time taken: ", epoch_time, self.total_time / self.nepochs)


model = grammarTransformer()
profiler = AdvancedProfiler()

start_time = time.time()

if config.GPUS == 1:
    trainer = pl.Trainer(max_epochs=config.MAX_EPOCHS,
                         gpus=config.GPUS,
                         precision=config.PRECISION)
    # trainer = pl.Trainer(max_epochs=config.MAX_EPOCHS, gpus=config.GPUS, profiler=profiler)
    # trainer = pl.Trainer(max_epochs=config.MAX_EPOCHS, gpus=config.GPUS, profiler=True)
elif config.GPUS == 0:
    trainer = pl.Trainer(max_epochs=config.MAX_EPOCHS,
                         precision=config.PRECISION)
else:
    trainer = pl.Trainer(max_epochs=config.MAX_EPOCHS,
                         gpus=config.GPUS,
Esempio n. 20
0
def main(args):
    if args.load_from_checkpoint is not None:
        model = MultVAE.load_from_checkpoint(args.load_from_checkpoint)
    else:
        n_input = load_table(args.val_biom).shape[0]
        model = MultVAE(n_input,
                        n_latent=args.n_latent,
                        n_hidden=args.n_hidden,
                        basis=args.basis,
                        dropout=args.dropout,
                        bias=args.bias,
                        tss=args.tss,
                        distribution=args.distribution,
                        batch_norm=args.batch_norm,
                        encoder_depth=args.encoder_depth,
                        learning_rate=args.learning_rate,
                        scheduler=args.scheduler,
                        transform=args.transform,
                        overdispersion=args.overdispersion,
                        grassmannian=args.grassmannian)

    print(args)
    print(model)
    if args.eigvectors is not None and args.eigvalues is not None:
        eigvectors = np.loadtxt(args.eigvectors)
        eigvalues = np.loadtxt(args.eigvalues)
        model.set_eigs(eigvectors, eigvalues)
    if args.profile:
        profiler = AdvancedProfiler()
    else:
        profiler = None
    dm = BiomDataModule(args.train_biom,
                        args.test_biom,
                        args.val_biom,
                        metadata=args.sample_metadata,
                        batch_category=args.batch_category,
                        batch_size=args.batch_size,
                        num_workers=args.num_workers)

    ckpt_path = os.path.join(args.output_directory, "checkpoints")
    checkpoint_callback = ModelCheckpoint(dirpath=ckpt_path,
                                          period=1,
                                          monitor='val_loss',
                                          mode='min',
                                          verbose=True)

    os.mkdir(args.output_directory)
    tb_logger = pl_loggers.TensorBoardLogger(f'{args.output_directory}/logs/')
    # save hyper-parameters to yaml file
    with open(f'{args.output_directory}/hparams.yaml', 'w') as outfile:
        yaml.dump(model._hparams, outfile, default_flow_style=False)
    # save tree to file if specified
    if os.path.exists(args.basis):
        tree = TreeNode.read(args.basis)
        tree.write(f'{args.output_directory}/tree.nwk')

    trainer = Trainer(max_epochs=args.epochs,
                      gpus=args.gpus,
                      stochastic_weight_avg=False,
                      auto_scale_batch_size='binsearch',
                      check_val_every_n_epoch=10,
                      gradient_clip_val=args.grad_clip,
                      profiler=profiler,
                      logger=tb_logger,
                      callbacks=[checkpoint_callback])

    trainer.fit(model, dm)
    trainer.save_checkpoint(args.output_directory + '/last_ckpt.pt')
Esempio n. 21
0
def main(project_config, hparams):
    torch.manual_seed(0)
    np.random.seed(0)

    # init module
    model = MonocularSemiSupDepth(hparams)

    # tags associated to the run
    def shape_format(shape):
        # shape = [Height, Width]
        return f"{shape[1]}x{shape[0]}"

    #assert hparams.metrics.use_gt_scale != hparams.datasets.train.load_pose, f"Either velocity of gt scaled"

    base_output_dir = Path(
        project_config.output_dir) / project_config.project_name

    logs_dir = base_output_dir / 'logs'
    logs_dir.mkdir(parents=True, exist_ok=True)

    experiment_output_dir = base_output_dir / 'outputs' / project_config.experiment_name

    assert hparams.logger in ['wandb', 'tensorboard']

    if hparams.logger == 'tensorboard':
        experiment_logger = TensorBoardLogger(
            save_dir=logs_dir, name=project_config.experiment_name)

        run_output_dir = experiment_output_dir / f'version_{experiment_logger.version}'

    elif hparams.logger == 'wandb':

        list_of_tags = [
            f"{hparams.model.depth_net.name} DepthNet",
            f"{hparams.model.pose_net.name} PoseNet",
            hparams.optimizer.name,
            hparams.scheduler.name,
            {
                1: 'gray',
                3: 'rgb'
            }[hparams.input_channels],
            f"train-{shape_format(hparams.datasets.train.data_transform_options.image_shape)}",
            f"val-{shape_format(hparams.datasets.val.data_transform_options.image_shape)}",
            f"test-{shape_format(hparams.datasets.test.data_transform_options.image_shape)}",
        ]
        if project_config.mixed_precision:
            list_of_tags.append('mixed_precision')

        losses = list(hparams.losses.keys())
        if 'supervised_loss_weight' in losses:
            losses.remove('supervised_loss_weight')
        list_of_tags += losses

        experiment_logger = WandbLogger(
            project=project_config.project_name,
            save_dir=
            logs_dir,  # the path to a directory where artifacts will be written
            log_model=True,
            tags=list_of_tags)
        #wandb_logger.watch(model, log='all', log_freq=5000) # watch model's gradients and params

        run_output_dir = experiment_output_dir / f'version_{experiment_logger.experiment.id}'

    else:
        run_output_dir = experiment_output_dir / 'no_version_system'

    run_output_dir.mkdir(parents=True, exist_ok=True)
    run_output_dir = str(run_output_dir)

    checkpoint_callback = ModelCheckpoint(
        filepath=run_output_dir +
        '/{epoch:04d}-{val-rmse_log:.5f}',  # saves a file like: my/path/epoch=2-abs_rel=0.0115.ckpt
        save_top_k=3,
        verbose=True,
        monitor='val-rmse_log',
        mode='min',
    )

    lr_logger = LearningRateLogger()

    if project_config.mixed_precision:
        amp_level = '01'
        precision = 16

    if project_config.gpus > 1:
        distributed_backend = 'ddp'
    else:
        distributed_backend = None

    profiler = False
    if project_config.fast_dev_run:
        from pytorch_lightning.profiler import AdvancedProfiler
        profiler = AdvancedProfiler(output_filename='./profiler.log')

    trainer = Trainer(
        gpus=project_config.gpus,
        distributed_backend=distributed_backend,
        num_nodes=project_config.nodes,
        checkpoint_callback=checkpoint_callback,
        callbacks=[lr_logger],
        logger=experiment_logger,
        fast_dev_run=project_config.fast_dev_run,
        profiler=profiler,
        early_stop_callback=False,
        #amp_level='O1',
        #precision=16,
        **hparams.trainer)
    trainer.fit(model)
    trainer.test(model)
Esempio n. 22
0
def main(args):
    if args.load_from_checkpoint is not None:
        model = MultBatchVAE.load_from_checkpoint(args.load_from_checkpoint)
    else:
        table = load_table(args.train_biom)
        n_input = table.shape[0]
        sample_metadata = pd.read_table(args.sample_metadata, dtype=str)
        sample_metadata = sample_metadata.set_index(sample_metadata.columns[0])
        sample_metadata = sample_metadata.loc[table.ids()]
        n_batches = len(sample_metadata[args.batch_category].value_counts())
        model = MultBatchVAE(n_input,
                             args.batch_prior,
                             n_batches,
                             n_latent=args.n_latent,
                             n_hidden=args.n_hidden,
                             basis=args.basis,
                             dropout=args.dropout,
                             bias=args.bias,
                             batch_norm=args.batch_norm,
                             encoder_depth=args.encoder_depth,
                             learning_rate=args.learning_rate,
                             vae_lr=args.vae_lr,
                             scheduler=args.scheduler,
                             transform=args.transform,
                             grassmannian=args.grassmannian)
        if args.load_vae_weights is not None:
            # initialize encoder/decoder weights with pretrained VAE
            other_model = MultVAE.load_from_checkpoint(args.load_vae_weights)
            model.vae.encoder = other_model.vae.encoder
            model.vae.decoder = other_model.vae.decoder
            model.vae.log_sigma_sq = other_model.vae.log_sigma_sq
            model.vae.variational_logvars = other_model.vae.variational_logvars
            # Note that input_embed isn't handled here.

    print(args)
    print(model)
    if args.eigvectors is not None and args.eigvalues is not None:
        eigvectors = np.loadtxt(args.eigvectors)
        eigvalues = np.loadtxt(args.eigvalues)
        model.set_eigs(eigvectors, eigvalues)
    if args.profile:
        profiler = AdvancedProfiler()
    else:
        profiler = None
    dm = BiomDataModule(args.train_biom,
                        args.test_biom,
                        args.val_biom,
                        metadata=args.sample_metadata,
                        batch_category=args.batch_category,
                        batch_size=args.batch_size,
                        num_workers=args.num_workers)

    ckpt_path = os.path.join(args.output_directory, "checkpoints")
    checkpoint_callback = ModelCheckpoint(dirpath=ckpt_path,
                                          period=1,
                                          monitor='val_loss',
                                          mode='min',
                                          verbose=True)

    os.mkdir(args.output_directory)
    tb_logger = pl_loggers.TensorBoardLogger(f'{args.output_directory}/logs/')
    # save hyper-parameters to yaml file
    with open(f'{args.output_directory}/hparams.yaml', 'w') as outfile:
        yaml.dump(model._hparams, outfile, default_flow_style=False)
    # save batch class mappings
    dm.batch_categories.to_csv(f'{args.output_directory}/batch_categories.txt',
                               sep='\t',
                               header=None)
    # save tree to file if specified
    if os.path.exists(args.basis):
        tree = TreeNode.read(args.basis)
        tree.write(f'{args.output_directory}/tree.nwk')

    trainer = Trainer(max_epochs=args.epochs,
                      gpus=args.gpus,
                      check_val_every_n_epoch=1,
                      gradient_clip_val=args.grad_clip,
                      profiler=profiler,
                      logger=tb_logger,
                      callbacks=[checkpoint_callback])

    trainer.fit(model, dm)
    trainer.save_checkpoint(args.output_directory + '/last_ckpt.pt')
Esempio n. 23
0
def setup_profiler(cfg):
    profiler = None
    if cfg.training.trainer.profiler:
        profiler = AdvancedProfiler(filename="profile.txt")
    return profiler
Esempio n. 24
0
def train_task(init,
               close,
               exp_cfg_path,
               env_cfg_path,
               task_nr,
               logger_pass=None):
    seed_everything(42)
    local_rank = int(os.environ.get('LOCAL_RANK', 0))
    if local_rank != 0 or not init:
        print(init, local_rank)
        rm = exp_cfg_path.find('cfg/exp/') + len('cfg/exp/')
        exp_cfg_path = os.path.join(exp_cfg_path[:rm], 'tmp/',
                                    exp_cfg_path[rm:])

    exp = load_yaml(exp_cfg_path)
    env = load_yaml(env_cfg_path)

    if local_rank == 0 and init:
        # Set in name the correct model path
        if exp.get('timestamp', True):
            timestamp = datetime.datetime.now().replace(
                microsecond=0).isoformat()

            model_path = os.path.join(env['base'], exp['name'])
            p = model_path.split('/')
            model_path = os.path.join('/', *p[:-1],
                                      str(timestamp) + '_' + p[-1])
        else:
            model_path = os.path.join(env['base'], exp['name'])
            try:
                shutil.rmtree(model_path)
            except:
                pass
        # Create the directory
        if not os.path.exists(model_path):
            try:
                os.makedirs(model_path)
            except:
                print("Failed generating network run folder")
        else:
            print("Network run folder already exits")

        # Only copy config files for the main ddp-task
        exp_cfg_fn = os.path.split(exp_cfg_path)[-1]
        env_cfg_fn = os.path.split(env_cfg_path)[-1]
        print(f'Copy {env_cfg_path} to {model_path}/{exp_cfg_fn}')
        shutil.copy(exp_cfg_path, f'{model_path}/{exp_cfg_fn}')
        shutil.copy(env_cfg_path, f'{model_path}/{env_cfg_fn}')
        exp['name'] = model_path
    else:
        # the correct model path has already been written to the yaml file.
        model_path = os.path.join(exp['name'], f'rank_{local_rank}_{task_nr}')
        # Create the directory
        if not os.path.exists(model_path):
            try:
                os.makedirs(model_path)
            except:
                pass

    # if local_rank == 0 and env['workstation'] == False:
    #     cm = open(os.path.join(model_path, f'info{local_rank}_{task_nr}.log'), 'w')
    # else:
    #     cm = nullcontext()
    # with cm as f:
    #   if local_rank == 0 and env['workstation'] == False:
    #     cm2 = redirect_stdout(f)
    #   else:
    #     cm2 = nullcontext()
    #   with cm2:
    # # Setup logger for each ddp-task
    # logging.getLogger("lightning").setLevel(logging.DEBUG)
    # logger = logging.getLogger("lightning")
    # fh = logging.FileHandler( , 'a')
    # logger.addHandler(fh)

    # Copy Dataset from Scratch to Nodes SSD

    if env['workstation'] == False:
        # use proxy hack for neptunai !!!
        NeptuneLogger._create_or_get_experiment = _create_or_get_experiment2

        # move data to ssd
        if exp['move_datasets'][0]['env_var'] != 'none':
            for dataset in exp['move_datasets']:
                scratchdir = os.getenv('TMPDIR')
                env_var = dataset['env_var']
                tar = os.path.join(env[env_var], f'{env_var}.tar')
                name = (tar.split('/')[-1]).split('.')[0]

                if not os.path.exists(
                        os.path.join(scratchdir, dataset['env_var'])):

                    try:
                        cmd = f"tar -xvf {tar} -C $TMPDIR >/dev/null 2>&1"
                        st = time.time()
                        print(f'Start moveing dataset-{env_var}: {cmd}')
                        os.system(cmd)
                        env[env_var] = str(os.path.join(scratchdir, name))
                        print(
                            f'Finished moveing dataset-{env_var} in {time.time()-st}s'
                        )

                    except:
                        rank_zero_warn('ENV Var' + env_var)
                        env[env_var] = str(os.path.join(scratchdir, name))
                        rank_zero_warn('Copying data failed')
                else:
                    env[env_var] = str(os.path.join(scratchdir, name))
        else:
            env['mlhypersim'] = str(
                os.path.join(env['mlhypersim'], 'mlhypersim'))

    if (exp['trainer']).get('gpus', -1):
        nr = torch.cuda.device_count()
        exp['trainer']['gpus'] = nr
        print(f'Set GPU Count for Trainer to {nr}!')

    model = Network(exp=exp, env=env)

    lr_monitor = LearningRateMonitor(**exp['lr_monitor']['cfg'])

    if exp['cb_early_stopping']['active']:
        early_stop_callback = EarlyStopping(**exp['cb_early_stopping']['cfg'])
        cb_ls = [early_stop_callback, lr_monitor]
    else:
        cb_ls = [lr_monitor]

    tses = TaskSpecificEarlyStopping(
        nr_tasks=exp['task_generator']['total_tasks'],
        **exp['task_specific_early_stopping'])
    cb_ls.append(tses)
    if local_rank == 0:
        for i in range(exp['task_generator']['total_tasks']):
            if i == task_nr:
                m = '/'.join(
                    [a for a in model_path.split('/') if a.find('rank') == -1])

                dic = copy.deepcopy(exp['cb_checkpoint']['cfg'])
                # try:
                #   if len(exp['cb_checkpoint'].get('nameing',[])) > 0:
                #     #filepath += '-{task_name:10s}'
                #     for m in exp['cb_checkpoint']['nameing']:
                #       filepath += '-{'+ m + ':.2f}'
                # except:
                #   pass
                # dic['monitor'] += str(i)
                checkpoint_callback = ModelCheckpoint(
                    dirpath=m,
                    filename='task' + str(i) + '-{epoch:02d}--{step:06d}',
                    **dic)

                cb_ls.append(checkpoint_callback)

    params = log_important_params(exp)

    if env['workstation']:
        t1 = 'workstation'
    else:
        t1 = 'leonhard'

    # if local_rank == 0:
    cwd = os.getcwd()
    files = [
        str(p).replace(cwd + '/', '') for p in Path(cwd).rglob('*.py')
        if str(p).find('vscode') == -1
    ]
    files.append(exp_cfg_path)
    files.append(env_cfg_path)

    if not exp.get('offline_mode', False):
        # if exp.get('experiment_id',-1) == -1:
        #create new experiment_id and write back
        if logger_pass is None:
            logger = NeptuneLogger(
                api_key=os.environ["NEPTUNE_API_TOKEN"],
                project_name="jonasfrey96/asl",
                experiment_name=exp['name'].split('/')[-2] + "_" +
                exp['name'].split('/')[-1],  # Optional,
                params=params,  # Optional,
                tags=[
                    t1, exp['name'].split('/')[-2], exp['name'].split('/')[-1]
                ] + exp["tag_list"],  # Optional,
                close_after_fit=False,
                offline_mode=exp.get('offline_mode', False),
                upload_source_files=files,
                upload_stdout=False,
                upload_stderr=False)
            exp['experiment_id'] = logger.experiment.id
            print('created experiment id' + str(exp['experiment_id']))
        else:
            logger = logger_pass

        # else:
        # print('loaded experiment id' +  str( exp['experiment_id']))
        # TODO
        # logger = NeptuneLogger(
        #   api_key=os.environ["NEPTUNE_API_TOKEN"],
        #   project_name="jonasfrey96/asl",
        #   experiment_name= exp['name'].split('/')[-2] +"_"+ exp['name'].split('/')[-1], # Optional,
        #   params=params, # Optional,
        #   tags=[t1, exp['name'].split('/')[-2], exp['name'].split('/')[-1]] + exp["tag_list"], # Optional,
        #   close_after_fit = False,
        #   offline_mode = exp.get('offline_mode', False),
        #   upload_source_files=files,
        #   upload_stdout=False,
        #   upload_stderr=False
        # )

        # logger = NeptuneLogger(
        #   api_key=os.environ["NEPTUNE_API_TOKEN"],
        #   project_name="jonasfrey96/asl",
        #   experiment_id=exp.get('experiment_id',-1),
        #   close_after_fit = False,
        # )
        print('Neptune Experiment ID: ' + str(logger.experiment.id) +
              " TASK NR " + str(task_nr))
    else:
        logger = TensorBoardLogger(
            save_dir=model_path,
            name='tensorboard',  # Optional,
            default_hp_metric=params,  # Optional,
        )
    # else:
    #   logger = TensorBoardLogger(
    #       save_dir=model_path+'/rank/'+str(local_rank),
    #       name= exp['name'].split('/')[-2] +"_"+ exp['name'].split('/')[-1], # Optional,
    #   )

    weight_restore = exp.get('weights_restore', False)
    checkpoint_load = exp['checkpoint_load']

    if local_rank == 0 and init:
        # write back the exp file with the correct name set to the model_path!
        # other ddp-task dont need to care about timestamps
        # also storeing the path to the latest.ckpt that downstream tasks can restore the model state
        exp['weights_restore_2'] = False
        exp['checkpoint_restore_2'] = True
        exp['checkpoint_load_2'] = os.path.join(model_path, 'last.ckpt')

        rm = exp_cfg_path.find('cfg/exp/') + len('cfg/exp/')
        exp_cfg_path = os.path.join(exp_cfg_path[:rm], 'tmp/',
                                    exp_cfg_path[rm:])
        Path(exp_cfg_path).parent.mkdir(parents=True, exist_ok=True)
        with open(exp_cfg_path, 'w+') as f:
            yaml.dump(exp, f, default_flow_style=False, sort_keys=False)

    if not init:
        # restore model state from previous task.
        exp['checkpoint_restore'] = exp['checkpoint_restore_2']
        exp['checkpoint_load'] = exp['checkpoint_load_2']
        exp['weights_restore'] = exp['weights_restore_2']

    # Always use advanced profiler
    if exp['trainer'].get('profiler', False):
        exp['trainer']['profiler'] = AdvancedProfiler(
            output_filename=os.path.join(model_path, 'profile.out'))
    else:
        exp['trainer']['profiler'] = False

    # print( exp['trainer'] )
    # print(os.environ.get('GLOBAL_RANK'))
    if exp.get('checkpoint_restore', False):
        p = os.path.join(env['base'], exp['checkpoint_load'])
        trainer = Trainer(**exp['trainer'],
                          default_root_dir=model_path,
                          callbacks=cb_ls,
                          resume_from_checkpoint=p,
                          logger=logger)
    else:
        trainer = Trainer(**exp['trainer'],
                          default_root_dir=model_path,
                          callbacks=cb_ls,
                          logger=logger)

    if exp['weights_restore']:
        # it is not strict since the latent replay buffer is not always available
        p = os.path.join(env['base'], exp['checkpoint_load'])
        if os.path.isfile(p):
            res = model.load_state_dict(torch.load(
                p, map_location=lambda storage, loc: storage)['state_dict'],
                                        strict=False)
            print('Restoring weights: ' + str(res))
        else:
            raise Exception('Checkpoint not a file')

    main_visu = MainVisualizer(p_visu=os.path.join(model_path, 'main_visu'),
                               logger=logger,
                               epoch=0,
                               store=True,
                               num_classes=22)

    tc = TaskCreator(**exp['task_generator'],
                     output_size=exp['model']['input_size'])
    print(tc)
    _task_start_training = time.time()
    _task_start_time = time.time()

    for idx, out in enumerate(tc):
        if idx == task_nr:
            break

    if True:
        #for idx, out in enumerate(tc):
        task, eval_lists = out
        main_visu.epoch = idx
        # New Logger
        print(f'<<<<<<<<<<<< TASK IDX {idx} TASK NAME : ' + task.name +
              ' >>>>>>>>>>>>>')

        model._task_name = task.name
        model._task_count = idx
        dataloader_train, dataloader_buffer = get_dataloader_train(
            d_train=task.dataset_train_cfg, env=env, exp=exp)
        print(str(dataloader_train.dataset))
        print(str(dataloader_buffer.dataset))
        dataloader_list_test = eval_lists_into_dataloaders(eval_lists,
                                                           env=env,
                                                           exp=exp)
        print(f'<<<<<<<<<<<< All Datasets are loaded and set up >>>>>>>>>>>>>')
        #Training the model
        trainer.should_stop = False
        # print("GLOBAL STEP ", model.global_step)
        for d in dataloader_list_test:
            print(str(d.dataset))

        if idx < exp['start_at_task']:
            # trainer.limit_val_batches = 1.0
            trainer.limit_train_batches = 1
            trainer.max_epochs = 1
            trainer.check_val_every_n_epoch = 1
            train_res = trainer.fit(model=model,
                                    train_dataloader=dataloader_train,
                                    val_dataloaders=dataloader_list_test)

            trainer.max_epochs = exp['trainer']['max_epochs']
            trainer.check_val_every_n_epoch = exp['trainer'][
                'check_val_every_n_epoch']
            trainer.limit_val_batches = exp['trainer']['limit_val_batches']
            trainer.limit_train_batches = exp['trainer']['limit_train_batches']
        else:
            print('Train', dataloader_train)
            print('Val', dataloader_list_test)
            train_res = trainer.fit(model=model,
                                    train_dataloader=dataloader_train,
                                    val_dataloaders=dataloader_list_test)
        res = trainer.logger_connector.callback_metrics
        res_store = {}
        for k in res.keys():
            try:
                res_store[k] = float(res[k])
            except:
                pass
        base_path = '/'.join(
            [a for a in model_path.split('/') if a.find('rank') == -1])
        with open(f"{base_path}/res{task_nr}.pkl", "wb") as f:
            pickle.dump(res_store, f)

        print(f'<<<<<<<<<<<< TASK IDX {idx} TASK NAME : ' + task.name +
              ' Trained >>>>>>>>>>>>>')

        if exp.get('buffer', {}).get('fill_after_fit', False):
            print(f'<<<<<<<<<<<< Performance Test to Get Buffer >>>>>>>>>>>>>')

            trainer.test(model=model, test_dataloaders=dataloader_buffer)

            if local_rank == 0:
                checkpoint_callback.save_checkpoint(trainer, model)
            print(f'<<<<<<<<<<<< Performance Test DONE >>>>>>>>>>>>>')

        number_validation_dataloaders = len(dataloader_list_test)

        if model._rssb_active:
            # visualize rssb
            bins, valids = model._rssb.get()
            fill_status = (bins != 0).sum(axis=1)
            main_visu.plot_bar(fill_status,
                               x_label='Bin',
                               y_label='Filled',
                               title='Fill Status per Bin',
                               sort=False,
                               reverse=False,
                               tag='Buffer_Fill_Status')

        plot_from_pkl(main_visu, base_path, task_nr)

    try:
        if close:
            logger.experiment.stop()
    except:
        pass
Esempio n. 25
0
def test_pytorch_profiler_deepcopy(tmpdir):
    pytorch_profiler = PyTorchProfiler(dirpath=tmpdir,
                                       filename="profiler",
                                       schedule=None)
    pytorch_profiler.start("on_train_start")
    torch.tensor(1)
    pytorch_profiler.describe()
    assert deepcopy(pytorch_profiler)


@pytest.mark.parametrize(
    ["profiler", "expected"],
    [
        (None, PassThroughProfiler),
        (SimpleProfiler(), SimpleProfiler),
        (AdvancedProfiler(), AdvancedProfiler),
        ("simple", SimpleProfiler),
        ("Simple", SimpleProfiler),
        ("advanced", AdvancedProfiler),
        ("pytorch", PyTorchProfiler),
    ],
)
def test_trainer_profiler_correct_args(profiler, expected):
    kwargs = {"profiler": profiler} if profiler is not None else {}
    trainer = Trainer(**kwargs)
    assert isinstance(trainer.profiler, expected)


def test_trainer_profiler_incorrect_str_arg():
    with pytest.raises(
            MisconfigurationException,
Esempio n. 26
0
                    )

        if trial is not None and args.opt_prune:
            trainer_kwargs['early_stop_callback'] = PyTorchLightningPruningCallback(trial, monitor=args.monitor_metric)

    # enable debug mode 
    if args.debug_mode:
        print("\n**** DEBUG MODE ON! ****\n")
        trainer_kwargs["track_grad_norm"] = 2
        trainer_kwargs["log_gpu_memory"] = True
        trainer_kwargs['print_nan_grads'] = False

        if not args.no_save:
            profile_path = os.path.join(results_path, "profiler.log")
            print("Profiling to ", profile_path)
            trainer_kwargs["profiler"] = AdvancedProfiler(output_filename=profile_path)
        else:
            trainer_kwargs["profiler"] = AdvancedProfiler()

    # set GPU availability
    if not torch.cuda.is_available():
        trainer_kwargs['gpus'] = 0

    trainer = pl.Trainer(**trainer_kwargs)
    
    return trainer, trainer_kwargs, results_path  

def train_model(args, trial = None):
    '''
    Train a single model whose hyperparameters are specified in the run config