コード例 #1
0
ファイル: run.py プロジェクト: cjber/georelations
def run(
    dataset,
    pl_model: pl.LightningModule,
    name: str,
    path: Union[Path, str],
    test_path: Union[Path, str],
    seed: int,
    args=args,
) -> None:
    seed_everything(seed, workers=True)

    datamodule: pl.LightningDataModule = DataModule(
        dataset=dataset,
        path=path,
        test_path=test_path,
        num_workers=8,
        batch_size=args.batch_size,
        seed=seed,
    )
    model: pl.LightningModule = pl_model()
    callbacks: list[Callback] = build_callbacks()
    csv_logger = CSVLogger(
        save_dir="csv_logs",
        name="seed_" + str(seed),
        version=name,
    )

    if args.fast_dev_run:
        trainer_kwargs = {"gpus": None, "auto_select_gpus": False}
    else:
        trainer_kwargs = {
            "gpus": -1,
            "auto_select_gpus": True,
            "precision": 16
        }

    trainer: pl.Trainer = pl.Trainer.from_argparse_args(
        args,
        **trainer_kwargs,
        deterministic=True,  # ensure reproducible results
        default_root_dir="ckpts",
        logger=[csv_logger],
        log_every_n_steps=10,
        callbacks=callbacks,
        max_epochs=35,
    )

    trainer.tune(model=model, datamodule=datamodule)
    trainer.fit(model=model, datamodule=datamodule)

    if not args.fast_dev_run:
        test = trainer.test(model=model,
                            ckpt_path="best",
                            datamodule=datamodule)
        pd.DataFrame(test).to_csv("csv_logs/seed_" + str(seed) + "_" + name +
                                  "_test.csv")
        csv_logger.save()

    if args.save_to_hub:
        model.model.push_to_hub(f"cjber/{args.save_to_hub}")  # type: ignore
コード例 #2
0
def test_v1_8_0_deprecated_agg_and_log_metrics_override(tmpdir):
    class AggregationOverrideLogger(CSVLogger):
        @rank_zero_only
        def agg_and_log_metrics(self, metrics, step):
            self.log_metrics(metrics=metrics, step=step)

    logger = AggregationOverrideLogger(tmpdir)
    logger2 = CSVLogger(tmpdir)
    logger3 = CSVLogger(tmpdir)

    # Test single loggers
    with pytest.deprecated_call(
        match="`Logger.agg_and_log_metrics` is deprecated in v1.6 and will be removed"
        " in v1.8. `Trainer` will directly call `Logger.log_metrics` so custom"
        " loggers should not implement `Logger.agg_and_log_metrics`."
    ):
        Trainer(logger=logger)
    # Should have no deprecation warning
    Trainer(logger=logger2)

    # Test multiple loggers
    with pytest.deprecated_call(
        match="`Logger.agg_and_log_metrics` is deprecated in v1.6 and will be removed"
        " in v1.8. `Trainer` will directly call `Logger.log_metrics` so custom"
        " loggers should not implement `Logger.agg_and_log_metrics`."
    ):
        Trainer(logger=[logger, logger3])
    # Should have no deprecation warning
    Trainer(logger=[logger2, logger3])
コード例 #3
0
def test_file_logger_no_name(tmpdir, name):
    """Verify that None or empty name works."""
    logger = CSVLogger(save_dir=tmpdir, name=name)
    logger.save()
    assert os.path.normpath(
        logger.root_dir) == tmpdir  # use os.path.normpath to handle trailing /
    assert os.listdir(tmpdir / "version_0")
コード例 #4
0
def test_version(tmpdir):
    """Verify versions of loggers are concatenated properly."""
    logger1 = CSVLogger(tmpdir, version=0)
    logger2 = CSVLogger(tmpdir, version=2)
    logger3 = CSVLogger(tmpdir, version=1)
    logger4 = CSVLogger(tmpdir, version=0)
    loggers = [logger1, logger2, logger3, logger4]
    version = _version([])
    assert version == ""
    version = _version([logger3])
    assert version == 1
    version = _version(loggers)
    assert version == "0_2_1"
    version = _version(loggers, "-")
    assert version == "0-2-1"
コード例 #5
0
def test_name(tmpdir):
    """Verify names of loggers are concatenated properly."""
    logger1 = CSVLogger(tmpdir, name="foo")
    logger2 = CSVLogger(tmpdir, name="bar")
    logger3 = CSVLogger(tmpdir, name="foo")
    logger4 = CSVLogger(tmpdir, name="baz")
    loggers = [logger1, logger2, logger3, logger4]
    name = _name([])
    assert name == ""
    name = _name([logger3])
    assert name == "foo"
    name = _name(loggers)
    assert name == "foo_bar_baz"
    name = _name(loggers, "-")
    assert name == "foo-bar-baz"
コード例 #6
0
def test_gpu_stats_monitor(tmpdir):
    """Test GPU stats are logged using a logger."""
    model = BoringModel()
    with pytest.deprecated_call(match="GPUStatsMonitor` callback was deprecated in v1.5"):
        gpu_stats = GPUStatsMonitor(intra_step_time=True)
    logger = CSVLogger(tmpdir)
    log_every_n_steps = 2

    trainer = Trainer(
        default_root_dir=tmpdir,
        max_epochs=2,
        limit_train_batches=7,
        log_every_n_steps=log_every_n_steps,
        gpus=1,
        callbacks=[gpu_stats],
        logger=logger,
    )

    trainer.fit(model)
    assert trainer.state.finished, f"Training failed with {trainer.state}"

    path_csv = os.path.join(logger.log_dir, ExperimentWriter.NAME_METRICS_FILE)
    met_data = np.genfromtxt(path_csv, delimiter=",", names=True, deletechars="", replace_space=" ")

    batch_time_data = met_data["batch_time/intra_step (ms)"]
    batch_time_data = batch_time_data[~np.isnan(batch_time_data)]
    assert batch_time_data.shape[0] == trainer.global_step // log_every_n_steps

    fields = ["utilization.gpu", "memory.used", "memory.free", "utilization.memory"]

    for f in fields:
        assert any(f in h for h in met_data.dtype.names)
def train(config_file="pipeline_config.yaml"):

    logging.info(headline("Step 1: Running metric learning training"))

    with open(config_file) as file:
        all_configs = yaml.load(file, Loader=yaml.FullLoader)

    common_configs = all_configs["common_configs"]
    metric_learning_configs = all_configs["metric_learning_configs"]

    logging.info(headline("a) Initialising model"))

    model = LayerlessEmbedding(metric_learning_configs)

    logging.info(headline("b) Running training"))

    save_directory = os.path.join(common_configs["artifact_directory"],
                                  "metric_learning")
    logger = CSVLogger(save_directory, name=common_configs["experiment_name"])

    trainer = Trainer(accelerator='gpu' if torch.cuda.is_available() else None,
                      gpus=common_configs["gpus"],
                      max_epochs=metric_learning_configs["max_epochs"],
                      logger=logger)

    trainer.fit(model)

    logging.info(headline("c) Saving model"))

    os.makedirs(save_directory, exist_ok=True)
    trainer.save_checkpoint(
        os.path.join(save_directory,
                     common_configs["experiment_name"] + ".ckpt"))

    return trainer, model
def train(config_file="pipeline_config.yaml"):

    logging.info(headline(" Step 3: Running GNN training "))

    with open(config_file) as file:
        all_configs = yaml.load(file, Loader=yaml.FullLoader)

    common_configs = all_configs["common_configs"]
    gnn_configs = all_configs["gnn_configs"]

    logging.info(headline("a) Initialising model"))

    model = InteractionGNN(gnn_configs)

    logging.info(headline("b) Running training"))

    save_directory = os.path.join(common_configs["artifact_directory"], "gnn")
    logger = CSVLogger(save_directory, name=common_configs["experiment_name"])

    trainer = Trainer(gpus=common_configs["gpus"],
                      max_epochs=gnn_configs["max_epochs"],
                      logger=logger)

    trainer.fit(model)

    logging.info(headline("c) Saving model"))

    os.makedirs(save_directory, exist_ok=True)
    trainer.save_checkpoint(
        os.path.join(save_directory,
                     common_configs["experiment_name"] + ".ckpt"))

    return trainer, model
コード例 #9
0
def test_gpu_stats_monitor(tmpdir):
    """
    Test GPU stats are logged using a logger.
    """
    model = EvalModelTemplate()
    gpu_stats = GPUStatsMonitor()
    logger = CSVLogger(tmpdir)

    trainer = Trainer(default_root_dir=tmpdir,
                      max_epochs=1,
                      gpus=1,
                      callbacks=[gpu_stats],
                      logger=logger)

    results = trainer.fit(model)
    assert results

    path_csv = os.path.join(logger.log_dir, ExperimentWriter.NAME_METRICS_FILE)
    with open(path_csv, 'r') as fp:
        lines = fp.readlines()

    header = lines[0].split()

    fields = [
        'utilization.gpu', 'memory.used', 'memory.free', 'utilization.memory'
    ]

    for f in fields:
        assert any([f in h for h in header])
コード例 #10
0
def test_v1_8_0_logger_collection(tmpdir):
    logger1 = CSVLogger(tmpdir)
    logger2 = CSVLogger(tmpdir)

    trainer1 = Trainer(logger=logger1)
    trainer2 = Trainer(logger=[logger1, logger2])

    # Should have no deprecation warning
    trainer1.logger
    trainer1.loggers
    trainer2.loggers

    with pytest.deprecated_call(match="logger` will return the first logger"):
        _ = trainer2.logger
    with pytest.deprecated_call(match="`LoggerCollection` is deprecated in v1.6"):
        _ = LoggerCollection([logger1, logger2])
コード例 #11
0
def test_xla_stats_monitor(tmpdir):
    """Test XLA stats are logged using a logger."""

    model = BoringModel()
    xla_stats = XLAStatsMonitor()
    logger = CSVLogger(tmpdir)

    trainer = Trainer(default_root_dir=tmpdir,
                      max_epochs=2,
                      limit_train_batches=5,
                      tpu_cores=8,
                      callbacks=[xla_stats],
                      logger=logger)

    trainer.fit(model)
    assert trainer.state.finished, f"Training failed with {trainer.state}"

    path_csv = os.path.join(logger.log_dir, ExperimentWriter.NAME_METRICS_FILE)
    met_data = np.genfromtxt(path_csv,
                             delimiter=',',
                             names=True,
                             deletechars='',
                             replace_space=' ')

    fields = ['avg. free memory (MB)', 'avg. peak memory (MB)']

    for f in fields:
        assert any(f in h for h in met_data.dtype.names)
コード例 #12
0
def main(params):
    exp_name = f'{params.model}-{params.img_size}-fold-{params.fold}'
    checkpoint = ModelCheckpoint(
        dirpath=f'logs/{exp_name}',
        filename='{epoch}-{valid_loss_epoch:.3f}',
        save_top_k=-1,
        verbose=False,
    )
    printer = MyPrintingCallback()
    logger = CSVLogger(save_dir=f"logs/{exp_name}", name="text_logs")
    wandb_logger = WandbLogger(name=exp_name, project='all-data')
    bar = LitProgressBar()
    lr_monitor = LearningRateMonitor(logging_interval='step')
    model = HPALit(params)
    trainer = pl.Trainer(
        progress_bar_refresh_rate=1,
        max_epochs=params.epochs,
        callbacks=[checkpoint, printer, bar, lr_monitor],
        logger=[logger, wandb_logger],
        gpus=1,
        num_sanity_val_steps=0,
        auto_lr_find=True,
    )
    trainer.tune(model)
    trainer.fit(model)
コード例 #13
0
def main(args, model=None) -> SummarizationModule:
    Path(args.output_dir).mkdir(exist_ok=True)
    check_output_dir(args, expected_items=3)
    if model is None:
        if "summarization" in args.task:
            model: SummarizationModule = SummarizationModule(args)
        else:
            model: SummarizationModule = TranslationModule(args)
    dataset = Path(args.data_dir).name
    if (
        args.logger_name == "default"
        or args.fast_dev_run
        or str(args.output_dir).startswith("/tmp")
        or str(args.output_dir).startswith("/var")
    ):
        from pytorch_lightning.loggers import CSVLogger
        logger = CSVLogger('chen_logs',name = 'SCHWEIGEN')  # don't pollute wandb logs unnecessarily
    elif args.logger_name == "wandb":
        from pytorch_lightning.loggers import WandbLogger

        project = os.environ.get("WANDB_PROJECT", dataset)
        logger = WandbLogger(name=model.output_dir.name, project=project)

    elif args.logger_name == "wandb_shared":
        from pytorch_lightning.loggers import WandbLogger

        logger = WandbLogger(name=model.output_dir.name, project=f"hf_{dataset}")

    if args.early_stopping_patience >= 0:
        es_callback = get_early_stopping_callback(model.val_metric, args.early_stopping_patience)
    else:
        es_callback = False

    lower_is_better = args.val_metric == "loss"
    trainer: pl.Trainer = generic_train(
        model,
        args,
        logging_callback=Seq2SeqLoggingCallback(),
        checkpoint_callback=get_checkpoint_callback(
            args.output_dir, model.val_metric, args.save_top_k, lower_is_better
        ),
        early_stopping_callback=es_callback,
        logger=logger,
    )
    pickle_save(model.hparams, model.output_dir / "hparams.pkl")
    if not args.do_predict:
        return model

    model.hparams.test_checkpoint = ""
    checkpoints = list(sorted(glob.glob(os.path.join(args.output_dir, "*.ckpt"), recursive=True)))
    if checkpoints:
        model.hparams.test_checkpoint = checkpoints[-1]
        trainer.resume_from_checkpoint = checkpoints[-1]
    trainer.logger.log_hyperparams(model.hparams)

    # test() without a model tests using the best checkpoint automatically
    trainer.test()
    return model
コード例 #14
0
def train_mult(config, checkpoint_dir=None):
    hyp_params.attn_dropout = config["attn_dropout"]
    hyp_params.attn_dropout_a = config["attn_dropout_a"]
    hyp_params.attn_dropout_v = config["attn_dropout_v"]
    hyp_params.embed_dropout = config["embed_dropout"]
    hyp_params.out_dropout = config["out_dropout"]
    hyp_params.relu_dropout = config["relu_dropout"]
    hyp_params.res_dropout = config["res_dropout"]

    # hyp_params.layers = int(config["layers"])
    # hyp_params.num_heads = int(config["num_heads"])
    # hyp_params.project_dim = int(config["num_heads"]) * int(config["head_dim"])
    hyp_params.lr = config["lr"]
    hyp_params.weight_decay = config["weight_decay"]

    comet_logger = CometLogger(
        api_key="cgss7piePhyFPXRw1J2uUEjkQ",
        workspace="transformer",
        project_name=hyp_params.project_name,
        save_dir="logs/comet_ml",
    )
    experiement_key = comet_logger.experiment.get_key()
    csv_logger = CSVLogger("logs/csv", name=experiement_key)
    early_stopping = EarlyStopping(
        monitor="valid_1mae", patience=10, verbose=True, mode="max"
    )
    checkpoint = ModelCheckpoint(save_top_k=1, monitor="valid_1mae", mode="max")
    # tune_reporter = TuneReportCallback(["valid_loss", "valid_1mae"])
    tune_checkpoint_reporter = TuneReportCheckpointCallback(
        metrics=["valid_loss", "valid_1mae"]
    )

    model = MULTModelWarpedAll(hyp_params, early_stopping=early_stopping)
    trainer = pl.Trainer(
        gpus=1,
        max_epochs=hyp_params.num_epochs,
        log_every_n_steps=1,
        callbacks=[early_stopping, checkpoint, tune_checkpoint_reporter],
        logger=[csv_logger, comet_logger],
        limit_train_batches=hyp_params.limit,
        limit_val_batches=hyp_params.limit,
        weights_summary="full",
        weights_save_path="logs/weights",
        progress_bar_refresh_rate=0,
    )

    if checkpoint_dir is not None:
        ck = th.load(os.path.join(checkpoint_dir, "checkpoint"))
        model.load_state_dict(ck["state_dict"])
        trainer.current_epoch = ck["epoch"]

    trainer.fit(model)

    ck = th.load(checkpoint.best_model_path)
    model.load_state_dict(ck["state_dict"])

    trainer.test(model)
コード例 #15
0
def test_file_logger_automatic_versioning(tmpdir):
    """Verify that automatic versioning works"""

    root_dir = tmpdir.mkdir("exp")
    root_dir.mkdir("version_0")
    root_dir.mkdir("version_1")

    logger = CSVLogger(save_dir=tmpdir, name="exp")

    assert logger.version == 2
コード例 #16
0
def train_model(model, train, test, exp_name, epochs=100):
    pl.seed_everything(42)
    logger = CSVLogger("logs", name=exp_name)

    trainer = pl.Trainer(gpus=1,
                         max_epochs=epochs,
                         deterministic=True,
                         logger=logger)
    trainer.fit(model, train, test)
    return trainer
コード例 #17
0
def main():
    args = get_args()
    pl.seed_everything(args.seed, workers=True)

    # initialize data module
    data = DataModule(args)
    data.prepare_data()
    data.setup("fit")

    prior = None
    if args.prior_model:
        assert hasattr(priors, args.prior_model), (
            f"Unknown prior model {args['prior_model']}. "
            f"Available models are {', '.join(priors.__all__)}"
        )
        # initialize the prior model
        prior = getattr(priors, args.prior_model)(dataset=data.dataset)
        args.prior_args = prior.get_init_args()

    # initialize lightning module
    model = LNNP(args, prior_model=prior, mean=data.mean, std=data.std)

    checkpoint_callback = ModelCheckpoint(
        dirpath=args.log_dir,
        monitor="val_loss",
        save_top_k=10,  # -1 to save all
        every_n_epochs=args.save_interval,
        filename="{epoch}-{val_loss:.4f}-{test_loss:.4f}",
    )
    early_stopping = EarlyStopping("val_loss", patience=args.early_stopping_patience)

    tb_logger = pl.loggers.TensorBoardLogger(
        args.log_dir, name="tensorbord", version="", default_hp_metric=False
    )
    csv_logger = CSVLogger(args.log_dir, name="", version="")

    trainer = pl.Trainer(
        strategy=DDPStrategy(find_unused_parameters=False),
        max_epochs=args.num_epochs,
        gpus=args.ngpus,
        num_nodes=args.num_nodes,
        default_root_dir=args.log_dir,
        auto_lr_find=False,
        resume_from_checkpoint=None if args.reset_trainer else args.load_model,
        callbacks=[early_stopping, checkpoint_callback],
        logger=[tb_logger, csv_logger],
        precision=args.precision,
    )

    trainer.fit(model, data)

    # run test set after completing the fit
    model = LNNP.load_from_checkpoint(trainer.checkpoint_callback.best_model_path)
    trainer = pl.Trainer(logger=[tb_logger, csv_logger])
    trainer.test(model, data)
コード例 #18
0
def test_file_logger_manual_versioning(tmpdir):
    """Verify that manual versioning works"""

    root_dir = tmpdir.mkdir("exp")
    root_dir.mkdir("version_0")
    root_dir.mkdir("version_1")
    root_dir.mkdir("version_2")

    logger = CSVLogger(save_dir=tmpdir, name="exp", version=1)

    assert logger.version == 1
コード例 #19
0
    def _add_default_loggers(self) -> List[LightningLoggerBase]:
        """Adds optional default loggers and returns the extended list

        Added loggers: CSV, TensorBoard, WandB
        """
        loggers = self._trainer_config.logger
        if loggers is True:
            loggers = []
        elif isinstance(loggers, LightningLoggerBase):
            loggers = [loggers]

        def get_loggers_of_type(logger_type) -> List[LightningLoggerBase]:
            return [logger for logger in loggers if isinstance(logger, logger_type)]

        # csv
        if self._trainer_config.add_csv_logger and not get_loggers_of_type(CSVLogger):
            loggers.append(
                CSVLogger(
                    save_dir=self._trainer_config.default_root_dir or os.getcwd(),
                    name="csv",
                )
            )

        # tensorboard
        if self._trainer_config.add_tensorboard_logger and not get_loggers_of_type(
            TensorBoardLogger
        ):
            loggers.append(
                TensorBoardLogger(
                    save_dir=self._trainer_config.default_root_dir,
                    name="tensorboard",
                )
            )

        # wandb
        if (
            self._trainer_config.add_wandb_logger
            and _HAS_WANDB
            and not get_loggers_of_type(WandbLogger)
        ):
            self._wandb_logger = WandbLogger(
                save_dir=self._trainer_config.default_root_dir,
                project=os.environ.get("WANDB_PROJECT", "biome"),
            )
            loggers.append(self._wandb_logger)
        elif get_loggers_of_type(WandbLogger):
            self._wandb_logger = get_loggers_of_type(WandbLogger)[0]
        # somehow the wandb dir does not get created, i think this is a bug on pl side, have to check it out
        if self._wandb_logger is not None and not os.path.isdir(
            os.path.join(self._wandb_logger.save_dir, "wandb")
        ):
            os.makedirs(os.path.join(self._wandb_logger.save_dir, "wandb"))

        return loggers
コード例 #20
0
def test_fit_csv_logger(tmpdir):
    dm = ClassifDataModule()
    model = ClassificationModel()
    logger = CSVLogger(save_dir=tmpdir)
    trainer = Trainer(default_root_dir=tmpdir,
                      max_steps=10,
                      logger=logger,
                      log_every_n_steps=1)
    trainer.fit(model, datamodule=dm)
    metrics_file = os.path.join(logger.log_dir,
                                ExperimentWriter.NAME_METRICS_FILE)
    assert os.path.isfile(metrics_file)
コード例 #21
0
def test_logdir_multiple_loggers(tmpdir):
    """Tests that the logdir equals the default_root_dir when trainer has multiple loggers."""
    default_root_dir = tmpdir / "default_root_dir"
    save_dir = tmpdir / "save_dir"
    model = TestModel(default_root_dir)
    trainer = Trainer(
        default_root_dir=default_root_dir,
        max_steps=2,
        logger=[TensorBoardLogger(save_dir=save_dir, name="custom_logs"), CSVLogger(tmpdir)],
    )
    assert trainer.log_dir == default_root_dir

    trainer.fit(model)
    assert trainer.log_dir == default_root_dir
コード例 #22
0
def test_flush_n_steps(tmpdir):
    logger = CSVLogger(tmpdir, flush_logs_every_n_steps=2)
    metrics = {
        "float": 0.3,
        "int": 1,
        "FloatTensor": torch.tensor(0.1),
        "IntTensor": torch.tensor(1)
    }
    logger.save = MagicMock()
    logger.log_metrics(metrics, step=0)

    logger.save.assert_not_called()
    logger.log_metrics(metrics, step=1)
    logger.save.assert_called_once()
コード例 #23
0
ファイル: cmds.py プロジェクト: avilay/kaggle-projects
def train(cfg):
    """
    Trains the classifier.
    """
    if cfg.name == "auto":
        cfg.name = Haikunator().haikunate()
    train_csv = Path(cfg.dataroot) / cfg.train_csv
    logger.info(f"Starting run {cfg.name}")
    model = HiggsClassifier(hp=cfg.hparams.model)
    data = HiggsDataModule(
        trainfile=train_csv,
        trainset_prop=cfg.train_val_split_frac,
        hp=cfg.hparams.trainer,
    )
    data.prepare()
    logger.info(
        f"Train set size: {data.trainsize}, Validation set size: {data.valsize}"
    )
    os.makedirs(cfg.runroot, exist_ok=True)

    if cfg.logger == "wandb":
        ml_logger = WandbLogger(
            project="higgs",
            name=cfg.name,
            save_dir=cfg.runroot,
            log_model="all",
            id=cfg.name,
        )
        ml_logger.watch(model, log="all")
    elif cfg.logger == "csv":
        ml_logger = CSVLogger(save_dir=cfg.runroot, name="higgs", version=cfg.name)

    checkpoint = ModelCheckpoint(monitor="val_loss", mode="min")
    start = datetime.now()
    trainer = Trainer(
        default_root_dir=cfg.runroot,
        max_epochs=cfg.hparams.trainer.n_epochs,
        logger=ml_logger,
        callbacks=[checkpoint],
    )
    trainer.fit(model, data)
    end = datetime.now()
    logger.info(f"Took {end - start} to finish training.")
コード例 #24
0
def main(seed=None) -> None:
    """ Entry-point for CLI tool """
    args = _get_args()
    print(str(args).replace("Namespace", "Arguments used = "))

    if seed is not None:
        seed_everything(seed)

    data = TreeDataModule(
        args.trees,
        batch_size=args.batch_size,
        split_part=args.split_part,
        split_seed=args.split_seed,
    )

    kwargs = {
        "fp_size": args.fp_size,
        "lstm_size": args.lstm_size,
        "dropout_prob": args.dropout,
        "learning_rate": args.lr,
        "weight_decay": args.weight_decay,
    }
    model = RouteDistanceModel(**kwargs)

    gpus = int(torch.cuda.is_available())
    tb_logger = TensorBoardLogger("tb_logs", name=f"route-dist")
    csv_logger = CSVLogger("csv_logs", name=f"route-dist")
    checkpoint = ModelCheckpoint(monitor="val_monitor", save_last=True)
    trainer = Trainer(
        gpus=gpus,
        logger=[tb_logger, csv_logger],
        callbacks=[checkpoint],
        max_epochs=args.epochs,
        deterministic=seed is not None,
    )
    trainer.fit(model, datamodule=data)

    ret = trainer.test(datamodule=data)
    print("=== Test results === ")
    accum = accumulate_stats(ret)
    for key, value in accum.items():
        print(f"{key}: {value:0.4f}")
コード例 #25
0
def main(args):
    train_students = json.load(
        open(os.path.join(DATA_DIR, "train_students_ids.json")))
    test_students = json.load(
        open(os.path.join(DATA_DIR, "test_students_ids.json")))

    students_ids = train_students + test_students

    for student_id in students_ids:

        config, train_ds, val_ds = setup(args, student_id)
        train_dl = create_dataloader(train_ds, config.dataloader)
        val_dl = create_dataloader(val_ds, config.dataloader, shuffle=False)

        tags = tags_from_args(args)
        tags = [str(t) for t in tags]
        name = ':'.join(tags)
        config.pprint()

        AgentClass = globals()[config.agent]
        agent = AgentClass(config)

        save_dir = os.path.join(OUT_DIR, get_date())
        if not os.path.isdir(save_dir):
            os.makedirs(save_dir)

        csv_logger = CSVLogger(
            save_dir=os.path.join(OUT_DIR, "leave_one_out_cv", student_id))
        experiment = csv_logger.experiment
        experiment.log = experiment.log_metrics  # create a function alias, so we don't have to change other things

        trainer = pl.Trainer(
            logger=csv_logger,
            weights_summary='full',
            max_epochs=config.epochs,
            gpus=[int(args.gpu_device)],
        )
        trainer.fit(
            agent,
            train_dataloader=train_dl,
            val_dataloaders=val_dl,
        )
コード例 #26
0
def test_pytorch_profiler_multiple_loggers(tmpdir):
    """Tests whether the PyTorch profiler is able to write its trace locally when the Trainer is configured with
    multiple loggers.

    See issue #8157.
    """

    def look_for_trace(trace_dir):
        """Determines if a directory contains a PyTorch trace."""
        return any("trace.json" in filename for filename in os.listdir(trace_dir))

    # Sanity check
    assert not look_for_trace(tmpdir)

    model = BoringModel()
    loggers = [TensorBoardLogger(save_dir=tmpdir), CSVLogger(tmpdir)]
    trainer = Trainer(default_root_dir=tmpdir, profiler="pytorch", logger=loggers, limit_train_batches=5, max_epochs=1)
    assert len(trainer.loggers) == 2
    trainer.fit(model)
    assert look_for_trace(tmpdir)
コード例 #27
0
def test_file_logger_named_version(tmpdir):
    """Verify that manual versioning works for string versions, e.g. '2020-02-05-162402'"""

    exp_name = "exp"
    tmpdir.mkdir(exp_name)
    expected_version = "2020-02-05-162402"

    logger = CSVLogger(save_dir=tmpdir,
                       name=exp_name,
                       version=expected_version)
    logger.log_hyperparams({"a": 1, "b": 2})
    logger.save()
    assert logger.version == expected_version
    assert os.listdir(tmpdir / exp_name) == [expected_version]
    assert os.listdir(tmpdir / exp_name / expected_version)
コード例 #28
0
def test_logger_default_name(tmpdir):
    """Test that the default logger name is lightning_logs."""

    # CSV
    logger = CSVLogger(save_dir=tmpdir)
    assert logger.name == "lightning_logs"

    # TensorBoard
    with mock.patch("pytorch_lightning.loggers.tensorboard.SummaryWriter"):
        logger = _instantiate_logger(TensorBoardLogger, save_dir=tmpdir)
        assert logger.name == "lightning_logs"

    # MLflow
    with mock.patch("pytorch_lightning.loggers.mlflow.mlflow"), mock.patch(
            "pytorch_lightning.loggers.mlflow.MlflowClient") as mlflow_client:
        mlflow_client().get_experiment_by_name.return_value = None
        logger = _instantiate_logger(MLFlowLogger, save_dir=tmpdir)

        _ = logger.experiment
        logger._mlflow_client.create_experiment.assert_called_with(
            name="lightning_logs", artifact_location=ANY)
コード例 #29
0
def test_file_logger_log_metrics(tmpdir, step_idx):
    logger = CSVLogger(tmpdir)
    metrics = {
        "float": 0.3,
        "int": 1,
        "FloatTensor": torch.tensor(0.1),
        "IntTensor": torch.tensor(1)
    }
    logger.log_metrics(metrics, step_idx)
    logger.save()

    path_csv = os.path.join(logger.log_dir, ExperimentWriter.NAME_METRICS_FILE)
    with open(path_csv) as fp:
        lines = fp.readlines()
    assert len(lines) == 2
    assert all(n in lines[0] for n in metrics)
コード例 #30
0
def test_gpu_stats_monitor(tmpdir):
    """
    Test GPU stats are logged using a logger.
    """
    model = BoringModel()
    gpu_stats = GPUStatsMonitor(intra_step_time=True)
    logger = CSVLogger(tmpdir)
    log_every_n_steps = 2

    trainer = Trainer(default_root_dir=tmpdir,
                      max_epochs=2,
                      limit_train_batches=7,
                      log_every_n_steps=log_every_n_steps,
                      gpus=1,
                      callbacks=[gpu_stats],
                      logger=logger)

    trainer.fit(model)
    assert trainer.state == TrainerState.FINISHED, f"Training failed with {trainer.state}"

    path_csv = os.path.join(logger.log_dir, ExperimentWriter.NAME_METRICS_FILE)
    met_data = np.genfromtxt(path_csv,
                             delimiter=',',
                             names=True,
                             deletechars='',
                             replace_space=' ')

    batch_time_data = met_data['batch_time/intra_step (ms)']
    batch_time_data = batch_time_data[~np.isnan(batch_time_data)]
    assert batch_time_data.shape[0] == trainer.global_step // log_every_n_steps

    fields = [
        'utilization.gpu',
        'memory.used',
        'memory.free',
        'utilization.memory',
    ]

    for f in fields:
        assert any([f in h for h in met_data.dtype.names])