コード例 #1
0
def get_save_handler(config):
    if exp_tracking.has_clearml:
        from ignite.contrib.handlers.clearml_logger import ClearMLSaver

        return ClearMLSaver(dirname=config.output_path.as_posix())

    return DiskSaver(config.output_path.as_posix())
コード例 #2
0
def test_clearml_disk_saver_integration_no_logger():
    model = torch.nn.Module()
    to_save_serializable = {"model": model}

    with pytest.warns(
            UserWarning,
            match="ClearMLSaver created a temporary checkpoints directory"):
        clearml.Task.current_task = Mock(return_value=object())
        clearml.binding.frameworks.WeightsFileHandler.create_output_model = MagicMock(
        )
        clearml_saver = ClearMLSaver()
        checkpoint = Checkpoint(to_save=to_save_serializable,
                                save_handler=clearml_saver,
                                n_saved=1)

    trainer = Engine(lambda e, b: None)
    trainer.state = State(epoch=0, iteration=0)
    checkpoint(trainer)
    trainer.state.iteration = 1
    checkpoint(trainer)

    if clearml_saver._atomic:
        assert clearml.binding.frameworks.WeightsFileHandler.create_output_model.call_count == 2
    else:
        saved_files = list(os.listdir(clearml_saver.dirname))
        assert len(saved_files) == 1
        assert saved_files[0] == "model_1.pt"
コード例 #3
0
ファイル: main.py プロジェクト: isabella232/ignite-2
def get_save_handler(config):
    if config["with_clearml"]:
        from ignite.contrib.handlers.clearml_logger import ClearMLSaver

        return ClearMLSaver(dirname=config["output_path"])

    return DiskSaver(config["output_path"], require_empty=False)
コード例 #4
0
def get_save_handler(output_path, with_clearml):
    if with_clearml:
        from ignite.contrib.handlers.clearml_logger import ClearMLSaver

        return ClearMLSaver(dirname=output_path)

    return DiskSaver(output_path)
コード例 #5
0
    def create_callbacks(self):

        ## SETUP CALLBACKS
        print('[INFO] Creating callback functions for training loop...',
              end='')
        # Early Stopping - stops training if the validation loss does not decrease after 5 epochs
        handler = EarlyStopping(patience=self.config.EARLY_STOPPING_PATIENCE,
                                score_function=score_function_loss,
                                trainer=self.train_engine)
        self.evaluator.add_event_handler(Events.COMPLETED, handler)
        print('Early Stopping ({} epochs)...'.format(
            self.config.EARLY_STOPPING_PATIENCE),
              end='')

        val_checkpointer = Checkpoint(
            {"model": self.model},
            ClearMLSaver(),
            n_saved=1,
            score_function=score_function_acc,
            score_name="val_acc",
            filename_prefix='cub200_{}_ignite_best'.format(
                self.config.MODEL.MODEL_NAME),
            global_step_transform=global_step_from_engine(self.train_engine),
        )
        self.evaluator.add_event_handler(Events.EPOCH_COMPLETED,
                                         val_checkpointer)
        print('Model Checkpointing...', end='')
        print('Done')
コード例 #6
0
def _test_save_model_optimizer_lr_scheduler_with_state_dict(
        device, on_zero_rank=False):

    if idist.get_rank() == 0:
        clearml.Task.current_task = Mock(return_value=object())
        clearml.binding.frameworks.WeightsFileHandler.create_output_model = MagicMock(
        )

    torch.manual_seed(23)

    model = DummyModel().to(device)

    optim = torch.optim.SGD(model.parameters(), lr=0.1)
    lr_scheduler = torch.optim.lr_scheduler.ExponentialLR(optim, gamma=0.5)

    def update_fn(engine, batch):
        x = torch.rand((4, 2)).to(device)
        optim.zero_grad()
        y = model(x)
        # Below code raises: RuntimeError: torch_xla/csrc/tensor_impl.cpp:144 : XLA tensors do not have storage
        # Probably related to https://github.com/pytorch/xla/issues/2576
        # loss = y.pow(2.0).sum()
        loss = y.sum()
        loss.backward()
        if idist.has_xla_support:
            import torch_xla.core.xla_model as xm

            xm.optimizer_step(optim, barrier=True)
        else:
            optim.step()
        lr_scheduler.step()

    engine = Engine(update_fn)

    to_save = {
        "model": model,
        "optimizer": optim,
        "lr_scheduler": lr_scheduler
    }

    with pytest.warns(
            UserWarning,
            match=r"ClearMLSaver created a temporary checkpoints directory"):
        clearml_saver = ClearMLSaver()

    if (not on_zero_rank) or (on_zero_rank and idist.get_rank() == 0):
        checkpoint = Checkpoint(to_save=to_save,
                                save_handler=clearml_saver,
                                n_saved=1)
        engine.add_event_handler(Events.EPOCH_COMPLETED, checkpoint)

    engine.run([0], max_epochs=4)

    idist.barrier()

    saved_objects = sorted(os.listdir(clearml_saver.dirname))
    # saved object is ['PREFIX_checkpoint_3.pt', ]
    saved_checkpoint = os.path.join(clearml_saver.dirname, saved_objects[0])

    if idist.has_xla_support:
        device = "cpu"

    loaded_obj = torch.load(saved_checkpoint, map_location=device)
    for f in ["model", "optimizer", "lr_scheduler"]:
        assert f in loaded_obj
    loaded_model_state_dict = loaded_obj["model"]
    loaded_optimizer_state_dict = loaded_obj["optimizer"]
    loaded_lr_scheduler_state_dict = loaded_obj["lr_scheduler"]

    assert isinstance(loaded_model_state_dict, dict)
    assert isinstance(loaded_optimizer_state_dict, dict)
    assert isinstance(loaded_lr_scheduler_state_dict, dict)

    # Specifically move device to CPU first
    model_state_dict = model.cpu().state_dict()
    for key in model_state_dict.keys():
        assert key in loaded_model_state_dict
        model_value = model_state_dict[key]
        loaded_model_value = loaded_model_state_dict[key]
        assert (model_value.cpu().numpy() == loaded_model_value.cpu().numpy()
                ).all()

    optim_state_dict = optim.state_dict()
    for key in optim_state_dict.keys():
        assert key in loaded_optimizer_state_dict
        optim_value = optim_state_dict[key]
        loaded_optim_value = loaded_optimizer_state_dict[key]
        if idist.get_rank() == 0:
            assert optim_value == loaded_optim_value

    lr_scheduler_state_dict = lr_scheduler.state_dict()
    for key in lr_scheduler_state_dict.keys():
        assert key in loaded_lr_scheduler_state_dict
        lr_scheduler_value = lr_scheduler_state_dict[key]
        loaded_lr_scheduler_value = loaded_lr_scheduler_state_dict[key]
        assert lr_scheduler_value == loaded_lr_scheduler_value
コード例 #7
0
def test_clearml_saver_callbacks():
    mock_task = MagicMock(spec=clearml.Task)
    mock_task.name = "check-task"

    mock_model = MagicMock(spec=clearml.OutputModel)

    model_info = WeightsFileHandler.ModelInfo(
        model=mock_model,
        upload_filename="test.pt",
        local_model_path="",
        local_model_id="",
        framework=Framework.pytorch,
        task=mock_task,
    )

    mock_model_info = MagicMock(spec_set=model_info)

    # Simulate 4 calls to save model and 2 to remove (n_saved=2)
    filenames = [
        "best_model_5_val_acc=0.123.pt",
        "best_model_6_val_acc=0.234.pt",
        "best_model_7_val_acc=0.356.pt",
        "best_model_8_val_acc=0.456.pt",
    ]
    metadata_list = [
        {
            "basename": "best_model",
            "score_name": "val_acc",
            "priority": 0.123
        },
        {
            "basename": "best_model",
            "score_name": "val_acc",
            "priority": 0.234
        },
        {
            "basename": "best_model",
            "score_name": "val_acc",
            "priority": 0.345
        },
        {
            "basename": "best_model",
            "score_name": "val_acc",
            "priority": 0.456
        },
    ]
    dirname = "/tmp/test"

    _checkpoint_slots = defaultdict(list)

    n_saved = 2

    for i, (filename, metadata) in enumerate(zip(filenames, metadata_list)):

        mock_model_info.upload_filename = filename

        if i >= n_saved:
            # Remove
            filename_to_remove = filenames[i % n_saved]
            for slots in _checkpoint_slots.values():
                try:
                    slots[slots.index(filename_to_remove)] = None
                except ValueError:
                    pass
                else:
                    i = i % n_saved
                    break

        basename = metadata["basename"]
        checkpoint_key = (dirname, basename)

        context = ClearMLSaver._CallbacksContext(
            callback_type=WeightsFileHandler.CallbackType,
            slots=_checkpoint_slots[checkpoint_key],
            checkpoint_key=str(checkpoint_key),
            filename=filename,
            basename=basename,
            metadata=metadata,
        )

        output_model_info = context.pre_callback(
            str(WeightsFileHandler.CallbackType.save), mock_model_info)
        assert (hasattr(output_model_info, "upload_filename")
                and f"{basename}_{i}.pt" in output_model_info.upload_filename)
        assert hasattr(output_model_info, "local_model_id") and str(
            checkpoint_key) in output_model_info.local_model_id

        output_model_info = context.post_callback(
            str(WeightsFileHandler.CallbackType.save), mock_model_info)
        assert hasattr(output_model_info, "model") and hasattr(
            output_model_info.model, "name")
        assert hasattr(output_model_info, "model") and hasattr(
            output_model_info.model, "comment")
        assert isinstance(output_model_info.model.name,
                          str) and filename in output_model_info.model.name
        assert (isinstance(output_model_info.model.comment, str)
                and metadata["basename"] in output_model_info.model.comment
                and metadata["score_name"] in output_model_info.model.comment)
コード例 #8
0
def run(train_batch_size, val_batch_size, epochs, lr, momentum):
    train_loader, val_loader = get_data_loaders(train_batch_size,
                                                val_batch_size)
    model = Net()
    device = "cpu"

    if torch.cuda.is_available():
        device = "cuda"

    model.to(device)  # Move model before creating optimizer
    optimizer = SGD(model.parameters(), lr=lr, momentum=momentum)
    criterion = nn.CrossEntropyLoss()
    trainer = create_supervised_trainer(model,
                                        optimizer,
                                        criterion,
                                        device=device)
    trainer.logger = setup_logger("Trainer")

    metrics = {"accuracy": Accuracy(), "loss": Loss(criterion)}

    train_evaluator = create_supervised_evaluator(model,
                                                  metrics=metrics,
                                                  device=device)
    train_evaluator.logger = setup_logger("Train Evaluator")
    validation_evaluator = create_supervised_evaluator(model,
                                                       metrics=metrics,
                                                       device=device)
    validation_evaluator.logger = setup_logger("Val Evaluator")

    @trainer.on(Events.EPOCH_COMPLETED)
    def compute_metrics(engine):
        train_evaluator.run(train_loader)
        validation_evaluator.run(val_loader)

    clearml_logger = ClearMLLogger(project_name="examples", task_name="ignite")

    clearml_logger.attach_output_handler(
        trainer,
        event_name=Events.ITERATION_COMPLETED(every=100),
        tag="training",
        output_transform=lambda loss: {"batchloss": loss},
    )

    for tag, evaluator in [("training metrics", train_evaluator),
                           ("validation metrics", validation_evaluator)]:
        clearml_logger.attach_output_handler(
            evaluator,
            event_name=Events.EPOCH_COMPLETED,
            tag=tag,
            metric_names=["loss", "accuracy"],
            global_step_transform=global_step_from_engine(trainer),
        )

    clearml_logger.attach_opt_params_handler(
        trainer,
        event_name=Events.ITERATION_COMPLETED(every=100),
        optimizer=optimizer)

    clearml_logger.attach(trainer,
                          log_handler=WeightsScalarHandler(model),
                          event_name=Events.ITERATION_COMPLETED(every=100))

    clearml_logger.attach(trainer,
                          log_handler=WeightsHistHandler(model),
                          event_name=Events.EPOCH_COMPLETED(every=100))

    clearml_logger.attach(trainer,
                          log_handler=GradsScalarHandler(model),
                          event_name=Events.ITERATION_COMPLETED(every=100))

    clearml_logger.attach(trainer,
                          log_handler=GradsHistHandler(model),
                          event_name=Events.EPOCH_COMPLETED(every=100))

    handler = Checkpoint(
        {"model": model},
        ClearMLSaver(),
        n_saved=1,
        score_function=lambda e: e.state.metrics["accuracy"],
        score_name="val_acc",
        filename_prefix="best",
        global_step_transform=global_step_from_engine(trainer),
    )
    validation_evaluator.add_event_handler(Events.EPOCH_COMPLETED, handler)

    # kick everything off
    trainer.run(train_loader, max_epochs=epochs)

    clearml_logger.close()