Beispiel #1
0
def test_pytorch(script_mode, use_loss_module):
    smd.del_hook()

    sim_class = ScriptSimulator if script_mode else SagemakerSimulator
    with sim_class() as sim:
        helper_torch_train(sim=sim,
                           script_mode=script_mode,
                           use_loss_module=use_loss_module)

        print("Finished Training")

        hook = smd.get_hook()
        print(f"hook = {hook}")
        # Check if the hook was executed with the default
        # hook configuration
        assert hook.has_default_hook_configuration()

        from smdebug.trials import create_trial

        trial = create_trial(path=sim.out_dir)
        print(f"trial.steps() = {trial.steps()}")
        print(f"trial.tensor_names() = {trial.tensor_names()}")

        print(f"collection_manager = {hook.collection_manager}")

        losses_tensors = hook.collection_manager.get("losses").tensor_names
        print(f"'losses' collection tensor_names = {losses_tensors}")
        assert len(losses_tensors) > 0

        assert all([
            name in trial.tensor_names()
            for name in hook.collection_manager.get("losses").tensor_names
        ])
Beispiel #2
0
def test_tensorboard_dir_sagemaker():
    """ In Sagemaker, we read the tensorboard_dir from a separate JSON config file. """
    with SagemakerSimulator() as sim:
        smd.del_hook()
        hook = smd.get_hook(create_if_not_exists=True)
        assert hook.out_dir == sim.out_dir
        assert hook.tensorboard_dir == sim.tensorboard_dir
def create_net_and_train(out_dir, n_steps, use_loss_module=False, use_loss_functional=False):
    assert (
        use_loss_module != use_loss_functional
    ), "Exactly one of `use_loss_module` and `use_loss_functional` must be true."

    net = Net()
    optimizer = optim.SGD(net.parameters(), lr=0.05, momentum=0.9)
    criterion = nn.CrossEntropyLoss()

    hook = smd.Hook(out_dir=out_dir, save_config=smd.SaveConfig(save_interval=1))
    hook.register_module(net)
    if use_loss_module:
        hook.register_loss(criterion)

    batch_size = 1
    # Use the same data at each step to test loss decreasing
    inputs, labels = torch.rand(batch_size, 3, 32, 32), torch.zeros(batch_size).long()
    for _ in range(n_steps):
        optimizer.zero_grad()
        outputs = net(inputs)
        if use_loss_module:
            loss = criterion(outputs, labels)
        if use_loss_functional:
            loss = F.cross_entropy(outputs, labels)
            hook.record_tensor_value("nll_loss", tensor_value=loss)
        loss.backward()
        optimizer.step()

    # Users can call this method to immediately use the Trials API.
    hook.close()
    smd.del_hook()
Beispiel #4
0
def test_pytorch(script_mode, use_loss_module):
    smd.del_hook()

    sim_class = ScriptSimulator if script_mode else SagemakerSimulator
    with sim_class() as sim:
        trainloader, testloader = get_dataloaders()
        net = Net()
        criterion = nn.CrossEntropyLoss()
        optimizer = optim.SGD(net.parameters(), lr=0.001, momentum=0.9)

        if script_mode:
            hook = smd.Hook(out_dir=sim.out_dir)
            hook.register_module(net)
            hook.register_loss(criterion)

        for i, data in enumerate(trainloader, 0):
            # get the inputs; data is a list of [inputs, labels]
            inputs, labels = data

            # zero the parameter gradients
            optimizer.zero_grad()

            # forward + backward + optimize
            outputs = net(inputs)
            if use_loss_module:
                loss = criterion(outputs, labels)
            else:
                loss = F.cross_entropy(outputs, labels)
                if script_mode:
                    hook.record_tensor_value(tensor_name="loss", tensor_value=loss)
            loss.backward()
            optimizer.step()

            if i == 499:  # print every 2000 mini-batches
                break

        print("Finished Training")

        hook = smd.get_hook()
        print(f"hook = {hook}")

        from smdebug.trials import create_trial

        trial = create_trial(path=sim.out_dir)
        print(f"trial.steps() = {trial.steps()}")
        print(f"trial.tensor_names() = {trial.tensor_names()}")

        print(f"collection_manager = {hook.collection_manager}")

        losses_tensors = hook.collection_manager.get("losses").tensor_names
        print(f"'losses' collection tensor_names = {losses_tensors}")
        assert len(losses_tensors) > 0

        assert all(
            [
                name in trial.tensor_names()
                for name in hook.collection_manager.get("losses").tensor_names
            ]
        )
Beispiel #5
0
def test_pytorch_with_unsupported_version(use_loss_module=False):
    smd.del_hook()
    helper_torch_train(script_mode=False, use_loss_module=use_loss_module)
    print("Finished Training")
    hook = smd.get_hook()
    assert hook is None