def test_pytorch(script_mode, use_loss_module): smd.del_hook() sim_class = ScriptSimulator if script_mode else SagemakerSimulator with sim_class() as sim: helper_torch_train(sim=sim, script_mode=script_mode, use_loss_module=use_loss_module) print("Finished Training") hook = smd.get_hook() print(f"hook = {hook}") # Check if the hook was executed with the default # hook configuration assert hook.has_default_hook_configuration() from smdebug.trials import create_trial trial = create_trial(path=sim.out_dir) print(f"trial.steps() = {trial.steps()}") print(f"trial.tensor_names() = {trial.tensor_names()}") print(f"collection_manager = {hook.collection_manager}") losses_tensors = hook.collection_manager.get("losses").tensor_names print(f"'losses' collection tensor_names = {losses_tensors}") assert len(losses_tensors) > 0 assert all([ name in trial.tensor_names() for name in hook.collection_manager.get("losses").tensor_names ])
def test_tensorboard_dir_sagemaker(): """ In Sagemaker, we read the tensorboard_dir from a separate JSON config file. """ with SagemakerSimulator() as sim: smd.del_hook() hook = smd.get_hook(create_if_not_exists=True) assert hook.out_dir == sim.out_dir assert hook.tensorboard_dir == sim.tensorboard_dir
def create_net_and_train(out_dir, n_steps, use_loss_module=False, use_loss_functional=False): assert ( use_loss_module != use_loss_functional ), "Exactly one of `use_loss_module` and `use_loss_functional` must be true." net = Net() optimizer = optim.SGD(net.parameters(), lr=0.05, momentum=0.9) criterion = nn.CrossEntropyLoss() hook = smd.Hook(out_dir=out_dir, save_config=smd.SaveConfig(save_interval=1)) hook.register_module(net) if use_loss_module: hook.register_loss(criterion) batch_size = 1 # Use the same data at each step to test loss decreasing inputs, labels = torch.rand(batch_size, 3, 32, 32), torch.zeros(batch_size).long() for _ in range(n_steps): optimizer.zero_grad() outputs = net(inputs) if use_loss_module: loss = criterion(outputs, labels) if use_loss_functional: loss = F.cross_entropy(outputs, labels) hook.record_tensor_value("nll_loss", tensor_value=loss) loss.backward() optimizer.step() # Users can call this method to immediately use the Trials API. hook.close() smd.del_hook()
def test_pytorch(script_mode, use_loss_module): smd.del_hook() sim_class = ScriptSimulator if script_mode else SagemakerSimulator with sim_class() as sim: trainloader, testloader = get_dataloaders() net = Net() criterion = nn.CrossEntropyLoss() optimizer = optim.SGD(net.parameters(), lr=0.001, momentum=0.9) if script_mode: hook = smd.Hook(out_dir=sim.out_dir) hook.register_module(net) hook.register_loss(criterion) for i, data in enumerate(trainloader, 0): # get the inputs; data is a list of [inputs, labels] inputs, labels = data # zero the parameter gradients optimizer.zero_grad() # forward + backward + optimize outputs = net(inputs) if use_loss_module: loss = criterion(outputs, labels) else: loss = F.cross_entropy(outputs, labels) if script_mode: hook.record_tensor_value(tensor_name="loss", tensor_value=loss) loss.backward() optimizer.step() if i == 499: # print every 2000 mini-batches break print("Finished Training") hook = smd.get_hook() print(f"hook = {hook}") from smdebug.trials import create_trial trial = create_trial(path=sim.out_dir) print(f"trial.steps() = {trial.steps()}") print(f"trial.tensor_names() = {trial.tensor_names()}") print(f"collection_manager = {hook.collection_manager}") losses_tensors = hook.collection_manager.get("losses").tensor_names print(f"'losses' collection tensor_names = {losses_tensors}") assert len(losses_tensors) > 0 assert all( [ name in trial.tensor_names() for name in hook.collection_manager.get("losses").tensor_names ] )
def test_pytorch_with_unsupported_version(use_loss_module=False): smd.del_hook() helper_torch_train(script_mode=False, use_loss_module=use_loss_module) print("Finished Training") hook = smd.get_hook() assert hook is None