def test_tensorboard_dir_script_specify_tensorboard_dir(): """ In script mode, passing `export_tensorboard` and `tensorboard_dir` works. """ with ScriptSimulator(tensorboard_dir="/tmp/tensorboard_dir") as sim: hook = smd.Hook(out_dir=sim.out_dir, export_tensorboard=True, tensorboard_dir=sim.tensorboard_dir) assert hook.tensorboard_dir == sim.tensorboard_dir
def helper_torch_train(sim=None, script_mode=False, use_loss_module=False): trainloader, testloader = get_dataloaders() net = Net() criterion = nn.CrossEntropyLoss() optimizer = optim.SGD(net.parameters(), lr=0.001, momentum=0.9) if script_mode: hook = smd.Hook(out_dir=sim.out_dir) hook.register_module(net) hook.register_loss(criterion) for i, data in enumerate(trainloader, 0): # get the inputs; data is a list of [inputs, labels] inputs, labels = data # zero the parameter gradients optimizer.zero_grad() # forward + backward + optimize outputs = net(inputs) if use_loss_module: loss = criterion(outputs, labels) else: loss = F.cross_entropy(outputs, labels) if script_mode: hook.record_tensor_value(tensor_name="loss", tensor_value=loss) loss.backward() optimizer.step() if i == 499: break
def test_data_parallel(): shutil.rmtree(out_dir, ignore_errors=True) hook = smd.Hook( out_dir=out_dir, save_config=smd.SaveConfig(save_steps=[0, 1, 5]), save_all=True, include_workers="one", ) device = "cuda" if torch.cuda.is_available() else "cpu" model = Net().to(device) if device == "cuda": model = DataParallel(model) hook.register_module(model) optimizer = optim.SGD(model.parameters(), lr=0.001, momentum=0.9) train(model, hook, torch.device(device), optimizer, num_steps=10) trial = create_trial(out_dir) assert trial.steps() == [0, 1, 5] if device == "cpu": assert len(trial.tensor_names()) == 38 else: assert len(trial.tensor_names()) > 37 shutil.rmtree(out_dir, ignore_errors=True)
def create_net_and_train(out_dir, n_steps, use_loss_module=False, use_loss_functional=False): assert ( use_loss_module != use_loss_functional ), "Exactly one of `use_loss_module` and `use_loss_functional` must be true." net = Net() optimizer = optim.SGD(net.parameters(), lr=0.05, momentum=0.9) criterion = nn.CrossEntropyLoss() hook = smd.Hook(out_dir=out_dir, save_config=smd.SaveConfig(save_interval=1)) hook.register_module(net) if use_loss_module: hook.register_loss(criterion) batch_size = 1 # Use the same data at each step to test loss decreasing inputs, labels = torch.rand(batch_size, 3, 32, 32), torch.zeros(batch_size).long() for _ in range(n_steps): optimizer.zero_grad() outputs = net(inputs) if use_loss_module: loss = criterion(outputs, labels) if use_loss_functional: loss = F.cross_entropy(outputs, labels) hook.record_tensor_value("nll_loss", tensor_value=loss) loss.backward() optimizer.step() # Users can call this method to immediately use the Trials API. hook.close() smd.del_hook()
def test_pytorch(script_mode, use_loss_module): smd.del_hook() sim_class = ScriptSimulator if script_mode else SagemakerSimulator with sim_class() as sim: trainloader, testloader = get_dataloaders() net = Net() criterion = nn.CrossEntropyLoss() optimizer = optim.SGD(net.parameters(), lr=0.001, momentum=0.9) if script_mode: hook = smd.Hook(out_dir=sim.out_dir) hook.register_module(net) hook.register_loss(criterion) for i, data in enumerate(trainloader, 0): # get the inputs; data is a list of [inputs, labels] inputs, labels = data # zero the parameter gradients optimizer.zero_grad() # forward + backward + optimize outputs = net(inputs) if use_loss_module: loss = criterion(outputs, labels) else: loss = F.cross_entropy(outputs, labels) if script_mode: hook.record_tensor_value(tensor_name="loss", tensor_value=loss) loss.backward() optimizer.step() if i == 499: # print every 2000 mini-batches break print("Finished Training") hook = smd.get_hook() print(f"hook = {hook}") from smdebug.trials import create_trial trial = create_trial(path=sim.out_dir) print(f"trial.steps() = {trial.steps()}") print(f"trial.tensor_names() = {trial.tensor_names()}") print(f"collection_manager = {hook.collection_manager}") losses_tensors = hook.collection_manager.get("losses").tensor_names print(f"'losses' collection tensor_names = {losses_tensors}") assert len(losses_tensors) > 0 assert all( [ name in trial.tensor_names() for name in hook.collection_manager.get("losses").tensor_names ] )
def run(rank, size, include_workers="one", num_epochs=10, batch_size=128, num_batches=10): """Distributed function to be implemented later.""" torch.manual_seed(1234) device = torch.device("cpu") model = Net().to(device) optimizer = optim.SGD(model.parameters(), lr=1) shutil.rmtree(out_dir, ignore_errors=True) hook = smd.Hook( out_dir=out_dir, save_config=smd.SaveConfig(save_steps=[0, 1, 5]), save_all=True, include_workers=include_workers, ) hook.register_module(model) for epoch in range(num_epochs): epoch_loss = 0.0 for _ in range(num_batches): optimizer.zero_grad() data, target = dataset(batch_size) output = model(data) loss = F.mse_loss(output, target) epoch_loss += loss.item() loss.backward() average_gradients(model) optimizer.step() # print(f"Rank {dist.get_rank()}, epoch {epoch}: {epoch_loss / num_batches}") assert hook._get_worker_name() == f"worker_{dist.get_rank()}" # Race condition here where both workers attempt to move # /tmp/{out_dir}/END_OF_JOB.ts to {out_dir}/END_OF_JOB.ts try: hook._cleanup() except FileNotFoundError: pass
def test_run_net_single_process(out_dir): """Runs a single linear layer.""" device = torch.device("cpu") model = Net().to(device) optimizer = optim.SGD(model.parameters(), lr=0.01) shutil.rmtree(out_dir, ignore_errors=True) hook = smd.Hook(out_dir=out_dir, save_config=smd.SaveConfig(save_steps=[0, 1, 5]), save_all=True) hook.register_module(model) train(model=model, device=device, optimizer=optimizer) hook._cleanup() assert hook._get_worker_name() == "worker_0" trial = create_trial(path=out_dir) assert len(trial.workers()) == 1, f"trial.workers() = {trial.workers()}" assert len(trial.steps()) == 3, f"trial.steps() = {trial.steps()}" shutil.rmtree(out_dir, ignore_errors=True)
def test_no_name_clash(): out_dir = TemporaryDirectory().name hook = smd.Hook( out_dir=out_dir, save_config=smd.SaveConfig(save_steps=[0, 1, 5]), save_all=True, include_workers="one", ) model = Net() hook.register_module(model) device = "cpu" optimizer = optim.SGD(model.parameters(), lr=0.001, momentum=0.9) train(model, hook, torch.device(device), optimizer, num_steps=10) trial = create_trial(out_dir) assert trial.steps() == [0, 1, 5] assert len(trial.tensor_names(regex="relu.*")) == 6 shutil.rmtree(out_dir, ignore_errors=True)
def start_training(model, trainloader, testloader, model_ext): criterion = nn.CrossEntropyLoss() optimizer = optim.SGD(model.parameters(), lr=0.05, momentum=0, weight_decay=5e-4) scheduler = torch.optim.lr_scheduler.CosineAnnealingLR(optimizer, T_max=200) # Registering Job job_name = model_ext hook = smd.Hook(out_dir=f'./smdebug/{job_name}', save_config=smd.SaveConfig(save_interval=100), include_collections=['weights', 'gradients', 'biases']) hook.register_module(model) hook.register_loss(criterion) for epoch in range(0, 5): train(model, trainloader, epoch, model_ext, criterion, optimizer, hook) test(model, testloader, epoch, criterion, model_ext) scheduler.step()
def train_model(out_dir="/tmp/smdebug", training_steps=5): rnn = RNN(50, 20, 10) save_config = smd.SaveConfig(save_interval=500) hook = smd.Hook(out_dir=out_dir, save_all=True, save_config=save_config) loss_fn = nn.MSELoss() hook.register_module(rnn) hook.register_module(loss_fn) batch_size = 10 TIMESTEPS = training_steps # Create some fake data batch = torch.randn(batch_size, 50) hidden = torch.zeros(batch_size, 20) target = torch.zeros(batch_size, 10) loss = 0 for t in range(TIMESTEPS): hidden, output = rnn(batch, hidden) loss += loss_fn(output, target) loss.backward() hook.close()
def test_tensorboard_dir_non_sagemaker_forgot_export_tensorboard(): """ In script mode, passing tensorboard_dir will work. """ with ScriptSimulator(tensorboard_dir="/tmp/tensorboard_dir") as sim: hook = smd.Hook(out_dir=sim.out_dir, tensorboard_dir=sim.tensorboard_dir) assert hook.tensorboard_dir == sim.tensorboard_dir
def test_tensorboard_dir_script_export_tensorboard(): """ In script mode, passing `export_tensorboard=True` results in tensorboard_dir=out_dir. """ with ScriptSimulator() as sim: hook = smd.Hook(out_dir=sim.out_dir, export_tensorboard=True) assert hook.tensorboard_dir == os.path.join(hook.out_dir, "tensorboard")
def test_tensorboard_dir_script_default(): """ In script mode, we default to no tensorboard. """ with ScriptSimulator() as sim: hook = smd.Hook(out_dir=sim.out_dir) assert hook.tensorboard_dir is None
self.hidden_size = hidden_size input_size = data_size + hidden_size self.i2h = nn.Linear(input_size, hidden_size) self.h2o = nn.Linear(hidden_size, output_size) def forward(self, data, last_hidden): input = torch.cat((data, last_hidden), 1) hidden = self.i2h(input) output = self.h2o(hidden) return hidden, output rnn = RNN(50, 20, 10) save_config = smd.SaveConfig(save_interval=500) hook = smd.Hook(out_dir="/tmp/smdebug", save_all=True, save_config=save_config) loss_fn = nn.MSELoss() hook.register_module(rnn) # hook.register_module(loss_fn) batch_size = 10 TIMESTEPS = 5 # Create some fake data batch = torch.randn(batch_size, 50) hidden = torch.zeros(batch_size, 20) target = torch.zeros(batch_size, 10) loss = 0
def create_smdebug_hook(): # This allows you to create the hook from the configuration you pass to the SageMaker pySDK #hook = smd.Hook.create_from_json_file() hook = smd.Hook("/tmp/tensors", include_regex=".*") return hook