def test_data_parallel(): shutil.rmtree(out_dir, ignore_errors=True) hook = smd.Hook( out_dir=out_dir, save_config=smd.SaveConfig(save_steps=[0, 1, 5]), save_all=True, include_workers="one", ) device = "cuda" if torch.cuda.is_available() else "cpu" model = Net().to(device) if device == "cuda": model = DataParallel(model) hook.register_module(model) optimizer = optim.SGD(model.parameters(), lr=0.001, momentum=0.9) train(model, hook, torch.device(device), optimizer, num_steps=10) trial = create_trial(out_dir) assert trial.steps() == [0, 1, 5] if device == "cpu": assert len(trial.tensor_names()) == 38 else: assert len(trial.tensor_names()) > 37 shutil.rmtree(out_dir, ignore_errors=True)
def model_fn(model_dir): global hook #create model model = models.resnet18() #traffic sign dataset has 43 classes nfeatures = model.fc.in_features model.fc = nn.Linear(nfeatures, 43) #load model weights = torch.load(model_dir + '/model/model.pt', map_location=lambda storage, loc: storage) model.load_state_dict(weights) model.eval() model.cpu() #hook configuration save_config = smd.SaveConfig(mode_save_configs={ smd.modes.PREDICT: smd.SaveConfigMode(save_interval=1) }) boto_session = boto3.Session() sagemaker_session = sagemaker.Session(boto_session=boto_session) hook = CustomHook("s3://" + sagemaker_session.default_bucket() + "/endpoint/tensors", save_config=save_config, include_regex='.*bn|.*bias|.*downsample|.*ResNet_input|.*image|.*fc_output' ) #register hook hook.register_module(model) #set mode hook.set_mode(modes.PREDICT) return model
def create_net_and_train(out_dir, n_steps, use_loss_module=False, use_loss_functional=False): assert ( use_loss_module != use_loss_functional ), "Exactly one of `use_loss_module` and `use_loss_functional` must be true." net = Net() optimizer = optim.SGD(net.parameters(), lr=0.05, momentum=0.9) criterion = nn.CrossEntropyLoss() hook = smd.Hook(out_dir=out_dir, save_config=smd.SaveConfig(save_interval=1)) hook.register_module(net) if use_loss_module: hook.register_loss(criterion) batch_size = 1 # Use the same data at each step to test loss decreasing inputs, labels = torch.rand(batch_size, 3, 32, 32), torch.zeros(batch_size).long() for _ in range(n_steps): optimizer.zero_grad() outputs = net(inputs) if use_loss_module: loss = criterion(outputs, labels) if use_loss_functional: loss = F.cross_entropy(outputs, labels) hook.record_tensor_value("nll_loss", tensor_value=loss) loss.backward() optimizer.step() # Users can call this method to immediately use the Trials API. hook.close() smd.del_hook()
def setDebuggerSaveConfig(): smd.SaveConfig( mode_save_configs={ smd.modes.TRAIN: smd.SaveConfigMode(save_interval=1), smd.modes.EVAL: smd.SaveConfigMode(save_interval=1), smd.modes.PREDICT: smd.SaveConfigMode(save_interval=1), smd.modes.GLOBAL: smd.SaveConfigMode(save_interval=1) } )
def run(rank, size, include_workers="one", num_epochs=10, batch_size=128, num_batches=10): """Distributed function to be implemented later.""" torch.manual_seed(1234) device = torch.device("cpu") model = Net().to(device) optimizer = optim.SGD(model.parameters(), lr=1) shutil.rmtree(out_dir, ignore_errors=True) hook = smd.Hook( out_dir=out_dir, save_config=smd.SaveConfig(save_steps=[0, 1, 5]), save_all=True, include_workers=include_workers, ) hook.register_module(model) for epoch in range(num_epochs): epoch_loss = 0.0 for _ in range(num_batches): optimizer.zero_grad() data, target = dataset(batch_size) output = model(data) loss = F.mse_loss(output, target) epoch_loss += loss.item() loss.backward() average_gradients(model) optimizer.step() # print(f"Rank {dist.get_rank()}, epoch {epoch}: {epoch_loss / num_batches}") assert hook._get_worker_name() == f"worker_{dist.get_rank()}" # Race condition here where both workers attempt to move # /tmp/{out_dir}/END_OF_JOB.ts to {out_dir}/END_OF_JOB.ts try: hook._cleanup() except FileNotFoundError: pass
def test_run_net_single_process(out_dir): """Runs a single linear layer.""" device = torch.device("cpu") model = Net().to(device) optimizer = optim.SGD(model.parameters(), lr=0.01) shutil.rmtree(out_dir, ignore_errors=True) hook = smd.Hook(out_dir=out_dir, save_config=smd.SaveConfig(save_steps=[0, 1, 5]), save_all=True) hook.register_module(model) train(model=model, device=device, optimizer=optimizer) hook._cleanup() assert hook._get_worker_name() == "worker_0" trial = create_trial(path=out_dir) assert len(trial.workers()) == 1, f"trial.workers() = {trial.workers()}" assert len(trial.steps()) == 3, f"trial.steps() = {trial.steps()}" shutil.rmtree(out_dir, ignore_errors=True)
def test_no_name_clash(): out_dir = TemporaryDirectory().name hook = smd.Hook( out_dir=out_dir, save_config=smd.SaveConfig(save_steps=[0, 1, 5]), save_all=True, include_workers="one", ) model = Net() hook.register_module(model) device = "cpu" optimizer = optim.SGD(model.parameters(), lr=0.001, momentum=0.9) train(model, hook, torch.device(device), optimizer, num_steps=10) trial = create_trial(out_dir) assert trial.steps() == [0, 1, 5] assert len(trial.tensor_names(regex="relu.*")) == 6 shutil.rmtree(out_dir, ignore_errors=True)
def model_fn(model_dir: str) -> ModelWithDebugHook: #create model model = models.resnet18() #traffic sign dataset has 43 classes nfeatures = model.fc.in_features model.fc = nn.Linear(nfeatures, 43) #load model weights = torch.load(f'{model_dir}/model/model.pt', map_location=lambda storage, loc: storage) model.load_state_dict(weights) model.eval() model.cpu() #hook configuration tensors_output_s3uri = os.environ.get('tensors_output') if tensors_output_s3uri is None: logger.warning( 'WARN: Skipping hook configuration as no tensors_output env var provided. ' 'Tensors will not be exported' ) hook = None else: save_config = smd.SaveConfig(mode_save_configs={ smd.modes.PREDICT: smd.SaveConfigMode(save_interval=1), }) hook = CustomHook( tensors_output_s3uri, save_config=save_config, include_regex='.*bn|.*bias|.*downsample|.*ResNet_input|.*image|.*fc_output', ) #register hook hook.register_module(model) #set mode hook.set_mode(modes.PREDICT) return ModelWithDebugHook(model, hook)
def start_training(model, trainloader, testloader, model_ext): criterion = nn.CrossEntropyLoss() optimizer = optim.SGD(model.parameters(), lr=0.05, momentum=0, weight_decay=5e-4) scheduler = torch.optim.lr_scheduler.CosineAnnealingLR(optimizer, T_max=200) # Registering Job job_name = model_ext hook = smd.Hook(out_dir=f'./smdebug/{job_name}', save_config=smd.SaveConfig(save_interval=100), include_collections=['weights', 'gradients', 'biases']) hook.register_module(model) hook.register_loss(criterion) for epoch in range(0, 5): train(model, trainloader, epoch, model_ext, criterion, optimizer, hook) test(model, testloader, epoch, criterion, model_ext) scheduler.step()
def train_model(out_dir="/tmp/smdebug", training_steps=5): rnn = RNN(50, 20, 10) save_config = smd.SaveConfig(save_interval=500) hook = smd.Hook(out_dir=out_dir, save_all=True, save_config=save_config) loss_fn = nn.MSELoss() hook.register_module(rnn) hook.register_module(loss_fn) batch_size = 10 TIMESTEPS = training_steps # Create some fake data batch = torch.randn(batch_size, 50) hidden = torch.zeros(batch_size, 20) target = torch.zeros(batch_size, 10) loss = 0 for t in range(TIMESTEPS): hidden, output = rnn(batch, hidden) loss += loss_fn(output, target) loss.backward() hook.close()
self.hidden_size = hidden_size input_size = data_size + hidden_size self.i2h = nn.Linear(input_size, hidden_size) self.h2o = nn.Linear(hidden_size, output_size) def forward(self, data, last_hidden): input = torch.cat((data, last_hidden), 1) hidden = self.i2h(input) output = self.h2o(hidden) return hidden, output rnn = RNN(50, 20, 10) save_config = smd.SaveConfig(save_interval=500) hook = smd.Hook(out_dir="/tmp/smdebug", save_all=True, save_config=save_config) loss_fn = nn.MSELoss() hook.register_module(rnn) # hook.register_module(loss_fn) batch_size = 10 TIMESTEPS = 5 # Create some fake data batch = torch.randn(batch_size, 50) hidden = torch.zeros(batch_size, 20) target = torch.zeros(batch_size, 10)
def train_model(epochs, batch_size_train, batch_size_val): #check if GPU is available and set context device = torch.device("cuda:0" if torch.cuda.is_available() else "cpu") #get pretrained ResNet model model = models.resnet18(pretrained=True) #replace inplace operators relu_inplace(model) nfeatures = model.fc.in_features #traffic sign dataset has 43 classes model.fc = nn.Linear(nfeatures, 43) #copy model to GPU or CPU model = model.to(device) # loss for multi label classification loss_function = nn.CrossEntropyLoss() # optimizer optimizer = optim.SGD(model.parameters(), lr=args.learning_rate, momentum=args.momentum) #configure smdebug hook: #save all iterations from validation phase #save only first iteration from training phase save_config = smd.SaveConfig(mode_save_configs={ smd.modes.TRAIN: smd.SaveConfigMode(save_steps=[0]), smd.modes.EVAL: smd.SaveConfigMode(save_interval=1) }) #create custom hook that has a customized forward function, so that we can get gradients of outputs hook = CustomHook(args.smdebug_dir, save_config=save_config, include_regex='.*bn|.*bias|.*downsample|.*ResNet_input|.*image|.*fc_output' ) #register hook hook.register_module(model) #get the dataloaders for train and test data train_loader, val_loader = get_dataloaders(batch_size_train, batch_size_val) #training loop for epoch in range(epochs): epoch_loss = 0 epoch_acc = 0 #set hook training phase hook.set_mode(modes.TRAIN) model.train() for inputs, labels in train_loader: inputs = inputs.to(device).requires_grad_() labels = labels.to(device) # zero the parameter gradients optimizer.zero_grad() # forward pass outputs = model(inputs) #get predictions _, preds = torch.max(outputs, 1) #compute loss loss = loss_function(outputs, labels) #backward pass loss.backward() #optimize parameters optimizer.step() # statistics epoch_loss += loss.item() epoch_acc += torch.sum(preds == labels.data) #set hook validation phase hook.set_mode(modes.EVAL) model.eval() for inputs, labels in val_loader: inputs = inputs.to(device).requires_grad_() hook.image_gradients(inputs) model.eval() #forward pass outputs = model(inputs) #get prediction predicted_class = outputs.data.max(1, keepdim=True)[1] agg = 0 for i in range(outputs.shape[0]): agg += outputs[i,predicted_class[i]] model.zero_grad() #compute gradients with respect to outputs agg.backward() print('Epoch {}/{} Loss: {:.4f} Acc: {:.4f}'.format( epoch, epochs - 1, epoch_loss, epoch_acc)) return model