def __init__(self): super(Net, self).__init__() self.add_module("conv1", nn.Conv2d(1, 20, 5, 1)) self.add_module("relu0", nn.ReLU()) self.add_module("max_pool", nn.MaxPool2d(2, stride=2)) self.add_module("conv2", nn.Conv2d(20, 50, 5, 1)) relu_module = nn.ReLU() self.add_module("relu1", relu_module) get_hook().get_collection("relu_activations").add_module_tensors(relu_module) self.add_module("max_pool2", nn.MaxPool2d(2, stride=2)) self.add_module("fc1", nn.Linear(4 * 4 * 50, 500)) self.add_module("relu2", nn.ReLU()) self.add_module("fc2", nn.Linear(500, 10))
def test_pytorch(script_mode, use_loss_module): smd.del_hook() sim_class = ScriptSimulator if script_mode else SagemakerSimulator with sim_class() as sim: helper_torch_train(sim=sim, script_mode=script_mode, use_loss_module=use_loss_module) print("Finished Training") hook = smd.get_hook() print(f"hook = {hook}") # Check if the hook was executed with the default # hook configuration assert hook.has_default_hook_configuration() from smdebug.trials import create_trial trial = create_trial(path=sim.out_dir) print(f"trial.steps() = {trial.steps()}") print(f"trial.tensor_names() = {trial.tensor_names()}") print(f"collection_manager = {hook.collection_manager}") losses_tensors = hook.collection_manager.get("losses").tensor_names print(f"'losses' collection tensor_names = {losses_tensors}") assert len(losses_tensors) > 0 assert all([ name in trial.tensor_names() for name in hook.collection_manager.get("losses").tensor_names ])
def test_tensorboard_dir_sagemaker(): """ In Sagemaker, we read the tensorboard_dir from a separate JSON config file. """ with SagemakerSimulator() as sim: smd.del_hook() hook = smd.get_hook(create_if_not_exists=True) assert hook.out_dir == sim.out_dir assert hook.tensorboard_dir == sim.tensorboard_dir
def test_pytorch(script_mode, use_loss_module): smd.del_hook() sim_class = ScriptSimulator if script_mode else SagemakerSimulator with sim_class() as sim: trainloader, testloader = get_dataloaders() net = Net() criterion = nn.CrossEntropyLoss() optimizer = optim.SGD(net.parameters(), lr=0.001, momentum=0.9) if script_mode: hook = smd.Hook(out_dir=sim.out_dir) hook.register_module(net) hook.register_loss(criterion) for i, data in enumerate(trainloader, 0): # get the inputs; data is a list of [inputs, labels] inputs, labels = data # zero the parameter gradients optimizer.zero_grad() # forward + backward + optimize outputs = net(inputs) if use_loss_module: loss = criterion(outputs, labels) else: loss = F.cross_entropy(outputs, labels) if script_mode: hook.record_tensor_value(tensor_name="loss", tensor_value=loss) loss.backward() optimizer.step() if i == 499: # print every 2000 mini-batches break print("Finished Training") hook = smd.get_hook() print(f"hook = {hook}") from smdebug.trials import create_trial trial = create_trial(path=sim.out_dir) print(f"trial.steps() = {trial.steps()}") print(f"trial.tensor_names() = {trial.tensor_names()}") print(f"collection_manager = {hook.collection_manager}") losses_tensors = hook.collection_manager.get("losses").tensor_names print(f"'losses' collection tensor_names = {losses_tensors}") assert len(losses_tensors) > 0 assert all( [ name in trial.tensor_names() for name in hook.collection_manager.get("losses").tensor_names ] )
def train(model, device, optimizer, num_steps=500, save_steps=[]): model.train() count = 0 # for batch_idx, (data, target) in enumerate(train_loader): for i in range(num_steps): batch_size = 32 data, target = torch.rand(batch_size, 20), torch.rand(batch_size).long() data, target = data.to(device), target.to(device) optimizer.zero_grad() output = model(Variable(data, requires_grad=True)) loss = F.nll_loss(output, target) smd.get_hook().record_tensor_value(tensor_name="my_loss", tensor_value=loss) loss.backward() if i in save_steps: model.saved["gradient/Net_fc1.weight"][i] = model.fc1.weight.grad.data.numpy().copy() model.saved["gradient/Net_fc2.weight"][i] = model.fc2.weight.grad.data.numpy().copy() model.saved["gradient/Net_fc3.weight"][i] = model.fc3.weight.grad.data.numpy().copy() model.saved["gradient/Net_fc1.bias"][i] = model.fc1.bias.grad.data.numpy().copy() model.saved["gradient/Net_fc2.bias"][i] = model.fc2.bias.grad.data.numpy().copy() model.saved["gradient/Net_fc3.bias"][i] = model.fc3.bias.grad.data.numpy().copy() optimizer.step()
def main(): parser = argparse.ArgumentParser(description="Train resnet50 cifar10") parser.add_argument("--batch_size", type=int, default=1024) parser.add_argument("--epoch", type=int, default=5) parser.add_argument("--local_rank", type=int) opt = parser.parse_args() torch.cuda.set_device(opt.local_rank) torch.distributed.init_process_group(backend="nccl", init_method="env://") # Init hook hook = get_hook() # create model net = models.__dict__["resnext101_32x8d"](pretrained=False) device = torch.device("cuda") net.to(device) # Start the training. median_time = train(opt.batch_size, opt.epoch, net, hook, device, opt.local_rank) print("Median training time per Epoch=%.1f sec" % median_time)
def main(): parser = argparse.ArgumentParser(description="Train resnet50 cifar10") parser.add_argument("--batch_size", type=int, default=1024) parser.add_argument("--epochs", type=int, default=5, metavar="N", help="number of epochs to train (default: 5)") parser.add_argument("--use_only_cpu", type=str2bool, default=False) parser.add_argument("--model", type=str, default="resnet50") args = parser.parse_args() args.cuda = not args.use_only_cpu and torch.cuda.is_available() batch_size = args.batch_size seed = 42 # Horovod: initialize library. hvd.init() torch.manual_seed(seed) if args.cuda: # Horovod: pin GPU to local rank. torch.cuda.set_device(hvd.local_rank()) torch.cuda.manual_seed(seed) local_rank = hvd.local_rank() # create model net = models.__dict__[args.model](pretrained=False) if args.cuda: net.cuda() # Init hook hook = get_hook() # Start the training. median_time = train(batch_size, args.epochs, net, hook, args, local_rank) print("Median training time per Epoch=%.1f sec" % median_time)
def train(args, net, device): hook = get_hook(create_if_not_exists=True) batch_size = args.batch_size epoch = args.epoch transform_train = transforms.Compose( [ transforms.RandomCrop(32, padding=4), transforms.RandomHorizontalFlip(), transforms.ToTensor(), transforms.Normalize((0.4914, 0.4822, 0.4465), (0.2023, 0.1994, 0.2010)), transforms.Lambda(transform_delay), ] ) transform_valid = transforms.Compose( [ transforms.ToTensor(), transforms.Normalize((0.4914, 0.4822, 0.4465), (0.2023, 0.1994, 0.2010)), transforms.Lambda(transform_delay), ] ) trainset = torchvision.datasets.CIFAR10( root="./data", train=True, download=True, transform=transform_train ) trainloader = torch.utils.data.DataLoader( trainset, batch_size=batch_size, shuffle=True, num_workers=args.workers, pin_memory=args.pin_memory, ) validset = torchvision.datasets.CIFAR10( root="./data", train=False, download=True, transform=transform_valid ) validloader = torch.utils.data.DataLoader( validset, batch_size=batch_size, shuffle=False, num_workers=args.workers, pin_memory=args.pin_memory, ) loss_optim = nn.CrossEntropyLoss() optimizer = optim.SGD(net.parameters(), lr=1.0, momentum=0.9) epoch_times = [] if hook: hook.register_loss(loss_optim) # train the model for i in range(epoch): print("START TRAINING") if hook: hook.set_mode(modes.TRAIN) start = time.time() net.train() train_loss = 0 for _, (inputs, targets) in enumerate(trainloader): inputs, targets = inputs.to(device), targets.to(device) optimizer.zero_grad() outputs = net(inputs) loss = loss_optim(outputs, targets) loss.backward() optimizer.step() train_loss += loss.item() print("START VALIDATING") if hook: hook.set_mode(modes.EVAL) net.eval() val_loss = 0 with torch.no_grad(): for _, (inputs, targets) in enumerate(validloader): inputs, targets = inputs.to(device), targets.to(device) outputs = net(inputs) loss = loss_optim(outputs, targets) val_loss += loss.item() epoch_time = time.time() - start epoch_times.append(epoch_time) print( "Epoch %d: train loss %.3f, val loss %.3f, in %.1f sec" % (i, train_loss, val_loss, epoch_time) ) # calculate training time after all epoch p50 = np.percentile(epoch_times, 50) return p50
def test_pytorch_with_unsupported_version(use_loss_module=False): smd.del_hook() helper_torch_train(script_mode=False, use_loss_module=use_loss_module) print("Finished Training") hook = smd.get_hook() assert hook is None