Beispiel #1
0
 def __init__(self):
     super(Net, self).__init__()
     self.add_module("conv1", nn.Conv2d(1, 20, 5, 1))
     self.add_module("relu0", nn.ReLU())
     self.add_module("max_pool", nn.MaxPool2d(2, stride=2))
     self.add_module("conv2", nn.Conv2d(20, 50, 5, 1))
     relu_module = nn.ReLU()
     self.add_module("relu1", relu_module)
     get_hook().get_collection("relu_activations").add_module_tensors(relu_module)
     self.add_module("max_pool2", nn.MaxPool2d(2, stride=2))
     self.add_module("fc1", nn.Linear(4 * 4 * 50, 500))
     self.add_module("relu2", nn.ReLU())
     self.add_module("fc2", nn.Linear(500, 10))
Beispiel #2
0
def test_pytorch(script_mode, use_loss_module):
    smd.del_hook()

    sim_class = ScriptSimulator if script_mode else SagemakerSimulator
    with sim_class() as sim:
        helper_torch_train(sim=sim,
                           script_mode=script_mode,
                           use_loss_module=use_loss_module)

        print("Finished Training")

        hook = smd.get_hook()
        print(f"hook = {hook}")
        # Check if the hook was executed with the default
        # hook configuration
        assert hook.has_default_hook_configuration()

        from smdebug.trials import create_trial

        trial = create_trial(path=sim.out_dir)
        print(f"trial.steps() = {trial.steps()}")
        print(f"trial.tensor_names() = {trial.tensor_names()}")

        print(f"collection_manager = {hook.collection_manager}")

        losses_tensors = hook.collection_manager.get("losses").tensor_names
        print(f"'losses' collection tensor_names = {losses_tensors}")
        assert len(losses_tensors) > 0

        assert all([
            name in trial.tensor_names()
            for name in hook.collection_manager.get("losses").tensor_names
        ])
Beispiel #3
0
def test_tensorboard_dir_sagemaker():
    """ In Sagemaker, we read the tensorboard_dir from a separate JSON config file. """
    with SagemakerSimulator() as sim:
        smd.del_hook()
        hook = smd.get_hook(create_if_not_exists=True)
        assert hook.out_dir == sim.out_dir
        assert hook.tensorboard_dir == sim.tensorboard_dir
Beispiel #4
0
def test_pytorch(script_mode, use_loss_module):
    smd.del_hook()

    sim_class = ScriptSimulator if script_mode else SagemakerSimulator
    with sim_class() as sim:
        trainloader, testloader = get_dataloaders()
        net = Net()
        criterion = nn.CrossEntropyLoss()
        optimizer = optim.SGD(net.parameters(), lr=0.001, momentum=0.9)

        if script_mode:
            hook = smd.Hook(out_dir=sim.out_dir)
            hook.register_module(net)
            hook.register_loss(criterion)

        for i, data in enumerate(trainloader, 0):
            # get the inputs; data is a list of [inputs, labels]
            inputs, labels = data

            # zero the parameter gradients
            optimizer.zero_grad()

            # forward + backward + optimize
            outputs = net(inputs)
            if use_loss_module:
                loss = criterion(outputs, labels)
            else:
                loss = F.cross_entropy(outputs, labels)
                if script_mode:
                    hook.record_tensor_value(tensor_name="loss", tensor_value=loss)
            loss.backward()
            optimizer.step()

            if i == 499:  # print every 2000 mini-batches
                break

        print("Finished Training")

        hook = smd.get_hook()
        print(f"hook = {hook}")

        from smdebug.trials import create_trial

        trial = create_trial(path=sim.out_dir)
        print(f"trial.steps() = {trial.steps()}")
        print(f"trial.tensor_names() = {trial.tensor_names()}")

        print(f"collection_manager = {hook.collection_manager}")

        losses_tensors = hook.collection_manager.get("losses").tensor_names
        print(f"'losses' collection tensor_names = {losses_tensors}")
        assert len(losses_tensors) > 0

        assert all(
            [
                name in trial.tensor_names()
                for name in hook.collection_manager.get("losses").tensor_names
            ]
        )
Beispiel #5
0
def train(model, device, optimizer, num_steps=500, save_steps=[]):
    model.train()
    count = 0
    # for batch_idx, (data, target) in enumerate(train_loader):
    for i in range(num_steps):
        batch_size = 32
        data, target = torch.rand(batch_size, 20), torch.rand(batch_size).long()
        data, target = data.to(device), target.to(device)
        optimizer.zero_grad()
        output = model(Variable(data, requires_grad=True))
        loss = F.nll_loss(output, target)
        smd.get_hook().record_tensor_value(tensor_name="my_loss", tensor_value=loss)
        loss.backward()
        if i in save_steps:
            model.saved["gradient/Net_fc1.weight"][i] = model.fc1.weight.grad.data.numpy().copy()
            model.saved["gradient/Net_fc2.weight"][i] = model.fc2.weight.grad.data.numpy().copy()
            model.saved["gradient/Net_fc3.weight"][i] = model.fc3.weight.grad.data.numpy().copy()
            model.saved["gradient/Net_fc1.bias"][i] = model.fc1.bias.grad.data.numpy().copy()
            model.saved["gradient/Net_fc2.bias"][i] = model.fc2.bias.grad.data.numpy().copy()
            model.saved["gradient/Net_fc3.bias"][i] = model.fc3.bias.grad.data.numpy().copy()
        optimizer.step()
def main():
    parser = argparse.ArgumentParser(description="Train resnet50 cifar10")
    parser.add_argument("--batch_size", type=int, default=1024)
    parser.add_argument("--epoch", type=int, default=5)
    parser.add_argument("--local_rank", type=int)
    opt = parser.parse_args()

    torch.cuda.set_device(opt.local_rank)
    torch.distributed.init_process_group(backend="nccl", init_method="env://")

    # Init hook
    hook = get_hook()

    # create model
    net = models.__dict__["resnext101_32x8d"](pretrained=False)
    device = torch.device("cuda")
    net.to(device)

    # Start the training.
    median_time = train(opt.batch_size, opt.epoch, net, hook, device, opt.local_rank)
    print("Median training time per Epoch=%.1f sec" % median_time)
def main():
    parser = argparse.ArgumentParser(description="Train resnet50 cifar10")
    parser.add_argument("--batch_size", type=int, default=1024)
    parser.add_argument("--epochs",
                        type=int,
                        default=5,
                        metavar="N",
                        help="number of epochs to train (default: 5)")
    parser.add_argument("--use_only_cpu", type=str2bool, default=False)
    parser.add_argument("--model", type=str, default="resnet50")
    args = parser.parse_args()

    args.cuda = not args.use_only_cpu and torch.cuda.is_available()

    batch_size = args.batch_size
    seed = 42

    # Horovod: initialize library.
    hvd.init()
    torch.manual_seed(seed)

    if args.cuda:
        # Horovod: pin GPU to local rank.
        torch.cuda.set_device(hvd.local_rank())
        torch.cuda.manual_seed(seed)

    local_rank = hvd.local_rank()
    # create model
    net = models.__dict__[args.model](pretrained=False)
    if args.cuda:
        net.cuda()
    # Init hook
    hook = get_hook()

    # Start the training.
    median_time = train(batch_size, args.epochs, net, hook, args, local_rank)
    print("Median training time per Epoch=%.1f sec" % median_time)
def train(args, net, device):
    hook = get_hook(create_if_not_exists=True)
    batch_size = args.batch_size
    epoch = args.epoch
    transform_train = transforms.Compose(
        [
            transforms.RandomCrop(32, padding=4),
            transforms.RandomHorizontalFlip(),
            transforms.ToTensor(),
            transforms.Normalize((0.4914, 0.4822, 0.4465), (0.2023, 0.1994, 0.2010)),
            transforms.Lambda(transform_delay),
        ]
    )

    transform_valid = transforms.Compose(
        [
            transforms.ToTensor(),
            transforms.Normalize((0.4914, 0.4822, 0.4465), (0.2023, 0.1994, 0.2010)),
            transforms.Lambda(transform_delay),
        ]
    )

    trainset = torchvision.datasets.CIFAR10(
        root="./data", train=True, download=True, transform=transform_train
    )
    trainloader = torch.utils.data.DataLoader(
        trainset,
        batch_size=batch_size,
        shuffle=True,
        num_workers=args.workers,
        pin_memory=args.pin_memory,
    )

    validset = torchvision.datasets.CIFAR10(
        root="./data", train=False, download=True, transform=transform_valid
    )
    validloader = torch.utils.data.DataLoader(
        validset,
        batch_size=batch_size,
        shuffle=False,
        num_workers=args.workers,
        pin_memory=args.pin_memory,
    )

    loss_optim = nn.CrossEntropyLoss()
    optimizer = optim.SGD(net.parameters(), lr=1.0, momentum=0.9)

    epoch_times = []

    if hook:
        hook.register_loss(loss_optim)
    # train the model

    for i in range(epoch):
        print("START TRAINING")
        if hook:
            hook.set_mode(modes.TRAIN)
        start = time.time()
        net.train()
        train_loss = 0
        for _, (inputs, targets) in enumerate(trainloader):
            inputs, targets = inputs.to(device), targets.to(device)
            optimizer.zero_grad()
            outputs = net(inputs)
            loss = loss_optim(outputs, targets)
            loss.backward()
            optimizer.step()
            train_loss += loss.item()

        print("START VALIDATING")
        if hook:
            hook.set_mode(modes.EVAL)
        net.eval()
        val_loss = 0
        with torch.no_grad():
            for _, (inputs, targets) in enumerate(validloader):
                inputs, targets = inputs.to(device), targets.to(device)
                outputs = net(inputs)
                loss = loss_optim(outputs, targets)
                val_loss += loss.item()

        epoch_time = time.time() - start
        epoch_times.append(epoch_time)
        print(
            "Epoch %d: train loss %.3f, val loss %.3f, in %.1f sec"
            % (i, train_loss, val_loss, epoch_time)
        )

    # calculate training time after all epoch
    p50 = np.percentile(epoch_times, 50)
    return p50
Beispiel #9
0
def test_pytorch_with_unsupported_version(use_loss_module=False):
    smd.del_hook()
    helper_torch_train(script_mode=False, use_loss_module=use_loss_module)
    print("Finished Training")
    hook = smd.get_hook()
    assert hook is None