コード例 #1
0
def test_tensorboard_dir_script_specify_tensorboard_dir():
    """ In script mode, passing `export_tensorboard` and `tensorboard_dir` works. """
    with ScriptSimulator(tensorboard_dir="/tmp/tensorboard_dir") as sim:
        hook = smd.Hook(out_dir=sim.out_dir,
                        export_tensorboard=True,
                        tensorboard_dir=sim.tensorboard_dir)
        assert hook.tensorboard_dir == sim.tensorboard_dir
コード例 #2
0
def helper_torch_train(sim=None, script_mode=False, use_loss_module=False):
    trainloader, testloader = get_dataloaders()
    net = Net()
    criterion = nn.CrossEntropyLoss()
    optimizer = optim.SGD(net.parameters(), lr=0.001, momentum=0.9)

    if script_mode:
        hook = smd.Hook(out_dir=sim.out_dir)
        hook.register_module(net)
        hook.register_loss(criterion)

    for i, data in enumerate(trainloader, 0):
        # get the inputs; data is a list of [inputs, labels]
        inputs, labels = data

        # zero the parameter gradients
        optimizer.zero_grad()

        # forward + backward + optimize
        outputs = net(inputs)
        if use_loss_module:
            loss = criterion(outputs, labels)
        else:
            loss = F.cross_entropy(outputs, labels)
            if script_mode:
                hook.record_tensor_value(tensor_name="loss", tensor_value=loss)
        loss.backward()
        optimizer.step()

        if i == 499:
            break
コード例 #3
0
def test_data_parallel():
    shutil.rmtree(out_dir, ignore_errors=True)

    hook = smd.Hook(
        out_dir=out_dir,
        save_config=smd.SaveConfig(save_steps=[0, 1, 5]),
        save_all=True,
        include_workers="one",
    )

    device = "cuda" if torch.cuda.is_available() else "cpu"
    model = Net().to(device)
    if device == "cuda":
        model = DataParallel(model)

    hook.register_module(model)

    optimizer = optim.SGD(model.parameters(), lr=0.001, momentum=0.9)
    train(model, hook, torch.device(device), optimizer, num_steps=10)

    trial = create_trial(out_dir)
    assert trial.steps() == [0, 1, 5]
    if device == "cpu":
        assert len(trial.tensor_names()) == 38
    else:
        assert len(trial.tensor_names()) > 37

    shutil.rmtree(out_dir, ignore_errors=True)
コード例 #4
0
def create_net_and_train(out_dir, n_steps, use_loss_module=False, use_loss_functional=False):
    assert (
        use_loss_module != use_loss_functional
    ), "Exactly one of `use_loss_module` and `use_loss_functional` must be true."

    net = Net()
    optimizer = optim.SGD(net.parameters(), lr=0.05, momentum=0.9)
    criterion = nn.CrossEntropyLoss()

    hook = smd.Hook(out_dir=out_dir, save_config=smd.SaveConfig(save_interval=1))
    hook.register_module(net)
    if use_loss_module:
        hook.register_loss(criterion)

    batch_size = 1
    # Use the same data at each step to test loss decreasing
    inputs, labels = torch.rand(batch_size, 3, 32, 32), torch.zeros(batch_size).long()
    for _ in range(n_steps):
        optimizer.zero_grad()
        outputs = net(inputs)
        if use_loss_module:
            loss = criterion(outputs, labels)
        if use_loss_functional:
            loss = F.cross_entropy(outputs, labels)
            hook.record_tensor_value("nll_loss", tensor_value=loss)
        loss.backward()
        optimizer.step()

    # Users can call this method to immediately use the Trials API.
    hook.close()
    smd.del_hook()
コード例 #5
0
def test_pytorch(script_mode, use_loss_module):
    smd.del_hook()

    sim_class = ScriptSimulator if script_mode else SagemakerSimulator
    with sim_class() as sim:
        trainloader, testloader = get_dataloaders()
        net = Net()
        criterion = nn.CrossEntropyLoss()
        optimizer = optim.SGD(net.parameters(), lr=0.001, momentum=0.9)

        if script_mode:
            hook = smd.Hook(out_dir=sim.out_dir)
            hook.register_module(net)
            hook.register_loss(criterion)

        for i, data in enumerate(trainloader, 0):
            # get the inputs; data is a list of [inputs, labels]
            inputs, labels = data

            # zero the parameter gradients
            optimizer.zero_grad()

            # forward + backward + optimize
            outputs = net(inputs)
            if use_loss_module:
                loss = criterion(outputs, labels)
            else:
                loss = F.cross_entropy(outputs, labels)
                if script_mode:
                    hook.record_tensor_value(tensor_name="loss", tensor_value=loss)
            loss.backward()
            optimizer.step()

            if i == 499:  # print every 2000 mini-batches
                break

        print("Finished Training")

        hook = smd.get_hook()
        print(f"hook = {hook}")

        from smdebug.trials import create_trial

        trial = create_trial(path=sim.out_dir)
        print(f"trial.steps() = {trial.steps()}")
        print(f"trial.tensor_names() = {trial.tensor_names()}")

        print(f"collection_manager = {hook.collection_manager}")

        losses_tensors = hook.collection_manager.get("losses").tensor_names
        print(f"'losses' collection tensor_names = {losses_tensors}")
        assert len(losses_tensors) > 0

        assert all(
            [
                name in trial.tensor_names()
                for name in hook.collection_manager.get("losses").tensor_names
            ]
        )
コード例 #6
0
def run(rank,
        size,
        include_workers="one",
        num_epochs=10,
        batch_size=128,
        num_batches=10):
    """Distributed function to be implemented later."""
    torch.manual_seed(1234)
    device = torch.device("cpu")
    model = Net().to(device)
    optimizer = optim.SGD(model.parameters(), lr=1)

    shutil.rmtree(out_dir, ignore_errors=True)

    hook = smd.Hook(
        out_dir=out_dir,
        save_config=smd.SaveConfig(save_steps=[0, 1, 5]),
        save_all=True,
        include_workers=include_workers,
    )

    hook.register_module(model)

    for epoch in range(num_epochs):
        epoch_loss = 0.0
        for _ in range(num_batches):
            optimizer.zero_grad()
            data, target = dataset(batch_size)
            output = model(data)
            loss = F.mse_loss(output, target)
            epoch_loss += loss.item()
            loss.backward()
            average_gradients(model)
            optimizer.step()
        # print(f"Rank {dist.get_rank()}, epoch {epoch}: {epoch_loss / num_batches}")

    assert hook._get_worker_name() == f"worker_{dist.get_rank()}"
    # Race condition here where both workers attempt to move
    # /tmp/{out_dir}/END_OF_JOB.ts to {out_dir}/END_OF_JOB.ts
    try:
        hook._cleanup()
    except FileNotFoundError:
        pass
コード例 #7
0
def test_run_net_single_process(out_dir):
    """Runs a single linear layer."""
    device = torch.device("cpu")
    model = Net().to(device)
    optimizer = optim.SGD(model.parameters(), lr=0.01)

    shutil.rmtree(out_dir, ignore_errors=True)
    hook = smd.Hook(out_dir=out_dir,
                    save_config=smd.SaveConfig(save_steps=[0, 1, 5]),
                    save_all=True)
    hook.register_module(model)
    train(model=model, device=device, optimizer=optimizer)
    hook._cleanup()

    assert hook._get_worker_name() == "worker_0"

    trial = create_trial(path=out_dir)
    assert len(trial.workers()) == 1, f"trial.workers() = {trial.workers()}"
    assert len(trial.steps()) == 3, f"trial.steps() = {trial.steps()}"
    shutil.rmtree(out_dir, ignore_errors=True)
コード例 #8
0
def test_no_name_clash():
    out_dir = TemporaryDirectory().name

    hook = smd.Hook(
        out_dir=out_dir,
        save_config=smd.SaveConfig(save_steps=[0, 1, 5]),
        save_all=True,
        include_workers="one",
    )
    model = Net()
    hook.register_module(model)
    device = "cpu"
    optimizer = optim.SGD(model.parameters(), lr=0.001, momentum=0.9)
    train(model, hook, torch.device(device), optimizer, num_steps=10)

    trial = create_trial(out_dir)
    assert trial.steps() == [0, 1, 5]

    assert len(trial.tensor_names(regex="relu.*")) == 6
    shutil.rmtree(out_dir, ignore_errors=True)
コード例 #9
0
def start_training(model, trainloader, testloader, model_ext):

    criterion = nn.CrossEntropyLoss()
    optimizer = optim.SGD(model.parameters(),
                          lr=0.05,
                          momentum=0,
                          weight_decay=5e-4)
    scheduler = torch.optim.lr_scheduler.CosineAnnealingLR(optimizer,
                                                           T_max=200)

    # Registering Job
    job_name = model_ext
    hook = smd.Hook(out_dir=f'./smdebug/{job_name}',
                    save_config=smd.SaveConfig(save_interval=100),
                    include_collections=['weights', 'gradients', 'biases'])

    hook.register_module(model)
    hook.register_loss(criterion)

    for epoch in range(0, 5):
        train(model, trainloader, epoch, model_ext, criterion, optimizer, hook)
        test(model, testloader, epoch, criterion, model_ext)
        scheduler.step()
コード例 #10
0
def train_model(out_dir="/tmp/smdebug", training_steps=5):
    rnn = RNN(50, 20, 10)
    save_config = smd.SaveConfig(save_interval=500)
    hook = smd.Hook(out_dir=out_dir, save_all=True, save_config=save_config)

    loss_fn = nn.MSELoss()

    hook.register_module(rnn)
    hook.register_module(loss_fn)

    batch_size = 10
    TIMESTEPS = training_steps

    # Create some fake data
    batch = torch.randn(batch_size, 50)
    hidden = torch.zeros(batch_size, 20)
    target = torch.zeros(batch_size, 10)

    loss = 0
    for t in range(TIMESTEPS):
        hidden, output = rnn(batch, hidden)
        loss += loss_fn(output, target)
    loss.backward()
    hook.close()
コード例 #11
0
def test_tensorboard_dir_non_sagemaker_forgot_export_tensorboard():
    """ In script mode, passing tensorboard_dir will work. """
    with ScriptSimulator(tensorboard_dir="/tmp/tensorboard_dir") as sim:
        hook = smd.Hook(out_dir=sim.out_dir,
                        tensorboard_dir=sim.tensorboard_dir)
        assert hook.tensorboard_dir == sim.tensorboard_dir
コード例 #12
0
def test_tensorboard_dir_script_export_tensorboard():
    """ In script mode, passing `export_tensorboard=True` results in tensorboard_dir=out_dir. """
    with ScriptSimulator() as sim:
        hook = smd.Hook(out_dir=sim.out_dir, export_tensorboard=True)
        assert hook.tensorboard_dir == os.path.join(hook.out_dir,
                                                    "tensorboard")
コード例 #13
0
def test_tensorboard_dir_script_default():
    """ In script mode, we default to no tensorboard. """
    with ScriptSimulator() as sim:
        hook = smd.Hook(out_dir=sim.out_dir)
        assert hook.tensorboard_dir is None
コード例 #14
0
        self.hidden_size = hidden_size
        input_size = data_size + hidden_size

        self.i2h = nn.Linear(input_size, hidden_size)
        self.h2o = nn.Linear(hidden_size, output_size)

    def forward(self, data, last_hidden):
        input = torch.cat((data, last_hidden), 1)
        hidden = self.i2h(input)
        output = self.h2o(hidden)
        return hidden, output


rnn = RNN(50, 20, 10)
save_config = smd.SaveConfig(save_interval=500)
hook = smd.Hook(out_dir="/tmp/smdebug", save_all=True, save_config=save_config)

loss_fn = nn.MSELoss()

hook.register_module(rnn)
# hook.register_module(loss_fn)

batch_size = 10
TIMESTEPS = 5

# Create some fake data
batch = torch.randn(batch_size, 50)
hidden = torch.zeros(batch_size, 20)
target = torch.zeros(batch_size, 10)

loss = 0
コード例 #15
0
ファイル: train.py プロジェクト: larroy/tornasole_misc
def create_smdebug_hook():
    # This allows you to create the hook from the configuration you pass to the SageMaker pySDK
    #hook = smd.Hook.create_from_json_file()
    hook = smd.Hook("/tmp/tensors", include_regex=".*")
    return hook