Exemple #1
0
def none_skip(pipeline_style):
    if pipeline_style == MultiProcessPipe.AsyncSchedule:
        pytest.skip("Skip tensors NYI for AsyncSchedule")

    @skippable(stash=["none"])
    class Stash(nn.Module):
        def forward(self, input):
            yield stash("none", None)
            return input

    @skippable(pop=["none"])
    class Pop(nn.Module):
        def forward(self, input):
            none = yield pop("none")
            assert none is None
            return input

    model = nn.Sequential(Stash(), Pop())
    model = MultiProcessPipe(
        model,
        [1, 1],
        style=pipeline_style,
        worker_map=get_worker_map(),
        input_device=torch.cuda.current_device(),
        chunks=5,
    ).cuda()

    input = torch.rand(10, requires_grad=True).cuda()
    input.retain_grad()
    output = model(input)

    def assert_grad_fn_is_not_portal(grad_fn, visited=set()):
        if grad_fn in visited or grad_fn is None:
            return

        assert not isinstance(grad_fn, PortalBlue._backward_cls)
        assert not isinstance(grad_fn, PortalCopy._backward_cls)
        assert not isinstance(grad_fn, PortalOrange._backward_cls)

        visited.add(grad_fn)
        for next_grad_fn, _ in grad_fn.next_functions:
            assert_grad_fn_is_not_portal(next_grad_fn, visited)

    if model.group.rank() == 1:
        assert_grad_fn_is_not_portal(output.grad_fn)

        output.sum().backward()
    else:
        model.back_helper(output)
        assert input.grad.mean().item() == 1
Exemple #2
0
def tuple_wait(cuda_sleep, pipeline_style):
    # In v0.0.3, Wait is applied to only the first tensor on a micro-batch.
    # Under this behavior, if checkpointing was disabled, there's a possibility
    # that gradient accumulations on other tensors are not synchronized
    # properly to the copy stream.
    class Sleep(torch.autograd.Function):
        @staticmethod
        def forward(ctx, x):
            return x.detach()

        @staticmethod
        def backward(ctx, grad):
            with torch.cuda.device(grad.device):
                cuda_sleep(0.05)
            return grad

    class Layer1(nn.Module):
        def forward(self, pair):
            a, b = pair
            return a * 1, b * 2, b * 3

    class Layer2(nn.Module):
        def forward(self, triple):
            a, b, c = triple
            b = Sleep.apply(b)
            return a + b + c

    model = nn.Sequential(Layer1(), Layer2())
    model = MultiProcessPipe(
        model,
        [1, 1],
        style=pipeline_style,
        worker_map=get_worker_map(),
        input_device=torch.cuda.current_device(),
        chunks=32,
        checkpoint="never",
    ).cuda()

    a = torch.rand(1024, 3, 32, 32, device=0, requires_grad=True)
    b = torch.rand(1024, 3, 32, 32, device=0, requires_grad=True)

    y = model((a, b))
    if model.group.rank() == 1:
        y.norm().backward()
    else:
        model.back_helper(y)

    if model.group.rank() == 0:
        assert torch.isclose(b.grad.norm().cpu(), torch.tensor(5.000))
Exemple #3
0
def partitions(pipe_class):
    a = nn.Linear(1, 1)
    b = nn.Linear(1, 1)

    model = nn.Sequential(a, b)
    model = pipe_class(model, [1, 1], worker_map=get_worker_map())

    assert isinstance(model.partitions, list)
    assert len(model) == 1
    assert isinstance(model.partitions[0].module, nn.Sequential)

    if model.group.rank() == 0:
        assert "0.0.weight" in model.state_dict()
    else:
        assert "0.1.weight" in model.state_dict()
Exemple #4
0
def async_event_loop():

    model = nn.Sequential(nn.Linear(10, 10), nn.ReLU(), nn.Linear(10, 10),
                          nn.ReLU())
    pipe = MultiProcessPipe(model, [1, 1, 1, 1],
                            style=MultiProcessPipe.AsyncSchedule,
                            worker_map=get_worker_map(),
                            chunks=10)

    inputs = torch.rand(100, 10)

    output = pipe(inputs)
    if pipe.final_stage:
        loss = output.mean()
        loss.backward()
Exemple #5
0
def checkpoint_mode_invalid(pipeline_style):
    model = nn.Sequential(nn.Linear(1, 1))

    with pytest.raises(
            ValueError,
            match="checkpoint is not one of 'always', 'except_last', or 'never'"
    ):
        MultiProcessPipe(
            model,
            balance=[1],
            style=pipeline_style,
            worker_map=get_worker_map(),
            chunks=2,
            checkpoint="INVALID_CHECKPOINT",
        )
Exemple #6
0
def checkpoint_mode_when_chunks_1(pipeline_style):
    model = nn.Sequential(nn.Linear(1, 1))

    # All checkpoint modes are fine.
    MultiProcessPipe(
        model,
        balance=[1],
        style=pipeline_style,
        worker_map=get_worker_map(),
        chunks=1,
        checkpoint="except_last",
    )
    MultiProcessPipe(model,
                     balance=[1],
                     style=pipeline_style,
                     worker_map=get_worker_map(),
                     chunks=1,
                     checkpoint="always")
    MultiProcessPipe(model,
                     balance=[1],
                     style=pipeline_style,
                     worker_map=get_worker_map(),
                     chunks=1,
                     checkpoint="never")
Exemple #7
0
def async_event_loop_interleave_hard():
    model = nn.Sequential(nn.Linear(10, 10), nn.Linear(10, 10), nn.Linear(10, 10), nn.Linear(10, 10))
    pipe = AMPnetPipe(
        module=model,
        balance=[1, 1, 1, 1],
        worker_map=get_worker_map(),
        chunks=10,
        checkpoint="never",
    )
    fake_dataset = FakeDataset()
    fake_dataloader = DataLoader(fake_dataset, batch_size=4, shuffle=True, num_workers=0)
    loss = nn.MSELoss()
    opt = MySGD(model.parameters(), lr=0.01)
    transform_and_log = AMPnetDelegate()
    pipe.interleave(fake_dataloader, loss, opt, transform_and_log)
Exemple #8
0
def checkpoint_mode(pipe_class):
    def count_grad_fn(grad_fn, name, visited=set()):
        if grad_fn in visited:
            return 0
        visited.add(grad_fn)

        if grad_fn is None:
            return 0
        if grad_fn.__class__.__name__ == name:
            return 1

        counter = 0
        for next_grad_fn, _ in grad_fn.next_functions:
            counter += count_grad_fn(next_grad_fn, name, visited=visited)
        return counter

    model = nn.Sequential(nn.Linear(1, 1))
    input = torch.rand(2, 1)

    always = pipe_class(
        model, balance=[1], worker_map=get_worker_map(), chunks=2, checkpoint="always", pipelined_backward=False,
    )
    except_last = pipe_class(
        model, balance=[1], worker_map=get_worker_map(), chunks=2, checkpoint="except_last", pipelined_backward=False,
    )
    never = pipe_class(
        model, balance=[1], worker_map=get_worker_map(), chunks=2, checkpoint="never", pipelined_backward=False,
    )

    always_output = always(input)
    except_last_output = except_last(input)
    never_output = never(input)

    assert count_grad_fn(always_output.grad_fn, "CheckpointBackward") == 2
    assert count_grad_fn(except_last_output.grad_fn, "CheckpointBackward") == 1
    assert count_grad_fn(never_output.grad_fn, "CheckpointBackward") == 0
def simple_linears(pipe_class):
    def sum_grad(parameters):
        return sum([p.grad.sum() for p in parameters if p.grad is not None])

    def zero_grad(parameters):
        for p in parameters:
            p.grad = None

    set_random_seed(12345)
    inputs = torch.rand(8, 1)
    model = nn.Sequential(
        nn.Linear(1, 2),
        nn.Linear(2, 4),
        nn.Linear(4, 2),
        nn.Linear(2, 1),
    )

    # Without MultiProcessPipe
    outputs = model(inputs)
    loss = outputs.mean()
    loss.backward()

    grad_without_pipe = [
        sum_grad([*model[0].parameters(), *model[1].parameters()]),
        sum_grad([*model[2].parameters(), *model[3].parameters()]),
    ]

    ref_without_pipe = [p.grad for p in model.parameters()]

    zero_grad(model.parameters())

    model = pipe_class(model, [2, 2], worker_map=get_worker_map(), chunks=4)

    outputs = model(inputs)
    if model.group.rank() == 1:
        loss = outputs.mean()
        loss.backward()
        grad_with_pipe = sum_grad(model.partition.parameters())

        # Both grads should be identical.
        assert torch.allclose(grad_with_pipe, grad_without_pipe[1])
    else:
        model.back_helper(outputs)
        grad_with_pipe = sum_grad(model.partition.parameters())

        # Both grads should be identical.
        assert torch.allclose(grad_with_pipe, grad_without_pipe[0])
    torch.distributed.barrier()
Exemple #10
0
def verify_module_duplicate_parameters_on_distinct_partitions(pipe_class):
    class Surrogate(nn.Module):
        def __init__(self, module):
            super().__init__()
            self.module = module

    conv = nn.Conv2d(3, 3, 1)
    model = nn.Sequential(Surrogate(conv), Surrogate(conv))

    # FIXME(tom) can't have duplicate params with separate processes
    with pytest.raises(
            ValueError,
            match=
            "module with duplicate parameters on distinct devices is not supported"
    ):
        pipe_class(model, [1, 1], worker_map=get_worker_map())
Exemple #11
0
def exception(pipe_class):
    class ExpectedException(Exception):
        pass

    class Raise(nn.Module):
        def forward(self, *_):
            raise ExpectedException()

    model = nn.Sequential(Raise())
    model = pipe_class(model,
                       balance=[1],
                       worker_map=get_worker_map(),
                       chunks=1)

    with pytest.raises(ExpectedException):
        model(torch.rand(1))
Exemple #12
0
def non_tensor(pipe_class):
    class NonTensor(nn.Module):
        def forward(self, _):
            return "hello"

    model = nn.Sequential(NonTensor())
    model = pipe_class(model, balance=[1], worker_map=get_worker_map())
    x = torch.rand(1)

    # TypeError: expected Tensor as element 0 in argument 0, but got str
    with pytest.raises(TypeError):
        model(x)

    # TypeError: expected Tensor to scatter, but got str
    with pytest.raises(TypeError):
        model("hello")
Exemple #13
0
def public_attrs(pipe_class):
    model = nn.Sequential(nn.Linear(1, 1))

    pipe = pipe_class(
        model,
        balance=(1, ),
        worker_map=get_worker_map(),
        chunks=42,
        checkpoint="always",
    )

    assert pipe.balance == [1]
    assert pipe.chunks == 42
    assert isinstance(pipe.chunks, int)
    assert pipe.checkpoint == "always"
    assert isinstance(pipe.checkpoint, str)
Exemple #14
0
def non_tensor_tuple(pipe_class):
    class NonTensorTuple(nn.Module):
        def forward(self, x):
            return (x, "hello")

    model = nn.Sequential(NonTensorTuple())
    model = pipe_class(model, balance=[1], worker_map=get_worker_map())
    x = torch.rand(1)

    # TypeError: CheckpointBackward.forward: expected Variable (got str) for return value 1
    with pytest.raises(TypeError):
        model(x)

    # TypeError: expected Tensor to scatter, but got str
    with pytest.raises(TypeError):
        model((x, "hello"))
Exemple #15
0
def public_attrs(pipe_class):
    class MyString:
        def __init__(self, value):
            self.value = value

        def __str__(self):
            return self.value

    model = nn.Sequential(nn.Linear(1, 1))

    pipe = pipe_class(model, balance=(1,), worker_map=get_worker_map(), chunks=42.000, checkpoint=MyString("always"),)

    assert pipe.balance == [1]
    assert pipe.chunks == 42
    assert isinstance(pipe.chunks, int)
    assert pipe.checkpoint == "always"
    assert isinstance(pipe.checkpoint, str)
Exemple #16
0
def named_children(pipe_class):
    a = nn.Linear(1, 1)
    b = nn.Linear(1, 1)

    model = nn.Sequential(OrderedDict([("a", a), ("b", b)]))
    model = pipe_class(model, [1, 1], worker_map=get_worker_map())

    names = set(n for n, _ in model.named_modules())
    if model.group.rank() == 0:
        assert "0.a" in names
    else:
        assert "0.b" in names

    # MultiProcessPipe doesn't support __getattr__. Unlike nn.Sequential, MultiProcessPipe requires
    # several methods in its namespace.
    with pytest.raises(AttributeError):
        model.a
Exemple #17
0
def inplace_on_not_requires_grad(pipe_class):
    # In-place operation on a tensor not requiring grad doesn't cause a
    # RuntimeError. Currently, we cannot detect this case.
    model = nn.Sequential(nn.ReLU(inplace=True))
    model = pipe_class(model, [1],
                       worker_map=get_worker_map(),
                       checkpoint="always")

    x = torch.rand(1)
    y = model(x)
    del model

    message = r"a leaf Variable that requires grad .* used in an in-place operation."
    with pytest.raises(RuntimeError, match=message):
        y.backward()

    torch.distributed.barrier()
Exemple #18
0
def check_pipe_against_reference(balance,
                                 model_constructor,
                                 checkpoint="except_last",
                                 custom_inputs=None):
    model = model_constructor()
    reference_model = model_constructor()
    for src, dst in zip(model, reference_model):
        dst.load_state_dict(copy.deepcopy(src.state_dict()))

    reference_model = nn.Sequential(*reference_model).cuda()

    pipe = PipeRPCWrapper(
        model,
        balance,
        input_device=torch.cuda.current_device(),
        worker_map=get_worker_map(),
        checkpoint=checkpoint,
    )

    pipe.foreach_worker(register_optimizer, include_self=True)
    register_optimizer(None, reference_model)

    inputs = torch.rand(10).cuda()
    target = torch.rand(10).cuda()
    cloned = inputs.clone()
    output = pipe(inputs)
    ref_out = reference_model(inputs)

    assert torch.equal(ref_out.cpu(), output.cpu())

    for out in output, ref_out:
        target = target.to(out.device)
        loss = nn.MSELoss()(out, target)
        loss.backward()

    pipe.foreach_worker(step_optimizer, include_self=True)
    step_optimizer(None, reference_model.cuda())

    pipe.eval()
    reference_model.eval()

    final_output = pipe(inputs)
    final_ref = reference_model(inputs.cuda())

    assert torch.equal(final_output.cpu(), final_ref.cpu())
Exemple #19
0
def async_event_loop_interleave_simple():
    model = nn.Sequential(nn.Linear(10, 10), nn.ReLU(inplace=False),
                          nn.Linear(10, 10), nn.ReLU(inplace=False))
    pipe = AMPnetPipe(
        module=model,
        balance=[2, 2],
        worker_map=get_worker_map(),
        chunks=10,
        checkpoint="never",
    )
    fake_dataset = FakeDataset()
    fake_dataloader = DataLoader(fake_dataset,
                                 batch_size=4,
                                 shuffle=True,
                                 num_workers=0)
    loss = nn.MSELoss()
    opt = MySGD(model.parameters(), lr=0.01)
    pipe.interleave(fake_dataloader, loss, opt, 0)
Exemple #20
0
def deferred_batch_norm(checkpoint, lazy, pipe_class):
    bn = nn.BatchNorm2d(3)
    pipe_bn = deepcopy(bn)
    pipe_fn = lambda: pipe_bn  # noqa: E731
    if lazy:
        model = [LazyModule(pipe_fn)]
    else:
        model = nn.Sequential(pipe_bn)
    pipe = pipe_class(
        model, balance=[1], worker_map=get_worker_map(), chunks=2, checkpoint=checkpoint, deferred_batch_norm=True,
    )

    x = torch.rand(4, 3, 10, 10)
    pipe(x).mean().backward()
    bn(x).mean().backward()

    assert torch.allclose(pipe[0].running_mean, bn.running_mean, atol=1e-4)
    assert torch.allclose(pipe[0].running_var, bn.running_var, atol=1e-4)
Exemple #21
0
def rpc_reuse_in_final_stage():

    # 'reused' and 'reused2' are located on stage 2, so the backward pass for
    # the final stage will need to first send gradients to stage 2, then receive
    # gradients from stage 2. This tests custom logic to handle reuse of layers
    # in the final stage of the pipeline.

    reused = nn.Linear(10, 10)
    reused2 = nn.Linear(10, 10)
    model = [
        nn.Linear(10, 10),
        nn.ReLU(),
        nn.Linear(10, 10),
        reused2,
        nn.ReLU(),
        reused,
        nn.ReLU(),
        reused,
        reused2,
        nn.ReLU(),
        reused,
        nn.ReLU(),
    ]
    balance = [2, 3, 4]

    init_rpc()

    if torch.distributed.get_rank() != 0:
        rpc.shutdown()
        torch.distributed.barrier()
        return

    pipe = PipeRPCWrapper(model, balance, worker_map=get_worker_map())

    inputs = torch.rand(10).cuda()
    target = torch.rand(10).cuda()
    output = pipe(inputs)
    nn.MSELoss()(output, target).backward()
    output = pipe(inputs)
    nn.MSELoss()(output, target).backward()
    rpc.shutdown()
    torch.distributed.barrier()
Exemple #22
0
def deny_moving(pipe_class):
    a = nn.Linear(1, 1)
    b = nn.Linear(1, 1)

    model = nn.Sequential(a, b)
    model = pipe_class(model, [1, 1], worker_map=get_worker_map())

    model.cuda()
    model.cpu()
    model.to(torch.device("cuda"))
    model.to(0)
    model.to("cuda")
    model.to(device=0)
    model.to(torch.rand(1))
    model.to(tensor=torch.rand(1))

    # Casting is allowed.
    model.half()
    model.to(torch.double)
    model.to(dtype=torch.float)
Exemple #23
0
def inplace_on_requires_grad(pipe_class):
    model = nn.Sequential(nn.Linear(1, 1), nn.ReLU(inplace=True))
    model = pipe_class(model, [1, 1],
                       worker_map=get_worker_map(),
                       checkpoint="always")

    x = torch.rand(1)

    if pipe_class == AsyncPipe and model.group.rank() == 0:
        # With AsyncPipe, model will wait forever for gradients if not eval
        model.eval()

    y = model(x)

    message = r"a leaf Variable that requires grad .* used in an in-place operation."
    if model.group.rank() == 1:
        with pytest.raises(RuntimeError, match=message):
            y.backward()

    torch.distributed.barrier()
Exemple #24
0
def no_grad(pipe_class):
    model = nn.Sequential(nn.Linear(1, 1))
    model = pipe_class(model, balance=[1], worker_map=get_worker_map(), chunks=2)
    input = torch.rand(2, 1)

    latent = None

    def hook(module, input, output):
        _ = module
        _ = input

        nonlocal latent
        latent = output

    partition = model.partitions[0]
    partition.module.register_forward_hook(hook)

    with torch.no_grad():
        model(input)

    assert latent.grad_fn is None
Exemple #25
0
def input_singleton(pipe_class):
    class One(nn.Module):
        def __init__(self):
            super().__init__()
            self.fc = nn.Linear(1, 1)

        def forward(self, only_a):
            (a,) = only_a
            return (self.fc(a),)

    model = nn.Sequential(One())
    model = pipe_class(model, balance=[1], worker_map=get_worker_map(), chunks=2, pipelined_backward=False,)

    a = torch.rand(10, 1, requires_grad=True)

    (a_out,) = model((a,))
    loss = a_out.mean()
    loss.backward()

    assert all(p.grad is not None for p in model.parameters())
    assert a.grad is not None
Exemple #26
0
def deferred_batch_norm_params(checkpoint, lazy, pipe_class):
    bn = nn.BatchNorm2d(3)
    pipe_bn = deepcopy(bn)
    pipe_fn = lambda: pipe_bn  # noqa: E731
    if lazy:
        model = [LazyModule(pipe_fn)]
    else:
        model = nn.Sequential(pipe_bn)
    pipe = pipe_class(
        model, balance=[1], worker_map=get_worker_map(), chunks=1, checkpoint=checkpoint, deferred_batch_norm=True,
    )

    x = torch.rand(4, 3, 10, 10)
    pipe(x).mean().backward()
    bn(x).mean().backward()

    assert pipe[0].weight.grad is not None
    assert pipe[0].bias.grad is not None

    assert torch.allclose(pipe[0].weight.grad, bn.weight.grad, atol=1e-4)
    assert torch.allclose(pipe[0].bias.grad, bn.bias.grad, atol=1e-4)
Exemple #27
0
def python_autograd_function(pipe_class):
    # FIXME deadlock with AsyncPipe?
    # A Python autograd function might fail with this error:
    #
    #   RuntimeError: Returning Variables sharing storage with other Variables
    #   that require grad is not supported in Python functions. Please submit a
    #   feature request if you hit this error.
    #
    # It doesn't look like an essential restriction. But it happens on the
    # current PyTorch version. To avoid it, we should detach the tensor before
    # returning by identity autograd functions, such as Wait, Fork, and Join.

    torch.manual_seed(0)

    class Identity(torch.autograd.Function):
        @staticmethod
        def forward(ctx, input):
            return input

        @staticmethod
        def backward(ctx, grad):
            return grad

    class M(nn.Module):
        def forward(self, input):
            return Identity.apply(input)

    model = nn.Sequential(M(), M())
    model = pipe_class(model, [1, 1],
                       worker_map=get_worker_map(),
                       checkpoint="always").cuda()
    model.eval()

    x = torch.rand(42)
    y = model(x)
    if model.group.rank() == 1:
        assert torch.allclose(x, y)

    torch.distributed.rpc.shutdown()
    torch.distributed.barrier()
Exemple #28
0
def sequential_like(balance, pipeline_style):
    a = nn.Linear(1, 1)
    b = nn.Linear(1, 1)

    model = nn.Sequential(a, b)
    model = MultiProcessPipe(model,
                             balance,
                             style=pipeline_style,
                             worker_map=get_worker_map())

    if balance == [2]:
        if torch.distributed.get_rank() == 0:
            assert len(model) == 2
            assert list(model) == [a, b]

            assert model[0] is a
            assert model[1] is b
            with pytest.raises(IndexError):
                _ = model[2]

            assert model[-1] is b
            assert model[-2] is a
        else:
            assert len(model) == 0
            assert list(model) == []
    else:
        assert len(model) == 1
        if torch.distributed.get_rank() == 0:
            assert list(model) == [a]
            assert model[0] is a
            assert model[-1] is a
        else:
            assert list(model) == [b]
            assert model[0] is b
            assert model[-1] is b

        with pytest.raises(IndexError):
            _ = model[1]
Exemple #29
0
def input_pair(pipe_class):
    class Two(nn.Module):
        def __init__(self):
            super().__init__()
            self.fc_a = nn.Linear(1, 1)
            self.fc_b = nn.Linear(1, 1)

        def forward(self, a_and_b):
            a, b = a_and_b
            return (self.fc_a(a), self.fc_b(b))

    model = nn.Sequential(Two())
    model = pipe_class(model, balance=[1], worker_map=get_worker_map(), chunks=2, pipelined_backward=False,)

    a = torch.rand(10, 1, requires_grad=True)
    b = torch.rand(10, 1, requires_grad=True)

    a_out, b_out = model((a, b))
    loss = (a_out + b_out).mean()
    loss.backward()

    assert a.grad is not None
    assert b.grad is not None
Exemple #30
0
def checkpoint_non_float_input(pipe_class):
    class ForkNonFloat(nn.Module):
        def forward(self, input):
            return (input * 2, torch.tensor([False]))

    class JoinNonFloat(nn.Module):
        def forward(self, input):
            return input[0] * 2

    model = nn.Sequential(ForkNonFloat(), JoinNonFloat())
    model = pipe_class(
        model, balance=[1, 1], worker_map=get_worker_map(), chunks=1, checkpoint="always", pipelined_backward=False,
    )

    input = torch.rand(1, requires_grad=True)
    output = model(input)
    if model.group.rank() == 1:
        # with torch.autograd.detect_anomaly():
        output.backward()
    elif pipe_class == MultiProcessPipe:
        model.back_helper(output)

    torch.distributed.barrier()