コード例 #1
0
def training_step(dace_model,
                  pt_model,
                  train_batch,
                  sdfg_name,
                  gpu,
                  train_criterion=None):

    # copy over the weights
    dace_model.load_state_dict(pt_model.state_dict())
    for dace_value, value in zip(pt_model.state_dict().values(),
                                 dace_model.state_dict().values()):
        assert np.allclose(dace_value, value)

    dace_model = DaceModule(dace_model,
                            backward=True,
                            sdfg_name=sdfg_name,
                            cuda=gpu)

    x, y = train_batch
    train_criterion = train_criterion or nn.NLLLoss()

    pt_loss = train_criterion(pt_model(x), y)

    dace_output = dace_model(x)
    dace_loss = train_criterion(dace_output, y)

    diff = abs(pt_loss.item() - dace_loss.item()) / pt_loss.item()
    assert diff < 1e-5

    pt_loss.backward()
    dace_loss.backward()

    for (name, dace_param), (pt_name,
                             pt_param) in zip(pt_model.named_parameters(),
                                              dace_model.named_parameters()):
        assert 'model.' + name == pt_name
        torch_tensors_close(name, pt_param.grad, dace_param.grad)

    optimizer = optim.SGD(pt_model.parameters(), lr=0.001)
    dace_optimizer = optim.SGD(dace_model.parameters(), lr=0.001)
    optimizer.step()
    dace_optimizer.step()

    for (name, dace_param), (pt_name,
                             pt_param) in zip(pt_model.named_parameters(),
                                              dace_model.named_parameters()):
        assert 'model.' + name == pt_name
        torch_tensors_close(name, pt_param.detach(), dace_param.detach())
コード例 #2
0
ファイル: test_conv2d.py プロジェクト: manuelburger/daceml
def test_conv2d(default_implementation, sdfg_name):
    class Model(nn.Module):
        def __init__(self):
            super(Model, self).__init__()
            self.conv1 = nn.Conv2d(1, 4, 3)
            self.conv2 = nn.Conv2d(4, 4, 3)

        def forward(self, x):
            x = F.relu(self.conv1(x))
            return F.relu(self.conv2(x))

    ptmodel = Model()
    x = torch.rand(1, 1, 8, 8)

    @dace_module
    class TestDecorator(Model):
        pass

    dace_model = DaceModule(ptmodel, sdfg_name=sdfg_name)
    dace_output = dace_model(x)

    dace_model_decorated = TestDecorator()
    dace_model_decorated(x)

    torch_output = ptmodel(x)

    assert np.allclose(torch_output.detach().numpy(), dace_output, atol=1e-06)
コード例 #3
0
def test_bert_encoder_backward(gpu, default_implementation, sdfg_name):
    batch_size = 2
    seq_len = 512
    hidden_size = 768

    input = torch.randn([batch_size, seq_len, hidden_size])
    ptmodel = BertLayer(BertConfig(hidden_act="relu")).eval()

    dace_model = DaceModule(ptmodel,
                            cuda=gpu,
                            train=False,
                            backward=True,
                            sdfg_name=sdfg_name,
                            apply_strict=True)

    ptinput = torch.clone(input)
    ptinput.requires_grad = True
    ptmodel(ptinput)[0].sum().backward()

    dace_input = torch.clone(input)
    dace_input.requires_grad = True
    dace_model(dace_input).sum().backward()

    diff = np.abs(dace_input.grad.detach().numpy() -
                  ptinput.grad.detach().numpy())

    assert np.max(diff) < 1e-4
コード例 #4
0
def test_bert_cf(sdfg_name):
    batch_size = 8
    seq_len = 512
    hidden_size = 768

    input = torch.randn([batch_size, seq_len, hidden_size])

    ptmodel = BertLayer(BertConfig()).eval()
    pt_outputs = ptmodel(input.clone())

    dace_model = DaceModule(ptmodel,
                            train=False,
                            sdfg_name=sdfg_name,
                            dummy_inputs=(input.clone(), ),
                            auto_optimize=False)

    # run again with constant folding
    dace_model.reset_sdfg()
    dace_model.prepend_post_onnx_hook(
        "cf", lambda onnx_model: onnx_model.sdfg.
        apply_transformations_repeated([ConstantFolding, RedundantSecondArray],
                                       validate_all=True,
                                       strict=True))
    dace_outputs1 = dace_model(input.clone())

    diff = np.abs(dace_outputs1.detach().numpy() -
                  pt_outputs[0].detach().numpy())

    assert np.max(diff) < 1e-5
コード例 #5
0
def test_bert_encoder(gpu, default_implementation, sdfg_name):
    if not gpu and default_implementation == 'onnxruntime':
        pytest.skip("combination is tested below")

    batch_size = 8
    seq_len = 512
    hidden_size = 768

    input = torch.randn([batch_size, seq_len, hidden_size])

    ptmodel = BertLayer(BertConfig()).eval()
    pt_outputs = ptmodel(input.clone())

    dace_model = DaceModule(ptmodel,
                            cuda=gpu,
                            train=False,
                            sdfg_name=sdfg_name,
                            apply_strict=True,
                            dummy_inputs=(input.clone(), ))

    if gpu:
        for name, _ in dace_model.model.named_parameters():
            parameter_to_transient(dace_model, name)

    dace_outputs0 = dace_model(input.clone())

    diff = np.abs(dace_outputs0.detach().numpy() -
                  pt_outputs[0].detach().numpy())

    assert np.max(diff) < 1e-5

    if default_implementation == "pure":
        ort_nodes = [
            n for n, _ in dace_model.sdfg.all_nodes_recursive()
            if hasattr(n, "environments") and any("onnx" in e.lower()
                                                  for e in n.environments)
        ]
        if len(ort_nodes) > 0:
            assert False, f"expected pure graph, found ORT nodes: {ort_nodes} "

        # check that cuBLAS is being used
        if gpu:
            assert any(
                (hasattr(n, "environments") and "cuBLAS" in n.environments or
                 hasattr(n, "implementation") and n.implementation == "cuBLAS")
                for n, _ in dace_model.sdfg.all_nodes_recursive())
コード例 #6
0
ファイル: test_pytorch.py プロジェクト: manuelburger/daceml
def run_pytorch_module(module,
                       sdfg_name,
                       gpu,
                       shape=None,
                       use_max=False,
                       auto_optimize=True):
    shape = shape or (3, 5)

    input_value = torch.rand(*shape, dtype=torch.float32)

    pytorch_input = torch.empty(*shape,
                                dtype=torch.float32,
                                requires_grad=False)
    pytorch_input.copy_(input_value)
    pytorch_input.requires_grad = True

    dace_input = torch.empty(*shape, dtype=torch.float32, requires_grad=False)
    dace_input.copy_(input_value)
    dace_input.requires_grad = True

    if use_max:
        pytorch_s = module(pytorch_input).max()
    else:
        pytorch_s = module(pytorch_input).sum()
    pytorch_s.backward()

    print("Pytorch output:")
    print(pytorch_input.grad)

    dace_module = DaceModule(module,
                             backward=True,
                             cuda=gpu,
                             sdfg_name=sdfg_name,
                             auto_optimize=auto_optimize)

    if use_max:
        dace_s = dace_module(dace_input).max()
    else:
        dace_s = dace_module(dace_input).sum()
    dace_s.backward()
    print("Dace output:")
    print(dace_input.grad)
    assert torch.allclose(pytorch_input.grad,
                          dace_input.grad,
                          rtol=1e-6,
                          atol=1e-4)
コード例 #7
0
def test_pytorch_from_dlpack():
    class Module(nn.Module):
        def __init__(self):
            super().__init__()
            self.fc1 = nn.Linear(10, 3)

        def forward(self, x):
            return self.fc1(x)

    pt_module = Module()
    dace_module = Module()
    dace_module.load_state_dict(pt_module.state_dict())

    input = torch.rand(2, 10)
    assert torch.allclose(pt_module(input), dace_module(input))

    dace_module = DaceModule(dace_module, cuda=True)

    assert torch.allclose(dace_module(input), pt_module(input))
    parameter_to_transient(dace_module, "fc1.weight")
    assert torch.allclose(dace_module(input), pt_module(input))
コード例 #8
0
ファイル: test_bert_encoder.py プロジェクト: am-ivanov/daceml
def test_bert_encoder(gpu, apply_strict):
    batch_size = 8
    seq_len = 512
    hidden_size = 768

    input = torch.randn([batch_size, seq_len, hidden_size])

    ptmodel = BertLayer(BertConfig()).eval()
    pt_outputs = ptmodel(input.clone())

    dace_model = DaceModule(ptmodel, cuda=gpu, train=False)
    dace_outputs0 = dace_model(input.clone())

    dace_model.dace_model.sdfg.apply_transformations_repeated(
        [ConstantFolding, RedundantSecondArray], validate_all=True)

    dace_outputs1 = dace_model(input.clone())

    diff = np.abs(dace_outputs0 - pt_outputs[0].detach().numpy())

    assert np.max(diff) < 1e-5
    assert np.allclose(dace_outputs1, dace_outputs0)
コード例 #9
0
ファイル: test_attn.py プロジェクト: am-ivanov/daceml
def test_attn(gpu):
    B = 2
    H = 16
    P = 64
    N = P * H
    SM, SN = 512, 512
    K, Q, V = [
        torch.randn([SM, B, N]),
        torch.randn([SN, B, N]),
        torch.randn([SM, B, N])
    ]
    ptmodel = torch.nn.MultiheadAttention(N, H, bias=False)

    pt_outputs = ptmodel(Q, K, V)

    dace_model = DaceModule(ptmodel, cuda=gpu)
    dace_outputs = dace_model(Q, K, V)

    assert np.allclose(pt_outputs[0].detach().numpy(),
                       dace_outputs[0],
                       atol=1e-06)
    assert np.allclose(pt_outputs[1].detach().numpy(),
                       dace_outputs[1],
                       atol=1e-06)