def training_step(dace_model, pt_model, train_batch, sdfg_name, gpu, train_criterion=None): # copy over the weights dace_model.load_state_dict(pt_model.state_dict()) for dace_value, value in zip(pt_model.state_dict().values(), dace_model.state_dict().values()): assert np.allclose(dace_value, value) dace_model = DaceModule(dace_model, backward=True, sdfg_name=sdfg_name, cuda=gpu) x, y = train_batch train_criterion = train_criterion or nn.NLLLoss() pt_loss = train_criterion(pt_model(x), y) dace_output = dace_model(x) dace_loss = train_criterion(dace_output, y) diff = abs(pt_loss.item() - dace_loss.item()) / pt_loss.item() assert diff < 1e-5 pt_loss.backward() dace_loss.backward() for (name, dace_param), (pt_name, pt_param) in zip(pt_model.named_parameters(), dace_model.named_parameters()): assert 'model.' + name == pt_name torch_tensors_close(name, pt_param.grad, dace_param.grad) optimizer = optim.SGD(pt_model.parameters(), lr=0.001) dace_optimizer = optim.SGD(dace_model.parameters(), lr=0.001) optimizer.step() dace_optimizer.step() for (name, dace_param), (pt_name, pt_param) in zip(pt_model.named_parameters(), dace_model.named_parameters()): assert 'model.' + name == pt_name torch_tensors_close(name, pt_param.detach(), dace_param.detach())
def test_conv2d(default_implementation, sdfg_name): class Model(nn.Module): def __init__(self): super(Model, self).__init__() self.conv1 = nn.Conv2d(1, 4, 3) self.conv2 = nn.Conv2d(4, 4, 3) def forward(self, x): x = F.relu(self.conv1(x)) return F.relu(self.conv2(x)) ptmodel = Model() x = torch.rand(1, 1, 8, 8) @dace_module class TestDecorator(Model): pass dace_model = DaceModule(ptmodel, sdfg_name=sdfg_name) dace_output = dace_model(x) dace_model_decorated = TestDecorator() dace_model_decorated(x) torch_output = ptmodel(x) assert np.allclose(torch_output.detach().numpy(), dace_output, atol=1e-06)
def test_bert_encoder_backward(gpu, default_implementation, sdfg_name): batch_size = 2 seq_len = 512 hidden_size = 768 input = torch.randn([batch_size, seq_len, hidden_size]) ptmodel = BertLayer(BertConfig(hidden_act="relu")).eval() dace_model = DaceModule(ptmodel, cuda=gpu, train=False, backward=True, sdfg_name=sdfg_name, apply_strict=True) ptinput = torch.clone(input) ptinput.requires_grad = True ptmodel(ptinput)[0].sum().backward() dace_input = torch.clone(input) dace_input.requires_grad = True dace_model(dace_input).sum().backward() diff = np.abs(dace_input.grad.detach().numpy() - ptinput.grad.detach().numpy()) assert np.max(diff) < 1e-4
def test_bert_cf(sdfg_name): batch_size = 8 seq_len = 512 hidden_size = 768 input = torch.randn([batch_size, seq_len, hidden_size]) ptmodel = BertLayer(BertConfig()).eval() pt_outputs = ptmodel(input.clone()) dace_model = DaceModule(ptmodel, train=False, sdfg_name=sdfg_name, dummy_inputs=(input.clone(), ), auto_optimize=False) # run again with constant folding dace_model.reset_sdfg() dace_model.prepend_post_onnx_hook( "cf", lambda onnx_model: onnx_model.sdfg. apply_transformations_repeated([ConstantFolding, RedundantSecondArray], validate_all=True, strict=True)) dace_outputs1 = dace_model(input.clone()) diff = np.abs(dace_outputs1.detach().numpy() - pt_outputs[0].detach().numpy()) assert np.max(diff) < 1e-5
def test_bert_encoder(gpu, default_implementation, sdfg_name): if not gpu and default_implementation == 'onnxruntime': pytest.skip("combination is tested below") batch_size = 8 seq_len = 512 hidden_size = 768 input = torch.randn([batch_size, seq_len, hidden_size]) ptmodel = BertLayer(BertConfig()).eval() pt_outputs = ptmodel(input.clone()) dace_model = DaceModule(ptmodel, cuda=gpu, train=False, sdfg_name=sdfg_name, apply_strict=True, dummy_inputs=(input.clone(), )) if gpu: for name, _ in dace_model.model.named_parameters(): parameter_to_transient(dace_model, name) dace_outputs0 = dace_model(input.clone()) diff = np.abs(dace_outputs0.detach().numpy() - pt_outputs[0].detach().numpy()) assert np.max(diff) < 1e-5 if default_implementation == "pure": ort_nodes = [ n for n, _ in dace_model.sdfg.all_nodes_recursive() if hasattr(n, "environments") and any("onnx" in e.lower() for e in n.environments) ] if len(ort_nodes) > 0: assert False, f"expected pure graph, found ORT nodes: {ort_nodes} " # check that cuBLAS is being used if gpu: assert any( (hasattr(n, "environments") and "cuBLAS" in n.environments or hasattr(n, "implementation") and n.implementation == "cuBLAS") for n, _ in dace_model.sdfg.all_nodes_recursive())
def run_pytorch_module(module, sdfg_name, gpu, shape=None, use_max=False, auto_optimize=True): shape = shape or (3, 5) input_value = torch.rand(*shape, dtype=torch.float32) pytorch_input = torch.empty(*shape, dtype=torch.float32, requires_grad=False) pytorch_input.copy_(input_value) pytorch_input.requires_grad = True dace_input = torch.empty(*shape, dtype=torch.float32, requires_grad=False) dace_input.copy_(input_value) dace_input.requires_grad = True if use_max: pytorch_s = module(pytorch_input).max() else: pytorch_s = module(pytorch_input).sum() pytorch_s.backward() print("Pytorch output:") print(pytorch_input.grad) dace_module = DaceModule(module, backward=True, cuda=gpu, sdfg_name=sdfg_name, auto_optimize=auto_optimize) if use_max: dace_s = dace_module(dace_input).max() else: dace_s = dace_module(dace_input).sum() dace_s.backward() print("Dace output:") print(dace_input.grad) assert torch.allclose(pytorch_input.grad, dace_input.grad, rtol=1e-6, atol=1e-4)
def test_pytorch_from_dlpack(): class Module(nn.Module): def __init__(self): super().__init__() self.fc1 = nn.Linear(10, 3) def forward(self, x): return self.fc1(x) pt_module = Module() dace_module = Module() dace_module.load_state_dict(pt_module.state_dict()) input = torch.rand(2, 10) assert torch.allclose(pt_module(input), dace_module(input)) dace_module = DaceModule(dace_module, cuda=True) assert torch.allclose(dace_module(input), pt_module(input)) parameter_to_transient(dace_module, "fc1.weight") assert torch.allclose(dace_module(input), pt_module(input))
def test_bert_encoder(gpu, apply_strict): batch_size = 8 seq_len = 512 hidden_size = 768 input = torch.randn([batch_size, seq_len, hidden_size]) ptmodel = BertLayer(BertConfig()).eval() pt_outputs = ptmodel(input.clone()) dace_model = DaceModule(ptmodel, cuda=gpu, train=False) dace_outputs0 = dace_model(input.clone()) dace_model.dace_model.sdfg.apply_transformations_repeated( [ConstantFolding, RedundantSecondArray], validate_all=True) dace_outputs1 = dace_model(input.clone()) diff = np.abs(dace_outputs0 - pt_outputs[0].detach().numpy()) assert np.max(diff) < 1e-5 assert np.allclose(dace_outputs1, dace_outputs0)
def test_attn(gpu): B = 2 H = 16 P = 64 N = P * H SM, SN = 512, 512 K, Q, V = [ torch.randn([SM, B, N]), torch.randn([SN, B, N]), torch.randn([SM, B, N]) ] ptmodel = torch.nn.MultiheadAttention(N, H, bias=False) pt_outputs = ptmodel(Q, K, V) dace_model = DaceModule(ptmodel, cuda=gpu) dace_outputs = dace_model(Q, K, V) assert np.allclose(pt_outputs[0].detach().numpy(), dace_outputs[0], atol=1e-06) assert np.allclose(pt_outputs[1].detach().numpy(), dace_outputs[1], atol=1e-06)