b = torch.empty(5, 5, device='msnpu') self.assertEqual(msnpu_extension.get_test_int(), 0) c = a + b self.assertEqual(msnpu_extension.get_test_int(), 1) def test_conv_backend_override(self): # To simplify tests, we use 4d input here to avoid doing view4d( which # needs more overrides) in _convolution. input = torch.empty(2, 4, 10, 2, device='msnpu', requires_grad=True) weight = torch.empty(6, 4, 2, 2, device='msnpu', requires_grad=True) bias = torch.empty(6, device='msnpu') # Make sure forward is overriden out = torch.nn.functional.conv1d(input, weight, bias, 2, 0, 1, 1) self.assertEqual(msnpu_extension.get_test_int(), 2) self.assertEqual(out.shape[0], input.shape[0]) self.assertEqual(out.shape[1], weight.shape[0]) # Make sure backward is overriden # Double backward is dispatched to _convolution_double_backward. # It is not tested here as it involves more computation/overrides. grad = torch.autograd.grad(out, input, out, create_graph=True) self.assertEqual(msnpu_extension.get_test_int(), 3) self.assertEqual(grad[0].shape, input.shape) if __name__ == "__main__": common.run_tests()
return ref_forward_output_my_rank, ref_weight_out @skip_if_lt_x_gpu(2) def test_one_iteration(self): """Test FSDP with uneven divide of parameter shards.""" model = Linear(3, 3, bias=False) input = torch.rand(8, 3) my_lr = 0.1 ref_forward_output_my_rank, ref_weight_out = self._get_ref_results( model, input, my_lr) model.to(self.rank) model = FSDP(model) optim = SGD(model.parameters(), lr=my_lr) self.assertTrue(len(input) >= self.world_size) in_data = torch.Tensor(input[self.rank]).to(self.rank) out = model(in_data) out.float().sum().backward() optim.step() optim.zero_grad() with model.summon_full_params(model): weight_out = model.module.weight.T.clone() self.assertEqual(ref_forward_output_my_rank, out) self.assertEqual(ref_weight_out, weight_out) if __name__ == "__main__": run_tests()
class TransformerUtilsTest(NcclDistributedTestBase): def test_split_tensor_along_last_dim(self): for tensor_model_paralell_world_size in range(1, self.world_size + 1): if self.world_size % tensor_model_paralell_world_size > 0: continue with self.subTest(tensor_model_paralell_world_size= tensor_model_paralell_world_size): parallel_state.initialize_model_parallel( tensor_model_parallel_size_=tensor_model_paralell_world_size ) device = "cpu" input_tensor = torch.randn((100, 100, 100), device=device) splits = utils.split_tensor_along_last_dim(input_tensor, 10) last_dim_shapes = torch.tensor( [int(split.size()[-1]) for split in splits]) self.assertTrue( torch.equal( last_dim_shapes, torch.full((10, ), 10), )) parallel_state.destroy_model_parallel() if __name__ == "__main__": common_utils.run_tests()