def test_float_vector(self): # test FloatVector kp_vector = kaldi.FloatVector(5) np_array = kp_vector.numpy() self.assertIsInstance(np_array, np.ndarray) np_array[2:] = 2.0 gold = np.array([0, 0, 2, 2, 2]) self.assertTrue((kp_vector == gold).all())
def test_vector_to_pytorch_cpu_tensor(self): dim = 2 v = kaldi.FloatVector(size=dim) v[0] = 10 v[1] = 20 v_reference_count = sys.getrefcount(v) # memory is shared between kaldi::Vector and PyTorch Tensor tensor = from_dlpack(v.to_dlpack()) self.assertEqual(v_reference_count + 1, sys.getrefcount(v)) self.assertEqual(tensor.is_cuda, False) self.assertEqual(tensor[0], 10) self.assertEqual(tensor[1], 20) tensor[0] = 100 # also changes `v` tensor[1] = 200 self.assertEqual(v[0], 100) self.assertEqual(v[1], 200) v[0] = 9 # also changes `tensor` self.assertEqual(tensor[0], 9) del tensor gc.collect() self.assertEqual(v_reference_count, sys.getrefcount(v)) # one more time self.assertEqual(v[0], 9) # v is still alive self.assertEqual(v[1], 200) tensor = from_dlpack(v.to_dlpack()) self.assertEqual(v_reference_count + 1, sys.getrefcount(v)) self.assertEqual(tensor.is_cuda, False) tensor[0] = 8 tensor[1] = 10 self.assertEqual(v[0], 8) self.assertEqual(v[1], 10) del tensor self.assertEqual(v_reference_count, sys.getrefcount(v))
def test_to_numpy(self): # first, build a kaldi vector dim = 8 v = kaldi.FloatVector(size=dim) self.assertEqual(v.Dim(), dim) for i in range(dim): self.assertEqual(v[i], 0) # now to numpy; memory is shared d = v.numpy() d += 10 for i in range(dim): self.assertEqual(d[i], v[i])
def test_cu_vector_to_pytorch_cpu_tensor(self): if kaldi.CudaCompiled(): print('This test is for constructing a CPU tensor from a CuVector') print('Kaldi is compiled with GPU, skip it') return dim = 2 cpu_v = kaldi.FloatVector(size=dim) cpu_v[0] = 10 cpu_v[1] = 20 v = kaldi.FloatCuVector(cpu_v) self.assertEqual(v[0], 10) self.assertEqual(v[1], 20) v_reference_count = sys.getrefcount(v) # memory is shared between `v` and `tensor` tensor = from_dlpack(v.to_dlpack()) self.assertEqual(v_reference_count + 1, sys.getrefcount(v)) self.assertTrue(tensor.is_cuda == False) self.assertTrue(tensor[0], 10) self.assertTrue(tensor[0], 20) tensor[0] = 6 # also changes `v` tensor[1] = 8 self.assertEqual(v[0], 6) self.assertEqual(v[1], 8) v.Add(2) # also changes `tensor` self.assertTrue(tensor[0], 8) self.assertTrue(tensor[1], 10) del tensor gc.collect() self.assertEqual(v_reference_count, sys.getrefcount(v)) self.assertEqual(v[0], 8) # `v` is still alive self.assertEqual(v[1], 10)
def test_pytorch_and_kaldi_gpu_tensor_zero_copy(self): # (fangjun): we put all tests in this function to avoid # invoking SelectGpuDevice() twice if torch.cuda.is_available() == False: print('No GPU detected! Skip it') return if kaldi.CudaCompiled() == False: print('Kaldi is not compiled with CUDA! Skip it') return device_id = 0 # Kaldi and PyTorch will use the same GPU kaldi.SelectGpuDevice(device_id=device_id) device = torch.device('cuda', device_id) tensor = torch.arange(3).float() tensor = tensor.to(device) # make sure the tensor from PyTorch is indeed on GPU self.assertTrue(tensor.is_cuda) # GPU data is shared between kaldi::CuSubVector and PyTorch GPU tensor # no data is copied v = kaldi.CuSubVectorFromDLPack(to_dlpack(tensor)) self.assertIsInstance(v, kaldi.FloatCuSubVector) v.Add(value=10) self.assertEqual(tensor[0], 10) self.assertEqual(tensor[1], 11) self.assertEqual(tensor[2], 12) v.Scale(value=6) self.assertEqual(tensor[0], 60) self.assertEqual(tensor[1], 66) self.assertEqual(tensor[2], 72) v.SetZero() self.assertEqual(tensor[0], 0) self.assertEqual(tensor[1], 0) self.assertEqual(tensor[2], 0) # Now for CuSubMatrix tensor = torch.arange(3).reshape(1, 3).float() tensor = tensor.to(device) # make sure the tensor from PyTorch is indeed on GPU self.assertTrue(tensor.is_cuda) m = kaldi.CuSubMatrixFromDLPack(to_dlpack(tensor)) m.ApplyExp() self.assertAlmostEqual(tensor[0, 0], math.exp(0), places=7) self.assertAlmostEqual(tensor[0, 1], math.exp(1), places=7) self.assertAlmostEqual(tensor[0, 2], math.exp(2), places=7) m.SetZero() self.assertEqual(tensor[0, 0], 0) self.assertEqual(tensor[0, 1], 0) self.assertEqual(tensor[0, 2], 0) # now from Kaldi to PyTorch dim = 2 cpu_v = kaldi.FloatVector(size=dim) cpu_v[0] = 10 cpu_v[1] = 20 gpu_v = kaldi.FloatCuVector(cpu_v) self.assertEqual(gpu_v[0], 10) self.assertEqual(gpu_v[1], 20) gpu_v_reference_count = sys.getrefcount(gpu_v) # memory is shared between `gpu_v` and `tensor` tensor = from_dlpack(gpu_v.to_dlpack()) # `gpu_v.to_dlpack()` increases the reference count of `gpu_v` self.assertEqual(gpu_v_reference_count + 1, sys.getrefcount(gpu_v)) self.assertTrue(tensor.is_cuda) self.assertEqual(tensor.device.index, device_id) self.assertTrue(tensor[0], 10) self.assertTrue(tensor[1], 20) tensor[0] = 1 # also changes `gpu_v` tensor[1] = 2 self.assertEqual(gpu_v[0], 1) self.assertEqual(gpu_v[1], 2) gpu_v.Add(10) # also changes `tensor` self.assertEqual(tensor[0], 11) self.assertEqual(tensor[1], 12) del tensor gc.collect() # now the reference count for gpu_v is decreased by one self.assertEqual(gpu_v_reference_count, sys.getrefcount(gpu_v)) self.assertEqual(gpu_v[0], 11) # gpu_v is still alive self.assertEqual(gpu_v[1], 12) # now for CuMatrix num_rows = 1 num_cols = 2 cpu_m = kaldi.FloatMatrix(row=num_rows, col=num_cols) cpu_m[0, 0] = 1 cpu_m[0, 1] = 2 gpu_m = kaldi.FloatCuMatrix(cpu_m) self.assertEqual(gpu_m[0, 0], 1) self.assertEqual(gpu_m[0, 1], 2) gpu_m_reference_count = sys.getrefcount(gpu_m) # memory is shared between `gpu_m` and `tensor` tensor = from_dlpack(gpu_m.to_dlpack()) self.assertEqual(gpu_m_reference_count + 1, sys.getrefcount(gpu_m)) self.assertTrue(tensor.is_cuda) self.assertEqual(tensor.device.index, device_id) self.assertTrue(tensor[0, 0], 1) self.assertTrue(tensor[0, 1], 2) tensor[0, 0] = 6 # also changes `gpu_m` tensor[0, 1] = 8 self.assertEqual(gpu_m[0, 0], 6) self.assertEqual(gpu_m[0, 1], 8) gpu_m.Add(2) # also changes `tensor` self.assertTrue(tensor[0, 0], 8) self.assertTrue(tensor[0, 1], 10) del tensor gc.collect() self.assertEqual(gpu_m_reference_count, sys.getrefcount(gpu_m)) self.assertEqual(gpu_m[0, 0], 8) # `gpu_m` is still alive self.assertEqual(gpu_m[0, 1], 10) # now for CuVector from_dlpack tensor = torch.tensor([1, 2]).float() tensor = tensor.to(device) # memory is shared between `tensor` and `v` v = kaldi.DLPackFloatCuSubVector.from_dlpack(to_dlpack(tensor)) self.assertEqual(v[0], 1) v.Add(1) # also changes `tensor` self.assertEqual(tensor[0], 2) self.assertEqual(tensor[1], 3) del v del tensor # now for CuMatrix from_dlpack tensor = torch.tensor([1, 2]).reshape(1, 2).float() tensor = tensor.to(device) # memory is shared between `tensor` and `m` m = kaldi.DLPackFloatCuSubMatrix.from_dlpack(to_dlpack(tensor)) self.assertEqual(m[0, 0], 1) m.Add(100) # also changes `tensor` self.assertEqual(tensor[0, 0], 101) del m del tensor gc.collect() # now test the issue: https://github.com/pytorch/pytorch/issues/9261 # it will not consume all GPU memory for i in range(100): b = torch.randn(1024 * 1024 * 1024 // 4, 1, device=device) # 1G a = kaldi.CuSubMatrixFromDLPack(to_dlpack(b)) gc.collect() torch.cuda.empty_cache() for i in range(100 * 4): b = kaldi.FloatCuMatrix(1024 * 1024, 64) # 256 MB a = from_dlpack(b.to_dlpack()) gc.collect()