Esempio n. 1
0
    def test_pytorch_cpu_tensor_to_cu_subvector(self):
        if kaldi.CudaCompiled():
            print('This test is for constructing a CuSubVector from '
                  'a CPU tensor')
            print('Kaldi is compiled with GPU, skip it')
            return

        tensor = torch.tensor([10, 20]).float()
        v = kaldi.CuSubVectorFromDLPack(to_dlpack(tensor))

        v.SetZero()  # also changes tensor, since memory is shared
        self.assertEqual(tensor[0], 0)
        self.assertEqual(tensor[1], 0)

        v.Add(8)
        self.assertEqual(tensor[0], 8)

        del v

        # memory is shared between `v` and `tensor`
        v = kaldi.DLPackFloatCuSubVector.from_dlpack(to_dlpack(tensor))

        v.Add(100)
        self.assertEqual(tensor[0], 108)
Esempio n. 2
0
    def test_pytorch_and_kaldi_gpu_tensor_zero_copy(self):
        # (fangjun): we put all tests in this function to avoid
        # invoking SelectGpuDevice() twice

        if torch.cuda.is_available() == False:
            print('No GPU detected! Skip it')
            return

        if kaldi.CudaCompiled() == False:
            print('Kaldi is not compiled with CUDA! Skip it')
            return

        device_id = 0

        # Kaldi and PyTorch will use the same GPU
        kaldi.SelectGpuDevice(device_id=device_id)

        device = torch.device('cuda', device_id)

        tensor = torch.arange(3).float()
        tensor = tensor.to(device)

        # make sure the tensor from PyTorch is indeed on GPU
        self.assertTrue(tensor.is_cuda)

        # GPU data is shared between kaldi::CuSubVector and PyTorch GPU tensor
        # no data is copied
        v = kaldi.CuSubVectorFromDLPack(to_dlpack(tensor))
        self.assertIsInstance(v, kaldi.FloatCuSubVector)

        v.Add(value=10)
        self.assertEqual(tensor[0], 10)
        self.assertEqual(tensor[1], 11)
        self.assertEqual(tensor[2], 12)

        v.Scale(value=6)
        self.assertEqual(tensor[0], 60)
        self.assertEqual(tensor[1], 66)
        self.assertEqual(tensor[2], 72)

        v.SetZero()
        self.assertEqual(tensor[0], 0)
        self.assertEqual(tensor[1], 0)
        self.assertEqual(tensor[2], 0)

        # Now for CuSubMatrix
        tensor = torch.arange(3).reshape(1, 3).float()
        tensor = tensor.to(device)

        # make sure the tensor from PyTorch is indeed on GPU
        self.assertTrue(tensor.is_cuda)

        m = kaldi.CuSubMatrixFromDLPack(to_dlpack(tensor))
        m.ApplyExp()

        self.assertAlmostEqual(tensor[0, 0], math.exp(0), places=7)
        self.assertAlmostEqual(tensor[0, 1], math.exp(1), places=7)
        self.assertAlmostEqual(tensor[0, 2], math.exp(2), places=7)

        m.SetZero()
        self.assertEqual(tensor[0, 0], 0)
        self.assertEqual(tensor[0, 1], 0)
        self.assertEqual(tensor[0, 2], 0)

        # now from Kaldi to PyTorch

        dim = 2
        cpu_v = kaldi.FloatVector(size=dim)
        cpu_v[0] = 10
        cpu_v[1] = 20

        gpu_v = kaldi.FloatCuVector(cpu_v)
        self.assertEqual(gpu_v[0], 10)
        self.assertEqual(gpu_v[1], 20)

        gpu_v_reference_count = sys.getrefcount(gpu_v)

        # memory is shared between `gpu_v` and `tensor`
        tensor = from_dlpack(gpu_v.to_dlpack())

        # `gpu_v.to_dlpack()` increases the reference count of `gpu_v`
        self.assertEqual(gpu_v_reference_count + 1, sys.getrefcount(gpu_v))

        self.assertTrue(tensor.is_cuda)
        self.assertEqual(tensor.device.index, device_id)

        self.assertTrue(tensor[0], 10)
        self.assertTrue(tensor[1], 20)

        tensor[0] = 1  # also changes `gpu_v`
        tensor[1] = 2

        self.assertEqual(gpu_v[0], 1)
        self.assertEqual(gpu_v[1], 2)

        gpu_v.Add(10)  # also changes `tensor`

        self.assertEqual(tensor[0], 11)
        self.assertEqual(tensor[1], 12)

        del tensor
        gc.collect()

        # now the reference count for gpu_v is decreased by one
        self.assertEqual(gpu_v_reference_count, sys.getrefcount(gpu_v))

        self.assertEqual(gpu_v[0], 11)  # gpu_v is still alive
        self.assertEqual(gpu_v[1], 12)

        # now for CuMatrix
        num_rows = 1
        num_cols = 2

        cpu_m = kaldi.FloatMatrix(row=num_rows, col=num_cols)
        cpu_m[0, 0] = 1
        cpu_m[0, 1] = 2

        gpu_m = kaldi.FloatCuMatrix(cpu_m)
        self.assertEqual(gpu_m[0, 0], 1)
        self.assertEqual(gpu_m[0, 1], 2)

        gpu_m_reference_count = sys.getrefcount(gpu_m)

        # memory is shared between `gpu_m` and `tensor`
        tensor = from_dlpack(gpu_m.to_dlpack())

        self.assertEqual(gpu_m_reference_count + 1, sys.getrefcount(gpu_m))

        self.assertTrue(tensor.is_cuda)
        self.assertEqual(tensor.device.index, device_id)

        self.assertTrue(tensor[0, 0], 1)
        self.assertTrue(tensor[0, 1], 2)

        tensor[0, 0] = 6  # also changes `gpu_m`
        tensor[0, 1] = 8

        self.assertEqual(gpu_m[0, 0], 6)
        self.assertEqual(gpu_m[0, 1], 8)

        gpu_m.Add(2)  # also changes `tensor`
        self.assertTrue(tensor[0, 0], 8)
        self.assertTrue(tensor[0, 1], 10)

        del tensor
        gc.collect()

        self.assertEqual(gpu_m_reference_count, sys.getrefcount(gpu_m))

        self.assertEqual(gpu_m[0, 0], 8)  # `gpu_m` is still alive
        self.assertEqual(gpu_m[0, 1], 10)

        # now for CuVector from_dlpack
        tensor = torch.tensor([1, 2]).float()
        tensor = tensor.to(device)

        # memory is shared between `tensor` and `v`
        v = kaldi.DLPackFloatCuSubVector.from_dlpack(to_dlpack(tensor))
        self.assertEqual(v[0], 1)

        v.Add(1)  # also changes `tensor`
        self.assertEqual(tensor[0], 2)
        self.assertEqual(tensor[1], 3)

        del v
        del tensor

        # now for CuMatrix from_dlpack
        tensor = torch.tensor([1, 2]).reshape(1, 2).float()
        tensor = tensor.to(device)

        # memory is shared between `tensor` and `m`
        m = kaldi.DLPackFloatCuSubMatrix.from_dlpack(to_dlpack(tensor))
        self.assertEqual(m[0, 0], 1)

        m.Add(100)  # also changes `tensor`
        self.assertEqual(tensor[0, 0], 101)

        del m
        del tensor
        gc.collect()

        # now test the issue: https://github.com/pytorch/pytorch/issues/9261
        # it will not consume all GPU memory
        for i in range(100):
            b = torch.randn(1024 * 1024 * 1024 // 4, 1, device=device)  # 1G
            a = kaldi.CuSubMatrixFromDLPack(to_dlpack(b))
            gc.collect()
        torch.cuda.empty_cache()

        for i in range(100 * 4):
            b = kaldi.FloatCuMatrix(1024 * 1024, 64)  # 256 MB
            a = from_dlpack(b.to_dlpack())
            gc.collect()
Esempio n. 3
0
    def test_case4(self):
        device = torch.device('cuda', device_id)
        # combine case1 to case3 to a minibatch
        # the first example (a): input_length: 1, label_length: 1
        # the second example (c, c): input_length: 3, label_length: 2
        # the third example (b, c): input_length: 3, label_length: 2
        label_lengths_tensor = torch.tensor([1, 2, 2], dtype=torch.int32)
        input_lengths_tensor = torch.tensor([1, 3, 3], dtype=torch.int32)

        alphabet_size = 5
        minibatch = 3
        info = ctc.CtcOptions()
        info.loc = ctc.CtcComputeLocation.CTC_GPU
        info.blank_label = 0

        label_lengths = kaldi.IntSubVectorFromDLPack(
            to_dlpack(label_lengths_tensor))

        input_lengths = kaldi.IntSubVectorFromDLPack(
            to_dlpack(input_lengths_tensor))

        status, size_in_bytes = ctc.GetWorkspaceSize(
            label_lengths=label_lengths,
            input_lengths=input_lengths,
            alphabet_size=alphabet_size,
            minibatch=minibatch,
            info=info)
        self.assertEqual(status, ctc.CtcStatus.CTC_STATUS_SUCCESS)
        num_floats = size_in_bytes // 4 + 1
        workspace_tensor = torch.empty(
            num_floats, dtype=torch.float32).contiguous().to(device)

        ex1 = torch.tensor([[0.2, 0.2, 0.2, 0.2, 0.2]], dtype=torch.float32)

        ex2 = torch.tensor(
            [[1, 2, 3, 4, 5], [6, 7, 8, 9, 10], [11, 12, 13, 14, 15]],
            dtype=torch.float32)

        ex3 = torch.tensor([[-5, -4, -3, -2, -1], [-10, -9, -8, -7, -6],
                            [-15, -14, -13, -12, -11]],
                           dtype=torch.float32)

        activations_tensor = pad_sequence([ex1, ex2, ex3], batch_first=False)

        activations_tensor = activations_tensor.contiguous().view(-1).to(device)
        gradients_tensor = torch.empty_like(activations_tensor)

        # labels are: (a), (c, c) (b, c)
        # which are:  (1), (3, 3), (2, 3)
        flat_labels_tensor = torch.tensor([1, 3, 3, 2, 3], dtype=torch.int32)
        costs_tensor = torch.empty(minibatch, dtype=torch.float32)

        activations = kaldi.CuSubVectorFromDLPack(to_dlpack(activations_tensor))
        gradients = kaldi.CuSubVectorFromDLPack(to_dlpack(gradients_tensor))
        flat_labels = kaldi.IntSubVectorFromDLPack(
            to_dlpack(flat_labels_tensor))
        costs = kaldi.FloatSubVectorFromDLPack(to_dlpack(costs_tensor))
        workspace = kaldi.CuSubVectorFromDLPack(to_dlpack(workspace_tensor))

        status = ctc.ComputeCtcLossGpu(activations=activations,
                                       gradients=gradients,
                                       flat_labels=flat_labels,
                                       label_lengths=label_lengths,
                                       input_lengths=input_lengths,
                                       alphabet_size=alphabet_size,
                                       minibatch=minibatch,
                                       costs=costs,
                                       workspace=workspace,
                                       options=info)

        self.assertAlmostEqual(costs[0], 1.6094379425049)
        self.assertAlmostEqual(costs[1], 7.355742931366)
        self.assertAlmostEqual(costs[2], 4.938850402832, places=6)
Esempio n. 4
0
    def test_case1(self):
        device = torch.device('cuda', device_id)

        # refer to https://github.com/baidu-research/warp-ctc/blob/master/torch_binding/TUTORIAL.md
        # this is the simplest case
        # we have one sequence with probability: [0.2, 0.2, 0.2, 0.2, 0.2]
        label_lengths_tensor = torch.tensor([1], dtype=torch.int32)
        input_lengths_tensor = torch.tensor([1], dtype=torch.int32)
        alphabet_size = 5
        minibatch = 1
        info = ctc.CtcOptions()
        info.loc = ctc.CtcComputeLocation.CTC_GPU
        info.blank_label = 0

        label_lengths = kaldi.IntSubVectorFromDLPack(
            to_dlpack(label_lengths_tensor))

        input_lengths = kaldi.IntSubVectorFromDLPack(
            to_dlpack(input_lengths_tensor))

        status, size_in_bytes = ctc.GetWorkspaceSize(
            label_lengths=label_lengths,
            input_lengths=input_lengths,
            alphabet_size=alphabet_size,
            minibatch=minibatch,
            info=info)
        self.assertEqual(status, ctc.CtcStatus.CTC_STATUS_SUCCESS)
        num_floats = size_in_bytes // 4 + 1
        workspace_tensor = torch.empty(
            num_floats, dtype=torch.float32).contiguous().to(device)

        activations_tensor = torch.tensor(
            [0.2, 0.2, 0.2, 0.2, 0.2],
            dtype=torch.float32).contiguous().to(device)
        gradients_tensor = torch.empty_like(activations_tensor)
        flat_labels_tensor = torch.tensor([1], dtype=torch.int32)
        costs_tensor = torch.empty(minibatch, dtype=torch.float32)

        activations = kaldi.CuSubVectorFromDLPack(to_dlpack(activations_tensor))
        gradients = kaldi.CuSubVectorFromDLPack(to_dlpack(gradients_tensor))
        flat_labels = kaldi.IntSubVectorFromDLPack(
            to_dlpack(flat_labels_tensor))
        costs = kaldi.FloatSubVectorFromDLPack(to_dlpack(costs_tensor))
        workspace = kaldi.CuSubVectorFromDLPack(to_dlpack(workspace_tensor))

        stream = torch.cuda.default_stream(device)
        with torch.cuda.stream(stream):
            status = ctc.ComputeCtcLossGpu(activations=activations,
                                           gradients=gradients,
                                           flat_labels=flat_labels,
                                           label_lengths=label_lengths,
                                           input_lengths=input_lengths,
                                           alphabet_size=alphabet_size,
                                           minibatch=minibatch,
                                           costs=costs,
                                           workspace=workspace,
                                           options=info)

        # 1.6094379425049 is copied from
        # https://github.com/baidu-research/warp-ctc/blob/master/torch_binding/TUTORIAL.md
        self.assertAlmostEqual(costs[0], 1.6094379425049)
Esempio n. 5
0
    def test_case3(self):
        device = torch.device('cuda', device_id)
        # this is the third case
        # we have 3 sequences with probability:
        # [-5, -4, -3, -2, -1]
        # [-10, -9, -8, -7, -6]
        # [-15, -14, -13, -12, -11]
        label_lengths_tensor = torch.tensor([2], dtype=torch.int32)
        input_lengths_tensor = torch.tensor([3], dtype=torch.int32)
        alphabet_size = 5
        minibatch = 1
        info = ctc.CtcOptions()
        info.loc = ctc.CtcComputeLocation.CTC_GPU
        info.blank_label = 0

        label_lengths = kaldi.IntSubVectorFromDLPack(
            to_dlpack(label_lengths_tensor))

        input_lengths = kaldi.IntSubVectorFromDLPack(
            to_dlpack(input_lengths_tensor))

        status, size_in_bytes = ctc.GetWorkspaceSize(
            label_lengths=label_lengths,
            input_lengths=input_lengths,
            alphabet_size=alphabet_size,
            minibatch=minibatch,
            info=info)
        self.assertEqual(status, ctc.CtcStatus.CTC_STATUS_SUCCESS)
        num_floats = size_in_bytes // 4 + 1
        workspace_tensor = torch.empty(
            num_floats, dtype=torch.float32).contiguous().to(device)

        activations_tensor = torch.tensor(
            [[-5, -4, -3, -2, -1], [-10, -9, -8, -7, -6],
             [-15, -14, -13, -12, -11]],
            dtype=torch.float32).contiguous().view(-1).to(device)
        gradients_tensor = torch.empty_like(activations_tensor)
        # the target sequence is b c, whichis 2 3
        flat_labels_tensor = torch.tensor([2, 3], dtype=torch.int32)
        costs_tensor = torch.empty(minibatch, dtype=torch.float32)

        activations = kaldi.CuSubVectorFromDLPack(to_dlpack(activations_tensor))
        gradients = kaldi.CuSubVectorFromDLPack(to_dlpack(gradients_tensor))
        flat_labels = kaldi.IntSubVectorFromDLPack(
            to_dlpack(flat_labels_tensor))
        costs = kaldi.FloatSubVectorFromDLPack(to_dlpack(costs_tensor))
        workspace = kaldi.CuSubVectorFromDLPack(to_dlpack(workspace_tensor))

        status = ctc.ComputeCtcLossGpu(activations=activations,
                                       gradients=gradients,
                                       flat_labels=flat_labels,
                                       label_lengths=label_lengths,
                                       input_lengths=input_lengths,
                                       alphabet_size=alphabet_size,
                                       minibatch=minibatch,
                                       costs=costs,
                                       workspace=workspace,
                                       options=info)

        # 4.938850402832 is copied from
        # https://github.com/baidu-research/warp-ctc/blob/master/torch_binding/TUTORIAL.md
        self.assertAlmostEqual(costs[0], 4.938850402832, places=6)