Example #1
0
def test_paddle_iterator_feed_ndarray():
    from nvidia.dali.plugin.paddle import DALIGenericIterator as PaddleIterator
    from nvidia.dali.plugin.paddle import feed_ndarray as feed_ndarray
    from paddle import fluid

    num_gpus = 1
    batch_size = 100
    pipes, _ = create_pipeline(lambda gpu: CustomPipe(batch_size=batch_size, num_threads=4, device_id=gpu, num_gpus=num_gpus,
                                                      data_paths=image_data_set), batch_size, num_gpus)
    for gpu_id in range(num_gpus):
        pipe = pipes[gpu_id]
        pipe.build()
        outs = pipe.run()
        out_data = outs[0].as_tensor()

        lod_tensor = fluid.core.LoDTensor()
        lod_tensor._set_dims(out_data.shape())
        gpu_place = fluid.CUDAPlace(gpu_id)

        ptr = lod_tensor._mutable_data(gpu_place, fluid.core.VarDesc.VarType.FP32)
        arr = np.array(lod_tensor)
        feed_ndarray(out_data, ptr, cuda_stream = None)  # Using DALI's internal stream
        np.testing.assert_equal(np.array(lod_tensor), outs[0].as_cpu().as_array())

        lod_tensor2 = fluid.core.LoDTensor()
        lod_tensor2._set_dims(out_data.shape())

        ptr2 = lod_tensor2._mutable_data(gpu_place, fluid.core.VarDesc.VarType.FP32)
        arr2 = np.array(lod_tensor2)
        feed_ndarray(out_data, ptr2, cuda_stream = 0)  # Using default stream
        np.testing.assert_equal(np.array(lod_tensor2), outs[0].as_cpu().as_array())
Example #2
0
    def __iter__(self):
        for _ in range(self.__len__()):
            data, ratios = [], []
            dali_data, dali_boxes, dali_labels, dali_before_pad, dali_img_ids, dali_attr = self.pipe.run(
            )

            # convert images from dali tensors to pytorch
            data = feed_ndarray(
                dali_data.as_tensor(),
                torch.zeros(dali_data.as_tensor().shape(),
                            dtype=torch.float,
                            device=torch.device("cuda")),
            )

            max_detections = max(
                *(dali_boxes[i].shape()[0] for i in range(len(dali_boxes))), 1)
            pyt_targets = -1 * torch.ones([len(dali_boxes), max_detections, 5])

            # get image ids. only needed for evaluation
            img_ids = torch.tensor(dali_img_ids.as_array())

            prior_size = dali_attr.as_cpu().as_array()

            # target has different size for each image so need to treat them separately
            for batch in range(self.batch_size):

                # Calculate image resize ratio to rescale boxes
                resized_size = dali_before_pad[batch].shape()[:2]

                # in this formulation to get true bbox you need to **multiply** prediction by ratio
                ratios.append(max(prior_size[batch]) / max(resized_size))

                # Rescale boxes
                pyt_bbox = feed_ndarray(dali_boxes[batch],
                                        torch.zeros(dali_boxes[batch].shape()))
                num_dets = pyt_bbox.size(0)
                if num_dets > 0:
                    pyt_bbox[:, 0::2] *= float(resized_size[1])
                    pyt_bbox[:, 1::2] *= float(resized_size[0])
                    pyt_targets[batch, :num_dets, :4] = pyt_bbox

                # Arrange labels in target tensor
                np_label = feed_ndarray(
                    dali_labels[batch],
                    torch.empty(dali_labels[batch].shape(), dtype=torch.int32))
                # DALI CocoReader maps existing 90 classes to 80 unique classes. Need to map to 90 again
                # this is done by indexing numpy array of new (90) labels with old (80) labels
                pyt_label = torch.tensor(
                    COCO_80_TO_90_ARR[np_label.squeeze().numpy()])
                if num_dets > 0:
                    pyt_label -= 1  # [0, 90] => [-1, 89]. Removes background
                    pyt_targets[batch, :num_dets, 4] = pyt_label

            pyt_targets = pyt_targets.cuda(non_blocking=True)
            ratios = torch.tensor(ratios)

            if self.train:
                yield data, pyt_targets
            else:
                yield data, (pyt_targets, img_ids, ratios)
Example #3
0
    def __call__(self, input):
        # set data and run the pipeline
        self._pipe.set_data(input)
        out_pipe = self._pipe.run()

        # retrieve dali tensor
        d_images: nvidia.dali.backend_impl.TensorGPU = out_pipe[0].as_tensor()

        # create torch tensor header with expected size
        t_images = torch.empty(
            d_images.shape(), dtype=torch.uint8, device=self._device)

        # populate torch tensor with dali tensor
        to_pytorch.feed_ndarray(d_images, t_images)
        t_images = t_images.permute([0, 3, 1, 2])

        return t_images
Example #4
0
def test_pytorch_iterator_feed_ndarray():
    from nvidia.dali.plugin.pytorch import DALIGenericIterator as PyTorchIterator
    from nvidia.dali.plugin.pytorch import feed_ndarray as feed_ndarray
    import torch
    num_gpus = 1
    batch_size = 100
    pipes, _ = create_pipeline(lambda gpu: CustomPipe(batch_size=batch_size, num_threads=4, device_id=gpu, num_gpus=num_gpus,
                                                      data_paths=image_data_set), batch_size, num_gpus)
    for gpu_id in range(num_gpus):
        pipe = pipes[gpu_id]
        pipe.build()
        outs = pipe.run()
        out_data = outs[0].as_tensor()
        device = torch.device('cuda', gpu_id)
        arr = torch.zeros(out_data.shape(), dtype=torch.float32, device=device)
        feed_ndarray(out_data, arr, cuda_stream = torch.cuda.current_stream(device=device))
        np.testing.assert_equal(arr.cpu().numpy(), outs[0].as_cpu().as_array())
Example #5
0
    def preprocess(self, pipe_out):
        image_list, boxes_list, labels_list, id_list = pipe_out

        batch = []
        for i in range(len(image_list)):
            img = torch.empty(image_list[i].shape(),
                              device="cuda",
                              dtype=torch.uint8)
            feed_ndarray(image_list[i], img, self.stream)
            self.stream.synchronize()
            img = img.permute(2, 0, 1) / 255.

            img_id = torch.from_numpy(id_list.at(i)).cuda()
            boxes = torch.from_numpy(boxes_list.at(i)).cuda()
            labels = torch.from_numpy(labels_list.at(i) - 1).squeeze(1).cuda()
            tgt = {"image_id": img_id, "boxes": boxes, "labels": labels.long()}

            batch.append((img, tgt))
        return batch
    def _run_one_step(self, pipeline, data_batches, current_data_batch):
        p = pipeline
        outputs = p.share_outputs()
        device_id = p.device_id

        tensors = list()
        shapes = list()
        for out in outputs:
            tensors.append(out.as_tensor())
            shapes.append(tensors[-1].shape())

        if data_batches[current_data_batch] is None:
            torch_types = list()
            torch_devices = list()
            torch_gpu_device = torch.device('cuda', device_id)
            torch_cpu_device = torch.device('cpu')
            for i in range(len(outputs)):
                torch_types.append(to_torch_type[np.dtype(tensors[i].dtype())])
                from nvidia.dali.backend import TensorGPU
                if type(tensors[i]) is TensorGPU:
                    torch_devices.append(torch_gpu_device)
                else:
                    torch_devices.append(torch_cpu_device)

            pyt_tensors = list()
            for i in range(len(outputs)):
                pyt_tensors.append(
                    torch.zeros(shapes[i],
                                dtype=torch_types[i],
                                device=torch_devices[i]))

            data_batches[current_data_batch] = pyt_tensors
        else:
            pyt_tensors = data_batches[current_data_batch]

        for tensor, pyt_tensor in zip(tensors, pyt_tensors):
            feed_ndarray(tensor, pyt_tensor)

        p.release_outputs()
        p.schedule_run()

        return pyt_tensors
Example #7
0
def test_mxnet_iterator_feed_ndarray():
    from nvidia.dali.plugin.mxnet import DALIGenericIterator as MXNetIterator
    from nvidia.dali.plugin.mxnet import feed_ndarray as feed_ndarray
    import mxnet as mx

    num_gpus = 1
    batch_size = 100
    pipes, _ = create_pipeline(lambda gpu: CustomPipe(batch_size=batch_size, num_threads=4, device_id=gpu, num_gpus=num_gpus,
                                                      data_paths=image_data_set), batch_size, num_gpus)
    for gpu_id in range(num_gpus):
        pipe = pipes[gpu_id]
        pipe.build()
        outs = pipe.run()
        out_data = outs[0].as_tensor()
        with mx.Context(mx.gpu(gpu_id)):
            arr = mx.nd.zeros(out_data.shape(), dtype=np.float32)
            mx.base._LIB.MXNDArrayWaitToWrite(arr.handle)
            feed_ndarray(out_data, arr, cuda_stream = None)  # Using DALI's internal stream
            np.testing.assert_equal(arr.asnumpy(), outs[0].as_cpu().as_array())

            arr2 = mx.nd.zeros(out_data.shape(), dtype=np.float32)
            mx.base._LIB.MXNDArrayWaitToWrite(arr2.handle)
            feed_ndarray(out_data, arr2, cuda_stream = 0)  # Using default stream
            np.testing.assert_equal(arr2.asnumpy(), outs[0].as_cpu().as_array())