Beispiel #1
0
def index_block_batches_T(indexed, tensor, batch, patchsize, nblocks, gpuid):

    # -- prepare data --
    print("indexed.shape ", indexed.shape)
    print("tensor.shape ", tensor.shape)
    print("batch.shape ", batch.shape)

    batchsize = batch.shape[2]
    indexed = indexed[:, :, :batchsize]
    numba.cuda.select_device(gpuid)
    indexed_nba = cuda.as_cuda_array(indexed)
    batch_nba = cuda.as_cuda_array(batch)
    tensor_nba = cuda.as_cuda_array(tensor)

    # -- prepare cuda --
    npix = tensor.shape[1]
    threads_per_block = 64
    blocks = npix // threads_per_block + 1

    # -- run cuda --
    index_block_batches_cuda_T[blocks,
                               threads_per_block](indexed_nba, tensor_nba,
                                                  batch_nba, patchsize,
                                                  nblocks)
    return indexed
Beispiel #2
0
def index_along_frames(patches, dframes):

    # -- set cuda device --
    device = patches.device
    gpuid = device.index
    numba.cuda.select_device(gpuid)

    # -- get shapes --
    naligns, npatches, nframes, nftrs = patches.shape
    naligns, npatches = dframes.shape
    fpatches = torch.zeros((naligns, npatches, nftrs))
    fpatches = fpatches.to(device)
    # print("naligns,npatches,nframes,nftrs",naligns,npatches,nframes,nftrs)

    # -- create numba cuda --
    dframes_nba = cuda.as_cuda_array(dframes)
    patches_nba = cuda.as_cuda_array(patches)
    fpatches_nba = cuda.as_cuda_array(fpatches)

    # -- exec indexing cuda-kernel --
    threads_per_block = (32, 32)
    blocks_aligns = naligns // threads_per_block[0] + (
        naligns % threads_per_block[0] != 0)
    blocks_patches = npatches // threads_per_block[1] + (
        npatches % threads_per_block[1] != 0)
    blocks = (blocks_aligns, blocks_patches)
    index_along_frames_cuda[blocks,
                            threads_per_block](fpatches_nba, patches_nba,
                                               dframes_nba)
    return fpatches
Beispiel #3
0
def get_input(type, nrows, ncols, dtype, order='C', out_dtype=False):
    rand_mat = (cp.random.rand(nrows, ncols) * 10)
    rand_mat = cp.array(rand_mat, order=order).astype(dtype)

    if type == 'numpy':
        result = np.array(cp.asnumpy(rand_mat), order=order)

    if type == 'cupy':
        result = rand_mat

    if type == 'numba':
        result = nbcuda.as_cuda_array(rand_mat)

    if type == 'cudf':
        result = cudf.DataFrame()
        result = result.from_gpu_matrix(nbcuda.as_cuda_array(rand_mat))

    if type == 'pandas':
        result = cudf.DataFrame()
        result = result.from_gpu_matrix(nbcuda.as_cuda_array(rand_mat))
        result = result.to_pandas()

    if type == 'cuml':
        result = CumlArray(data=rand_mat,
                           dtype=dtype,
                           shape=rand_mat.shape,
                           order=order if order != 'K' else None)

    if out_dtype:
        return result, np.array(cp.asnumpy(rand_mat).astype(out_dtype),
                                order=order)
    else:
        return result, np.array(cp.asnumpy(rand_mat), order=order)
Beispiel #4
0
    def test_array_views(self):
        """Views created via array interface support:
            - Strided slices
            - Strided slices
        """
        h_arr = np.random.random(10)
        c_arr = cuda.to_device(h_arr)

        arr = cuda.as_cuda_array(c_arr)

        # __getitem__ interface accesses expected data

        # Direct views
        np.testing.assert_array_equal(arr.copy_to_host(), h_arr)
        np.testing.assert_array_equal(arr[:].copy_to_host(), h_arr)

        # Slicing
        np.testing.assert_array_equal(arr[:5].copy_to_host(), h_arr[:5])

        # Strided view
        np.testing.assert_array_equal(arr[::2].copy_to_host(), h_arr[::2])

        # View of strided array
        arr_strided = cuda.as_cuda_array(c_arr[::2])
        np.testing.assert_array_equal(arr_strided.copy_to_host(), h_arr[::2])

        # A strided-view-of-array and view-of-strided-array have the same
        # shape, strides, itemsize, and alloc_size
        self.assertEqual(arr[::2].shape, arr_strided.shape)
        self.assertEqual(arr[::2].strides, arr_strided.strides)
        self.assertEqual(arr[::2].dtype.itemsize, arr_strided.dtype.itemsize)
        self.assertEqual(arr[::2].alloc_size, arr_strided.alloc_size)
        self.assertEqual(arr[::2].nbytes,
                         arr_strided.size * arr_strided.dtype.itemsize)

        # __setitem__ interface propogates into external array

        # Writes to a slice
        arr[:5] = np.pi
        np.testing.assert_array_equal(
            c_arr.copy_to_host(), np.concatenate((np.full(5,
                                                          np.pi), h_arr[5:])))

        # Writes to a slice from a view
        arr[:5] = arr[5:]
        np.testing.assert_array_equal(c_arr.copy_to_host(),
                                      np.concatenate((h_arr[5:], h_arr[5:])))

        # Writes through a view
        arr[:] = cuda.to_device(h_arr)
        np.testing.assert_array_equal(c_arr.copy_to_host(), h_arr)

        # Writes to a strided slice
        arr[::2] = np.pi
        np.testing.assert_array_equal(
            c_arr.copy_to_host()[::2],
            np.full(5, np.pi),
        )
        np.testing.assert_array_equal(c_arr.copy_to_host()[1::2], h_arr[1::2])
    def test_consume_no_sync(self):
        # Create a foreign array with no stream
        f_arr = ForeignArray(cuda.device_array(10))

        with patch.object(cuda.cudadrv.driver.Stream,
                          'synchronize',
                          return_value=None) as mock_sync:
            cuda.as_cuda_array(f_arr)

        # Ensure the synchronize method of a stream was not called
        mock_sync.assert_not_called()
    def test_consume_sync(self):
        # Create a foreign array with a stream
        s = cuda.stream()
        f_arr = ForeignArray(cuda.device_array(10, stream=s))

        with patch.object(cuda.cudadrv.driver.Stream,
                          'synchronize',
                          return_value=None) as mock_sync:
            cuda.as_cuda_array(f_arr)

        # Ensure the synchronize method of a stream was called
        mock_sync.assert_called_once_with()
Beispiel #7
0
def create_input(input_type, dtype, shape, order):
    rand_ary = cp.ones(shape, dtype=dtype, order=order)

    if input_type == 'numpy':
        return np.array(cp.asnumpy(rand_ary), dtype=dtype, order=order)

    elif input_type == 'numba':
        return cuda.as_cuda_array(rand_ary)

    elif input_type == 'series':
        return cudf.Series(cuda.as_cuda_array(rand_ary))

    else:
        return rand_ary
Beispiel #8
0
def get_small_dataset(output_type):
    ary = [[1.0, 4.0, 4.0], [2.0, 2.0, 2.0], [5.0, 1.0, 1.0]]
    ary = cp.asarray(ary)

    if output_type == 'numba':
        return as_cuda_array(ary)

    elif output_type == 'cupy':
        return ary

    elif output_type == 'numpy':
        return cp.asnumpy(ary)

    else:
        return cudf.DataFrame.from_gpu_matrix(as_cuda_array(ary))
Beispiel #9
0
    def __init__(self, minibatch: int, maxT: int, maxU: int,
                 alphabet_size: int, workspace, blank: int, num_threads: int,
                 stream):
        """
        Helper class to launch the CUDA Kernels to compute the Transducer Loss.

        Args:
            minibatch: Int representing the batch size.
            maxT: The maximum possible acoustic sequence length. Represents T in the logprobs tensor.
            maxU: The maximum possible target sequence length. Represents U in the logprobs tensor.
            alphabet_size: The vocabulary dimension V+1 (inclusive of RNNT blank).
            workspace: An allocated chunk of memory that will be sliced off and reshaped into required
                blocks used as working memory.
            blank: Index of the RNNT blank token in the vocabulary. Generally the first or last token in the vocab.
            num_threads: Number of OMP threads to launch.
            stream: Numba Cuda Stream.
        """
        self.minibatch_ = minibatch
        self.maxT_ = maxT
        self.maxU_ = maxU
        self.alphabet_size_ = alphabet_size
        self.gpu_workspace = cuda.as_cuda_array(
            workspace
        )  # a flat vector of floatX numbers that represents allocated memory slices
        self.blank_ = blank
        self.num_threads_ = num_threads
        self.stream_ = stream  # type: cuda.cudadrv.driver.Stream

        if num_threads > 0:
            numba.set_num_threads(min(multiprocessing.cpu_count(),
                                      num_threads))
        else:
            self.num_threads_ = numba.get_num_threads()
Beispiel #10
0
def apply_binarize(in_col, width):
    buf = rmm.DeviceBuffer(size=(in_col.size * width))
    out = cuda.as_cuda_array(buf).view("int8").reshape((in_col.size, width))
    if out.size > 0:
        out[:] = 0
        binarize.forall(out.size)(in_col, out, width)
    return out
Beispiel #11
0
    def __call__(self, tensor, mode=0):
        r""" Converts float weights to quantized weights.

        Args:
            - tensor: input data
            - mode: GFPQ mode for param
                GFPQ_MODE_INIT(0): There is no valid parameter in param[].
                    Generate the parameter and filled in param[].
                GFPQ_MODE_UPDATE(1): There is parameter in param[]. Generate
                    new parameter, update param[] when the new parameter is
                    better.
                GFPQ_MODE_APPLY_ONLY(2): There is parameter in param[]. Don't
                    generate parameter. Just use the param[].
        """

        return tensor
        data_cuda_array = cuda.as_cuda_array(tensor.data.detach())
        data_p = data_cuda_array.device_ctypes_pointer
        self._param.mode = mode
        ret = self._libquant.HI_GFPQ_QuantAndDeQuant_GPU_PY(
            data_p, data_cuda_array.size, self._bit_width,
            ctypes.byref(self._param), self._stream.handle,
            self._cublas_handle)
        assert ret == 0, "HI_GFPQ_QuantAndDeQuant failed(%d)\n" % (ret)
        return tensor
Beispiel #12
0
def convert_dtype(X, to_dtype=np.float32):
    """
    Convert X to be of dtype `dtype`

    Supported float dtypes for overflow checking.
    Todo: support other dtypes if needed.
    """

    if isinstance(X, np.ndarray):
        dtype = X.dtype
        if dtype != to_dtype:
            X_m = X.astype(to_dtype)
            if len(X[X == np.inf]) > 0:
                raise TypeError("Data type conversion resulted"
                                "in data loss.")
            return X_m

    elif isinstance(X, cudf.Series) or isinstance(X, cudf.DataFrame):
        return X.astype(to_dtype)

    elif cuda.is_cuda_array(X):
        X_m = cp.asarray(X)
        X_m = X_m.astype(to_dtype)
        return cuda.as_cuda_array(X_m)

    else:
        raise TypeError("Received unsupported input type " % type(X))

    return X
Beispiel #13
0
    def check_ipc_handle_serialization(self, index_arg=None, foreign=False):
        # prepare data for IPC
        arr = np.arange(10, dtype=np.intp)
        devarr = cuda.to_device(arr)
        if index_arg is not None:
            devarr = devarr[index_arg]
        if foreign:
            devarr = cuda.as_cuda_array(ForeignArray(devarr))
        expect = devarr.copy_to_host()

        # create IPC handle
        ctx = cuda.current_context()
        ipch = ctx.get_ipc_handle(devarr.gpu_data)

        # pickle
        buf = pickle.dumps(ipch)
        ipch_recon = pickle.loads(buf)
        self.assertIs(ipch_recon.base, None)
        self.assertEqual(tuple(ipch_recon.handle), tuple(ipch.handle))
        self.assertEqual(ipch_recon.size, ipch.size)

        # spawn new process for testing
        ctx = mp.get_context('spawn')
        result_queue = ctx.Queue()
        args = (ipch, result_queue)
        proc = ctx.Process(target=serialize_ipc_handle_test, args=args)
        proc.start()
        succ, out = result_queue.get()
        if not succ:
            self.fail(out)
        else:
            np.testing.assert_equal(expect, out)
        proc.join(3)
Beispiel #14
0
def convert_dtype(X, to_dtype=np.float32, legacy=True):
    """
    Convert X to be of dtype `dtype`, raising a TypeError
    if the conversion would lose information.
    """
    would_lose_info = _typecast_will_lose_information(X, to_dtype)
    if would_lose_info:
        raise TypeError("Data type conversion would lose information.")

    if isinstance(X, np.ndarray):
        dtype = X.dtype
        if dtype != to_dtype:
            X_m = X.astype(to_dtype)
            return X_m

    elif isinstance(X, (cudf.Series, cudf.DataFrame, pd.Series, pd.DataFrame)):
        return X.astype(to_dtype, copy=False)

    elif cuda.is_cuda_array(X):
        X_m = cp.asarray(X)
        X_m = X_m.astype(to_dtype, copy=False)

        if legacy:
            return cuda.as_cuda_array(X_m)
        else:
            return CumlArray(data=X_m)

    else:
        raise TypeError("Received unsupported input type: %s" % type(X))

    return X
Beispiel #15
0
 def predict(self, features_gen=None, as_cuda_array=False, flatten=True):
     if features_gen is None:
         features_gen = self._build_features()
     predicted_cva = torch.empty((self.diffusion_engine.num_defs_per_path *
                                  self.diffusion_engine.num_paths, 1),
                                 dtype=torch.float32,
                                 device=self.device)
     with cuda.devices.gpus[self.device.index]:
         d_predicted_cva = cuda.as_cuda_array(
             predicted_cva.view(self.diffusion_engine.num_defs_per_path,
                                self.diffusion_engine.num_paths))
     if as_cuda_array:
         out = d_predicted_cva
     else:
         out = cuda.pinned_array((self.diffusion_engine.num_defs_per_path,
                                  self.diffusion_engine.num_paths),
                                 dtype=np.float32)
     if flatten:
         out = out.reshape(-1)
     while True:
         t = yield
         self._predict(t, features_gen, predicted_cva)
         if not as_cuda_array:
             d_predicted_cva.copy_to_host(out)
         yield out
Beispiel #16
0
def get_input(type, nrows, ncols, dtype, order='C', out_dtype=False):
    rand_mat = (cp.random.rand(nrows, ncols) * 10)
    rand_mat = cp.array(rand_mat, dtype=dtype, order=order)

    if type == 'numpy':
        result = np.array(cp.asnumpy(rand_mat), order=order)

    if type == 'cupy':
        result = rand_mat

    if type == 'numba':
        result = nbcuda.as_cuda_array(rand_mat)

    if type == 'cudf':
        result = cudf.DataFrame(rand_mat)

    if type == 'pandas':
        result = pdDF(cp.asnumpy(rand_mat))

    if type == 'cuml':
        result = CumlArray(data=rand_mat)

    if out_dtype:
        return result, np.array(cp.asnumpy(rand_mat).astype(out_dtype),
                                order=order)
    else:
        return result, np.array(cp.asnumpy(rand_mat), order=order)
    def test_consume_sync_disabled(self):
        # Create a foreign array with a stream
        s = cuda.stream()
        f_arr = ForeignArray(cuda.device_array(10, stream=s))

        # Set sync to false before testing. The test suite should generally be
        # run with sync enabled, but stash the old value just in case it is
        # not.
        with override_config('CUDA_ARRAY_INTERFACE_SYNC', False):
            with patch.object(cuda.cudadrv.driver.Stream,
                              'synchronize',
                              return_value=None) as mock_sync:
                cuda.as_cuda_array(f_arr)

            # Ensure the synchronize method of a stream was not called
            mock_sync.assert_not_called()
    def test_consume_no_stream(self):
        # Create a foreign array with no stream
        f_arr = ForeignArray(cuda.device_array(10))

        # Ensure that the imported array has no default stream
        c_arr = cuda.as_cuda_array(f_arr)
        self.assertEqual(c_arr.stream, 0)
Beispiel #19
0
    def deserialize(cls, header, frames):
        # Deserialize the mask, value, and offset frames
        arrays = []

        for each_frame in frames:
            if hasattr(each_frame, "__cuda_array_interface__"):
                each_frame = cuda.as_cuda_array(each_frame)
            elif isinstance(each_frame, memoryview):
                each_frame = np.asarray(each_frame)
                each_frame = cudautils.to_device(each_frame)

            arrays.append(libcudf.cudf.get_ctype_ptr(each_frame))

        # Use from_offsets to get nvstring data.
        # Note: array items = [nbuf, sbuf, obuf]
        scount = header["nvstrings"]
        data = nvstrings.from_offsets(
            arrays[1],
            arrays[2],
            scount,
            nbuf=arrays[0],
            ncount=header["null_count"],
            bdevmem=True,
        )
        return column.as_column(data)
Beispiel #20
0
    def auto_device(self, obj, stream=0, copy=True):
        """
        Create a DeviceRecord or DeviceArray like obj and optionally copy data
        from host to device. If obj already represents device memory, it is
        returned and no copy is made. Uses RMM for device memory allocation if
        necessary.
        """
        if cuda.driver.is_device_memory(obj):
            return obj, False
        if hasattr(obj, '__cuda_array_interface__'):
            new_dev_array = cuda.as_cuda_array(obj)
            # Allocate new output array using rmm and copy the numba device
            # array to an rmm owned device array
            out_dev_array = self.device_array_like(new_dev_array)
            out_dev_array.copy_to_device(new_dev_array)
            return out_dev_array, False
        else:
            if isinstance(obj, np.void):
                # raise NotImplementedError("DeviceRecord type not supported "
                #                            "by RMM")
                devobj = cuda.devicearray.from_record_like(obj, stream=stream)
            else:
                if not isinstance(obj, np.ndarray):
                    obj = np.asarray(obj)
                cuda.devicearray.sentry_contiguous(obj)
                devobj = self.device_array_like(obj, stream=stream)

            if copy:
                devobj.copy_to_device(obj, stream=stream)
            return devobj, True
Beispiel #21
0
def device_to_host(obj: object) -> DeviceSerialized:
    header, frames = serialize(obj, serializers=["cuda", "pickle"])
    is_cuda = [hasattr(f, "__cuda_array_interface__") for f in frames]
    frames = [
        cuda.as_cuda_array(f).copy_to_host() if ic else f
        for ic, f in zip(is_cuda, frames)
    ]
    return DeviceSerialized(header, frames, is_cuda)
Beispiel #22
0
def uniform_pix_sampling(aligned, S=2):
    nframes, nimages, ncolor, h, w = aligned.shape
    device = aligned.device
    gpuid = device.index
    numba.cuda.select_device(gpuid)
    sims = torch.zeros((S, nimages, ncolor, h, w)).to(device)
    # rands = np.random.choice(nframes,(h,w))

    aligned_nba = cuda.as_cuda_array(aligned)
    sims_nba = cuda.as_cuda_array(sims)
    index_bursts_by_frames(sims_nba, aligned_nba)

    masks = torch.zeros((S, nimages, ncolor, h, w)).to(device)
    masks_nba = cuda.as_cuda_array(masks)
    fill_masks(masks_nba, aligned_nba)

    return sims, masks
Beispiel #23
0
def get_input(type, nrows, ncols, dtype, order='C', out_dtype=False):
    if has_cupy:
        import cupy as cp
        rand_mat = (cp.random.rand(nrows, ncols) * 10)
        rand_mat = cp.array(rand_mat, order=order).astype(dtype)

        if type == 'numpy':
            result = np.array(cp.asnumpy(rand_mat), order=order)

        if type == 'cupy':
            result = rand_mat

        if type == 'numba':
            result = cuda.as_cuda_array(rand_mat)

        if type == 'dataframe':
            X_df = cudf.DataFrame()
            result = X_df.from_gpu_matrix(cuda.as_cuda_array(rand_mat))

        if out_dtype:
            return result, np.array(cp.asnumpy(rand_mat).astype(out_dtype),
                                    order=order)
        else:
            return result, np.array(cp.asnumpy(rand_mat), order=order)

    else:
        rand_mat = (np.random.rand(nrows, ncols) * 10)
        rand_mat = np.array(rand_mat, order=order).astype(dtype)

        if type == 'numpy':
            result = deepcopy(rand_mat)

        if type == 'cupy':
            result = None

        if type == 'numba':
            result = cuda.to_device(rand_mat)

        if type == 'dataframe':
            X_df = cudf.DataFrame()
            result = X_df.from_gpu_matrix(cuda.to_device(rand_mat))

        if out_dtype:
            return result, rand_mat.astype(out_dtype)
        else:
            return result, rand_mat
Beispiel #24
0
def as_contiguous(arr):
    assert arr.ndim == 1
    cupy_dtype = arr.dtype
    if np.issubdtype(cupy_dtype, np.datetime64):
        cupy_dtype = np.dtype("int64")
        arr = arr.view("int64")
    out = cupy.ascontiguousarray(cupy.asarray(arr))
    return cuda.as_cuda_array(out).view(arr.dtype)
Beispiel #25
0
def convert_dtype(X, to_dtype=np.float32):
    """
    Convert X to be of dtype `dtype`

    Supported float dtypes for overflow checking.
    Todo: support other dtypes if needed.
    """

    # Using cuDF for converting numba and device array interface inputs
    # if CuPy not installed, temporary while CuPy conda package
    # causes nccl conflicts
    if isinstance(X, np.ndarray):
        dtype = X.dtype
        if dtype != to_dtype:
            X_m = X.astype(to_dtype)
            if len(X[X == np.inf]) > 0:
                raise TypeError("Data type conversion resulted"
                                "in data loss.")
            return X_m

    elif isinstance(X, cudf.Series):
        return X.astype(to_dtype)

    elif cuda.is_cuda_array(X):
        if has_cupy():
            import cupy as cp
            X_m = cp.asarray(X)
            X_m = X_m.astype(to_dtype)
            return cuda.as_cuda_array(X_m)
        else:
            warnings.warn("Using cuDF for dtype conversion, install"
                          "CuPy for faster data conversion.")
            if (len(X.shape) == 1):
                return cudf.Series(X).astype(to_dtype).to_gpu_array()
            else:
                X_df = cudf.DataFrame()
                X = X_df.from_gpu_matrix(X)
                X = convert_dtype(X, to_dtype=to_dtype)
                return X.as_gpu_matrix()

    elif isinstance(X, cudf.DataFrame):
        dtype = np.dtype(X[X.columns[0]]._column.dtype)
        if dtype != to_dtype:
            new_cols = [(col, X._cols[col].astype(to_dtype))
                        for col in X._cols]
            overflowed = sum([len(colval[colval >= np.inf])
                              for colname, colval in new_cols])

            if overflowed > 0:
                raise TypeError("Data type conversion resulted"
                                "in data loss.")

            return cudf.DataFrame(new_cols)

    else:
        raise TypeError("Received unsupported input type " % type(X))

    return X
    def test_consume_stream(self):
        # Create a foreign array with a stream
        s = cuda.stream()
        f_arr = ForeignArray(cuda.device_array(10, stream=s))

        # Ensure that an imported array has the stream as its default stream
        c_arr = cuda.as_cuda_array(f_arr)
        self.assertTrue(c_arr.stream.external)
        self.assertEqual(c_arr.stream.handle.value, s.handle.value)
Beispiel #27
0
def full(size, value, dtype):
    cupy_dtype = dtype
    if np.issubdtype(cupy_dtype, np.datetime64):
        time_unit, _ = np.datetime_data(cupy_dtype)
        cupy_dtype = np.int64
        value = np.datetime64(value, time_unit).view(cupy_dtype)

    out = cupy.full(size, value, cupy_dtype)
    return cuda.as_cuda_array(out).view(dtype)
Beispiel #28
0
 def as_device_array(self, obj):
     # We don't want to call as_cuda_array on objects that are already Numba
     # device arrays, because this results in exporting the array as a
     # Producer then importing it as a Consumer, which causes a
     # synchronization on the array's stream (if it has one) by default.
     # When we have a Numba device array, we can simply return it.
     if cuda.cudadrv.devicearray.is_cuda_ndarray(obj):
         return obj
     return cuda.as_cuda_array(obj)
 def _build_labels_backward(self, as_cuda_tensor):
     d_spread_integral_now = self.diffusion_engine.d_spread_integrals[0, 1:]
     d_spread_integral_next = self.diffusion_engine.d_spread_integrals[1,
                                                                       1:]
     d_mtm_next = self.diffusion_engine.d_mtm_by_cpty[0]
     d_rate_integral_now = self.diffusion_engine.d_dom_rate_integral[0]
     d_rate_integral_next = self.diffusion_engine.d_dom_rate_integral[1]
     d_def = self.diffusion_engine.d_def_indicators[0]
     d_labels_by_cpty = self.diffusion_engine.d_mtm_by_cpty[1]
     t_out = torch.empty((self.diffusion_engine.num_defs_per_path,
                          self.diffusion_engine.num_paths),
                         dtype=torch.float32,
                         device=self.device)
     with cuda.devices.gpus[self.device.index]:
         d_out = cuda.as_cuda_array(t_out)
     if as_cuda_tensor:
         out = t_out
     else:
         out = cuda.pinned_array((self.diffusion_engine.num_defs_per_path,
                                  self.diffusion_engine.num_paths),
                                 dtype=np.float32)
     out[:] = 0
     if as_cuda_tensor:
         yield out.view(-1, 1)
     else:
         yield out.reshape(-1, 1)
     d_spread_integral_next.copy_to_device(
         self.diffusion_engine.spread_integrals[
             self.diffusion_engine.num_coarse_steps, 1:])
     d_rate_integral_next.copy_to_device(
         self.diffusion_engine.dom_rate_integral[
             self.diffusion_engine.num_coarse_steps])
     accumulate = False
     for t in range(self.diffusion_engine.num_coarse_steps - 1, -1, -1):
         d_spread_integral_now.copy_to_device(
             self.diffusion_engine.spread_integrals[t, 1:])
         d_rate_integral_now.copy_to_device(
             self.diffusion_engine.dom_rate_integral[t])
         d_mtm_next.copy_to_device(self.diffusion_engine.mtm_by_cpty[t + 1])
         d_def.copy_to_device(self.diffusion_engine.def_indicators[t])
         self.__cuda_build_labels_backward(d_spread_integral_now,
                                           d_spread_integral_next,
                                           d_rate_integral_now,
                                           d_rate_integral_next, d_mtm_next,
                                           d_labels_by_cpty, t > 0,
                                           accumulate)
         self.__cuda_aggregate_survival(d_labels_by_cpty, d_def, d_out)
         if as_cuda_tensor:
             yield out.view(-1, 1)
         else:
             d_out.copy_to_host(out)
             yield out.reshape(-1, 1)
         if not accumulate:
             accumulate = True
Beispiel #30
0
def create_input(input_type, dtype, shape, order):
    float_dtypes = [np.float16, np.float32, np.float64]
    if dtype in float_dtypes:
        rand_ary = np.random.random(shape)
    else:
        rand_ary = cp.random.randint(100, size=shape)

    rand_ary = cp.array(rand_ary, dtype=dtype, order=order)

    if input_type == 'numpy':
        return np.array(cp.asnumpy(rand_ary), dtype=dtype, order=order)

    elif input_type == 'numba':
        return cuda.as_cuda_array(rand_ary)

    elif input_type == 'series':
        return cudf.Series(cuda.as_cuda_array(rand_ary))

    else:
        return rand_ary
 def test_as_cuda_array(self):
     h_arr = np.arange(10)
     self.assertFalse(cuda.is_cuda_array(h_arr))
     d_arr = cuda.to_device(h_arr)
     self.assertTrue(cuda.is_cuda_array(d_arr))
     my_arr = MyArray(d_arr)
     self.assertTrue(cuda.is_cuda_array(my_arr))
     wrapped = cuda.as_cuda_array(my_arr)
     self.assertTrue(cuda.is_cuda_array(wrapped))
     # Their values must equal the original array
     np.testing.assert_array_equal(wrapped.copy_to_host(), h_arr)
     np.testing.assert_array_equal(d_arr.copy_to_host(), h_arr)
     # d_arr and wrapped must be the same buffer
     self.assertEqual(wrapped.device_ctypes_pointer.value,
                      d_arr.device_ctypes_pointer.value)
    def test_kernel_arg(self):
        h_arr = np.arange(10)
        d_arr = cuda.to_device(h_arr)
        my_arr = MyArray(d_arr)
        wrapped = cuda.as_cuda_array(my_arr)

        @cuda.jit
        def mutate(arr, val):
            arr[cuda.grid(1)] += val

        val = 7
        mutate.forall(wrapped.size)(wrapped, val)

        np.testing.assert_array_equal(wrapped.copy_to_host(), h_arr + val)
        np.testing.assert_array_equal(d_arr.copy_to_host(), h_arr + val)
 def test_ownership(self):
     # Get the deallocation queue
     ctx = cuda.current_context()
     deallocs = ctx.deallocations
     # Flush all deallocations
     deallocs.clear()
     self.assertEqual(len(deallocs), 0)
     # Make new device array
     d_arr = cuda.to_device(np.arange(100))
     # Convert it
     cvted = cuda.as_cuda_array(d_arr)
     # Drop reference to the original object such that
     # only `cvted` has a reference to it.
     del d_arr
     # There shouldn't be any new deallocations
     self.assertEqual(len(deallocs), 0)
     # Try to access the memory and verify its content
     np.testing.assert_equal(cvted.copy_to_host(), np.arange(100))
     # Drop last reference to the memory
     del cvted
     self.assertEqual(len(deallocs), 1)
     # Flush
     deallocs.clear()
    def test_array_views(self):
        """Views created via array interface support:
            - Strided slices
            - Strided slices
        """
        h_arr = np.random.random(10)
        c_arr = cuda.to_device(h_arr)

        arr = cuda.as_cuda_array(c_arr)

        # __getitem__ interface accesses expected data

        # Direct views
        np.testing.assert_array_equal(arr.copy_to_host(), h_arr)
        np.testing.assert_array_equal(arr[:].copy_to_host(), h_arr)

        # Slicing
        np.testing.assert_array_equal(arr[:5].copy_to_host(), h_arr[:5])

        # Strided view
        np.testing.assert_array_equal(arr[::2].copy_to_host(), h_arr[::2])

        # View of strided array
        arr_strided = cuda.as_cuda_array(c_arr[::2])
        np.testing.assert_array_equal(arr_strided.copy_to_host(), h_arr[::2])

        # A strided-view-of-array and view-of-strided-array have the same
        # shape, strides, itemsize, and alloc_size
        self.assertEqual(arr[::2].shape, arr_strided.shape)
        self.assertEqual(arr[::2].strides, arr_strided.strides)
        self.assertEqual(arr[::2].dtype.itemsize, arr_strided.dtype.itemsize)
        self.assertEqual(arr[::2].alloc_size, arr_strided.alloc_size)
        self.assertEqual(arr[::2].nbytes,
                         arr_strided.size * arr_strided.dtype.itemsize)

        # __setitem__ interface propogates into external array

        # Writes to a slice
        arr[:5] = np.pi
        np.testing.assert_array_equal(
            c_arr.copy_to_host(),
            np.concatenate((np.full(5, np.pi), h_arr[5:]))
        )

        # Writes to a slice from a view
        arr[:5] = arr[5:]
        np.testing.assert_array_equal(
            c_arr.copy_to_host(),
            np.concatenate((h_arr[5:], h_arr[5:]))
        )

        # Writes through a view
        arr[:] = cuda.to_device(h_arr)
        np.testing.assert_array_equal(c_arr.copy_to_host(), h_arr)

        # Writes to a strided slice
        arr[::2] = np.pi
        np.testing.assert_array_equal(
            c_arr.copy_to_host()[::2],
            np.full(5, np.pi),
        )
        np.testing.assert_array_equal(
            c_arr.copy_to_host()[1::2],
            h_arr[1::2]
        )
Beispiel #35
0
 def as_device_array(self, obj):
     return cuda.as_cuda_array(obj)