コード例 #1
0
    def __call__(self, buffer_alloc):
        """
        Allocates the GPUTensor object as a view of a pre-allocated buffer.

        Arguments:
            buffer_alloc (DeviceAllocation): Memory handle returned by pycuda
                allocator
        """
        tensor_description = self.tensor_description
        layout = tensor_description.layout
        dtype = self.transformer.storage_dtype(tensor_description.dtype)

        if layout:
            gpudata = int(buffer_alloc) + (layout.offset * dtype.itemsize)
            strides = tuple([s * dtype.itemsize for s in layout.strides])
            new_tensor = GPUArray(layout.shape,
                                  dtype,
                                  gpudata=gpudata,
                                  strides=strides)
        else:
            gpudata = int(buffer_alloc) + tensor_description.offset
            new_tensor = GPUArray(tensor_description.shape,
                                  dtype,
                                  gpudata=gpudata,
                                  strides=tensor_description.strides)
        self._tensor = new_tensor
        self.transformer.tensors[self.tensor_name] = self._tensor
コード例 #2
0
    def __setitem__(self, key, value):
        sliced = self.__getitem__(key)

        # Use fill for scalar values
        if type(value) == np.float32 or type(value) == np.float64 or \
                type(value) == float:
            sliced.fill(value)
        elif type(value) == np.int32 or type(value) == np.int64 or \
                type(value) == int:
            sliced.fill(value)
        elif self.tensor.shape == () or np.prod(self.tensor.shape) == 1:
            sliced.fill(value)
        elif np.sum(self.tensor.strides) == 0:
            view = GPUArray((1, ), dtype=self.tensor.dtype)
            view.fill(value)
        else:
            # Convert to correct dtype if necessary
            if value.dtype != self.tensor.dtype:
                new_value = np.ndarray(self.tensor.shape, dtype=self.tensor.dtype)
                new_value[:] = value
                value = new_value

            # Reshape to satisfy pycuda if necessary
            if sliced.shape != value.shape:
                sliced = self.tensor.reshape(value.shape)

            if self.is_contiguous and self.strides_contiguous(value):
                sliced[:] = value
            elif type(value) == GPUArray:
                self.from_other(value, sliced)
            else:
                contig_tensor = GPUArray(value.shape, self.tensor.dtype)
                contig_tensor[:] = value
                self.from_other(contig_tensor, sliced)
コード例 #3
0
ファイル: blasext.py プロジェクト: zwghit/PyFR
    def errest(self, x, y, z, *, norm):
        if x.traits != y.traits != z.traits:
            raise ValueError('Incompatible matrix types')

        # Wrap
        xarr = GPUArray(x.leaddim*x.nrow, x.dtype, gpudata=x)
        yarr = GPUArray(y.leaddim*y.nrow, y.dtype, gpudata=y)
        zarr = GPUArray(z.leaddim*z.nrow, z.dtype, gpudata=z)

        # Norm type
        reduce_expr = 'a + b' if norm == 'l2' else 'max(a, b)'

        # Build the reduction kernel
        rkern = ReductionKernel(
            x.dtype, neutral='0', reduce_expr=reduce_expr,
            map_expr='pow(x[i]/(atol + rtol*max(fabs(y[i]), fabs(z[i]))), 2)',
            arguments='{0}* x, {0}* y, {0}* z, {0} atol, {0} rtol'
                      .format(npdtype_to_ctype(x.dtype))
        )

        class ErrestKernel(ComputeKernel):
            @property
            def retval(self):
                return self._retarr.get()

            def run(self, queue, atol, rtol):
                self._retarr = rkern(xarr, yarr, zarr, atol, rtol,
                                     stream=queue.cuda_stream_comp)

        return ErrestKernel()
コード例 #4
0
    def get(self, tensor):
        """
        Copy the device tensor to a numpy array.

        Arguments:
            tensor (np.ndarray): Optional output array

        Returns:
            Numpy array containing tensor data
        """
        if np.sum(self.tensor.strides) != 0:
            if self.is_contiguous or self.tensor.shape == () or np.prod(self.tensor.shape) == 1:
                contig_tensor = self.tensor
            else:
                # Need to do memcpy from contiguous device memory
                contig_tensor = self.as_contiguous()

            if tensor is None:
                return contig_tensor.get()
            tensor[:] = contig_tensor.get()
        else:
            # Tensor is just a broadcasted scalar, get scalar value and fill output array
            view = GPUArray((1, ), dtype=self.tensor.dtype, gpudata=self.tensor.gpudata)[0]
            value = view.get()

            if tensor is None:
                out = np.ndarray(self.tensor.shape, dtype=self.tensor.dtype)
                out.fill(value)
                return out
            tensor.fill(value)

        return tensor
コード例 #5
0
ファイル: gputransform.py プロジェクト: leonllm/ngraph
    def __setitem__(self, key, value):
        sliced = self.__getitem__(key)

        # Use fill for scalar values
        # convert value to numpy
        if type(value) == float:
            value = np.float64(value)
        elif type(value) == int:
            value = np.int64(value)
        elif isinstance(value, np.ndarray):
            # handle 0-d and 1-d conversion to scalar
            if value.shape == ():
                value = value[()]
            elif value.shape == (1, ):
                value = value[0]

        # flex: added astype to deal with GPUArray dtype int16
        # FLEX TODO: assumed same behavior for all cases
        if type(value) in (np.int32, np.int64, int, np.uint32, np.float32,
                           np.float64):
            sliced.fill(value.astype(sliced.dtype))
        elif self.tensor.shape == () or np.prod(self.tensor.shape) == 1:
            sliced.fill(value.astype(sliced.dtype))
        elif np.sum(self.tensor.strides) == 0:
            view = GPUArray((1, ), dtype=self.tensor.dtype)
            view.fill(value.astype(sliced.dtype))
        else:
            # Convert to correct dtype if necessary
            if value.dtype != self.tensor.dtype:
                new_value = np.ndarray(value.shape, dtype=self.tensor.dtype)
                new_value[:] = value
                value = new_value

            # Reshape to satisfy pycuda if necessary
            if sliced.shape != value.shape:
                sliced = self.tensor.reshape(value.shape)

            if self.is_contiguous and self.strides_contiguous(value):
                if sliced.shape == ():
                    sliced.reshape((1, ))[:] = value.reshape((1, ))
                else:
                    sliced[:] = value
            elif type(value) == GPUArray:
                self.from_other(value, sliced)
            else:
                contig_tensor = GPUArray(value.shape, self.tensor.dtype)
                contig_tensor[:] = value
                self.from_other(contig_tensor, sliced)
コード例 #6
0
    def __getitem__(self, index):
        if index is None or index == _none_slice or index == ():
            return self.tensor
        elif not isinstance(index, tuple):
            index = (index,)

        # Slice tensor by changing shape, strides, and base address
        new_shape = []
        new_offset = 0
        new_strides = []
        seen_ellipsis = False

        shape = self.tensor.shape
        dtype = self.tensor.dtype
        strides = self.tensor.strides

        # Iterate over axes of index to compute new offset, shape, strides
        array_axis = 0
        for index_axis in range(len(index)):
            index_entry = index[index_axis]

            if array_axis > len(shape):
                raise IndexError("Too many axes in index")

            if isinstance(index_entry, slice):
                # Standard slicing (start:stop:step)
                start, stop, idx_strides = index_entry.indices(shape[array_axis])

                new_offset += (start * strides[array_axis])
                new_shape.append(-((start - stop) // idx_strides))
                new_strides.append(idx_strides * strides[array_axis])

                array_axis += 1
            elif isinstance(index_entry, (int, np.integer)):
                # Single index value
                new_offset += (index_entry * strides[array_axis])
                array_axis += 1
            elif index_entry is Ellipsis:
                # Use same shape as original for these axes
                if seen_ellipsis:
                    raise IndexError(
                        "More than one ellipsis not allowed in index")
                seen_ellipsis = True

                remaining_index_count = len(index) - (index_axis + 1)
                new_array_axis = len(shape) - remaining_index_count
                if new_array_axis < array_axis:
                    raise IndexError("Invalid use of ellipsis in index")
                while array_axis < new_array_axis:
                    new_shape.append(shape[array_axis])
                    new_strides.append(strides[array_axis])
                    array_axis += 1
            else:
                raise IndexError("Invalid subindex %s in axis %d" % (index_entry, index_axis))

        # Create view
        return GPUArray(new_shape,
                        dtype,
                        strides=new_strides,
                        gpudata=(self.tensor.gpudata + new_offset))
コード例 #7
0
ファイル: protobackends.py プロジェクト: kevinyuan/aeon
 def consume(self, buf_index, hostlist, devlist):
     assert 0 <= buf_index < 2, 'Can only double buffer'
     self.ctx.push()
     hbuf = hostlist[buf_index]
     if devlist[buf_index] is None:
         shape, dtype = hbuf.shape[::-1], hbuf.dtype
         devlist[buf_index] = GPUArray(shape, dtype)
     devlist[buf_index].set(hbuf.T)
     self.ctx.pop()
コード例 #8
0
def rand(shape, dtype=numpy.float32, stream=None):
    from pycuda.gpuarray import GPUArray
    from pycuda.elementwise import get_elwise_kernel

    result = GPUArray(shape, dtype)
    
    if dtype == numpy.float32:
        func = get_elwise_kernel(
            "float *dest, unsigned int seed", 
            md5_code + """
            #define POW_2_M32 (1/4294967296.0f)
            dest[i] = a*POW_2_M32;
            if ((i += total_threads) < n)
                dest[i] = b*POW_2_M32;
            if ((i += total_threads) < n)
                dest[i] = c*POW_2_M32;
            if ((i += total_threads) < n)
                dest[i] = d*POW_2_M32;
            """,
            "md5_rng_float")
    elif dtype == numpy.float64:
        func = get_elwise_kernel(
            "double *dest, unsigned int seed", 
            md5_code + """
            #define POW_2_M32 (1/4294967296.0)
            #define POW_2_M64 (1/18446744073709551616.)

            dest[i] = a*POW_2_M32 + b*POW_2_M64;

            if ((i += total_threads) < n)
            {
              dest[i] = c*POW_2_M32 + d*POW_2_M64;
            }
            """,
            "md5_rng_float")
    elif dtype in [numpy.int32, numpy.uint32]:
        func = get_elwise_kernel(
            "unsigned int *dest, unsigned int seed", 
            md5_code + """
            dest[i] = a;
            if ((i += total_threads) < n)
                dest[i] = b;
            if ((i += total_threads) < n)
                dest[i] = c;
            if ((i += total_threads) < n)
                dest[i] = d;
            """,
            "md5_rng_int")
    else:
        raise NotImplementedError;

    func.set_block_shape(*result._block)
    func.prepared_async_call(result._grid, stream,
            result.gpudata, numpy.random.randint(2**31-1), result.size)
    
    return result
コード例 #9
0
def toGpuArray(f):
    """Converts a waLBerla GPUField to a pycuda GPUArray"""
    if not f:
        return None
    dtype = np.dtype(f.dtypeStr)
    strides = [dtype.itemsize * a for a in f.strides]
    return GPUArray(f.sizeWithGhostLayers,
                    dtype,
                    gpudata=f.ptr,
                    strides=strides)
コード例 #10
0
ファイル: parray.py プロジェクト: bionet/vtem
def arrayp2g(pary):
    """convert a PitchArray to a GPUArray"""
    from pycuda.gpuarray import GPUArray
    result = GPUArray(pary.shape, pary.dtype)
    if pary.size:
        if pary.M == 1:
            cuda.memcpy_dtod(result.gpudata, pary.gpudata, pary.mem_size * pary.dtype.itemsize)
        else:
            PitchTrans(pary.shape, result.gpudata, _pd(result.shape), pary.gpudata, pary.ld, pary.dtype)
            
    return result
コード例 #11
0
ファイル: util.py プロジェクト: smessing/striate
def dot(x,y):
  timer.start()
  if isinstance(x, GPUArray):
    assert isinstance(y, GPUArray)
    if x.shape == (1,):
      assert y.shape[0] == 1
      y *= scalar(x)
      return y.ravel()
    elif y.shape == (1,):
      assert x.shape[1] == 1
      x *= scalar(y)
      return x.ravel()
    elif len(x.shape) == 1 and len(y.shape) == 1:
      return scalar(pycuda.gpuarray.dot(x,y))
    else:
      needs_ravel = False
      if len(x.shape) == 1:
        needs_ravel = True
        x = x.reshape((1,) + x.shape)
      if len(y.shape) == 1:
        needs_ravel = True
        y = y.reshape(y.shape + (1,))

      #result = linalg.dot(x, y)
      result = GPUArray((y.shape[1], x.shape[0]), dtype=x.dtype)
      sgemm('t', 't', x.shape[0], y.shape[1], x.shape[1], 1.0,
            x.gpudata, x.shape[1], y.gpudata, y.shape[1], 0.0,
            result.gpudata, result.shape[1])
      result = transpose(result)

      if needs_ravel:
        assert result.shape[1] == 1 or result.shape[0] == 1
        result = result.ravel()
      timer.end('dot')
      return result
  else:
    return np.dot(x,y)
コード例 #12
0
ファイル: cuda_kernel.py プロジェクト: sportsbitenews/fastnet
def dot(x, y):
    if not CUBLAS_ENABLED:
        return gpuarray.to_gpu(np.dot(x.get(), y.get()))

    if isinstance(x, GPUArray):
        result = GPUArray((y.shape[1], x.shape[0]), dtype=x.dtype)
        #util.log_info('%s %s %s', x.shape, y.shape, result.shape)
        #util.log_info('%s %s %s', x.ptr, y.ptr, result.ptr)
        sgemm('t', 't', x.shape[0], y.shape[1], x.shape[1], 1.0, x.gpudata,
              x.shape[1], y.gpudata, y.shape[1], 0.0, result.gpudata,
              result.shape[1])
        result = transpose(result)
        return result
    else:
        return np.dot(x, y)
コード例 #13
0
ファイル: cuda_kernel.py プロジェクト: phecy/striate
def dot(x, y):
    timer.start()
    if isinstance(x, GPUArray):
        assert isinstance(y, GPUArray)
        if x.shape == (1, ):
            assert y.shape[0] == 1
            y *= scalar(x)
            return y.ravel()
        elif y.shape == (1, ):
            assert x.shape[1] == 1
            x *= scalar(y)
            return x.ravel()
        elif len(x.shape) == 1 and len(y.shape) == 1:
            return scalar(pycuda.gpuarray.dot(x, y))
        else:
            needs_ravel = False
            if len(x.shape) == 1:
                needs_ravel = True
                x = x.reshape((1, ) + x.shape)
            if len(y.shape) == 1:
                needs_ravel = True
                y = y.reshape(y.shape + (1, ))
            #result = linalg.dot(x, y)
            result = GPUArray((y.shape[1], x.shape[0]), dtype=x.dtype)
            sgemm('t', 't', x.shape[0], y.shape[1], x.shape[1], 1.0, x.gpudata,
                  x.shape[1], y.gpudata, y.shape[1], 0.0, result.gpudata,
                  result.shape[1])
            result = transpose(result)

            if needs_ravel:
                assert result.shape[1] == 1 or result.shape[0] == 1
                result = result.ravel()
            timer.end('dot')
            return result
    else:
        return np.dot(x, y)
コード例 #14
0
    def as_contiguous(self):
        """
        Creates a new GPUArray with the same dimensions, but using contiguous memory

        Returns:
            New contiguous GPUArray with separate underlying device allocation
        """
        contig_tensor = GPUArray(self.tensor.shape, dtype=self.tensor.dtype)
        src_strides = [s // self.tensor.dtype.itemsize for s in self.tensor.strides]
        dst_strides = [s // contig_tensor.dtype.itemsize for s in contig_tensor.strides]
        kernel = _get_copy_transpose_kernel(self.tensor.dtype,
                                            self.tensor.shape,
                                            range(len(self.tensor.shape)))
        params = [contig_tensor.gpudata, self.tensor.gpudata] + list(kernel.args)
        params = params + src_strides + dst_strides
        kernel.prepared_async_call(kernel.grid, kernel.block, None, *params)
        return contig_tensor
コード例 #15
0
    def __call__(self, buffer_alloc):
        """
        Allocates the GPUTensor object as a view of a pre-allocated buffer.

        Arguments:
            buffer_alloc (DeviceAllocation): Memory handle returned by pycuda
                allocator
        """
        tensor_description = self.tensor_description

        gpudata = int(buffer_alloc) + tensor_description.offset
        new_tensor = GPUArray(tensor_description.shape,
                              tensor_description.dtype,
                              gpudata=gpudata,
                              strides=tensor_description.strides)

        self._tensor = new_tensor
        self.transformer.tensors[self.tensor_name] = self._tensor
コード例 #16
0
ファイル: cuda_extension.py プロジェクト: xuhenry/walberla
def toGpuArray(f, withGhostLayers=True):
    """Converts a waLBerla GPUField to a pycuda GPUArray"""
    if not f:
        return None
    dtype = np.dtype(f.dtypeStr)
    strides = [dtype.itemsize * a for a in f.strides]
    res = GPUArray(f.sizeWithGhostLayers,
                   dtype,
                   gpudata=f.ptr,
                   strides=strides)
    if withGhostLayers is True:
        return res

    ghostLayers = normalizeGhostlayerInfo(f, withGhostLayers)
    glCutoff = [f.nrOfGhostLayers - gl for gl in ghostLayers]
    res = res[glCutoff[0]:-glCutoff[0] if glCutoff[0] > 0 else None,
              glCutoff[1]:-glCutoff[1] if glCutoff[1] > 0 else None,
              glCutoff[2]:-glCutoff[2] if glCutoff[2] > 0 else None, :]
    return res
コード例 #17
0
ファイル: protobackends.py プロジェクト: kevinyuan/aeon
    def consume(self, buf_index, hostlist, devlist):
        assert 0 <= buf_index < 2, 'Can only double buffer'
        hbuf = hostlist[buf_index]

        frag_sz, ndims, ndtype = hbuf.shape[0] // self.num_dev, hbuf.shape[
            1], hbuf.dtype

        # Create fragment array destination if missing
        if devlist[buf_index] is None:
            devlist[buf_index] = []
            for ctx in self.ctxs:
                ctx.push()
                devlist[buf_index].append(GPUArray((ndims, frag_sz), ndtype))
                ctx.pop()

        # Initiate the transfer
        for idx, ctx, dbuf, strm in zip(self.device_ids, self.ctxs,
                                        devlist[buf_index], self.streams):
            ctx.push()
            dbuf.set_async(hbuf[idx * frag_sz:(idx + 1) * frag_sz, :].T, strm)
            ctx.pop()
コード例 #18
0
# Generate coordinates of non-uniform points.
kx = np.random.uniform(-np.pi, np.pi, size=M)
ky = np.random.uniform(-np.pi, np.pi, size=M)

# Generate source strengths.
c = (np.random.standard_normal((n_transf, M)) + 1j * np.random.standard_normal(
    (n_transf, M)))

# Cast to desired datatype.
kx = kx.astype(dtype)
ky = ky.astype(dtype)
c = c.astype(complex_dtype)

# Allocate memory for the uniform grid on the GPU.
fk_gpu = GPUArray((n_transf, N1, N2), dtype=complex_dtype)

# Initialize the plan and set the points.
plan = cufinufft(1, (N1, N2), n_transf, eps=eps, dtype=dtype)
plan.set_pts(to_gpu(kx), to_gpu(ky))

# Execute the plan, reading from the strengths array c and storing the
# result in fk_gpu.
plan.execute(to_gpu(c), fk_gpu)

# Retreive the result from the GPU.
fk = fk_gpu.get()

# Check accuracy of the transform at position (nt1, nt2).
nt1 = int(0.37 * N1)
nt2 = int(0.26 * N2)
コード例 #19
0
# Generate coordinates of non-uniform points.
kx = np.random.uniform(-np.pi, np.pi, size=M)
ky = np.random.uniform(-np.pi, np.pi, size=M)

# Generate grid values.
fk = (np.random.standard_normal((n_transf, N1, N2))
      + 1j * np.random.standard_normal((n_transf, N1, N2)))

# Cast to desired datatype.
kx = kx.astype(dtype)
ky = ky.astype(dtype)
fk = fk.astype(complex_dtype)

# Allocate memory for the nonuniform coefficients on the GPU.
c_gpu = GPUArray((n_transf, M), dtype=complex_dtype)

# Initialize the plan and set the points.
plan = cufinufft(2, (N1, N2), n_transf, eps=eps, dtype=dtype)
plan.set_pts(to_gpu(kx), to_gpu(ky))

# Execute the plan, reading from the uniform grid fk c and storing the result
# in c_gpu.
plan.execute(c_gpu, to_gpu(fk))

# Retreive the result from the GPU.
c = c_gpu.get()

# Check accuracy of the transform at index jt.
jt = M // 2
コード例 #20
0
 def __init__(self, value):
     if not isinstance(value, GPUArray):
         value = GPUArray.to_gpu(value)
     self.shape = value.shape
     self.dtype = value.dtype
コード例 #21
0
 def fft(self, src: cua.GPUArray, dest: cua.GPUArray = None):
     """
     Compute the forward FFT
     :param src: the source GPUarray
     :param dest: the destination GPUarray. Should be None for an inplace transform
     :return: the transformed array. For a R2C inplace transform, the complex view of the
         array is returned.
     """
     if self.inplace:
         if dest is not None:
             if src.gpudata != dest.gpudata:
                 raise RuntimeError(
                     "VkFFTApp.fft: dest is not None but this is an inplace transform"
                 )
         if self.batch_shape is not None:
             s = src.reshape(self.batch_shape)
         else:
             s = src
         _vkfft_cuda.fft(self.app, int(s.gpudata), int(s.gpudata))
         if self.norm == "ortho":
             if self.precision == 2:
                 src *= np.float16(self._get_fft_scale(norm=0))
             elif self.precision == 4:
                 src *= np.float32(self._get_fft_scale(norm=0))
             elif self.precision == 8:
                 src *= np.float64(self._get_fft_scale(norm=0))
         if self.r2c:
             if src.dtype == np.float32:
                 return src.view(dtype=np.complex64)
             elif src.dtype == np.float64:
                 return src.view(dtype=np.complex128)
         return src
     else:
         if dest is None:
             raise RuntimeError(
                 "VkFFTApp.fft: dest is None but this is an out-of-place transform"
             )
         elif src.gpudata == dest.gpudata:
             raise RuntimeError(
                 "VkFFTApp.fft: dest and src are identical but this is an out-of-place transform"
             )
         if self.r2c:
             assert (src.size == dest.size // dest.shape[-1] * 2 *
                     (dest.shape[-1] - 1))
         if self.batch_shape is not None:
             s = src.reshape(self.batch_shape)
             if self.r2c:
                 c_shape = tuple(
                     list(self.batch_shape[:-1]) +
                     [self.batch_shape[-1] // 2 + 1])
                 d = dest.reshape(c_shape)
             else:
                 d = dest.reshape(self.batch_shape)
         else:
             s, d = src, dest
         _vkfft_cuda.fft(self.app, int(s.gpudata), int(d.gpudata))
         if self.norm == "ortho":
             if self.precision == 2:
                 dest *= np.float16(self._get_fft_scale(norm=0))
             elif self.precision == 4:
                 dest *= np.float32(self._get_fft_scale(norm=0))
             elif self.precision == 8:
                 dest *= np.float64(self._get_fft_scale(norm=0))
         return dest
コード例 #22
0
 def ifft(self, src: cua.GPUArray, dest: cua.GPUArray = None):
     """
     Compute the backward FFT
     :param src: the source GPUarray
     :param dest: the destination GPUarray. Should be None for an inplace transform
     :return: the transformed array. For a C2R inplace transform, the float view of the
         array is returned.
     """
     if self.inplace:
         if dest is not None:
             if src.gpudata != dest.gpudata:
                 raise RuntimeError(
                     "VkFFTApp.fft: dest!=src but this is an inplace transform"
                 )
         if self.batch_shape is not None:
             if self.r2c:
                 src_shape = tuple(
                     list(self.batch_shape[:-1]) +
                     [self.batch_shape[-1] // 2])
                 s = src.reshape(src_shape)
             else:
                 s = src.reshape(self.batch_shape)
         else:
             s = src
         _vkfft_cuda.ifft(self.app, int(s.gpudata), int(s.gpudata))
         if self.norm == "ortho":
             if self.precision == 2:
                 src *= np.float16(self._get_ifft_scale(norm=0))
             elif self.precision == 4:
                 src *= np.float32(self._get_ifft_scale(norm=0))
             elif self.precision == 8:
                 src *= np.float64(self._get_ifft_scale(norm=0))
         if self.r2c:
             if src.dtype == np.complex64:
                 return src.view(dtype=np.float32)
             elif src.dtype == np.complex128:
                 return src.view(dtype=np.float64)
         return src
     if not self.inplace:
         if dest is None:
             raise RuntimeError(
                 "VkFFTApp.ifft: dest is None but this is an out-of-place transform"
             )
         elif src.gpudata == dest.gpudata:
             raise RuntimeError(
                 "VkFFTApp.ifft: dest and src are identical but this is an out-of-place transform"
             )
         if self.r2c:
             assert (dest.size == src.size // src.shape[-1] * 2 *
                     (src.shape[-1] - 1))
             # Special case, src and dest buffer sizes are different,
             # VkFFT is configured to go back to the source buffer
             if self.batch_shape is not None:
                 src_shape = tuple(
                     list(self.batch_shape[:-1]) +
                     [self.batch_shape[-1] // 2 + 1])
                 s = src.reshape(src_shape)
                 d = dest.reshape(self.batch_shape)
             else:
                 s, d = src, dest
             _vkfft_cuda.ifft(self.app, int(d.gpudata), int(s.gpudata))
         else:
             if self.batch_shape is not None:
                 s = src.reshape(self.batch_shape)
                 d = dest.reshape(self.batch_shape)
             else:
                 s, d = src, dest
             _vkfft_cuda.ifft(self.app, int(s.gpudata), int(d.gpudata))
         if self.norm == "ortho":
             if self.precision == 2:
                 dest *= np.float16(self._get_ifft_scale(norm=0))
             elif self.precision == 4:
                 dest *= np.float32(self._get_ifft_scale(norm=0))
             elif self.precision == 8:
                 dest *= np.float64(self._get_ifft_scale(norm=0))
         return dest
コード例 #23
0
ファイル: array_cuda.py プロジェクト: ueno-phys/pycbc
def zeros(length, dtype=np.float64):
    result = GPUArray(length, dtype=dtype)
    nwords = result.nbytes / 4
    pycuda.driver.memset_d32(result.gpudata, 0, nwords)
    return result
コード例 #24
0
def test_project_shepp_logan(with_spline):
    from pycuda.gpuarray import to_gpu, GPUArray

    from sympy.matrices.dense import MutableDenseMatrix
    MutableDenseMatrix.__hash__ = lambda x: 1  # hash(tuple(x))
    try:
        import pyconrad.autoinit
        phantom3d = pyconrad.phantoms.shepp_logan(100, 100, 100)
        pyconrad.imshow(phantom3d, 'phantom')
    except Exception:
        phantom3d = np.random.rand(30, 31, 32)

    for i, projection_matrix in enumerate((m1, )):

        volume = pystencils.fields('volume: float32[100,100,100]')
        projections = pystencils.fields('projections: float32[1024,960]')
        volume.set_coordinate_origin_to_field_center()
        volume.coordinate_transform = sympy.rot_axis2(0.2)
        # volume.coordinate_transform = sympy.rot_axis3(0.1)
        volume.coordinate_transform = 3 * volume.coordinate_transform
        projections.set_coordinate_origin_to_field_center()

        kernel = forward_projection(volume,
                                    projections,
                                    projection_matrix,
                                    step_size=1,
                                    cubic_bspline_interpolation=with_spline)
        print(kernel)
        kernel = kernel.compile('gpu')
        # print(kernel.code)

        volume_gpu = to_gpu(np.ascontiguousarray(phantom3d, np.float32))
        if with_spline:
            pystencils.gpucuda.texture_utils.prefilter_for_cubic_bspline(
                volume_gpu)
        projection_gpu = GPUArray(projections.spatial_shape, np.float32)

        kernel(volume=volume_gpu, projections=projection_gpu)

        pyconrad.imshow(volume_gpu, 'volume ' + str(with_spline))
        pyconrad.imshow(projection_gpu,
                        'projections ' + str(i) + str(with_spline))

    for i, projection_matrix in enumerate((m1, )):
        angle = pystencils_reco.typed_symbols('angle', 'float32')

        volume = pystencils.fields('volume: float32[100,100,100]')
        projections = pystencils.fields('projections: float32[1024,960]')
        volume.set_coordinate_origin_to_field_center()
        volume.coordinate_transform = sympy.rot_axis2(angle)
        # volume.coordinate_transform = sympy.rot_axis3(0.1)
        volume.coordinate_transform = 3 * volume.coordinate_transform
        projections.set_coordinate_origin_to_field_center()

        kernel = forward_projection(volume,
                                    projections,
                                    projection_matrix,
                                    step_size=1,
                                    cubic_bspline_interpolation=with_spline)
        print(kernel)
        kernel = kernel.compile('gpu')
        # print(kernel.code)

        volume_gpu = to_gpu(np.ascontiguousarray(phantom3d, np.float32))
        if with_spline:
            pystencils.gpucuda.texture_utils.prefilter_for_cubic_bspline(
                volume_gpu)
        projection_gpu = GPUArray(projections.spatial_shape, np.float32)

        for phi in np.arange(0, np.pi, np.pi / 100):
            kernel(volume=volume_gpu, projections=projection_gpu, angle=phi)
            pyconrad.imshow(projection_gpu, 'rotation!' + str(with_spline))
        pyconrad.close_all_windows()
コード例 #25
0
print(arg_dict)

formula = arg_dict.pop('formula', 'CH3COOH')
zoom = arg_dict.pop('zoom', 1.5)
repeat = arg_dict.pop('repeat', 4)
nlaunch = arg_dict.pop('nlaunch', 1)
block_per_sm = arg_dict.pop('block_per_sm', 8)
block_size = arg_dict.pop('block_size', 128)
device = pycuda.driver.Device(arg_dict.pop('device', 0))

njobs = device.MULTIPROCESSOR_COUNT * block_per_sm * repeat

g = Graph.from_ase(molecule(formula), adjacency=dict(h=zoom))

kernel = Kernel()
''' generate jobs '''
jobs = [Job(0, 0, GPUArray(len(g.nodes)**2, np.float32)) for i in range(njobs)]
''' call GPU kernel '''
for i in range(nlaunch):
    kernel.kernel._launch_kernel([g], jobs, nodal=False, lmin=0)

R = jobs[0].vr_gpu.get().reshape(len(g.nodes), -1)
r = R.sum()

print('Nodal similarity:\n', R, sep='')
print('Overall similarity:\n', r, sep='')

for job in jobs:
    assert (np.abs(job.vr_gpu.get().sum() - r) < r * 1e-6)
print('**ALL PASSED**')
コード例 #26
0
ファイル: ghz.py プロジェクト: fjarri-attic/bellsim-letter
def calculation(in_queue, out_queue):

    device_num, params = in_queue.get()

    chunk_size = params['chunk_size']
    chunks_num = params['chunks_num']
    particles = params['particles']
    state = params['state']
    representation = params['representation']
    quantities = params['quantities']

    decoherence = params['decoherence']
    if decoherence is not None:
        decoherence_steps = decoherence['steps']
        decoherence_coeff = decoherence['coeff']
    else:
        decoherence_steps = 0
        decoherence_coeff = 1

    binning = params['binning']
    if binning is not None:
        s = set()
        for names, _, _ in binning:
            s.update(names)
        quantities = sorted(list(s))

    c_dtype = numpy.complex128
    c_ctype = 'double2'
    s_dtype = numpy.float64
    s_ctype = 'double'
    Fs = []

    cuda.init()

    device = cuda.Device(device_num)
    ctx = device.make_context()
    free, total = cuda.mem_get_info()
    max_chunk_size = float(total) / len(quantities) / numpy.dtype(
        c_dtype).itemsize / 1.1
    max_chunk_size = 10**int(numpy.log(max_chunk_size) / numpy.log(10))
    #print free, total, max_chunk_size

    if max_chunk_size > chunk_size:
        subchunk_size = chunk_size
        subchunks_num = 1
    else:
        assert chunk_size % max_chunk_size == 0
        subchunk_size = max_chunk_size
        subchunks_num = chunk_size / subchunk_size

    buffers = []
    for quantity in sorted(quantities):
        buffers.append(GPUArray(subchunk_size, c_dtype))

    stream = cuda.Stream()

    # compile code
    try:
        source = TEMPLATE.render(c_ctype=c_ctype,
                                 s_ctype=s_ctype,
                                 particles=particles,
                                 state=state,
                                 representation=representation,
                                 quantities=quantities,
                                 decoherence_coeff=decoherence_coeff)
    except:
        print exceptions.text_error_template().render()
        raise

    try:
        module = SourceModule(source, no_extern_c=True)
    except:
        for i, l in enumerate(source.split("\n")):
            print i + 1, ":", l
        raise

    kernel_initialize = module.get_function("initialize")
    kernel_calculate = module.get_function("calculate")
    kernel_decoherence = module.get_function("decoherence")

    # prepare call parameters

    gen_block_size = min(kernel_initialize.max_threads_per_block,
                         kernel_calculate.max_threads_per_block)
    gen_grid_size = device.get_attribute(
        cuda.device_attribute.MULTIPROCESSOR_COUNT)
    gen_block = (gen_block_size, 1, 1)
    gen_grid = (gen_grid_size, 1, 1)

    num_gen = gen_block_size * gen_grid_size
    assert num_gen <= 20000

    # prepare RNG states

    #seeds = to_gpu(numpy.ones(size, dtype=numpy.uint32))
    seeds = to_gpu(
        numpy.random.randint(0, 2**32 - 1, size=num_gen).astype(numpy.uint32))
    state_type_size = sizeof("curandStateXORWOW", "#include <curand_kernel.h>")
    states = cuda.mem_alloc(num_gen * state_type_size)

    #prev_stack_size = cuda.Context.get_limit(cuda.limit.STACK_SIZE)
    #cuda.Context.set_limit(cuda.limit.STACK_SIZE, 1<<14) # 16k
    kernel_initialize(states,
                      seeds.gpudata,
                      block=gen_block,
                      grid=gen_grid,
                      stream=stream)
    #cuda.Context.set_limit(cuda.limit.STACK_SIZE, prev_stack_size)

    # run calculation
    args = [states] + [buf.gpudata
                       for buf in buffers] + [numpy.int32(subchunk_size)]

    if binning is None:

        results = {
            quantity: numpy.zeros(
                (decoherence_steps + 1, chunks_num * subchunks_num), c_dtype)
            for quantity in quantities
        }
        for i in xrange(chunks_num * subchunks_num):
            kernel_calculate(*args,
                             block=gen_block,
                             grid=gen_grid,
                             stream=stream)

            for k in xrange(decoherence_steps + 1):
                if k > 0:
                    kernel_decoherence(*args,
                                       block=gen_block,
                                       grid=gen_grid,
                                       stream=stream)

                for j, quantity in enumerate(sorted(quantities)):
                    F = (gpuarray.sum(buffers[j], stream=stream) /
                         buffers[j].size).get()
                    results[quantity][k, i] = F

        for quantity in sorted(quantities):
            results[quantity] = results[quantity].reshape(
                decoherence_steps + 1, chunks_num,
                subchunks_num).mean(2).real.tolist()

        out_queue.put(results)

    else:

        bin_accums = [
            numpy.zeros(tuple([binnum] * len(vals)), numpy.int64)
            for vals, binnum, _ in binning
        ]
        bin_edges = [None] * len(binning)

        for i in xrange(chunks_num * subchunks_num):
            bin_edges = []
            kernel_calculate(*args,
                             block=gen_block,
                             grid=gen_grid,
                             stream=stream)
            results = {
                quantity: buffers[j].get().real
                for j, quantity in enumerate(sorted(quantities))
            }

            for binparam, bin_accum in zip(binning, bin_accums):
                qnames, binnum, ranges = binparam
                sample_lines = [results[quantity] for quantity in qnames]
                sample = numpy.concatenate(
                    [arr.reshape(subchunk_size, 1) for arr in sample_lines],
                    axis=1)

                hist, edges = numpy.histogramdd(sample, binnum, ranges)
                bin_accum += hist
                bin_edges.append(numpy.array(edges))

        results = [[acc.tolist(), edges.tolist()]
                   for acc, edges in zip(bin_accums, bin_edges)]

        out_queue.put(results)

    #ctx.pop()
    ctx.detach()
コード例 #27
0
 def _make_array(self, shape, dtype):
     return GPUArray(shape, dtype)