Example #1
0
    def __init__(self, dtype_out,
            neutral, reduce_expr, map_expr=None, arguments=None,
            name="reduce_kernel", keep=False, options=None, preamble=""):
            ReductionKernel.__init__(self, dtype_out,
                neutral, reduce_expr, map_expr, arguments,
                name, keep, options, preamble)

            self.shared_size=self.block_size*self.dtype_out.itemsize
Example #2
0
import pycuda.gpuarray as gpuarray
import pycuda.driver as cuda
import pycuda.autoinit
import numpy
from pycuda.reduction import ReductionKernel

a = gpuarray.arange(400, dtype=numpy.float32)
b = gpuarray.arange(400, dtype=numpy.float32)

print a

krnl = ReductionKernel(numpy.float32,
                       neutral="0",
                       reduce_expr="a+b",
                       map_expr="x[i]*y[i]",
                       arguments="float *x, float *y")

my_dot_prod = krnl(a, b).get()
print my_dot_prod
Example #3
0
    def test_struct_reduce(self):
        preamble = """
        struct minmax_collector
        {
            float cur_min;
            float cur_max;

            __device__
            minmax_collector()
            { }

            __device__
            minmax_collector(float cmin, float cmax)
            : cur_min(cmin), cur_max(cmax)
            { }

            __device__ minmax_collector(minmax_collector const &src)
            : cur_min(src.cur_min), cur_max(src.cur_max)
            { }

            __device__ minmax_collector(minmax_collector const volatile &src)
            : cur_min(src.cur_min), cur_max(src.cur_max)
            { }

            __device__ minmax_collector volatile &operator=(
                minmax_collector const &src) volatile
            {
                cur_min = src.cur_min;
                cur_max = src.cur_max;
                return *this;
            }
        };

        __device__
        minmax_collector agg_mmc(minmax_collector a, minmax_collector b)
        {
            return minmax_collector(
                fminf(a.cur_min, b.cur_min),
                fmaxf(a.cur_max, b.cur_max));
        }
        """
        mmc_dtype = np.dtype([("cur_min", np.float32), ("cur_max", np.float32)])

        from pycuda.curandom import rand as curand
        a_gpu = curand((20000,), dtype=np.float32)
        a = a_gpu.get()

        from pycuda.tools import register_dtype
        register_dtype(mmc_dtype, "minmax_collector")

        from pycuda.reduction import ReductionKernel
        red = ReductionKernel(mmc_dtype,
                neutral="minmax_collector(10000, -10000)",
                # FIXME: needs infinity literal in real use, ok here
                reduce_expr="agg_mmc(a, b)", map_expr="minmax_collector(x[i], x[i])",
                arguments="float *x", preamble=preamble)

        minmax = red(a_gpu).get()
        #print minmax["cur_min"], minmax["cur_max"]
        #print np.min(a), np.max(a)

        assert minmax["cur_min"] == np.min(a)
        assert minmax["cur_max"] == np.max(a)
Example #4
0
    def __init__(self, img_size, **kwargs):
        self.num = CorrelStage.num
        CorrelStage.num += 1
        self.verbose = kwargs.get("verbose", 0)
        self.debug(2, "Initializing with resolution", img_size)
        self.h, self.w = img_size
        self._ready = False
        self.nbIter = kwargs.get("iterations", 5)
        self.showDiff = kwargs.get("show_diff", False)
        if self.showDiff:
            import cv2
            cv2.namedWindow("Residual",
                            cv2.WINDOW_NORMAL | cv2.WINDOW_KEEPRATIO)
        self.mul = kwargs.get("mul", 3)
        # These two store the values of the last resampled array
        # It is meant to allocate output array only once (see resampleD)
        self.rX, self.rY = -1, -1
        # self.loop will be incremented every time getDisp is called
        # It will be used to measure performance and output some info
        self.loop = 0

        # Allocating stuff #

        # Grid and block for kernels called with the size of the image #
        # All the images and arrays in the kernels will be in order (x,y)
        self.grid = (int(ceil(self.w / 32)), int(ceil(self.h / 32)))
        self.block = (int(ceil(self.w / self.grid[0])),
                      int(ceil(self.h / self.grid[1])), 1)
        self.debug(3, "Default grid:", self.grid, "block", self.block)

        # We need the number of fields to allocate the G tables #
        self.Nfields = kwargs.get("Nfields")
        if self.Nfields is None:
            self.Nfields = len(kwargs.get("fields")[0])

        # Allocating everything we need #
        self.devG = []
        self.devFieldsX = []
        self.devFieldsY = []
        for i in range(self.Nfields):
            # devG stores the G arrays (to compute the research direction)
            self.devG.append(gpuarray.empty(img_size, np.float32))
            # devFieldsX/Y store the fields value along X and Y
            self.devFieldsX.append(gpuarray.empty((self.h, self.w),
                                                  np.float32))
            self.devFieldsY.append(gpuarray.empty((self.h, self.w),
                                                  np.float32))
        # devH Stores the Hessian matrix
        self.H = np.zeros((self.Nfields, self.Nfields), np.float32)
        # And devHi stores its invert
        self.devHi = gpuarray.empty((self.Nfields, self.Nfields), np.float32)
        # devOut is written with the difference of the images
        self.devOut = gpuarray.empty((self.h, self.w), np.float32)
        # devX stores the value of the parameters (what is actually computed)
        self.devX = gpuarray.empty((self.Nfields), np.float32)
        # to store the research direction
        self.devVec = gpuarray.empty((self.Nfields), np.float32)
        # To store the original image on the device
        self.devOrig = gpuarray.empty(img_size, np.float32)
        # To store the gradient along X of the original image on the device
        self.devGradX = gpuarray.empty(img_size, np.float32)
        # And along Y
        self.devGradY = gpuarray.empty(img_size, np.float32)

        # Locating the kernel file #
        kernelFile = kwargs.get("kernel_file")
        if kernelFile is None:
            self.debug(2, "Kernel file not specified")
            from crappy import __path__ as crappyPath
            kernelFile = crappyPath[0] + "/data/kernels.cu"
        # Reading kernels and compiling module #
        with open(kernelFile, "r") as f:
            self.debug(3, "Sourcing module")
            self.mod = SourceModule(f.read() % (self.w, self.h, self.Nfields))
        # Assigning functions to the kernels #
        # These kernels are defined in data/kernels.cu
        self._resampleOrigKrnl = self.mod.get_function('resampleO')
        self._resampleKrnl = self.mod.get_function('resample')
        self._gradientKrnl = self.mod.get_function('gradient')
        self._makeGKrnl = self.mod.get_function('makeG')
        self._makeDiff = self.mod.get_function('makeDiff')
        self._dotKrnl = self.mod.get_function('myDot')
        self._addKrnl = self.mod.get_function('kadd')
        # These ones use pyCuda reduction module to generate efficient kernels
        self._mulRedKrnl = ReductionKernel(np.float32,
                                           neutral="0",
                                           reduce_expr="a+b",
                                           map_expr="x[i]*y[i]",
                                           arguments="float *x, float *y")
        self._leastSquare = ReductionKernel(np.float32,
                                            neutral="0",
                                            reduce_expr="a+b",
                                            map_expr="x[i]*x[i]",
                                            arguments="float *x")
        # We could have used use mulRedKrnl(x,x), but this is probably faster ?

        # Getting texture references #
        self.tex = self.mod.get_texref('tex')
        self.tex_d = self.mod.get_texref('tex_d')
        self.texMask = self.mod.get_texref('texMask')
        # Setting proper flags #
        # All textures use normalized coordinates except for the mask
        for t in [self.tex, self.tex_d]:
            t.set_flags(cuda.TRSF_NORMALIZED_COORDINATES)
        for t in [self.tex, self.tex_d, self.texMask]:
            t.set_filter_mode(cuda.filter_mode.LINEAR)
            t.set_address_mode(0, cuda.address_mode.BORDER)
            t.set_address_mode(1, cuda.address_mode.BORDER)

        # Preparing kernels for less overhead when called #
        self._resampleOrigKrnl.prepare("Pii", texrefs=[self.tex])
        self._resampleKrnl.prepare("Pii", texrefs=[self.tex_d])
        self._gradientKrnl.prepare("PP", texrefs=[self.tex])
        self._makeDiff.prepare("PPPP",
                               texrefs=[self.tex, self.tex_d, self.texMask])
        self._addKrnl.prepare("PfP")
        # Reading original image if provided #
        if kwargs.get("img") is not None:
            self.setOrig(kwargs.get("img"))
        # Reading fields if provided #
        if kwargs.get("fields") is not None:
            self.setFields(kwargs.get("fields"))
        # Reading mask if provided #
        if kwargs.get("mask") is not None:
            self.setMask(kwargs.get("mask"))
Example #5
0
from pycuda.reduction import ReductionKernel
import numpy

dot = ReductionKernel(dtype_out=numpy.float32,
                      neutral="0",
                      reduce_expr="a+b",
                      map_expr="x[i]∗y[i]",
                      arguments="const float ∗x, const float ∗y")

from pycuda.curandom import rand as curand

x = curand((1000 * 1000), dtype=numpy.float32)
y = curand((1000 * 1000), dtype=numpy.float32)
x_dot_y = dot(x, y).get()
x_dot_y_cpu = numpy.dot(x.get(), y.get())
print x_dot_y
print x_dot_y_cpu
import pycuda.gpuarray as gpuarray
import pycuda.autoinit
import numpy
from pycuda.reduction import ReductionKernel

# Comprimento do vetor
vector_length = 400

# Vetores A e B
input_vector_a = gpuarray.arange(vector_length, dtype=numpy.int)
input_vector_b = gpuarray.arange(vector_length, dtype=numpy.int)

# Operação de redução em paralelo
dot_product = ReductionKernel(numpy.int,
                              arguments="int *x, int *y",
                              map_expr="x[i]*y[i]",
                              reduce_expr="a+b",
                              neutral="0")

# Execução do kernel
dot_product = dot_product(input_vector_a, input_vector_b).get()

# Imprime os resultados
print("Matriz A")
print(input_vector_a)

print("Matriz B")
print(input_vector_b)

print("Resultado do Produto A * B")
print(dot_product)
import pycuda.gpuarray as gpuarray
import pycuda.driver as drv
import pycuda.autoinit
import numpy
from pycuda.reduction import ReductionKernel

n = 5
start = drv.Event()
end = drv.Event()
start.record()
d_a = gpuarray.arange(n, dtype=numpy.uint32)
d_b = gpuarray.arange(n, dtype=numpy.uint32)
# 归约内核函数
kernel = ReductionKernel(numpy.uint32,
                         neutral="0",
                         reduce_expr="a+b",
                         map_expr="d_a[i]*d_b[i]",
                         arguments="int *d_a,int *d_b")
# 点乘
d_result = kernel(d_a, d_b).get()
end.record()
end.synchronize()
secs = start.time_till(end) * 1e-3
print("Vector A")
print(d_a)
print("Vector B")
print(d_b)
print("The computed dot product using reduction:")
print(d_result)
print("Dot Product on GPU")
print("%fs" % (secs))
mod = SourceModule("""
  
  __global__ void CopyRow(float *x, float *y, int n)
  {
    
    for (int j = threadIdx.x + blockIdx.x*blockDim.x; j < n; j += blockDim.x * gridDim.x) {
        x[j] = y[j];
    }
  }
  """)
CopyRow = mod.get_function("CopyRow")

compareGpu = ReductionKernel(np.float32,
                             neutral="0",
                             reduce_expr="max(a, b)",
                             map_expr="abs( abs(x[i]) - abs(y[i]) )",
                             arguments="float *x, float *y")


def gpuMatMul(A, B, C, transa='n', transb='n', block=(32, 32, 1)):
    block = (TileMat, TileMat, 1)

    transa = transa.lower()
    transb = transb.lower()
    bx, by, bz = block
    Arow, Acol = A.shape
    Brow, Bcol = B.shape
    if bx > Tile:
        bx = Tile
    if by > Tile:
Example #9
0
calc_ll_by_der_per_sample = ElementwiseKernel(
    "double *err_by_der, double sigma_lm, double *ll_by_der ",
    "ll_by_der[i] = -0.5 * err_by_der[i]*err_by_der[i] / (sigma_lm*sigma_lm)",
    "calc_ll_by_der_per_sample")

from pycuda import autoinit

#calc_diff = ElementwiseKernel(
#        "double *x, double *out, int N",
#        """out[i] = (x[i+1]-x[i])/dt - (y[i+1]-y[i])/dt""",
#        "calc_err_by_der_per_sample")

calc_sum_prime = ReductionKernel(np.float64,
                                 neutral="0",
                                 reduce_expr="a+b",
                                 map_expr=" i < N-1 ? x[i+1]-x[i] : 0",
                                 arguments="double *x, int N")

calc_sum_double_prime = ReductionKernel(
    np.float64,
    neutral="0",
    reduce_expr="a+b",
    map_expr=" ((0 < i) && (i < N-1)) ? x[i+1]-2*x[i]+x[i-1] : 0",
    arguments="double *x, int N")

calc_sum_abs_double_prime = ReductionKernel(
    np.float64,
    neutral="0",
    reduce_expr="a+b",
    map_expr=" ((0 < i) && (i < N-1)) ? abs(x[i+1]-2*x[i]+x[i-1]) : 0",
Example #10
0
seq = np.array([1, 2, 3, 4], dtype=np.int32)
seq_gpu = gpuarray.to_gpu(seq)
sum_gpu = InclusiveScanKernel(np.int32, "a+b")
# analogous to lambda a,b: a + b
print sum_gpu(seq_gpu).get()
print np.cumsum(seq)

seq = np.array([1, 100, -3, -10000, 4, 10000, 66, 14, 21], dtype=np.int32)
seq_gpu = gpuarray.to_gpu(seq)
max_gpu = InclusiveScanKernel(np.int32, "a > b ? a : b")
print max_gpu(seq_gpu).get()
print max_gpu(seq_gpu).get()[-1]
# print np.max(seq) auch 10000

# Skalarprodukt einer Matrix in paralleler Ausfueugrung auf der gpu,
# kann erstmal nur einfache Vektoren, keine 2D-Matrizen

dot_prod = ReductionKernel(np.float32,
                           neutral="0",
                           reduce_expr="a+b",
                           map_expr="vec1[i]*vec2[i]",
                           arguments="float *vec1, float *vec2")

x = np.array([1, 2, 3]).astype(np.float32)
y = np.array([6, 7, 8]).astype(np.float32)
device_x = gpuarray.to_gpu(x)
device_y = gpuarray.to_gpu(y)
product = dot_prod(device_x, device_y)
print(product.get())
Example #11
0
from pycuda import gpuarray as ga
from pycuda import driver
from pycuda.elementwise import ElementwiseKernel
from pycuda.reduction import ReductionKernel
import numpy as np

_axby = ElementwiseKernel(
            """ pycuda::complex<double> a, pycuda::complex<double> *x,
                pycuda::complex<double> b, pycuda::complex<double> *y""", \
           ' x[i] = a * x[i] + b * y[i]')

_norm = ReductionKernel(np.complex128,
                        neutral="0",
                        reduce_expr="a+b",
                        map_expr="pow(abs(x[i]), 2)",
                        arguments="pycuda::complex<double> *x")


class Grid:
    def __init__(self, array):
        # Get the array.
        if type(array) is np.ndarray:
            self.g = ga.to_gpu(array)  # Copy data to the GPU.
        elif type(array) is ga.GPUArray:
            self.g = array  # GPUArray already initialized.
        else:
            print 'Invalid type'  # Raise proper exception here.

#         # Create the aby function.
#         if self.g.dtype is np.dtype('complex128'):
#             cuda_type = 'pycuda::complex<double>'
Example #12
0
import pycuda.autoinit
import pycuda.driver as cuda
import pycuda.gpuarray as gpuarray
from pycuda.reduction import ReductionKernel
from pycuda.curandom import rand as curand
import numpy

a = curand((1000 * 1000), dtype=numpy.float32)
b = curand((1000 * 1000), dtype=numpy.float32)

piKernel = ReductionKernel(numpy.float32,
                           neutral="0",
                           reduce_expr="a+b",
                           map_expr="float(x[i] * x[i] + y[i] * y[i]) <= 1.0f",
                           arguments="float *x, float*y")

pi = (4.0 * piKernel(a, b).get()) / (1000 * 1000)

print(pi)
Example #13
0
'''
size = 5
knl = ReductionKernel(dtype_out = np.float32, neutral = "0", reduce_expr = "a+b", map_expr = "x[i]",arguments = "float *x")
a = np.random.randint(5, size = size).astype(np.float32)
a_gpu = gpuarray.to_gpu(a)
result_gpu = knl(a_gpu)

print a
print "\n"
print reduction_cpu(a, size)
print "\n"
print result_gpu.get()
'''

knl = ReductionKernel(dtype_out = np.float32, neutral = "0", reduce_expr = "a+b", map_expr = "x[i]",arguments = "float *x")

time_cpu = []
time_knl = []

N = range(1, 3000)
for i in N:
  size = 32 * i
  a = np.random.randint(5, size = size).astype(np.float32)
  a_gpu = gpuarray.to_gpu(a)

  start = time.time()
  reduction_cpu(a, size)
  time_cpu.append(time.time() - start)

  start = time.time()
Example #14
0
                                       "sigmoid_double")

tanh_float_ker = ElementwiseKernel(f"float *Y, float *x", """
    Y[i] = tanh(x[i])
    """, "tanh_float")

tanh_double_ker = ElementwiseKernel(
    f"double *Y, double *x", """
    double pos_exp = exp (x[i]);
    double neg_exp = exp (-x[i]);
    Y[i] = (pos_exp - neg_exp) / (pos_exp + neg_exp)
    """, "tanh_double")

exp_sum_float_ker = ReductionKernel(np.float32,
                                    neutral="0.0",
                                    reduce_expr="a+b",
                                    map_expr="exp (x[i])",
                                    arguments=f"float *x")

softmax_float_ker = ElementwiseKernel(f"float *Y, float *x, float s",
                                      "Y[i] = exp (x[i]) / s", "softmax_float")

exp_sum_double_ker = ReductionKernel(np.float32,
                                     neutral="0.0",
                                     reduce_expr="a+b",
                                     map_expr="exp (x[i])",
                                     arguments=f"double *x")

softmax_double_ker = ElementwiseKernel(f"double *Y, double *x, double s",
                                       "Y[i] = exp (x[i]) / s",
                                       "softmax_double")
Example #15
0
#
# The utility functions for GPU computation
#
import numpy as np

from ..util import gpu_init

try:
    from pycuda.reduction import ReductionKernel
    from pycuda.elementwise import ElementwiseKernel

    # log|A| for A is a low triangle matrix
    # logDiagSum(A, A.shape[0]+1)
    logDiagSum = ReductionKernel(np.float64,
                                 neutral="0",
                                 reduce_expr="a+b",
                                 map_expr="i%step==0?log(x[i]):0",
                                 arguments="double *x, int step")

    strideSum = ReductionKernel(np.float64,
                                neutral="0",
                                reduce_expr="a+b",
                                map_expr="i%step==0?x[i]:0",
                                arguments="double *x, int step")

    # np.trace(np.dot(A,B)) (also equivalent to (A*B.T).sum() ) A - a1 x a2, B - a2 x a1
    traceDot = ReductionKernel(
        np.float64,
        neutral="0",
        reduce_expr="a+b",
        map_expr="A[i]*B[(i%a1)*a2+i/a1]",