import cudapy as cp from time import time # rangeId : void (float* A) def rangeId(A): if idx < len(A): A[idx] = float(idx) __rangeIdCall = cp.compile(rangeId) # matrixMultiply : void (float* A, float* B, float* C, int m, int n, int p) # A is m x n # B is n x p # C is m x p def matrixMultiply(A, B, C, m, n, p): row = idy col = idx result = 0.0 if row >= m or col >= p: return for i in xrange(n): result += A[row * n + i] * B[i * p + col] C[row * p + col] = result __matrixMultiplyCall = cp.compile(matrixMultiply) m = 1200
# mandelbrot : void (float, float, float, float, int, int, int, int*) def mandelbrot(x0, y0, x1, y1, width, height, maxIter, output): if idx >= width * height: return dx = (x1 - x0) / float(width) dy = (y1 - y0) / float(height) x = x0 + float(idx % width) * dx y = y0 + float(idx / width) * dy output[idx] = mandel(x, y, maxIter) __mandelbrotCall = cp.compile([mandelbrot, mandel]) def mandelbrotCall(x0, y0, x1, y1, width, height, maxIter): cudaResult = cp.CudaArray.allocate(width * height, cp.Int) __mandelbrotCall(cp.dim3(width * height))(x0, y0, x1, y1, width, height, maxIter, cudaResult) return cudaResult.toHost() def scaleAndShift(x0, y0, x1, y1, scale, shiftX, shiftY): x0 *= scale x1 *= scale y0 *= scale y1 *= scale x0 += shiftX x1 += shiftX
import cudapy as cp # saxpy : void (float alpha, float* X, float* Y) def saxpy(alpha, X, Y): if idx < len(X): Y[idx] = alpha * X[idx] + Y[idx] # Compile the kernel function saxpyCall = cp.compile(saxpy) X = map(float, range(100)) Y = map(float, range(100)) # Transfer Y to device memory Y = cp.CudaArray(Y) # Make the SAXPY call saxpyCall(len(X))(5.0, X, Y) # Convert the result back to Python list result = Y.toList() print result