def sqrt_normalize_gpu(img): global posr, negr, posa, nega, stream rgb = gpuarray.to_gpu(img[:, :, :3].copy()) a = gpuarray.to_gpu(img[:, :, 3].copy()) if not posr: posr = gpuarray.zeros_like(rgb) + 1 negr = gpuarray.zeros_like(rgb) - 1 posa = gpuarray.zeros_like(a) + 1 nega = gpuarray.zeros_like(a) - 1 rgb = cumath.sqrt(abs(rgb), stream=stream) * gpuarray.if_positive( rgb, posr, negr, stream=stream) a = cumath.sqrt(abs(a), stream=stream) * gpuarray.if_positive( a, posa, nega, stream=stream) return normalize_gpu(rgb, a)
def test_sqrt(self): """tests if the sqrt function works""" a = simplearray.array(10).fill_arange()+1 b = cumath.sqrt(a) for i in range(10): self.assert_(abs(math.sqrt(a[i]) - b[i]) < 1e-3)
def gaussian_norm(data, sigma=0.5, **kwargs): """ Performs Gaussian normalization to an input dataset. This is, every voxel is normalized by substracting the mean and dividing it by the standard deviation in a Gaussian neighbourhood around it. Parameters ---------- data : 2 or 3 dimensional array The data to be filtered sigma : float or array of floats The standard deviation of the Gaussian filter used to estimate the mean and standard deviation of the kernel. Controls the radius and strength of the filter. If an array is given, it has to satisfy `len(sigma) = data.ndim`. Default: 0.5 **kwargs : other named parameters Parameters are passed to `conv.make_gaussian_1d` Returns ------- result : 2 or 3 dimensional filtered `GPUArray` The result of the filtering resulting from PyCuda. Use `.get()` to retrieve the corresponding Numpy array. """ kwargs['keep_gpu'] = True num = gaussian_center(data, sigma=sigma, **kwargs) den = cumath.sqrt(gaussian(num**2, sigma=sigma, **kwargs)) # TODO numerical precision ignore den < 1e-7 num /= den return num
def lcc_to_sphere_cuda(self, x, y, R=6370, truelat0=31.7, truelat1=31.7, ref_lat=31.68858, stand_lon=-113.7): phi0 = np.radians(ref_lat) phi1 = np.radians(truelat0) phi2 = np.radians(truelat1) lambda0 = np.radians(stand_lon) if truelat0 == truelat1: n = np.sin(phi0) else: n = (np.log(np.cos(phi1) / np.cos(phi2)) / np.log( np.tan(np.pi / 4 + phi2 / 2) / np.tan(np.pi / 4 + phi1 / 2))) F = (np.cos(phi1) * np.power(np.tan(np.pi / 4 + phi1 / 2), n) / n) rho0 = F / np.power(np.tan(np.pi / 4 + phi0 / 2), n) x = x / R y = y / R ymrho = y - rho0 rho = cumath.sqrt(x * x + ymrho * ymrho) atan1 = F**(1.0 / n) atan2 = rho**(1.0 / n) atan_res = self.atan2(atan1, atan2) phis = 360 * (atan_res - np.pi / 4) / np.pi lambdas = (cumath.asin(x / rho) / n + lambda0) * 180 / np.pi return phis, lambdas
def _get_updates(self, grads): """Get the values used to update params with given gradients Parameters ---------- grads : list, length = len(coefs_) + len(intercepts_) Containing gradients with respect to coefs_ and intercepts_ in MLP model. So length should be aligned with params Returns ------- updates : list, length = len(grads) The values to add to params """ self.t += 1 self.ms = [ self.beta_1 * m + (1 - self.beta_1) * grad for m, grad in zip(self.ms, grads) ] self.vs = [ self.beta_2 * v + (1 - self.beta_2) * (grad**2) for v, grad in zip(self.vs, grads) ] self.learning_rate = (self.learning_rate_init * np.sqrt(1 - self.beta_2**self.t) / (1 - self.beta_1**self.t)) updates = [ -self.learning_rate * m / (cumath.sqrt(v) + self.epsilon) for m, v in zip(self.ms, self.vs) ] return updates
def magnitude(vec, vec2): #, fn = mod.get_function('magnitude')): #gpu_vec = drv.mem_alloc(vec.nbytes) #drv.memcpy_htod(gpu_vec, vec) #fn(gpu_vec, block=(512, 1, 1)) #dest = drv.from_device_like(gpu_vec, vec) #print 'Dot product: ', dest[0] gpu_arry = gpuarr.to_gpu_async(vec) gpu_arry2 = gpuarr.to_gpu_async(vec2) mag = cumath.sqrt(gpuarr.dot(gpu_arry, gpu_arry, dtype=np.float32)) mag2 = cumath.sqrt(gpuarr.dot(gpu_arry2, gpu_arry2, dtype=np.float32)) product = gpuarr.dot(gpu_arry, gpu_arry2, dtype=np.float32) / mag + mag2 print product return product.get()
def _sigma(self, sliceset, u, lower_bounds, upper_bounds): block = (256, 1, 1) grid = (max(sliceset.n_slices // block[0], 1), 1, 1) cov_u = gpuarray.zeros(sliceset.n_slices, dtype=np.float64) sorted_std_per_slice(lower_bounds.gpudata, upper_bounds.gpudata, u.gpudata, self.n_slices, cov_u.gpudata, block=block, grid=grid) return cumath.sqrt(cov_u)
def diag_gpu(A, v1): # handle current_handle = cublas.cublasCreate() m = A.shape[0] Q = np.zeros((m, m), dtype=np.float64) # Q[0, :] = 0.0 # implied Q[1, :] = v1.copy() beta = np.zeros(m, dtype=np.float64) alpha = np.zeros(m, dtype=np.float64) # move data onto the GPU A_gpu = gpuarray.to_gpu(A) Q_gpu = gpuarray.to_gpu(Q) beta_gpu = gpuarray.to_gpu(beta) alpha_gpu = gpuarray.to_gpu(alpha) w = gpuarray.zeros(m, dtype=np.float64) # we define three kernels for simple arithmetic w_scale = ElementwiseKernel( arguments="double *w, double *alpha, double *beta, double *Q1, double *Q2, int loop_index", operation="w[i] = w[i] - (alpha[loop_index] * Q1[i]) - (beta[loop_index] * Q2[i])", name="element_wise_w_building") # using -= to do inplace subtraction gives an incorrect answer norm_krnl = ReductionKernel(np.float64, neutral="0.0", reduce_expr="a+b", map_expr="x[i]*x[i]", arguments="double *x") ediv = ElementwiseKernel( arguments="double *a, double *b, double *c, int loop_index", operation="a[i] = b[i] / c[loop_index+1]", name="element_wise_division") # the name must not have spaces!!!! for i in range(1, m-1): cublas.cublasDgemv(handle = current_handle, trans = 'T', m = m, n = m, # Hermitian matrix alpha = 1.0, beta = 0.0, A = A_gpu.gpudata, lda = m, x = Q_gpu[i, :].gpudata, incx = 1, y = w.gpudata, incy = 1, ) cublas.cublasDgemm(handle = current_handle, transa = 'n', transb = 'n', m = 1, n = 1, k = m, lda = 1, ldb = m, ldc = 1, alpha = 1.0, beta = 0.0, A = w.gpudata, B = Q_gpu[i, :].gpudata, C = alpha_gpu[i].gpudata) w_scale(w, alpha_gpu, beta_gpu, Q_gpu[i, :], Q_gpu[i-1, :], i) beta_gpu[i+1] = cumath.sqrt(norm_krnl(w)) ediv(Q_gpu[i+1, :], w, beta_gpu, i) # end of loop # last 2 steps cublas.cublasDgemv(handle = current_handle, trans = 'T', m = m, n = m, # Hermitian matrix alpha = 1.0, beta = 0.0, A = A_gpu.gpudata, lda = m, x = Q_gpu[-1, :].gpudata, incx = 1, y = w.gpudata, incy = 1,) cublas.cublasDgemm(handle = current_handle, transa = 'n', transb = 'n', m = 1, n = 1, k = m, lda = 1, ldb = m, ldc = 1, alpha = 1.0, beta = 0.0, A = w.gpudata, B = Q_gpu[-1, :].gpudata, C = alpha_gpu[-1].gpudata) # retrive the alpha's and betas alpha_cpu = alpha_gpu.get() beta_cpu = beta_gpu.get() print("GPU: ", alpha_cpu, beta_cpu, sep="\n\n") # make tridiagonal matrix out of alpha and B # Tri = np.zeros(matrix_size) return
def sqrt_t(self, a, out): cumath.sqrt(a, out=out)
N = 100000 # --- Create random vectorson the CPU h_a = np.random.randn(1, N) h_b = np.random.randn(1, N) # --- Set CPU arrays as single precision h_a = h_a.astype(np.float32) h_b = h_b.astype(np.float32) h_c = np.empty_like(h_a) d_a = gpuarray.to_gpu(h_a) d_b = gpuarray.to_gpu(h_b) start.record() d_c = (cumath.sqrt(cumath.fabs(d_a)) + cumath.exp(d_b)) end.record() end.synchronize() secs = start.time_till(end) * 1e-3 print("Processing time = %fs" % (secs)) h_c = d_c.get() if np.all(abs(h_c - (np.sqrt(np.abs(h_a)) + np.exp(h_b))) < 1e-5): print("Test passed!") else: print("Error!") # --- Flush context printf buffer cuda.Context.synchronize()
def sqrt_t(self, a, out): cumath.sqrt(a, out)
def cuda_field(ab, krv, cartesian=True, bohren=True): '''Returns the field scattered by the particle at each coordinate Parameters ---------- ab : numpy.ndarray Mie scattering coefficients krv : numpy.ndarray Reduced vector displacements of particle from image coordinates cartesian : bool If set, return field projected onto Cartesian coordinates. Otherwise, return polar projection. bohren : bool If set, use sign convention from Bohren and Huffman. Otherwise, use opposite sign convention. Returns ------- field : numpy.ndarray [3, npts] array of complex vector values of the scattered field at each coordinate. ''' nc = ab.shape[0] # number of partial waves in sum # GEOMETRY # 1. particle displacement [pixel] # Note: The sign convention used here is appropriate # for illumination propagating in the -z direction. # This means that a particle forming an image in the # focal plane (z = 0) is located at positive z. # Accounting for this by flipping the axial coordinate # is equivalent to using a mirrored (left-handed) # coordinate system. kx = gpuarray.to_gpu(krv[:, 0]).astype(np.float32) ky = gpuarray.to_gpu(krv[:, 1]).astype(np.float32) kz = gpuarray.to_gpu(-krv[:, 2]).astype(np.float32) npts = len(kx) # 2. geometric factors krho = cumath.sqrt(kx * kx + ky * ky) cosphi = kx / krho sinphi = ky / krho kr = cumath.sqrt(krho * krho + kz * kz) costheta = kz / kr sintheta = krho / kr sinkr = cumath.sin(kr) coskr = cumath.cos(kr) # SPECIAL FUNCTIONS # starting points for recursive function evaluation ... # 1. Riccati-Bessel radial functions, page 478. # Particles above the focal plane create diverging waves # described by Eq. (4.13) for $h_n^{(1)}(kr)$. These have z > 0. # Those below the focal plane appear to be converging from the # perspective of the camera. They are descrinbed by Eq. (4.14) # for $h_n^{(2)}(kr)$, and have z < 0. We can select the # appropriate case by applying the correct sign of the imaginary # part of the starting functions... factor = 1.j * kz / abs(kz) if not bohren: factor *= -1. xi_nm2 = coskr + factor * sinkr # \xi_{-1}(kr) xi_nm1 = sinkr - factor * coskr # \xi_0(kr) # 2. Angular functions (4.47), page 95 pi_nm1 = 0. # \pi_0(\cos\theta) pi_n = 1. # \pi_1(\cos\theta) # 3. Vector spherical harmonics: [r,theta,phi] mo1n = gpuarray.zeros([3, npts], dtype=np.complex64) ne1n = gpuarray.empty([3, npts], dtype=np.complex64) # storage for scattered field es = gpuarray.zeros([3, npts], dtype=np.complex64) # COMPUTE field by summing partial waves for n in range(1, nc): # upward recurrences ... # 4. Legendre factor (4.47) # Method described by Wiscombe (1980) swisc = pi_n * costheta twisc = swisc - pi_nm1 tau_n = pi_nm1 - n * twisc # -\tau_n(\cos\theta) # ... Riccati-Bessel function, page 478 xi_n = (2. * n - 1.) * (xi_nm1 / kr) - xi_nm2 # \xi_n(kr) # ... Deirmendjian's derivative dn = (n * xi_n) / kr - xi_nm1 # vector spherical harmonics (4.50) # mo1n[0, :] = 0.j # no radial component mo1n[1, :] = pi_n * xi_n # ... divided by cosphi/kr mo1n[2, :] = tau_n * xi_n # ... divided by sinphi/kr # ... divided by cosphi sintheta/kr^2 ne1n[0, :] = n * (n + 1.) * pi_n * xi_n ne1n[1, :] = tau_n * dn # ... divided by cosphi/kr ne1n[2, :] = pi_n * dn # ... divided by sinphi/kr # prefactor, page 93 en = 1.j**n * (2. * n + 1.) / n / (n + 1.) # the scattered field in spherical coordinates (4.45) es += np.complex64(1.j * en * ab[n, 0]) * ne1n es -= np.complex64(en * ab[n, 1]) * mo1n # upward recurrences ... # ... angular functions (4.47) # Method described by Wiscombe (1980) pi_nm1 = pi_n pi_n = swisc + ((n + 1.) / n) * twisc # ... Riccati-Bessel function xi_nm2 = xi_nm1 xi_nm1 = xi_n # n: multipole sum # geometric factors were divided out of the vector # spherical harmonics for accuracy and efficiency ... # ... put them back at the end. radialfactor = 1. / kr es[0, :] *= cosphi * sintheta * radialfactor**2 es[1, :] *= cosphi * radialfactor es[2, :] *= sinphi * radialfactor # By default, the scattered wave is returned in spherical # coordinates. Project components onto Cartesian coordinates. # Assumes that the incident wave propagates along z and # is linearly polarized along x if cartesian: ec = gpuarray.empty_like(es) ec[0, :] = es[0, :] * sintheta * cosphi ec[0, :] += es[1, :] * costheta * cosphi ec[0, :] -= es[2, :] * sinphi ec[1, :] = es[0, :] * sintheta * sinphi ec[1, :] += es[1, :] * costheta * sinphi ec[1, :] += es[2, :] * cosphi ec[2, :] = es[0, :] * costheta - es[1, :] * sintheta return ec.get() else: return es.get()
def execute(self): f_first = True resimg = self.images_iterator.read_reference_image() self.resulting_image = self.images_iterator.read_reference_image() shape = resimg.shape resimg.image[:] = 2**resimg.color_depth / 2 resimg_nda = np.ndarray(shape=resimg.image.shape, dtype=resimg.image.dtype) resimg_nda[:] = resimg.image[:] resimg_cu = gpuarray.to_gpu(resimg_nda) imgarr_cu = gpuarray.to_gpu(resimg_nda) avrimg_cu = gpuarray.zeros_like(resimg_cu) std_cu = gpuarray.zeros(shape[:2], dtype=resimg.dtype) std_cu.fill(np.float32(2**resimg.color_depth)) dist_cu = gpuarray.zeros(shape[:2], dtype=resimg.dtype) flags_cu = gpuarray.zeros(shape[:2], dtype=np.bool) iter_cnt = 5 print(shape) th_x = 32 th_y = 32 blk_x = int(shape[0] / th_x) + 1 blk_y = int(shape[1] / th_y) + 1 grid_im = (blk_x, blk_y, 1) block_im = (th_x, th_y, 1) print(block_im) print(grid_im) mod_dist_colors = SourceModule(self.__kernel_dist_colors) mod_std = SourceModule(self.__kernel_std) dist_colors = mod_dist_colors.get_function("dist_colors") img_merge_std = mod_std.get_function("img_merge_std") ca = time.clock() for itr in range(iter_cnt): invalid_imgs = [] img_cnt = 0.0 for imgarr in self.images_iterator: if shape != imgarr.shape: self.images_iterator.discard_image() continue img_cnt += 1 imgarr_cu.set(imgarr.image) dist_colors(imgarr_cu, resimg_cu, avrimg_cu, std_cu, np.int32(shape[0]), np.int32(shape[1]), np.int32(itr), np.float32(10.0), block=block_im, grid=grid_im) cb = time.clock() print("avg clock: %1.4f" % (cb - ca)) resimg_cu = avrimg_cu[:] / np.float32(img_cnt) std_cu.fill(0.0) for imgarr in self.images_iterator: imgarr_cu.set(imgarr.image) img_merge_std(imgarr_cu, resimg_cu, std_cu, np.int32(shape[0]), np.int32(shape[1]), block=block_im, grid=grid_im) cb = time.clock() print("std clock: %1.4f" % (cb - ca)) std_cu /= np.float32(img_cnt) cumath.sqrt(std_cu, out=std_cu) avrimg_cu.fill(0.0) self.resulting_image.image = np.array(resimg_cu.get())
def random_normal(loc=0.0, scale=1.0, size=None): u1 = curandom.rand(size, dtype=numpy.float64) u2 = curandom.rand(size, dtype=numpy.float64) z1 = cumath.sqrt(-2.*cumath.log(u1))*cumath.cos(2.*numpy.pi*u2) return CUDAArray(scale*z1+loc)
def sqrt(self): return CUDAArray(cumath.sqrt(self.arr))