def impl_test_binaryop_2d(self, dtype): if issubclass(dtype, numbers.Integral): a_sca = np.array(np.random.randint(1, 10), dtype=dtype) b_sca = np.array(np.random.randint(1, 10), dtype=dtype) a_vec = np.random.randint(1, 10, 3).astype(dtype) b_vec = np.random.randint(1, 10, 3).astype(dtype) a_mat = np.random.randint(1, 10, 6).reshape((3, 2)).astype(dtype) b_mat = np.random.randint(1, 10, 6).reshape((3, 2)).astype(dtype) else: a_sca = np.random.normal(scale=5.0, size=()).astype(dtype) b_sca = np.random.normal(scale=5.0, size=()).astype(dtype) a_vec = np.random.normal(scale=5.0, size=(3, )).astype(dtype) b_vec = np.random.normal(scale=5.0, size=(3, )).astype(dtype) a_mat = np.random.normal(scale=5.0, size=(3, 2)).astype(dtype) b_mat = np.random.normal(scale=5.0, size=(3, 2)).astype(dtype) a_sca_gpu = gpuarray.to_gpu(a_sca) b_sca_gpu = gpuarray.to_gpu(b_sca) a_vec_gpu = gpuarray.to_gpu(a_vec) b_vec_gpu = gpuarray.to_gpu(b_vec) a_mat_gpu = gpuarray.to_gpu(a_mat) b_mat_gpu = gpuarray.to_gpu(b_mat) # addition assert np.allclose(misc.add(a_sca_gpu, b_sca_gpu).get(), a_sca + b_sca) assert np.allclose(misc.add(a_vec_gpu, b_vec_gpu).get(), a_vec + b_vec) assert np.allclose(misc.add(a_mat_gpu, b_mat_gpu).get(), a_mat + b_mat) # subtract assert np.allclose( misc.subtract(a_sca_gpu, b_sca_gpu).get(), a_sca - b_sca) assert np.allclose( misc.subtract(a_vec_gpu, b_vec_gpu).get(), a_vec - b_vec) assert np.allclose( misc.subtract(a_mat_gpu, b_mat_gpu).get(), a_mat - b_mat) # multiplication assert np.allclose( misc.multiply(a_sca_gpu, b_sca_gpu).get(), a_sca * b_sca) assert np.allclose( misc.multiply(a_vec_gpu, b_vec_gpu).get(), a_vec * b_vec) assert np.allclose( misc.multiply(a_mat_gpu, b_mat_gpu).get(), a_mat * b_mat) # division assert np.allclose( misc.divide(a_sca_gpu, b_sca_gpu).get(), a_sca / b_sca) assert np.allclose( misc.divide(a_vec_gpu, b_vec_gpu).get(), a_vec / b_vec) assert np.allclose( misc.divide(a_mat_gpu, b_mat_gpu).get(), a_mat / b_mat)
def thunk(): alpha = gpuarray.to_gpu(np.squeeze(np.asarray(inputs[0]))[:, None]) x_t = gpuarray.to_gpu(np.asarray(inputs[1])[0, :, :]) x_f = gpuarray.to_gpu(np.asarray(inputs[2])[0, :, :]) Xt = cumath.exp(misc.add(linalg.dot(x_t, A), b)) Xf = cumath.exp(misc.add(linalg.dot(x_f, A), b)) Xtn = misc.sum(Xt, axis=1, keepdims=True) Xfn = misc.sum(Xf, axis=1, keepdims=True) Xt = misc.divide(Xt, Xtn) Xf = misc.divide(Xf, Xfn) w = misc.multiply(Xt, alpha) + misc.multiply(Xf, 1 - alpha) dq = Xt - Xf qdw = dq / w t1 = misc.sum(x * qdw, axis=1) f = 2 * depth + self.base.n t2 = f * misc.sum(dq, axis=1) / misc.sum(w, axis=1) t3 = misc.sum(x, axis=1) * misc.sum(qdw, axis=1) dalpha = t1 - t2 + t3 del dq, t1, f, t2, t3 iw = 1 / w S1 = misc.multiply( depth[:, None] * (self.base.n - 1) / self.base.n, iw) S2 = (self.base.n + depth[:, None]) / cumath.log( misc.sum(w, axis=1, keepdims=True)) F = misc.multiply(misc.subtract((x * iw) - S1, S2), alpha) del w, iw, S1, S2 cast = gpuarray.zeros((x_t.shape[1], Xt.shape[1]), dtype=theano.config.floatX) dLq_t = gpuarray.zeros(x_t.shape, dtype=theano.config.floatX) dLq_f = gpuarray.zeros(x_f.shape, dtype=theano.config.floatX) for i in range(Xt.shape[0]): S1 = misc.multiply(Xt[None, i, :], A) S2 = misc.sum(S1, axis=1, keepdims=True) S2 = misc.multiply(S2, misc.add(Xt[None, i, :], cast)) dLq_t[i, :] = misc.sum(misc.multiply(F[None, i, :], S1 - S2), axis=1) S1 = misc.multiply(Xf[None, i, :], A) S2 = misc.sum(S1, axis=1, keepdims=True) S2 = misc.multiply(S2, misc.add(Xf[None, i, :], cast)) dLq_f[i, :] = misc.sum(misc.multiply(F[None, i, :], S1 - S2), axis=1) outputs[0][0] = dalpha.get() outputs[1][0] = dLq_t.get() outputs[2][0] = dLq_f.get() for v in node.outputs: compute_map[v][0] = True
def impl_test_binaryop_2d(self, dtype): if issubclass(dtype, numbers.Integral): a_sca = np.array(np.random.randint(1, 10), dtype=dtype) b_sca = np.array(np.random.randint(1, 10), dtype=dtype) a_vec = np.random.randint(1, 10, 3).astype(dtype) b_vec = np.random.randint(1, 10, 3).astype(dtype) a_mat = np.random.randint(1, 10, 6).reshape((3, 2)).astype(dtype) b_mat = np.random.randint(1, 10, 6).reshape((3, 2)).astype(dtype) else: a_sca = np.random.normal(scale=5.0, size=()).astype(dtype) b_sca = np.random.normal(scale=5.0, size=()).astype(dtype) a_vec = np.random.normal(scale=5.0, size=(3,)).astype(dtype) b_vec = np.random.normal(scale=5.0, size=(3,)).astype(dtype) a_mat = np.random.normal(scale=5.0, size=(3, 2)).astype(dtype) b_mat = np.random.normal(scale=5.0, size=(3, 2)).astype(dtype) a_sca_gpu = gpuarray.to_gpu(a_sca) b_sca_gpu = gpuarray.to_gpu(b_sca) a_vec_gpu = gpuarray.to_gpu(a_vec) b_vec_gpu = gpuarray.to_gpu(b_vec) a_mat_gpu = gpuarray.to_gpu(a_mat) b_mat_gpu = gpuarray.to_gpu(b_mat) # addition assert np.allclose(misc.add(a_sca_gpu, b_sca_gpu).get(), a_sca+b_sca) assert np.allclose(misc.add(a_vec_gpu, b_vec_gpu).get(), a_vec+b_vec) assert np.allclose(misc.add(a_mat_gpu, b_mat_gpu).get(), a_mat+b_mat) # subtract assert np.allclose(misc.subtract(a_sca_gpu, b_sca_gpu).get(), a_sca-b_sca) assert np.allclose(misc.subtract(a_vec_gpu, b_vec_gpu).get(), a_vec-b_vec) assert np.allclose(misc.subtract(a_mat_gpu, b_mat_gpu).get(), a_mat-b_mat) # multiplication assert np.allclose(misc.multiply(a_sca_gpu, b_sca_gpu).get(), a_sca*b_sca) assert np.allclose(misc.multiply(a_vec_gpu, b_vec_gpu).get(), a_vec*b_vec) assert np.allclose(misc.multiply(a_mat_gpu, b_mat_gpu).get(), a_mat*b_mat) # division assert np.allclose(misc.divide(a_sca_gpu, b_sca_gpu).get(), a_sca/b_sca) assert np.allclose(misc.divide(a_vec_gpu, b_vec_gpu).get(), a_vec/b_vec) assert np.allclose(misc.divide(a_mat_gpu, b_mat_gpu).get(), a_mat/b_mat)
def thunk(): alpha = gpuarray.to_gpu(np.squeeze(np.asarray(inputs[0]))[:, None]) x_t = gpuarray.to_gpu(np.asarray(inputs[1])[0, :, :]) x_f = gpuarray.to_gpu(np.asarray(inputs[2])[0, :, :]) Xt = cumath.exp(misc.add(linalg.dot(x_t, A), b)) Xf = cumath.exp(misc.add(linalg.dot(x_f, A), b)) Xtn = misc.sum(Xt, axis=1, keepdims=True) Xfn = misc.sum(Xf, axis=1, keepdims=True) Xt = misc.divide(Xt, Xtn) Xf = misc.divide(Xf, Xfn) w = misc.multiply(Xt, alpha) + misc.multiply(Xf, 1 - alpha) wp = cumath.log(w) wpn = misc.sum(wp, axis=1, keepdims=True) / self.n wp = misc.subtract(wp, wpn) t1 = misc.sum(x * wp, axis=1) t2 = (self.n + depth) * cumath.log(misc.sum(w, axis=1)) t3 = depth * wpn outputs[0][0] = misc.sum(t1 - t2 + t3).get() for v in node.outputs: compute_map[v][0] = True
def __rmul__(self, other): return cumisc.multiply(other, self) def __rdiv__(self, other): return cumisc.divide(other, self)
def __mul__(self, other): return cumisc.multiply(self, other) def __div__(self, other): return cumisc.divide( self, other)
def _impl_test_binaryop_2d(self, dtype): if issubclass(dtype, numbers.Integral): a_sca = np.array(np.random.randint(1, 10), dtype=dtype) b_sca = np.array(np.random.randint(1, 10), dtype=dtype) a_vec = np.random.randint(1, 10, 3).astype(dtype) b_vec = np.random.randint(1, 10, 3).astype(dtype) a_mat = np.random.randint(1, 10, 6).reshape((3, 2)).astype(dtype) b_mat = np.random.randint(1, 10, 6).reshape((3, 2)).astype(dtype) b_mat_f = np.random.randint(1, 10, 6).reshape( (3, 2)).astype(dtype, order='F') else: a_sca = np.random.normal(scale=5.0, size=()).astype(dtype) b_sca = np.random.normal(scale=5.0, size=()).astype(dtype) a_vec = np.random.normal(scale=5.0, size=(3, )).astype(dtype) b_vec = np.random.normal(scale=5.0, size=(3, )).astype(dtype) a_mat = np.random.normal(scale=5.0, size=(3, 2)).astype(dtype) b_mat = np.random.normal(scale=5.0, size=(3, 2)).astype(dtype) b_mat_f = np.random.normal(scale=5.0, size=(3, 2)).astype(dtype, order='F') a_sca_gpu = gpuarray.to_gpu(a_sca) b_sca_gpu = gpuarray.to_gpu(b_sca) a_vec_gpu = gpuarray.to_gpu(a_vec) b_vec_gpu = gpuarray.to_gpu(b_vec) a_mat_gpu = gpuarray.to_gpu(a_mat) b_mat_gpu = gpuarray.to_gpu(b_mat) b_mat_f_gpu = gpuarray.to_gpu(b_mat_f) # addition assert_allclose(misc.add(a_sca_gpu, b_sca_gpu).get(), a_sca + b_sca, rtol=dtype_to_rtol[dtype], atol=dtype_to_atol[dtype]) assert_allclose(misc.add(a_vec_gpu, b_vec_gpu).get(), a_vec + b_vec, rtol=dtype_to_rtol[dtype], atol=dtype_to_atol[dtype]) assert_allclose(misc.add(a_mat_gpu, b_mat_gpu).get(), a_mat + b_mat, rtol=dtype_to_rtol[dtype], atol=dtype_to_atol[dtype]) # subtract assert_allclose(misc.subtract(a_sca_gpu, b_sca_gpu).get(), a_sca - b_sca, rtol=dtype_to_rtol[dtype], atol=dtype_to_atol[dtype]) assert_allclose(misc.subtract(a_vec_gpu, b_vec_gpu).get(), a_vec - b_vec, rtol=dtype_to_rtol[dtype], atol=dtype_to_atol[dtype]) assert_allclose(misc.subtract(a_mat_gpu, b_mat_gpu).get(), a_mat - b_mat, rtol=dtype_to_rtol[dtype], atol=dtype_to_atol[dtype]) # multiplication assert_allclose(misc.multiply(a_sca_gpu, b_sca_gpu).get(), a_sca * b_sca, rtol=dtype_to_rtol[dtype], atol=dtype_to_atol[dtype]) assert_allclose(misc.multiply(a_vec_gpu, b_vec_gpu).get(), a_vec * b_vec, rtol=dtype_to_rtol[dtype], atol=dtype_to_atol[dtype]) assert_allclose(misc.multiply(a_mat_gpu, b_mat_gpu).get(), a_mat * b_mat, rtol=dtype_to_rtol[dtype], atol=dtype_to_atol[dtype]) # division if issubclass(dtype, numbers.Integral): assert_allclose(misc.divide(a_sca_gpu, b_sca_gpu).get(), a_sca // b_sca, rtol=dtype_to_rtol[dtype], atol=dtype_to_atol[dtype]) assert_allclose(misc.divide(a_vec_gpu, b_vec_gpu).get(), a_vec // b_vec, rtol=dtype_to_rtol[dtype], atol=dtype_to_atol[dtype]) assert_allclose(misc.divide(a_mat_gpu, b_mat_gpu).get(), a_mat // b_mat, rtol=dtype_to_rtol[dtype], atol=dtype_to_atol[dtype]) else: assert_allclose(misc.divide(a_sca_gpu, b_sca_gpu).get(), a_sca / b_sca, rtol=dtype_to_rtol[dtype], atol=dtype_to_atol[dtype]) assert_allclose(misc.divide(a_vec_gpu, b_vec_gpu).get(), a_vec / b_vec, rtol=dtype_to_rtol[dtype], atol=dtype_to_atol[dtype]) assert_allclose(misc.divide(a_mat_gpu, b_mat_gpu).get(), a_mat / b_mat, rtol=dtype_to_rtol[dtype], atol=dtype_to_atol[dtype]) # mismatched order assert_raises(ValueError, misc.add, a_mat_gpu, b_mat_f_gpu)
for step in xrange(N_TIMESTEPS): # print step # Implementing split-step method # Update wavefunction and resovoir, record density cu_fft.fft(psi_gpu, psi_gpu, plan_forward) psi_gpu *= kineticFactorHalf_gpu cu_fft.ifft(psi_gpu, psi_gpu, plan_inverse, scale=True) # currentDensity_gpu = abs(psi_gpu) ** 2 # currentDensity_gpu = psi_gpu.real **2 + psi_gpu.imag ** 2 currentDensity_gpu = (psi_gpu * psi_gpu.conj()).real # modSquared.prepared_call(grid, block, psi_gpu.gpudata, # currentDensity_gpu.gpudata, 1024) # n_gpu *= cumath.exp(-gammaRdt_gpu + Rdt_gpu * currentDensity_gpu) n_gpu *= cumath.exp(misc.add(- gammaRdt_gpu, - misc.multiply(Rdt_gpu, currentDensity_gpu))) n_gpu += Pdt_gpu psi_gpu *= cumath.exp( misc.add( misc.add(misc.multiply(expFactorPolFirst_gpu, n_gpu), misc.multiply(expFactorPolSecond_gpu, currentDensity_gpu)), expFactorPolThird_gpu)) # psiNonlinear.prepared_call(grid, block, expFactorPolFirst, # expFactorPolSecond, expFactorPolThird, # psi_gpu.gpudata, n_gpu.gpudata, # currentDensity_gpu.gpudata, 1024) cu_fft.fft(psi_gpu, psi_gpu, plan_forward) # record spectrum drv.memcpy_dtod(spectrum[step, :].gpudata, psi_gpu[N//2, :].gpudata,
# print step # Implementing split-step method # Update wavefunction and resovoir, record density cu_fft.fft(psi_gpu, psi_gpu, plan_forward) psi_gpu *= kineticFactorHalf_gpu cu_fft.ifft(psi_gpu, psi_gpu, plan_inverse, scale=True) # currentDensity_gpu = abs(psi_gpu) ** 2 # currentDensity_gpu = psi_gpu.real **2 + psi_gpu.imag ** 2 currentDensity_gpu = (psi_gpu * psi_gpu.conj()).real # modSquared.prepared_call(grid, block, psi_gpu.gpudata, # currentDensity_gpu.gpudata, 1024) # n_gpu *= cumath.exp(-gammaRdt_gpu + Rdt_gpu * currentDensity_gpu) n_gpu *= cumath.exp( misc.add(-gammaRdt_gpu, -misc.multiply(Rdt_gpu, currentDensity_gpu))) n_gpu += Pdt_gpu psi_gpu *= cumath.exp( misc.add( misc.add( misc.multiply(expFactorPolFirst_gpu, n_gpu), misc.multiply(expFactorPolSecond_gpu, currentDensity_gpu)), expFactorPolThird_gpu)) # psiNonlinear.prepared_call(grid, block, expFactorPolFirst, # expFactorPolSecond, expFactorPolThird, # psi_gpu.gpudata, n_gpu.gpudata, # currentDensity_gpu.gpudata, 1024) cu_fft.fft(psi_gpu, psi_gpu, plan_forward) # record spectrum
def fft_gpu(window_a, search_area): """ Do batch of FFT's on on the Jetson Inputs: window_a: 3D numpy array stack of interrogation windows of the first frame output from the window slice function search_area: 3D numpy array Stack of interrogation windows of the second frame output from the window slice function Outputs: corr_gpu: 3D numpy array Stack of correlation functions for each image pair """ batch_size, win_h, win_w = np.array(window_a.shape).astype(np.int32) window_a = window_a.astype(np.float32) search_area = search_area.astype(np.float32) #allocate space on gpu for FFT's #d_winA = drv.mem_alloc(window_a.nbytes) #drv.memcpy_htod(d_winA, window_a) #d_search_area = drv.mem_alloc(search_area.nbytes) #drv.memcpy_htod(d_search_area, search_area) d_winA = gpuarray.to_gpu(window_a) d_search_area = gpuarray.to_gpu(search_area) d_winIFFT = gpuarray.empty_like(d_winA) d_winFFT = gpuarray.empty((batch_size, win_h, win_w // 2 + 1), np.complex64) d_searchAreaFFT = gpuarray.empty((batch_size, win_h, win_w // 2 + 1), np.complex64) #frame a fft plan_forward = cu_fft.Plan((win_h, win_w), np.float32, np.complex64, batch=batch_size) cu_fft.fft(d_winA, d_winFFT, plan_forward) #frame b fft cu_fft.fft(d_search_area, d_searchAreaFFT, plan_forward) #multiply the ffts d_winFFT = d_winFFT.conj() d_tmp = cu_misc.multiply(d_searchAreaFFT, d_winFFT) #inverse transform plan_inverse = cu_fft.Plan((win_h, win_w), np.complex64, np.float32, batch=batch_size) cu_fft.ifft(d_tmp, d_winIFFT, plan_inverse, True) #transfer data back corr_gpu = d_winIFFT.get().real corr_gpu = fftshift(corr_gpu, axes=(1, 2)) # Free GPU memory d_winA.gpudata.free() d_search_area.gpudata.free() d_winFFT.gpudata.free() d_winIFFT.gpudata.free() d_searchAreaFFT.gpudata.free() d_tmp.gpudata.free() return (corr_gpu)
def _impl_test_binaryop_2d(self, dtype): if issubclass(dtype, numbers.Integral): a_sca = np.array(np.random.randint(1, 10), dtype=dtype) b_sca = np.array(np.random.randint(1, 10), dtype=dtype) a_vec = np.random.randint(1, 10, 3).astype(dtype) b_vec = np.random.randint(1, 10, 3).astype(dtype) a_mat = np.random.randint(1, 10, 6).reshape((3, 2)).astype(dtype) b_mat = np.random.randint(1, 10, 6).reshape((3, 2)).astype(dtype) b_mat_f = np.random.randint(1, 10, 6).reshape((3, 2)).astype(dtype, order='F') else: a_sca = np.random.normal(scale=5.0, size=()).astype(dtype) b_sca = np.random.normal(scale=5.0, size=()).astype(dtype) a_vec = np.random.normal(scale=5.0, size=(3,)).astype(dtype) b_vec = np.random.normal(scale=5.0, size=(3,)).astype(dtype) a_mat = np.random.normal(scale=5.0, size=(3, 2)).astype(dtype) b_mat = np.random.normal(scale=5.0, size=(3, 2)).astype(dtype) b_mat_f = np.random.normal(scale=5.0, size=(3, 2)).astype(dtype, order='F') a_sca_gpu = gpuarray.to_gpu(a_sca) b_sca_gpu = gpuarray.to_gpu(b_sca) a_vec_gpu = gpuarray.to_gpu(a_vec) b_vec_gpu = gpuarray.to_gpu(b_vec) a_mat_gpu = gpuarray.to_gpu(a_mat) b_mat_gpu = gpuarray.to_gpu(b_mat) b_mat_f_gpu = gpuarray.to_gpu(b_mat_f) # addition assert_allclose(misc.add(a_sca_gpu, b_sca_gpu).get(), a_sca+b_sca, rtol=dtype_to_rtol[dtype], atol=dtype_to_atol[dtype]) assert_allclose(misc.add(a_vec_gpu, b_vec_gpu).get(), a_vec+b_vec, rtol=dtype_to_rtol[dtype], atol=dtype_to_atol[dtype]) assert_allclose(misc.add(a_mat_gpu, b_mat_gpu).get(), a_mat+b_mat, rtol=dtype_to_rtol[dtype], atol=dtype_to_atol[dtype]) # subtract assert_allclose(misc.subtract(a_sca_gpu, b_sca_gpu).get(), a_sca-b_sca, rtol=dtype_to_rtol[dtype], atol=dtype_to_atol[dtype]) assert_allclose(misc.subtract(a_vec_gpu, b_vec_gpu).get(), a_vec-b_vec, rtol=dtype_to_rtol[dtype], atol=dtype_to_atol[dtype]) assert_allclose(misc.subtract(a_mat_gpu, b_mat_gpu).get(), a_mat-b_mat, rtol=dtype_to_rtol[dtype], atol=dtype_to_atol[dtype]) # multiplication assert_allclose(misc.multiply(a_sca_gpu, b_sca_gpu).get(), a_sca*b_sca, rtol=dtype_to_rtol[dtype], atol=dtype_to_atol[dtype]) assert_allclose(misc.multiply(a_vec_gpu, b_vec_gpu).get(), a_vec*b_vec, rtol=dtype_to_rtol[dtype], atol=dtype_to_atol[dtype]) assert_allclose(misc.multiply(a_mat_gpu, b_mat_gpu).get(), a_mat*b_mat, rtol=dtype_to_rtol[dtype], atol=dtype_to_atol[dtype]) # division if issubclass(dtype, numbers.Integral): assert_allclose(misc.divide(a_sca_gpu, b_sca_gpu).get(), a_sca//b_sca, rtol=dtype_to_rtol[dtype], atol=dtype_to_atol[dtype]) assert_allclose(misc.divide(a_vec_gpu, b_vec_gpu).get(), a_vec//b_vec, rtol=dtype_to_rtol[dtype], atol=dtype_to_atol[dtype]) assert_allclose(misc.divide(a_mat_gpu, b_mat_gpu).get(), a_mat//b_mat, rtol=dtype_to_rtol[dtype], atol=dtype_to_atol[dtype]) else: assert_allclose(misc.divide(a_sca_gpu, b_sca_gpu).get(), a_sca/b_sca, rtol=dtype_to_rtol[dtype], atol=dtype_to_atol[dtype]) assert_allclose(misc.divide(a_vec_gpu, b_vec_gpu).get(), a_vec/b_vec, rtol=dtype_to_rtol[dtype], atol=dtype_to_atol[dtype]) assert_allclose(misc.divide(a_mat_gpu, b_mat_gpu).get(), a_mat/b_mat, rtol=dtype_to_rtol[dtype], atol=dtype_to_atol[dtype]) # mismatched order assert_raises(ValueError, misc.add, a_mat_gpu, b_mat_f_gpu)