def softmax(x, deriv=False): if deriv: return x * (1.0 - x) else: np_t = np.array([[0.0]]) # skcuda.misc.max(x).get(np_t) # x = x - np_t.ravel()[0] gpu.sum(cm.exp(x)).get(np_t) return cm.exp(x) / np_t.ravel()[0]
def compute_one(self, amp, r, buffer): cumath.exp(self.iqx * r.x() + self.iqxsq * r.z(), out=self._ex) cumath.exp(self.iqy * r.y() + self.iqysq * r.z(), out=self._ey) self._ex *= amp self.outer(self._ex, self._ey, buffer, np.int32(self.w), np.int32(self.h), block=self.block, grid=self.grid)
def thunk(): alpha = gpuarray.to_gpu(np.squeeze(np.asarray(inputs[0]))[:, None]) x_t = gpuarray.to_gpu(np.asarray(inputs[1])[0, :, :]) x_f = gpuarray.to_gpu(np.asarray(inputs[2])[0, :, :]) Xt = cumath.exp(misc.add(linalg.dot(x_t, A), b)) Xf = cumath.exp(misc.add(linalg.dot(x_f, A), b)) Xtn = misc.sum(Xt, axis=1, keepdims=True) Xfn = misc.sum(Xf, axis=1, keepdims=True) Xt = misc.divide(Xt, Xtn) Xf = misc.divide(Xf, Xfn) w = misc.multiply(Xt, alpha) + misc.multiply(Xf, 1 - alpha) dq = Xt - Xf qdw = dq / w t1 = misc.sum(x * qdw, axis=1) f = 2 * depth + self.base.n t2 = f * misc.sum(dq, axis=1) / misc.sum(w, axis=1) t3 = misc.sum(x, axis=1) * misc.sum(qdw, axis=1) dalpha = t1 - t2 + t3 del dq, t1, f, t2, t3 iw = 1 / w S1 = misc.multiply( depth[:, None] * (self.base.n - 1) / self.base.n, iw) S2 = (self.base.n + depth[:, None]) / cumath.log( misc.sum(w, axis=1, keepdims=True)) F = misc.multiply(misc.subtract((x * iw) - S1, S2), alpha) del w, iw, S1, S2 cast = gpuarray.zeros((x_t.shape[1], Xt.shape[1]), dtype=theano.config.floatX) dLq_t = gpuarray.zeros(x_t.shape, dtype=theano.config.floatX) dLq_f = gpuarray.zeros(x_f.shape, dtype=theano.config.floatX) for i in range(Xt.shape[0]): S1 = misc.multiply(Xt[None, i, :], A) S2 = misc.sum(S1, axis=1, keepdims=True) S2 = misc.multiply(S2, misc.add(Xt[None, i, :], cast)) dLq_t[i, :] = misc.sum(misc.multiply(F[None, i, :], S1 - S2), axis=1) S1 = misc.multiply(Xf[None, i, :], A) S2 = misc.sum(S1, axis=1, keepdims=True) S2 = misc.multiply(S2, misc.add(Xf[None, i, :], cast)) dLq_f[i, :] = misc.sum(misc.multiply(F[None, i, :], S1 - S2), axis=1) outputs[0][0] = dalpha.get() outputs[1][0] = dLq_t.get() outputs[2][0] = dLq_f.get() for v in node.outputs: compute_map[v][0] = True
def test_exp(self): """tests if the exp function works""" a = simplearray.array(100).fill_arange()/10 b = cumath.exp(a) for i in range(100): self.assert_(abs(math.exp(a[i]) - b[i]) < 1e-2)
def forward_gpu(self, x, temperature): """forward propagation in gpu mode""" # obtain z hx = np.concatenate((self.h, x)) hx_gpu = gpu.to_gpu(hx.astype(np.float32)) all_weights = np.concatenate((self.forget_w, self.sel_w, self.write_w, self.add_w)) all_biases = np.concatenate((self.forget_b, self.sel_b, self.write_b, self.add_b)) all_weights_gpu = gpu.to_gpu(all_weights.astype(np.float32)) all_biases_gpu = gpu.to_gpu(all_biases.astype(np.float32)) z = gpu.zeros((self.hidden_s * 4, 1), np.float32) self.kernel(all_weights_gpu, hx_gpu, z, grid=(self.num_block, 1, 1), block=(self.num_thread_per_block, 1, 1)) z += all_biases_gpu # non-linearity z[:self.hidden_s * 3, :1] = 1.0 / (gpum.exp(-1 * z[:self.hidden_s * 3, :1]) + 1.0) z[self.hidden_s * 3:, :1] = 1.7159 * gpum.tanh(2.0 / 3.0 * z[self.hidden_s * 3:, :1]) z_cpu = z.get() # update cell and hidden self.c = z_cpu[:self.hidden_s, :1] * self.c + \ z_cpu[self.hidden_s:self.hidden_s*2, :1] * z_cpu[self.hidden_s*3:, :1] self.h = z_cpu[self.hidden_s * 2: self.hidden_s * 3, :1] * Tanh(self.c) # output res = np.dot(self.weights, self.h) + self.biases return Softmax(res, temperature)
def calculate_attenuation_gpu(projections_gpu, energy, p, pool): attenuation_gpu = gpuarray.zeros(projections_gpu[next(iter(projections_gpu))].shape, dtype=np.float32, allocator=pool.allocate) for mat in projections_gpu: # logger.debug(f'attenuating {mat}') attenuation_gpu = attenuation_gpu.mul_add(1.0, projections_gpu[mat], -get_absorbtion_coefs(energy, mat)) attenuation_gpu = cumath.exp(attenuation_gpu) * energy * p return attenuation_gpu
def fprop(self, input, output): max = gpuarray.zeros((1, self.batchSize), dtype = np.float32) col_max_reduce(max, input) add_vec_to_cols(input, max, output, alpha = -1) gpu_copy_to(cumath.exp(output), output) sum = gpuarray.zeros(max.shape, dtype = np.float32) add_col_sum_to_vec(sum, output, alpha = 0) div_vec_to_cols(output, sum)
def logsumexp(mat): max_dim = max_by_axis(mat, 1) tmp = add_vec_to_mat(mat, max_dim, 0, substract=True) tmp = cumath.exp(tmp) tmp = matrix_sum_out_axis(tmp, 1) tmp = cumath.log(tmp) max_dim += tmp return max_dim
def exp(d_a, mode=MathModes.ACC): if mode == MathModes.ACC: return cumath.exp(d_a) d_out = gpuarray.zeros_like(d_a) thread_size = min(d_a.size, MAX_BLOCK_SIZE) block_size = max(int(math.ceil(d_a.size / float(thread_size))), 1) exp_fast_kernel(d_a, d_out, numpy.int32(d_a.size), block=(thread_size,1,1), grid=(block_size,1,1)) return d_out
def update_momentum(self, factor): """ update_momentum - performs an update in momentum space Parameters ---------- factor - essentially the size of the time step dt Returns ------- None. """ if not self.gpu: self.psi *= np.exp(-1j * factor * self.m * self.V) else: self.g_psi_hat[...] = -1.0j * factor * self.m * self.g_V cumath.exp(self.g_psi_hat, out=self.g_psi_hat) self.g_psi *= self.g_psi_hat
def thunk(): alpha = gpuarray.to_gpu(np.squeeze(np.asarray(inputs[0]))[:, None]) x_t = gpuarray.to_gpu(np.asarray(inputs[1])[0, :, :]) x_f = gpuarray.to_gpu(np.asarray(inputs[2])[0, :, :]) Xt = cumath.exp(misc.add(linalg.dot(x_t, A), b)) Xf = cumath.exp(misc.add(linalg.dot(x_f, A), b)) Xtn = misc.sum(Xt, axis=1, keepdims=True) Xfn = misc.sum(Xf, axis=1, keepdims=True) Xt = misc.divide(Xt, Xtn) Xf = misc.divide(Xf, Xfn) w = misc.multiply(Xt, alpha) + misc.multiply(Xf, 1 - alpha) wp = cumath.log(w) wpn = misc.sum(wp, axis=1, keepdims=True) / self.n wp = misc.subtract(wp, wpn) t1 = misc.sum(x * wp, axis=1) t2 = (self.n + depth) * cumath.log(misc.sum(w, axis=1)) t3 = depth * wpn outputs[0][0] = misc.sum(t1 - t2 + t3).get() for v in node.outputs: compute_map[v][0] = True
def _fprop(self, X, X_space): A = scikits.cuda.linalg.dot(X, self.W) B, bias_space = self._b_space.broadcast(self.b, b=X_space.get_extent('b')) Y = cumath.exp(A + B) Z = scikits.cuda.linalg.dot(Y, self._sum_vector_classes) Z_space = bias_space.with_extents(w=1) Z, Z_space = Z_space.broadcast(Z, w=self.n_classes) Y /= Z return Y
def f(A_t, A_w, dz = delta_z): if f.delta_z != dz: f.w_exp = cumath.exp(-1j * dz/2. * w_op) f.t_exp = cumath.exp(-1j * dz * t_op) f.delta_z = dz ## Dispersion (I pass) f.A_t = A_t f.A_w = A_w #print A_w.get()[n_points/2], prod(A_w, f.w_exp, A_w) #A_w = f.w_exp*A_w #print A_w.get()[n_points/2], ifft_g(f.A_w, f.A_t) ## Scale factor included in fft_g ## Constant potential term prod(f.A_t, f.t_exp, f.A_t) ## Nonlinear operator as intensity dependency if nlin != 0: f.A_t = f.A_t * cumath.exp(-1j * delta_z * nlin * f.A_t * f.A_t.conj()) ## Additional nonlinear terms as a function t_nl_op(A(t),dt,z) if t_nl_op != None: f.A_t = f.A_t * cumath.exp(-1j * delta_z * t_nl_op(f.A_t, dt, z0+delta_z/2) ) ## Apodization if apod: prod(f.A_t, apod_array, f.A_t) fft_g(f.A_t, f.A_w) ## Scale factor included in fft_g ## Dispersion (II pass) prod(f.A_w, f.w_exp, f.A_w) ifft_g(f.A_w, f.A_t) ## Scale factor included in fft_g return f.A_t, f.A_w
def shift_trev_freq(self): """ GPU implementation of shift_trev_freq """ t_rev = self.RFParams.t_rev[self.RFParams.counter[0]] dev_induced_voltage_f = bm.rfft(self.dev_mtw_memory, self.n_mtw_fft) dev_induced_voltage_f *= cm.exp(self.dev_omegaj_mtw * t_rev) self.dev_mtw_memory = get_gpuarray((self.n_mtw_memory, bm.precision.real_t, id(self), 'mtw_m')) dummy = bm.irfft(dev_induced_voltage_f, caller_id=id(self)) gpu_copy_d2d(self.dev_mtw_memory, dummy, range=range(0, self.n_mtw_memory)) set_zero_real(self.dev_mtw_memory, slice=slice(-int(self.buffer_size), None, None))
def shift_trev_freq(self): """ Method to shift the induced voltage by a revolution period in the frequency domain """ t_rev = self.RFParams.t_rev[self.RFParams.counter[0]] # Shift in frequency domain dev_induced_voltage_f = bm.rfft(self.dev_mtw_memory, self.n_mtw_fft) dev_induced_voltage_f *= cm.exp(self.dev_omegaj_mtw * t_rev) self.dev_mtw_memory = get_gpuarray( (self.n_mtw_memory, bm.precision.real_t, id(self), 'mtw_m')) dummy = bm.irfft(dev_induced_voltage_f, caller_id=self(id)) gpu_copy_d2d(self.dev_mtw_memory, dummy, range=range(0, self.n_mtw_memory)) set_zero_real(self.dev_mtw_memory, slice=slice(-int(self.buffer_size), None, None))
GPU_find_max_in_shrmem(all_l_rhots_gpu, grid=grd, block=blk, shared=int(max_tpb*8)) # Indexes are not contiguous griddimx = int(nclmns / max_tpb) griddimy = int(nsamps) # One thread per sample-time grd = (griddimx, griddimy, 1) blk = (max_tpb, 1, 1) maxes = np.array(all_l_rhots_gpu[:,0][1::nmodes].get()).astype(np.float64) maxes_gpu = gpuarray.to_gpu(maxes) GPU_bcast_vec_to_matrix(all_l_rhots_gpu, -maxes_gpu, grid=grd, block=blk, shared=8) # ***** THIS IS CORRECT AND WORKING UP THROUGH HERE AS OF AUGUST 10TH 2016 ***** ''' Marginalize over Time ''' all_l_rhots_gpu = cumath.exp(all_l_rhots_gpu) # exponentiate GPU_nv_reduc(all_l_rhots_gpu) # sum over time lnL_gpu = maxes_gpu + cumath.log(all_l_rhots_gpu) # TIMES DELTA T FIXME
start = drv.Event() end = drv.Event() x = np.random.normal(size=N) start.record() dX = gpuarray.to_gpu(x) end.record() end.synchronize() print "Transfer to GPU time: %fs" % (start.time_till(end) * 1e-3) print "Timing vectorized exponentiation:" start.record() dexpX = cumath.exp(dX) end.record() end.synchronize() print "GPU array calc time: %fs" % (start.time_till(end) * 1e-3) start.record() expX = np.exp(x) end.record() end.synchronize() print "CPU calc time: %fs" % (start.time_till(end) * 1e-3) print "Timing vectorized dot product/sum of squares:" start.record() gpuarray.dot(dX, dX) end.record()
def demosaick_gpu(img): img = gp.to_gpu(img) p2x = im2col(img, _i2c2) cm.log(img + _eps, out=img) p1x = im2col(img, _i2c1) wA = p1x.shape[0] wB = p2x.shape[0] hA = p1x.shape[1] hB = p2x.shape[1] # Path 1 p1x = p1x.reshape([wA * hA, 576]) p1y = lg.dot(p1x, _wts.int1) cm.exp(p1y, out=p1y) p1y = p1y.reshape([wA * hA * 64, 3 * _ofac]) p1x = lg.dot(p1y, _wts.int2) msc.add_matvec(p1x, _wts.int2b, out=p1x) p1x = p1x.reshape([wA * hA * 64 * 3, _ofac]) # Path 2 # conv1 p2x = p2x.reshape([wB * hB, 64]) p2y = lg.dot(p2x, _wts.c1) msc.add_matvec(p2y, _wts.c1b, out=p2y) gp.maximum(p2y, 0., p2y) p2y = p2y.reshape([wB, hB, _numsel]) # conv2 shI = [wB - 1, hB - 1, _numsel] shM = [(wB - 1) * (hB - 1), _numsel] p2x = gp.empty(shM, dtype=np.float32) pTT = gp.empty(shI, dtype=np.float32) pTT = pTT.reshape(shI) pTT[...] = p2y[0:-1, 0:-1, :] pTT = pTT.reshape(shM) p2x = lg.dot(pTT, _wts.c200) pTT = pTT.reshape(shI) pTT[...] = p2y[0:-1, 1:, :] pTT = pTT.reshape(shM) lg.add_dot(pTT, _wts.c201, p2x) pTT = pTT.reshape(shI) pTT[...] = p2y[1:, 0:-1, :] pTT = pTT.reshape(shM) lg.add_dot(pTT, _wts.c210, p2x) pTT = pTT.reshape(shI) pTT[...] = p2y[1:, 1:, :] pTT = pTT.reshape(shM) lg.add_dot(pTT, _wts.c211, p2x) msc.add_matvec(p2x, _wts.c2b, out=p2x) gp.maximum(p2x, 0., p2x) p2x = p2x.reshape(shI) # conv 3 shI = [wB - 2, hB - 2, _numsel] shM = [(wB - 2) * (hB - 2), _numsel] p2y = gp.empty(shM, dtype=np.float32) pTT = gp.empty(shI, dtype=np.float32) pTT = pTT.reshape(shI) pTT[...] = p2x[0:-1, 0:-1, :] pTT = pTT.reshape(shM) p2y = lg.dot(pTT, _wts.c300) pTT = pTT.reshape(shI) pTT[...] = p2x[0:-1, 1:, :] pTT = pTT.reshape(shM) lg.add_dot(pTT, _wts.c301, p2y) pTT = pTT.reshape(shI) pTT[...] = p2x[1:, 0:-1, :] pTT = pTT.reshape(shM) lg.add_dot(pTT, _wts.c310, p2y) pTT = pTT.reshape(shI) pTT[...] = p2x[1:, 1:, :] pTT = pTT.reshape(shM) lg.add_dot(pTT, _wts.c311, p2y) msc.add_matvec(p2y, _wts.c3b, out=p2y) gp.maximum(p2y, 0., p2y) p2x = lg.dot(p2y, _wts.sout) msc.add_matvec(p2x, _wts.soutb, out=p2x) gp.maximum(p2x, 0., p2x) p2x = p2x.reshape(p1x.shape) # Combine p1x *= p2x p1 = msc.sum(p1x, axis=1) gp.maximum(p1, 0., p1) gp.minimum(p1, 1., p1) p1 = p1.reshape([wA, hA, 64 * 3]) im = p2im(p1.get()) return im
x = np.random.normal(size = n) x_short = np.random.normal(size = 8) start.record() dev_x = gpuarray.to_gpu(x) dev_x_short = gpuarray.to_gpu(x_short) end.record() end.synchronize() print "Transfer to GPU time: %fs" %(start.time_till(end)*1e-3) print "Timing vectorized exponentiation:" start.record() dev_expx_short = cumath.exp(dev_x_short) end.record() end.synchronize() print "GPU array calc time (initial): %fs" %(start.time_till(end)*1e-3) start.record() dev_expx = cumath.exp(dev_x) end.record() end.synchronize() print "GPU array calc time: %fs" %(start.time_till(end)*1e-3) start.record() exp_x = np.exp(x) end.record() end.synchronize() print "CPU calc time: %fs" %(start.time_till(end)*1e-3)
def bptt(self, data, temperature, length): """full back propagation through time""" loss = 0 D_forget_w = np.zeros((self.hidden_s, self.input_s + self.hidden_s)) D_forget_b = np.zeros((self.hidden_s, 1)) D_sel_w = np.zeros((self.hidden_s, self.input_s + self.hidden_s)) D_sel_b = np.zeros((self.hidden_s, 1)) D_add_w = np.zeros((self.hidden_s, self.input_s + self.hidden_s)) D_add_b = np.zeros((self.hidden_s, 1)) D_write_w = np.zeros((self.hidden_s, self.input_s + self.hidden_s)) D_write_b = np.zeros((self.hidden_s, 1)) D_biases = np.zeros((self.output_s, 1)) D_weights = np.zeros((self.output_s, self.hidden_s)) hANDx = [] forget_in = [] sel_in = [] add_in = [] write_in = [] c_hist = [] h_hist = [] forget_ = [] sel_ = [] add_ = [] write_ = [] prediction = [] c_init = np.copy(self.c) E_over_write_next = np.zeros((1, self.hidden_s)) E_over_c_next = np.zeros((1, self.hidden_s)) # first forward propagation if self.gpu: # in gpu mode all_weights = np.concatenate((self.forget_w, self.sel_w, self.write_w, self.add_w)) all_biases = np.concatenate((self.forget_b, self.sel_b, self.write_b, self.add_b)) all_weights_gpu = gpu.to_gpu(all_weights.astype(np.float32)) all_biases_gpu = gpu.to_gpu(all_biases.astype(np.float32)) z = gpu.zeros((self.hidden_s * 4, 1), np.float32) for i in range(length): x = data[i] # obtain z hx = np.concatenate((self.h, x)) hANDx.append(hx) hx_gpu = gpu.to_gpu(hx.astype(np.float32)) self.kernel(all_weights_gpu, hx_gpu, z, grid=(self.num_block, 1, 1), block=(self.num_thread_per_block, 1, 1)) z += all_biases_gpu z_cpu = z.get() forget_in.append(z_cpu[:self.hidden_s, :1]) sel_in.append(z_cpu[self.hidden_s:self.hidden_s * 2, :1]) write_in.append(z_cpu[self.hidden_s * 2:self.hidden_s * 3, :1]) add_in.append(z_cpu[self.hidden_s * 3:, :1]) # non-linearity z[:self.hidden_s * 3, :1] = 1.0 / (gpum.exp(-1 * z[:self.hidden_s * 3, :1]) + 1.0) z[self.hidden_s * 3:, :1] = 1.7159 * gpum.tanh(2 / 3.0 * z[self.hidden_s * 3:, :1]) z_cpu = z.get() forget_.append(z_cpu[:self.hidden_s, :1]) sel_.append(z_cpu[self.hidden_s:self.hidden_s * 2, :1]) write_.append(z_cpu[self.hidden_s * 2:self.hidden_s * 3, :1]) add_.append(z_cpu[self.hidden_s * 3:, :1]) # update cell and hidden self.c = z_cpu[:self.hidden_s, :1] * self.c + z_cpu[self.hidden_s:self.hidden_s * 2, :1] \ * z_cpu[self.hidden_s * 3:, :1] self.h = z_cpu[self.hidden_s * 2: self.hidden_s * 3, :1] * Tanh(self.c) c_hist.append(self.c + 0) h_hist.append(self.h + 0) # output res = Softmax(np.dot(self.weights, self.h) + self.biases, temperature) prediction.append(res) loss += -np.log(res[np.argmax(data[i + 1]), 0]) else: for i in range(length): x = data[i] info = np.concatenate((self.h, x), axis=0) hANDx.append(info) a = np.dot(self.forget_w, info) + self.forget_b forget_in.append(a) forget = Sigmoid(a) forget_.append(forget) a = np.dot(self.sel_w, info) + self.sel_b sel_in.append(a) select = Sigmoid(a) sel_.append(select) a = np.dot(self.add_w, info) + self.add_b add_in.append(a) add = Tanh(a) add_.append(add) self.c = self.c * forget + select * add a = np.dot(self.write_w, info) + self.write_b write_in.append(a) write = Sigmoid(a) write_.append(write) c_hist.append(np.copy(self.c)) self.h = write * Tanh(self.c) h_hist.append(np.copy(self.h)) a = np.dot(self.weights, self.h) + self.biases res = Softmax(a, temperature) prediction.append(res) loss += -np.log(res[np.argmax(data[i+1]), 0]) # back propagation through time for i in range(length-1, -1, -1): # some variable hx_t = np.transpose(hANDx[i]) # obtain current layer delta delta = prediction[i] - data[i+1] D_biases += delta D_weights += np.dot(delta, h_hist[i].T) # obtain E_over_h w.r.t. current layer delta delta_h = np.dot(delta.T, self.weights) # obtain E_over_h w.r.t. write gate if i == length-1: write_h = np.zeros((1, self.hidden_s)) else: diag_sigmoid_grad = numpy.matlib.repmat(sigmoid_grad(write_in[i+1]), 1, self.hidden_s) write_w_part = self.write_w[:, :self.hidden_s] write_over_h = diag_sigmoid_grad * write_w_part write_h = np.dot(E_over_write_next, write_over_h) # obtain E_over_h w.r.t. memory cell if i == length-1: c_h = np.zeros((1, self.hidden_s)) else: # part A: forget_over_h diag_sigmoid_grad = numpy.matlib.repmat(sigmoid_grad(forget_in[i+1]), 1, self.hidden_s) forget_w_part = self.forget_w[:, :self.hidden_s] forget_over_h = diag_sigmoid_grad * forget_w_part forget_over_h *= numpy.matlib.repmat(c_hist[i], 1, self.hidden_s) # part B: sel_over_h diag_sigmoid_grad = numpy.matlib.repmat(sigmoid_grad(sel_in[i+1]), 1, self.hidden_s) sel_w_part = self.sel_w[:, :self.hidden_s] sel_over_h = diag_sigmoid_grad * sel_w_part sel_over_h *= numpy.matlib.repmat(add_[i+1], 1, self.hidden_s) # part C: add_over_h diag_sigmoid_grad = numpy.matlib.repmat(sigmoid_grad(add_in[i+1]), 1, self.hidden_s) add_w_part = self.add_w[:, :self.hidden_s] add_over_h = diag_sigmoid_grad * add_w_part add_over_h *= numpy.matlib.repmat(sel_[i+1], 1, self.hidden_s) # finally c_h c_over_h = forget_over_h + sel_over_h + add_over_h c_h = np.dot(E_over_c_next, c_over_h) # obtain E_over_h and relevant gradients E_over_h = delta_h + write_h + c_h # write gate update update_write = E_over_h * np.transpose(Tanh(c_hist[i])) update_write *= np.transpose(sigmoid_grad(write_in[i])) D_write_b += update_write.T D_write_w += np.dot(update_write.T, hx_t) # memory cell update, with E_over_c recursively, and update E_over_c_next as well E_over_c = E_over_h * np.transpose(write_[i]) * np.transpose(tanh_grad(c_hist[i])) if i == length-1: E_over_c_next = E_over_c else: E_over_c += E_over_c_next * np.transpose(forget_[i+1]) E_over_c_next = E_over_c # forget gate update if i == 0: c_last = c_init else: c_last = c_hist[i-1] update_forget = E_over_c * np.transpose(c_last) * np.transpose(sigmoid_grad(forget_in[i])) D_forget_b += update_forget.T D_forget_w += np.dot(update_forget.T, hx_t) # sel update update_sel = E_over_c * np.transpose(add_[i]) update_sel *= np.transpose(sigmoid_grad(sel_in[i])) D_sel_b += update_sel.T D_sel_w += np.dot(update_sel.T, hx_t) # add update update_add = E_over_c * np.transpose(sel_[i]) update_add *= np.transpose(tanh_grad(add_in[i])) D_add_b += update_add.T D_add_w += np.dot(update_add.T, hx_t) # update E_over_write E_over_write_next = E_over_h * np.transpose(Tanh(c_hist[i])) for each in [D_forget_w, D_forget_b, D_sel_w, D_sel_b, D_add_w, D_add_b, D_write_w, D_write_b, D_weights, D_biases]: np.clip(each, -30, 30, out=each) return D_forget_w, D_forget_b, D_sel_w, D_sel_b, D_add_w, D_add_b, \ D_write_w, D_write_b, D_weights, D_biases, loss/(length+0.0)
def softmax(mat): L = logsumexp(mat) return cumath.exp(add_vec_to_mat(mat, -L, inplace=True))
def marginalize_all_lnL(mod, all_l_rhots_gpu, nmodes, nsamps, ntimes, nclmns, delta_t): # Recopy constants into device constant memory # **-- constants --** max_tpb = 1024 nmodes_gpu = mod.get_global("nmodes")[0] nsamps_gpu = mod.get_global("nsamps")[0] ntimes_gpu = mod.get_global("ntimes")[0] nclmns_gpu = mod.get_global("nclmns")[0] cuda.memcpy_htod(nmodes_gpu, np.array(nmodes, ndmin=1).astype(np.int32)) cuda.memcpy_htod(nsamps_gpu, np.array(nsamps, ndmin=1).astype(np.int32)) cuda.memcpy_htod(ntimes_gpu, np.array(ntimes, ndmin=1).astype(np.int32)) cuda.memcpy_htod(nclmns_gpu, np.array(nclmns, ndmin=1).astype(np.int32)) # Get GPU functions GPU_find_max_in_shrmem = mod.get_function("find_max_in_shrmem") GPU_nv_reduc = mod.get_function("nv_reduc") GPU_bcast_vec_to_matrix = mod.get_function("bcast_vec_to_matrix") def next_greater_power_of_2(x): return 2**(x-1).bit_length() griddimx = int(nclmns / max_tpb) griddimy = int(nsamps) # One thread per sample-time grd = (griddimx, griddimy, 1) blk = (max_tpb, 1, 1) print("Finding Maximum...\n") # Get the maxes GPU_find_max_in_shrmem(all_l_rhots_gpu, grid=grd, block=blk, shared=int(max_tpb*8)) griddimy = int(nsamps) blokdimx = next_greater_power_of_2(griddimx) # Only need as many threads as we had blocks in x dimension grd = (1, griddimy, 1) blk = (blokdimx, 1, 1) # Second reduction - this works as long as we don't have rhoTS longer then 1024^2 GPU_find_max_in_shrmem(all_l_rhots_gpu, grid=grd, block=blk, shared=int(blokdimx*8)) # Collect the maxes through the host maxes = np.array(all_l_rhots_gpu[:,0][nmodes-2::nmodes].get()).astype(np.float64) maxes_gpu = gpuarray.to_gpu(maxes) griddimx = int(nclmns / max_tpb) griddimy = int(nsamps) # One thread per sample-time grd = (griddimx, griddimy, 1) blk = (max_tpb, 1, 1) GPU_bcast_vec_to_matrix(all_l_rhots_gpu, -maxes_gpu, grid=grd, block=blk, shared=8) # Exponentiating a bunch of zeros creates a bunch of extra ones that we don't want in our # sum, so this is the number we need to subtract out to offset it padwidth = nclmns - ntimes all_l_rhots_gpu = cumath.exp(all_l_rhots_gpu) # exponentiate print("Reducing final answer...\n") GPU_nv_reduc(all_l_rhots_gpu, grid=grd, block=blk, shared=max_tpb*8) # sum over time griddimy = int(nsamps) blokdimx = next_greater_power_of_2(griddimx) # Only need as many threads as we had blocks in x dimension grd = (1, griddimy, 1) blk = (blokdimx, 1, 1) GPU_nv_reduc(all_l_rhots_gpu, grid=grd, block=blk, shared=blokdimx*8) # sum over time lnL = (all_l_rhots_gpu[:,0][nmodes-1::nmodes].get() - padwidth).astype(np.float64) lnL_gpu = gpuarray.to_gpu(lnL) lnL_gpu = maxes_gpu + cumath.log(lnL_gpu*delta_t) return lnL_gpu.get()
for step in xrange(N_TIMESTEPS): # print step # Implementing split-step method # Update wavefunction and resovoir, record density cu_fft.fft(psi_gpu, psi_gpu, plan_forward) psi_gpu *= kineticFactorHalf_gpu cu_fft.ifft(psi_gpu, psi_gpu, plan_inverse, scale=True) # currentDensity_gpu = abs(psi_gpu) ** 2 # currentDensity_gpu = psi_gpu.real **2 + psi_gpu.imag ** 2 currentDensity_gpu = (psi_gpu * psi_gpu.conj()).real # modSquared.prepared_call(grid, block, psi_gpu.gpudata, # currentDensity_gpu.gpudata, 1024) # n_gpu *= cumath.exp(-gammaRdt_gpu + Rdt_gpu * currentDensity_gpu) n_gpu *= cumath.exp( misc.add(-gammaRdt_gpu, -misc.multiply(Rdt_gpu, currentDensity_gpu))) n_gpu += Pdt_gpu psi_gpu *= cumath.exp( misc.add( misc.add( misc.multiply(expFactorPolFirst_gpu, n_gpu), misc.multiply(expFactorPolSecond_gpu, currentDensity_gpu)), expFactorPolThird_gpu)) # psiNonlinear.prepared_call(grid, block, expFactorPolFirst, # expFactorPolSecond, expFactorPolThird, # psi_gpu.gpudata, n_gpu.gpudata, # currentDensity_gpu.gpudata, 1024) cu_fft.fft(psi_gpu, psi_gpu, plan_forward)
def batch_backward_gpu(self, X, Y, mini_batch_size): """the backward propagation algorithm to compute the gradient of the cost function, with a batched version for using CUDA-based gpu""" D_biases = [np.zeros(b.shape) for b in self.biases] D_weights = [np.zeros(w.shape) for w in self.weights] weights_gpu = [gpu.to_gpu(self.weights[i].astype(np.float32)) for i in range(self.num_layers - 1)] # feed-forward activation = gpu.to_gpu(X.astype(np.float32)) activations = [activation] zs = [] count = 0 for b, w in zip(self.biases, self.weights): # gpu mode variables Z = gpu.zeros((self.sizes[count + 1], mini_batch_size), np.float32) kernel = self.kernel1[count] kernel(weights_gpu[count], activation, Z, grid=(self.num_block1[count], 1, 1), block=(self.num_thread_per_block, 1, 1)) Z += gpu.to_gpu(mat.repmat(b, 1, mini_batch_size).astype(np.float32)) zs.append(Z) if count < self.num_layers - 2: if self.act == 0: activation = 1.0 / (1.0 + gpum.exp(-1 * Z)) activations.append(activation) elif self.act == 1: activation = 1.7159 * gpum.tanh(2.0 / 3.0 * Z) activations.append(activation) else: activation = gpu.zeros(Z.shape, np.float32) kernel = self.kernel3[count] kernel(Z, activation, grid=(self.num_block3[count], 1, 1), block=(self.num_thread_per_block, 1, 1)) activations.append(activation) else: activation = Softmax(Z.get()) activations.append(activation) count += 1 # backward # first the special case for output layer (due to softmax layer) delta = activations[-1] - Y D_biases[-1] = np.array([np.sum(delta, axis=1)]).T delta = gpu.to_gpu(delta.astype(np.float32)) D_weights_gpu = gpu.zeros(D_weights[-1].shape, np.float32) kernel = self.kernel5[-1] # these are for handling the bug in PyCuda library regarding matrix transpose a_t = gpu.to_gpu(np.zeros((activations[-2].shape[1], activations[-2].shape[0]), np.float32) + gpu.transpose(activations[-2]).get()) # execute the kernel to update D_weights[-1] kernel(delta, a_t, D_weights_gpu, grid=(self.num_block5[-1], 1, 1), block=(self.num_thread_per_block, 1, 1)) D_weights[-1] = D_weights_gpu.get() # then compute the derivative of other hidden layers, from large to small count = 0 for l in range(2, self.num_layers): Z = zs[-l] if self.act == 0: grad = (1.0 / (1.0 + gpum.exp(-1 * Z))) * (1 - (1.0 / (1.0 + gpum.exp(-1 * Z)))) elif self.act == 1: grad = 1.7159 * 2 / 3.0 * (1 - (gpum.tanh(2.0/3.0 * Z)) ** 2) else: grad = gpu.zeros(Z.shape, np.float32) kernel = self.kernel4[count] kernel(Z, grad, grid=(self.num_block4[count], 1, 1), block=(self.num_thread_per_block, 1, 1)) product = gpu.zeros((self.weights[-l+1].shape[1], mini_batch_size), np.float32) kernel = self.kernel2[count] weights_t = gpu.to_gpu((np.zeros((weights_gpu[-l + 1].shape[1], weights_gpu[-l + 1].shape[0])) + self.weights[-l+1].T).astype(np.float32)) kernel(weights_t, delta, product, grid=(self.num_block2[count], 1, 1), block=(self.num_thread_per_block, 1, 1)) delta = product * grad # for each weights and biases D_biases[-l] = np.array([np.sum(delta.get(), axis=1)]).T kernel = self.kernel5[-l] a_t = gpu.to_gpu(np.zeros((activations[-l-1].shape[1], activations[-l-1].shape[0]), np.float32) + gpu.transpose(activations[-l-1]).get()) D_weights_gpu = gpu.zeros(D_weights[-l].shape, np.float32) kernel(delta, a_t, D_weights_gpu, grid=(self.num_block5[-l], 1, 1), block=(self.num_thread_per_block, 1, 1)) D_weights[-l] = D_weights_gpu.get() count += 1 return D_biases, D_weights
for step in xrange(N_TIMESTEPS): # print step # Implementing split-step method # Update wavefunction and resovoir, record density cu_fft.fft(psi_gpu, psi_gpu, plan_forward) psi_gpu *= kineticFactorHalf_gpu cu_fft.ifft(psi_gpu, psi_gpu, plan_inverse, scale=True) # currentDensity_gpu = abs(psi_gpu) ** 2 # currentDensity_gpu = psi_gpu.real **2 + psi_gpu.imag ** 2 currentDensity_gpu = (psi_gpu * psi_gpu.conj()).real # modSquared.prepared_call(grid, block, psi_gpu.gpudata, # currentDensity_gpu.gpudata, 1024) # n_gpu *= cumath.exp(-gammaRdt_gpu + Rdt_gpu * currentDensity_gpu) n_gpu *= cumath.exp(misc.add(- gammaRdt_gpu, - misc.multiply(Rdt_gpu, currentDensity_gpu))) n_gpu += Pdt_gpu psi_gpu *= cumath.exp( misc.add( misc.add(misc.multiply(expFactorPolFirst_gpu, n_gpu), misc.multiply(expFactorPolSecond_gpu, currentDensity_gpu)), expFactorPolThird_gpu)) # psiNonlinear.prepared_call(grid, block, expFactorPolFirst, # expFactorPolSecond, expFactorPolThird, # psi_gpu.gpudata, n_gpu.gpudata, # currentDensity_gpu.gpudata, 1024) cu_fft.fft(psi_gpu, psi_gpu, plan_forward) # record spectrum drv.memcpy_dtod(spectrum[step, :].gpudata, psi_gpu[N//2, :].gpudata,
def softmax(mat): tmp = gpuarray.empty_like(mat) L = logsumexp(mat) tmp = add_vec_to_mat(mat, L, substract=True) tmp = cumath.exp(tmp) return tmp
start = drv.Event() end = drv.Event() x = np.random.normal(size = N) start.record() dX = gpuarray.to_gpu(x) end.record() end.synchronize() print "Transfer to GPU time: %fs" %(start.time_till(end)*1e-3) print "Timing vectorized exponentiation:" start.record() dexpX = cumath.exp(dX) end.record() end.synchronize() print "GPU array calc time: %fs" %(start.time_till(end)*1e-3) start.record() expX = np.exp(x) end.record() end.synchronize() print "CPU calc time: %fs" %(start.time_till(end)*1e-3) print "Timing vectorized dot product/sum of squares:" start.record() gpuarray.dot(dX,dX) end.record()
def marginalize_all_lnL(mod, all_l_rhots_gpu, nmodes, nsamps, ntimes, nclmns, delta_t): # Recopy constants into device constant memory # **-- constants --** max_tpb = 1024 nmodes_gpu = mod.get_global("nmodes")[0] nsamps_gpu = mod.get_global("nsamps")[0] ntimes_gpu = mod.get_global("ntimes")[0] nclmns_gpu = mod.get_global("nclmns")[0] cuda.memcpy_htod(nmodes_gpu, np.array(nmodes, ndmin=1).astype(np.int32)) cuda.memcpy_htod(nsamps_gpu, np.array(nsamps, ndmin=1).astype(np.int32)) cuda.memcpy_htod(ntimes_gpu, np.array(ntimes, ndmin=1).astype(np.int32)) cuda.memcpy_htod(nclmns_gpu, np.array(nclmns, ndmin=1).astype(np.int32)) # Get GPU functions GPU_find_max_in_shrmem = mod.get_function("find_max_in_shrmem") GPU_nv_reduc = mod.get_function("nv_reduc") GPU_bcast_vec_to_matrix = mod.get_function("bcast_vec_to_matrix") def next_greater_power_of_2(x): return 2**(x - 1).bit_length() griddimx = int(nclmns / max_tpb) griddimy = int(nsamps) # One thread per sample-time grd = (griddimx, griddimy, 1) blk = (max_tpb, 1, 1) print("Finding Maximum...\n") # Get the maxes GPU_find_max_in_shrmem(all_l_rhots_gpu, grid=grd, block=blk, shared=int(max_tpb * 8)) griddimy = int(nsamps) blokdimx = next_greater_power_of_2( griddimx) # Only need as many threads as we had blocks in x dimension grd = (1, griddimy, 1) blk = (blokdimx, 1, 1) # Second reduction - this works as long as we don't have rhoTS longer then 1024^2 GPU_find_max_in_shrmem(all_l_rhots_gpu, grid=grd, block=blk, shared=int(blokdimx * 8)) # Collect the maxes through the host maxes = np.array(all_l_rhots_gpu[:, 0][nmodes - 2::nmodes].get()).astype( np.float64) maxes_gpu = gpuarray.to_gpu(maxes) griddimx = int(nclmns / max_tpb) griddimy = int(nsamps) # One thread per sample-time grd = (griddimx, griddimy, 1) blk = (max_tpb, 1, 1) GPU_bcast_vec_to_matrix(all_l_rhots_gpu, -maxes_gpu, grid=grd, block=blk, shared=8) # Exponentiating a bunch of zeros creates a bunch of extra ones that we don't want in our # sum, so this is the number we need to subtract out to offset it padwidth = nclmns - ntimes all_l_rhots_gpu = cumath.exp(all_l_rhots_gpu) # exponentiate print("Reducing final answer...\n") GPU_nv_reduc(all_l_rhots_gpu, grid=grd, block=blk, shared=max_tpb * 8) # sum over time griddimy = int(nsamps) blokdimx = next_greater_power_of_2( griddimx) # Only need as many threads as we had blocks in x dimension grd = (1, griddimy, 1) blk = (blokdimx, 1, 1) GPU_nv_reduc(all_l_rhots_gpu, grid=grd, block=blk, shared=blokdimx * 8) # sum over time lnL = (all_l_rhots_gpu[:, 0][nmodes - 1::nmodes].get() - padwidth).astype( np.float64) lnL_gpu = gpuarray.to_gpu(lnL) lnL_gpu = maxes_gpu + cumath.log(lnL_gpu * delta_t) return lnL_gpu.get()
def logsumexp(mat): max_dim = max_by_axis(mat, 1) tmp = add_vec_to_mat(mat, -max_dim, 0) L = max_dim + cumath.log(matrix_sum_out_axis(cumath.exp(tmp), 1)) return L
def exp(self): return CUDAArray(cumath.exp(self.arr))
end = drv.Event() x = np.random.normal(size=n) x_short = np.random.normal(size=8) start.record() dev_x = gpuarray.to_gpu(x) dev_x_short = gpuarray.to_gpu(x_short) end.record() end.synchronize() print "Transfer to GPU time: %fs" % (start.time_till(end) * 1e-3) print "Timing vectorized exponentiation:" start.record() dev_expx_short = cumath.exp(dev_x_short) end.record() end.synchronize() print "GPU array calc time (initial): %fs" % (start.time_till(end) * 1e-3) start.record() dev_expx = cumath.exp(dev_x) end.record() end.synchronize() print "GPU array calc time: %fs" % (start.time_till(end) * 1e-3) start.record() exp_x = np.exp(x) end.record() end.synchronize() print "CPU calc time: %fs" % (start.time_till(end) * 1e-3)
for i in range(10): gpu_ind.set(indices[i]) gpuarray.take(probs, gpu_ind, out=selected_probs) utils.scalar_sub(selected_probs, 1.0, selected_probs) gpuarray.multi_put([selected_probs], gpu_ind, out=[probs]) #print probs t1 = time.clock() for i in range(N): # get the softmax probs first utils.max(scores, 1, maxscores, maxscoreids) utils.sub_matvec(scores, maxscores, 0, deltas) cumath.exp(deltas, out=deltas) scm.sum(deltas, 1, sumdeltas) utils.div_matvec(deltas, sumdeltas, 0, probs) # probs.get(cpu_probs) # cpu_probs[np.arange(B), indices[i]] -= 1 # probs.set(cpu_probs) gpu_ind.set(indices[i]) gpuarray.take(probs, gpu_ind, out=selected_probs) utils.scalar_sub(selected_probs, 1.0, selected_probs) gpuarray.multi_put([selected_probs], gpu_ind, out=[probs]) t2 = time.clock() #print probs print 'tdiff = %.3f, per loop = %.6f, wps = %.3f' % ((t2-t1), (t2-t1)/N,
for i in range(10): gpu_ind.set(indices[i]) gpuarray.take(probs, gpu_ind, out=selected_probs) utils.scalar_sub(selected_probs, 1.0, selected_probs) gpuarray.multi_put([selected_probs], gpu_ind, out=[probs]) #print probs t1 = time.clock() for i in range(N): # get the softmax probs first utils.max(scores, 1, maxscores, maxscoreids) utils.sub_matvec(scores, maxscores, 0, deltas) cumath.exp(deltas, out=deltas) scm.sum(deltas, 1, sumdeltas) utils.div_matvec(deltas, sumdeltas, 0, probs) # probs.get(cpu_probs) # cpu_probs[np.arange(B), indices[i]] -= 1 # probs.set(cpu_probs) gpu_ind.set(indices[i]) gpuarray.take(probs, gpu_ind, out=selected_probs) utils.scalar_sub(selected_probs, 1.0, selected_probs) gpuarray.multi_put([selected_probs], gpu_ind, out=[probs]) t2 = time.clock() #print probs print 'tdiff = %.3f, per loop = %.6f, wps = %.3f' % ((t2 - t1),
def register_multiple_images_subpix_cuda(stack, template): import pycuda.autoinit import pycuda.gpuarray as gpuarray import pycuda.driver as drv import pycuda.cumath as cumath import skcuda.fft as cu_fft import skcuda.linalg as lin import skcuda.cublas as cub from numpy import pi, newaxis, floor import cmath from pycuda.elementwise import ElementwiseKernel from pycuda.compiler import SourceModule from numpy import conj, abs, arctan2, sqrt, real, imag, shape, zeros, trunc, ceil, floor, fix from numpy.fft import fftshift, ifftshift fft2, ifft2 = fftn, ifftn = fast_ffts.get_ffts(nthreads=1, use_numpy_fft=False) mod = SourceModule(""" #include <pycuda-complex.hpp>" __global__ void load_convert(unsigned short *a, float *b,int f, int imlen) { int idx = (int) gridDim.x*blockDim.x*blockIdx.y+blockIdx.x * blockDim.x + threadIdx.x ; int offset = f * imlen; if (idx <imlen) { b[idx] = (float)a[offset+idx]; } } __global__ void convert_export(float *a, unsigned short *b,int imlen) { int idx = (int) gridDim.x*blockDim.x*blockIdx.y+blockIdx.x * blockDim.x + threadIdx.x ; if (idx <imlen) { b[idx] = (unsigned short)(a[idx]>0 ? a[idx] : 0) ; } } __global__ void multiply_comp_float(pycuda::complex<float> *x, pycuda::complex<float> *y, pycuda::complex<float> *z, int imlen) { int idx = (int) gridDim.x*blockDim.x*blockIdx.y+blockIdx.x * blockDim.x + threadIdx.x ; if (idx <imlen) { z[idx] = x[idx] * y[idx]; } } __global__ void calc_conj(pycuda::complex<float> *x, pycuda::complex<float> *y, int imlen) { int idx = (int) gridDim.x*blockDim.x*blockIdx.y+blockIdx.x * blockDim.x + threadIdx.x ; if (idx <imlen) { y[idx]._M_re = x[idx]._M_re; y[idx]._M_im = -x[idx]._M_im; } } __global__ void convert_multiply(float *x, pycuda::complex<float> *y, float sx, int imlen) { int idx = (int) gridDim.x*blockDim.x*blockIdx.y+blockIdx.x * blockDim.x + threadIdx.x ; if (idx <imlen) { y[idx]._M_re = 0; y[idx]._M_im = x[idx] * sx; } } __global__ void transfer_array(pycuda::complex<float> *x, pycuda::complex<float> *y, int imlenl, int imlen, int nlargeh, int nh) { int idx = (int) gridDim.x*blockDim.x*blockIdx.y+blockIdx.x * blockDim.x + threadIdx.x ; int offset = imlenl*3/4; if (idx<imlen) { int target_ind = (offset+(idx/nh)*nlargeh + (idx % nh))%imlenl; x[target_ind] = y[idx]; } } __global__ void calc_shiftmatrix(float *x, float *y, pycuda::complex<float> *z, float sx, float sy,float dg, int imlen) { int idx = (int) gridDim.x*blockDim.x*blockIdx.y+blockIdx.x * blockDim.x + threadIdx.x ; if (idx <imlen) { z[idx]._M_re = 0; z[idx]._M_im = x[idx] * sx + y[idx] * sy + dg; } } __global__ void sub_float(float *x, float *y, float sv, int imlen) { int idx = (int) gridDim.x*blockDim.x*blockIdx.y+blockIdx.x * blockDim.x + threadIdx.x ; if (idx <imlen) { x[idx] = y[idx]-sv; } } """) load_convert_kernel = mod.get_function('load_convert') convert_export_kernel = mod.get_function('convert_export') convert_multiply_kernel = mod.get_function('convert_multiply') multiply_float_kernel = mod.get_function('multiply_comp_float') transfer_array_kernel = mod.get_function('transfer_array') calc_shiftmatrix_kernel = mod.get_function('calc_shiftmatrix') conj_kernel = mod.get_function('calc_conj') sub_float_kernel = mod.get_function('sub_float') Z = stack.shape[0] M = stack.shape[1] N = stack.shape[2] max_memsize = 4200000000 imlen = M * N half_imlen = M * (N // 2 + 1) grid_dim = (64, int(imlen / (512 * 64)) + 1, 1) block_dim = (512, 1, 1) #512 threads per block stack_bin = int(max_memsize / (M * N * stack.itemsize)) stack_ite = int(Z / stack_bin) + 1 usfac = 100 ## needs to be bigger than 10 if not template.shape == stack.shape[1:]: raise ValueError("Images must have same shape.") if np.any(np.isnan(template)): template = template.copy() template[template != template] = 0 if np.any(np.isnan(stack)): stack = stack.copy() stack[stack != stack] = 0 mlarge = M * 2 nlarge = N * 2 t = time.time() plan_forward = cu_fft.Plan((M, N), np.float32, np.complex64) plan_inverse = cu_fft.Plan((M, N), np.complex64, np.float32) plan_inverse_big = cu_fft.Plan((mlarge, nlarge), np.complex64, np.float32) cub_h = cub.cublasCreate() template_gpu = gpuarray.to_gpu(template.astype('float32')) source_gpu = gpuarray.empty((M, N), np.float32) ifft_gpu = gpuarray.empty((M, N), np.float32) result_gpu = gpuarray.empty((M, N), np.uint16) templatef_gpu = gpuarray.empty((M, N // 2 + 1), np.complex64) sourcef_gpu = gpuarray.empty((M, N // 2 + 1), np.complex64) prod_gpu1 = gpuarray.empty((M, N // 2 + 1), np.complex64) prod_gpu2 = gpuarray.empty((M, N // 2 + 1), np.complex64) shiftmatrix = gpuarray.empty((M, N // 2 + 1), np.complex64) cu_fft.fft(template_gpu, templatef_gpu, plan_forward, scale=True) templatef_gpu = templatef_gpu.conj() move_list = np.zeros((Z, 2)) largearray1_gpu = gpuarray.zeros((mlarge, nlarge // 2 + 1), np.complex64) largearray2_gpu = gpuarray.empty((mlarge, nlarge), np.float32) imlenl = mlarge * (nlarge // 2 + 1) zoom_factor = 1.5 dftshiftG = trunc(ceil(usfac * zoom_factor) / 2) #% Center of output array at dftshift+1 upsample_dim = int(ceil(usfac * zoom_factor)) term1c = (ifftshift(np.arange(N, dtype='float') - floor(N / 2)). T[:, newaxis]) / N # fftfreq # output points term2c = ((np.arange(upsample_dim, dtype='float')) / usfac)[newaxis, :] term1r = (np.arange(upsample_dim, dtype='float').T)[:, newaxis] term2r = (ifftshift(np.arange(M, dtype='float')) - floor(M / 2))[newaxis, :] # fftfreq term1c_gpu = gpuarray.to_gpu(term1c[:int(floor(N / 2) + 1), :].astype('float32')) term2c_gpu = gpuarray.to_gpu(term2c.astype('float32')) term1r_gpu = gpuarray.to_gpu(term1r.astype('float32')) term2r_gpu = gpuarray.to_gpu(term2r.astype('float32')) term2c_gpu_ori = gpuarray.to_gpu(term2c.astype('float32')) term1r_gpu_ori = gpuarray.to_gpu(term1r.astype('float32')) kernc_gpu = gpuarray.zeros((N // 2 + 1, upsample_dim), np.float32) kernr_gpu = gpuarray.zeros((upsample_dim, M), np.float32) kernc_gpuc = gpuarray.zeros((N // 2 + 1, upsample_dim), np.complex64) kernr_gpuc = gpuarray.zeros((upsample_dim, M), np.complex64) Nr = np.fft.ifftshift(np.linspace(-np.fix(M / 2), np.ceil(M / 2) - 1, M)) Nc = np.fft.ifftshift(np.linspace(-np.fix(N / 2), np.ceil(N / 2) - 1, N)) [Nc, Nr] = np.meshgrid(Nc, Nr) Nc_gpu = gpuarray.to_gpu((Nc[:, :N // 2 + 1] / N).astype('float32')) Nr_gpu = gpuarray.to_gpu((Nr[:, :N // 2 + 1] / M).astype('float32')) upsampled1 = gpuarray.empty((upsample_dim, N // 2 + 1), np.complex64) upsampled2 = gpuarray.empty((upsample_dim, upsample_dim), np.complex64) source_stack = gpuarray.empty((stack_bin, M, N), dtype=stack.dtype) copy = drv.Memcpy3D() copy.set_src_host(stack.data) copy.set_dst_device(source_stack.gpudata) copy.width_in_bytes = copy.src_pitch = stack.strides[1] copy.src_height = copy.height = M for zb in range(stack_ite): zrange = np.arange(zb * stack_bin, min((stack_bin * (zb + 1)), Z)) copy.depth = len(zrange) copy.src_z = int(zrange[0]) copy() for i in range(len(zrange)): t = zb * stack_bin + i load_convert_kernel(source_stack, source_gpu.gpudata, np.int32(i), np.int32(imlen), block=block_dim, grid=grid_dim) cu_fft.fft(source_gpu, sourcef_gpu, plan_forward, scale=True) multiply_float_kernel(sourcef_gpu, templatef_gpu, prod_gpu1, np.int32(half_imlen), block=block_dim, grid=grid_dim) transfer_array_kernel(largearray1_gpu, prod_gpu1, np.int32(imlenl), np.int32(half_imlen), np.int32(nlarge // 2 + 1), np.int32(N // 2 + 1), block=block_dim, grid=grid_dim) cu_fft.ifft(largearray1_gpu, largearray2_gpu, plan_inverse_big, scale=True) peakind = cub.cublasIsamax(cub_h, largearray2_gpu.size, largearray2_gpu.gpudata, 1) rloc, cloc = np.unravel_index(peakind, largearray2_gpu.shape) md2 = trunc(mlarge / 2) nd2 = trunc(nlarge / 2) if rloc > md2: row_shift2 = rloc - mlarge else: row_shift2 = rloc if cloc > nd2: col_shift2 = cloc - nlarge else: col_shift2 = cloc row_shiftG = row_shift2 / 2. col_shiftG = col_shift2 / 2. # Initial shift estimate in upsampled grid row_shiftG0 = round(row_shiftG * usfac) / usfac col_shiftG0 = round(col_shiftG * usfac) / usfac # Matrix multiply DFT around the current shift estimate roffG = dftshiftG - row_shiftG0 * usfac coffG = dftshiftG - col_shiftG0 * usfac sub_float_kernel(term2c_gpu, term2c_gpu_ori, np.float32(coffG / usfac), np.int32(term2c_gpu.size), block=block_dim, grid=grid_dim) sub_float_kernel(term1r_gpu, term1r_gpu_ori, np.float32(roffG), np.int32(term1r_gpu.size), block=block_dim, grid=grid_dim) lin.dot(term1c_gpu, term2c_gpu, handle=cub_h, out=kernc_gpu) lin.dot(term1r_gpu, term2r_gpu, handle=cub_h, out=kernr_gpu) convert_multiply_kernel(kernc_gpu, kernc_gpuc, np.float32(-2 * pi), np.int32(kernc_gpu.size), block=block_dim, grid=grid_dim) convert_multiply_kernel(kernr_gpu, kernr_gpuc, np.float32(-2 * pi / (M * usfac)), np.int32(kernr_gpu.size), block=block_dim, grid=grid_dim) cumath.exp(kernc_gpuc, out=kernc_gpuc) cumath.exp(kernr_gpuc, out=kernr_gpuc) conj_kernel(prod_gpu1, prod_gpu2, np.int32(half_imlen), block=block_dim, grid=grid_dim) lin.dot(kernr_gpuc, prod_gpu2, handle=cub_h, out=upsampled1) lin.dot(upsampled1, kernc_gpuc, handle=cub_h, out=upsampled2) CCG = conj(upsampled2.get()) / (md2 * nd2 * usfac**2) rlocG, clocG = np.unravel_index(abs(CCG).argmax(), CCG.shape) CCGmax = CCG[rlocG, clocG] rlocG = rlocG - dftshiftG #+ 1 # +1 # questionable/failed hack + 1; clocG = clocG - dftshiftG #+ 1 # -1 # questionable/failed hack - 1; row_shiftG = row_shiftG0 + rlocG / usfac col_shiftG = col_shiftG0 + clocG / usfac diffphaseG = arctan2(imag(CCGmax), real(CCGmax)) # Compute registered version of source stack calc_shiftmatrix_kernel(Nr_gpu, Nc_gpu, shiftmatrix, np.float32(row_shiftG * 2 * np.pi), np.float32(col_shiftG * 2 * np.pi), np.float32(diffphaseG), np.int32(half_imlen), block=block_dim, grid=grid_dim) cumath.exp(shiftmatrix, out=shiftmatrix) multiply_float_kernel(sourcef_gpu, shiftmatrix, prod_gpu1, np.int32(half_imlen), block=block_dim, grid=grid_dim) cu_fft.ifft(prod_gpu1, ifft_gpu, plan_inverse) convert_export_kernel(ifft_gpu, result_gpu, np.int32(imlen), block=block_dim, grid=grid_dim) move_list[t, :] = (row_shiftG, col_shiftG) stack[t, :, :] = result_gpu.get() cub.cublasDestroy(cub_h) return (stack, move_list)
N = 100000 # --- Create random vectorson the CPU h_a = np.random.randn(1, N) h_b = np.random.randn(1, N) # --- Set CPU arrays as single precision h_a = h_a.astype(np.float32) h_b = h_b.astype(np.float32) h_c = np.empty_like(h_a) d_a = gpuarray.to_gpu(h_a) d_b = gpuarray.to_gpu(h_b) start.record() d_c = (cumath.sqrt(cumath.fabs(d_a)) + cumath.exp(d_b)) end.record() end.synchronize() secs = start.time_till(end) * 1e-3 print("Processing time = %fs" % (secs)) h_c = d_c.get() if np.all(abs(h_c - (np.sqrt(np.abs(h_a)) + np.exp(h_b))) < 1e-5): print("Test passed!") else: print("Error!") # --- Flush context printf buffer cuda.Context.synchronize()
def compute_displace(self, amp, r, buffer): cumath.exp(self._iqx * r.x() + self._iqxz * r.z(), out=self._ex) cumath.exp(self._iqy * r.y() + self._iqyz * r.z(), out=self._ey) self._ey *= amp self.outer(self._ey, self._ex, buffer)
grid=grd, block=blk, shared=int(max_tpb * 8)) # Indexes are not contiguous griddimx = int(nclmns / max_tpb) griddimy = int(nsamps) # One thread per sample-time grd = (griddimx, griddimy, 1) blk = (max_tpb, 1, 1) maxes = np.array(all_l_rhots_gpu[:, 0][1::nmodes].get()).astype(np.float64) maxes_gpu = gpuarray.to_gpu(maxes) GPU_bcast_vec_to_matrix(all_l_rhots_gpu, -maxes_gpu, grid=grd, block=blk, shared=8) # ***** THIS IS CORRECT AND WORKING UP THROUGH HERE AS OF AUGUST 10TH 2016 ***** ''' Marginalize over Time ''' all_l_rhots_gpu = cumath.exp(all_l_rhots_gpu) # exponentiate GPU_nv_reduc(all_l_rhots_gpu) # sum over time lnL_gpu = maxes_gpu + cumath.log(all_l_rhots_gpu) # TIMES DELTA T FIXME
def sigmoid(x, deriv=False): if deriv: return x * (1.0 - x) else: return 1.0 / (1.0 + cm.exp(-x))
def relu(x, deriv=False): if deriv: return 1.0 - cm.exp(-x) else: return gpu.maximum(x, 0)
######## N = 10 # --- Create random vectorson the CPU h_a = np.random.randn(1, N) h_b = np.random.randn(1, N) # --- Set CPU arrays as single precision h_a = h_a.astype(np.float32) h_b = h_b.astype(np.float32) h_c = np.empty_like(h_a, dtype=np.complex64) d_a = gpuarray.to_gpu(h_a) d_b = gpuarray.to_gpu(h_b) d_c = d_a * cumath.exp(1j * d_b) h_c = d_c.get() if np.array_equal(h_c, h_a * np.exp(1j * h_b)): print("Test passed!") else: print("Error!") print(h_c) print(h_a * np.exp(1j * h_b)) # --- Flush context printf buffer cuda.Context.synchronize()