def p_norm(ap, p, TPBX, TPBY): d_ap_out = p_norm_helper(ap, p, TPBX, TPBY) d_ap_out_flatten = d_ap_out.flatten() reduced_sum = cuda.reduce(lambda a, b: a + b) reduced_ap = reduced_sum(d_ap_out_flatten) rst = reduced_ap**(1 / p) return rst
def setup(self): self.no_op = cuda.jit(argtypes=())(no_op) self.stream = cuda.stream() self.f32 = np.zeros(self.n, dtype=np.float32) self.d_f32 = cuda.to_device(self.f32, self.stream) self.f64 = np.zeros(self.n, dtype=np.float64) self.d_f64 = cuda.to_device(self.f64, self.stream) self.sum_reduce = cuda.reduce(lambda x, y: x + y) self.res_f32 = cuda.to_device(np.zeros(1, dtype=np.float32)) self.res_f64 = cuda.to_device(np.zeros(1, dtype=np.float64)) self.stream.synchronize()
def test_prod_reduce(self): prod_reduce = cuda.reduce(lambda a, b: a * b) A = (np.arange(64, dtype=np.float64) + 1) expect = A.prod() got = prod_reduce(A, init=1) self.assertTrue(np.allclose(expect, got))
def test_prod_reduce(self): prod_reduce = cuda.reduce(lambda a, b: a * b) A = np.arange(64, dtype=np.float64) + 1 expect = A.prod() got = prod_reduce(A, init=1) self.assertTrue(np.allclose(expect, got))