def check_errors(thr, a_shape, a_dtype, b_shape, b_dtype, transposed_a=False, transposed_b=False): a = get_test_array(a_shape, a_dtype) b = get_test_array(b_shape, b_dtype) a_ref = transpose(a) if transposed_a else a b_ref = transpose(b) if transposed_b else b res_ref = ref_dot(a_ref, b_ref) a_dev = thr.to_device(a) b_dev = thr.to_device(b) res_dev = thr.empty_like(res_ref) dot = MatrixMul(a_dev, b_dev, out_arr=res_dev, transposed_a=transposed_a, transposed_b=transposed_b) dotc = dot.compile(thr) dotc(res_dev, a_dev, b_dev) assert diff_is_negligible(res_dev.get(), res_ref)
def modified_gemm_gpu(A, B, C): shape = (A.shape[0], B.shape[1]) api = cluda.cuda_api() thr = api.Thread.create() res_arr = thr.array((shape[0], shape[1]), dtype=A.dtype) mul = MatrixMul(A, B, out_arr=res_arr) mulc = mul.compile(thr) mulc(res_arr, A, B) return res_arr + C
def check_performance(thr_and_double, perf_shape, bwo=None, transposed_a=False, transposed_b=False): thr, double = thr_and_double dtype = numpy.float64 if double else numpy.float32 batch, size = perf_shape shape = (batch, size, size) a = get_test_array(shape, dtype) b = get_test_array(shape, dtype) a_ref = transpose(a) if transposed_a else a b_ref = transpose(b) if transposed_b else b a_dev = thr.to_device(a) b_dev = thr.to_device(b) res_ref = ref_dot(a_ref, b_ref) res_dev = thr.array(res_ref.shape, dtype=dtype) dot = MatrixMul(a_dev, b_dev, out_arr=res_dev, block_width_override=bwo, transposed_a=transposed_a, transposed_b=transposed_b) try: dotc = dot.compile(thr) except ValueError: pytest.skip() attempts = 10 times = [] for i in range(attempts): t1 = time.time() dotc(res_dev, a_dev, b_dev) thr.synchronize() times.append(time.time() - t1) assert diff_is_negligible(thr.from_device(res_dev), res_ref) return min(times), batch * size**3 * 2
def check_errors(thr, a_shape, a_dtype, b_shape, b_dtype, transposed_a=False, transposed_b=False): a = get_test_array(a_shape, a_dtype) b = get_test_array(b_shape, b_dtype) a_ref = transpose(a) if transposed_a else a b_ref = transpose(b) if transposed_b else b res_ref = ref_dot(a_ref, b_ref) a_dev = thr.to_device(a) b_dev = thr.to_device(b) res_dev = thr.empty_like(res_ref) dot = MatrixMul(a_dev, b_dev, out_arr=res_dev, transposed_a=transposed_a, transposed_b=transposed_b) dotc = dot.compile(thr) dotc(res_dev, a_dev, b_dev) assert diff_is_negligible(res_dev.get(), res_ref)
def main(): api = cluda.ocl_api() thr = api.Thread.create() print thr shape1 = (100, 200) shape2 = (200, 100) a = numpy.random.randn(*shape1).astype(numpy.float32) b = numpy.random.randn(*shape2).astype(numpy.float32) a_dev = thr.to_device(a) b_dev = thr.to_device(b) res_dev = thr.array((shape1[0], shape2[1]), dtype=numpy.float32) dot = MatrixMul(a_dev, b_dev, out_arr=res_dev) dotc = dot.compile(thr) dotc(res_dev, a_dev, b_dev) res_reference = numpy.dot(a, b) print res_reference
def main(): api = cluda.ocl_api() # thr = api.Thread.create() thr = api.Thread.create({'exclude_devices': 'Iris Pro'}) n = 6000 m = 3000 shape1 = (n, m) shape2 = (m, n) a = numpy.random.randn(*shape1).astype(numpy.float32) b = numpy.random.randn(*shape2).astype(numpy.float32) a_dev = thr.to_device(a) b_dev = thr.to_device(b) res_dev = thr.array((shape1[0], shape2[1]), dtype=numpy.float32) dot = MatrixMul(a_dev, b_dev, out_arr=res_dev) dotc = dot.compile(thr) gt = 0 for i in range(10): thr.synchronize() gpu_start = time.time() dotc(res_dev, a_dev, b_dev) thr.synchronize() gt += time.time() - gpu_start print(gt) ct = 0 res_reference = None for i in range(10): t = time.time() res_reference = numpy.dot(a, b) ct += time.time() - t print(ct) print(norm(res_dev.get() - res_reference) / norm(res_reference) < 1e-6)
def run(): api = cluda.ocl_api() thr = api.Thread.create() n = 3000 shape1 = (n, n) shape2 = (n, n) a = numpy.random.randn(*shape1).astype(numpy.float32) b = numpy.random.randn(*shape2).astype(numpy.float32) a_dev = thr.to_device(a) b_dev = thr.to_device(b) res_dev = thr.array((shape1[0], shape2[1]), dtype=numpy.float32) dot = MatrixMul(a_dev, b_dev, out_arr=res_dev) dotc = dot.compile(thr) dotc(res_dev, a_dev, b_dev) res_reference = numpy.dot(a, b) print(norm(res_dev.get() - res_reference) / norm(res_reference) < 1e-6)
def check_performance(thr_and_double, perf_shape, bwo=None, transposed_a=False, transposed_b=False): thr, double = thr_and_double dtype = numpy.float64 if double else numpy.float32 batch, size = perf_shape shape = (batch, size, size) a = get_test_array(shape, dtype) b = get_test_array(shape, dtype) a_ref = transpose(a) if transposed_a else a b_ref = transpose(b) if transposed_b else b a_dev = thr.to_device(a) b_dev = thr.to_device(b) res_ref = ref_dot(a_ref, b_ref) res_dev = thr.array(res_ref.shape, dtype=dtype) dot = MatrixMul(a_dev, b_dev, out_arr=res_dev, block_width_override=bwo, transposed_a=transposed_a, transposed_b=transposed_b) try: dotc = dot.compile(thr) except ValueError: pytest.skip() attempts = 10 times = [] for i in range(attempts): t1 = time.time() dotc(res_dev, a_dev, b_dev) thr.synchronize() times.append(time.time() - t1) assert diff_is_negligible(thr.from_device(res_dev), res_ref) return min(times), batch * size ** 3 * 2
def create(thr, a_size, b_size, dtype=np.complex128, compile_=True): mm = MatrixMul(thr.array(a_size, dtype=dtype), thr.array(b_size, dtype=dtype)) if compile_: mm = mm.compile(thr) return mm
def create(thr, a_size, b_size, dtype=np.complex128, compile_=True): mm = MatrixMul(thr.array(a_size, dtype=dtype), thr.array(b_size, dtype=dtype)) if compile_: mm = mm.compile(thr) return mm