def check_errors(thr, a_shape, a_dtype, b_shape, b_dtype, transposed_a=False, transposed_b=False): a = get_test_array(a_shape, a_dtype) b = get_test_array(b_shape, b_dtype) a_ref = transpose(a) if transposed_a else a b_ref = transpose(b) if transposed_b else b res_ref = ref_dot(a_ref, b_ref) a_dev = thr.to_device(a) b_dev = thr.to_device(b) res_dev = thr.empty_like(res_ref) dot = MatrixMul(a_dev, b_dev, out_arr=res_dev, transposed_a=transposed_a, transposed_b=transposed_b) dotc = dot.compile(thr) dotc(res_dev, a_dev, b_dev) assert diff_is_negligible(res_dev.get(), res_ref)
def modified_gemm_gpu(A, B, C): shape = (A.shape[0], B.shape[1]) api = cluda.cuda_api() thr = api.Thread.create() res_arr = thr.array((shape[0], shape[1]), dtype=A.dtype) mul = MatrixMul(A, B, out_arr=res_arr) mulc = mul.compile(thr) mulc(res_arr, A, B) return res_arr + C
def check_performance(thr_and_double, perf_shape, bwo=None, transposed_a=False, transposed_b=False): thr, double = thr_and_double dtype = numpy.float64 if double else numpy.float32 batch, size = perf_shape shape = (batch, size, size) a = get_test_array(shape, dtype) b = get_test_array(shape, dtype) a_ref = transpose(a) if transposed_a else a b_ref = transpose(b) if transposed_b else b a_dev = thr.to_device(a) b_dev = thr.to_device(b) res_ref = ref_dot(a_ref, b_ref) res_dev = thr.array(res_ref.shape, dtype=dtype) dot = MatrixMul(a_dev, b_dev, out_arr=res_dev, block_width_override=bwo, transposed_a=transposed_a, transposed_b=transposed_b) try: dotc = dot.compile(thr) except ValueError: pytest.skip() attempts = 10 times = [] for i in range(attempts): t1 = time.time() dotc(res_dev, a_dev, b_dev) thr.synchronize() times.append(time.time() - t1) assert diff_is_negligible(thr.from_device(res_dev), res_ref) return min(times), batch * size**3 * 2
def main(): api = cluda.ocl_api() thr = api.Thread.create() print thr shape1 = (100, 200) shape2 = (200, 100) a = numpy.random.randn(*shape1).astype(numpy.float32) b = numpy.random.randn(*shape2).astype(numpy.float32) a_dev = thr.to_device(a) b_dev = thr.to_device(b) res_dev = thr.array((shape1[0], shape2[1]), dtype=numpy.float32) dot = MatrixMul(a_dev, b_dev, out_arr=res_dev) dotc = dot.compile(thr) dotc(res_dev, a_dev, b_dev) res_reference = numpy.dot(a, b) print res_reference
def main(): api = cluda.ocl_api() # thr = api.Thread.create() thr = api.Thread.create({'exclude_devices': 'Iris Pro'}) n = 6000 m = 3000 shape1 = (n, m) shape2 = (m, n) a = numpy.random.randn(*shape1).astype(numpy.float32) b = numpy.random.randn(*shape2).astype(numpy.float32) a_dev = thr.to_device(a) b_dev = thr.to_device(b) res_dev = thr.array((shape1[0], shape2[1]), dtype=numpy.float32) dot = MatrixMul(a_dev, b_dev, out_arr=res_dev) dotc = dot.compile(thr) gt = 0 for i in range(10): thr.synchronize() gpu_start = time.time() dotc(res_dev, a_dev, b_dev) thr.synchronize() gt += time.time() - gpu_start print(gt) ct = 0 res_reference = None for i in range(10): t = time.time() res_reference = numpy.dot(a, b) ct += time.time() - t print(ct) print(norm(res_dev.get() - res_reference) / norm(res_reference) < 1e-6)
def run(): api = cluda.ocl_api() thr = api.Thread.create() n = 3000 shape1 = (n, n) shape2 = (n, n) a = numpy.random.randn(*shape1).astype(numpy.float32) b = numpy.random.randn(*shape2).astype(numpy.float32) a_dev = thr.to_device(a) b_dev = thr.to_device(b) res_dev = thr.array((shape1[0], shape2[1]), dtype=numpy.float32) dot = MatrixMul(a_dev, b_dev, out_arr=res_dev) dotc = dot.compile(thr) dotc(res_dev, a_dev, b_dev) res_reference = numpy.dot(a, b) print(norm(res_dev.get() - res_reference) / norm(res_reference) < 1e-6)
def check_performance(thr_and_double, perf_shape, bwo=None, transposed_a=False, transposed_b=False): thr, double = thr_and_double dtype = numpy.float64 if double else numpy.float32 batch, size = perf_shape shape = (batch, size, size) a = get_test_array(shape, dtype) b = get_test_array(shape, dtype) a_ref = transpose(a) if transposed_a else a b_ref = transpose(b) if transposed_b else b a_dev = thr.to_device(a) b_dev = thr.to_device(b) res_ref = ref_dot(a_ref, b_ref) res_dev = thr.array(res_ref.shape, dtype=dtype) dot = MatrixMul(a_dev, b_dev, out_arr=res_dev, block_width_override=bwo, transposed_a=transposed_a, transposed_b=transposed_b) try: dotc = dot.compile(thr) except ValueError: pytest.skip() attempts = 10 times = [] for i in range(attempts): t1 = time.time() dotc(res_dev, a_dev, b_dev) thr.synchronize() times.append(time.time() - t1) assert diff_is_negligible(thr.from_device(res_dev), res_ref) return min(times), batch * size ** 3 * 2
def _build_plan(self, plan_factory, _device_params, output_arr, input_arr): plan = plan_factory() dtype = input_arr.dtype p_dtype = dtypes.real_for(dtype) if dtypes.is_complex(dtype) else dtype mode_shape = input_arr.shape if self._inverse else output_arr.shape current_mem = input_arr seq_axes = list(range(len(input_arr.shape))) current_axes = list(range(len(input_arr.shape))) for i, axis in enumerate(self._axes): current_mem, current_axes = self._add_transpose(plan, current_mem, current_axes, axis) tr_matrix = plan.persistent_array( self._get_transformation_matrix(p_dtype, mode_shape[axis], self._add_points[axis])) dot = MatrixMul(current_mem, tr_matrix) if i == len(self._axes) - 1 and current_axes == seq_axes: dot_output = output_arr else: # Cannot write to output if it is not the last transform, # or if we need to return to the initial axes order dot_output = plan.temp_array_like(dot.parameter.output) plan.computation_call(dot, dot_output, current_mem, tr_matrix) current_mem = dot_output # If we ended up with the wrong order of axes, # return to the original order. if current_axes != seq_axes: tr_axes = [current_axes.index(i) for i in range(len(current_axes))] transpose = Transpose(current_mem, output_arr_t=output_arr, axes=tr_axes) plan.add_computation(transpose, output_arr, current_mem) return plan
def create(thr, a_size, b_size, dtype=np.complex128, compile_=True): mm = MatrixMul(thr.array(a_size, dtype=dtype), thr.array(b_size, dtype=dtype)) if compile_: mm = mm.compile(thr) return mm
def _build_plan(self, plan_factory, device_params, alpha, beta, alpha_i, beta_i, seed): plan = plan_factory() system = self._system representation = self._representation unitary = plan.persistent_array(self._system.unitary) needs_noise_matrix = representation != Representation.POSITIVE_P and system.needs_noise_matrix( ) mmul = MatrixMul(alpha, unitary, transposed_b=True) if not needs_noise_matrix: # TODO: this could be sped up for repr != POSITIVE_P, # since in that case alpha == conj(beta), and we don't need to do two multuplications. mmul_beta = MatrixMul(beta, unitary, transposed_b=True) trf_conj = self._make_trf_conj() mmul_beta.parameter.matrix_b.connect(trf_conj, trf_conj.output, matrix_b_p=trf_conj.input) plan.computation_call(mmul, alpha, alpha_i, unitary) plan.computation_call(mmul_beta, beta, beta_i, unitary) else: noise_matrix = system.noise_matrix() noise_matrix_dev = plan.persistent_array(noise_matrix) # If we're here, it's not positive-P, and alpha == conj(beta). # This means we can just calculate alpha, and then build beta from it. w = plan.temp_array_like(alpha) temp_alpha = plan.temp_array_like(alpha) plan.computation_call(mmul, temp_alpha, alpha_i, unitary) bijection = philox(64, 2) # Keeping the kernel the same so it can be cached. # The seed will be passed as the computation parameter instead. keygen = KeyGenerator.create(bijection, seed=numpy.int32(0)) sampler = normal_bm(bijection, numpy.float64) plan.kernel_call(TEMPLATE.get_def("generate_apply_matrix_noise"), [w, seed], kernel_name="generate_apply_matrix_noise", global_size=alpha.shape, render_kwds=dict( bijection=bijection, keygen=keygen, sampler=sampler, mul_cr=functions.mul(numpy.complex128, numpy.float64), add_cc=functions.add(numpy.complex128, numpy.complex128), )) noise = plan.temp_array_like(alpha) plan.computation_call(mmul, noise, w, noise_matrix_dev) plan.kernel_call(TEMPLATE.get_def("add_noise"), [alpha, beta, temp_alpha, noise], kernel_name="add_noise", global_size=alpha.shape, render_kwds=dict( add=functions.add(numpy.complex128, numpy.complex128), conj=functions.conj(numpy.complex128))) return plan
def test_out_arr_shape(): a = numpy.empty((1, 22, 33), numpy.float32) b = numpy.empty((2, 3, 33, 44), numpy.float32) dot = MatrixMul(a, b) assert dot.parameter.output.shape == (2, 3, 22, 44)