コード例 #1
0
ファイル: test_matrixmul.py プロジェクト: xexo7C8/reikna
def check_errors(thr,
                 a_shape,
                 a_dtype,
                 b_shape,
                 b_dtype,
                 transposed_a=False,
                 transposed_b=False):
    a = get_test_array(a_shape, a_dtype)
    b = get_test_array(b_shape, b_dtype)

    a_ref = transpose(a) if transposed_a else a
    b_ref = transpose(b) if transposed_b else b

    res_ref = ref_dot(a_ref, b_ref)

    a_dev = thr.to_device(a)
    b_dev = thr.to_device(b)
    res_dev = thr.empty_like(res_ref)

    dot = MatrixMul(a_dev,
                    b_dev,
                    out_arr=res_dev,
                    transposed_a=transposed_a,
                    transposed_b=transposed_b)
    dotc = dot.compile(thr)
    dotc(res_dev, a_dev, b_dev)

    assert diff_is_negligible(res_dev.get(), res_ref)
コード例 #2
0
def modified_gemm_gpu(A, B, C):
    shape = (A.shape[0], B.shape[1])
    api = cluda.cuda_api()
    thr = api.Thread.create()
    res_arr = thr.array((shape[0], shape[1]), dtype=A.dtype)

    mul = MatrixMul(A, B, out_arr=res_arr)
    mulc = mul.compile(thr)
    mulc(res_arr, A, B)

    return res_arr + C
コード例 #3
0
ファイル: test_matrixmul.py プロジェクト: xexo7C8/reikna
def check_performance(thr_and_double,
                      perf_shape,
                      bwo=None,
                      transposed_a=False,
                      transposed_b=False):

    thr, double = thr_and_double
    dtype = numpy.float64 if double else numpy.float32
    batch, size = perf_shape

    shape = (batch, size, size)

    a = get_test_array(shape, dtype)
    b = get_test_array(shape, dtype)

    a_ref = transpose(a) if transposed_a else a
    b_ref = transpose(b) if transposed_b else b

    a_dev = thr.to_device(a)
    b_dev = thr.to_device(b)
    res_ref = ref_dot(a_ref, b_ref)
    res_dev = thr.array(res_ref.shape, dtype=dtype)

    dot = MatrixMul(a_dev,
                    b_dev,
                    out_arr=res_dev,
                    block_width_override=bwo,
                    transposed_a=transposed_a,
                    transposed_b=transposed_b)

    try:
        dotc = dot.compile(thr)
    except ValueError:
        pytest.skip()

    attempts = 10
    times = []
    for i in range(attempts):
        t1 = time.time()
        dotc(res_dev, a_dev, b_dev)
        thr.synchronize()
        times.append(time.time() - t1)

    assert diff_is_negligible(thr.from_device(res_dev), res_ref)

    return min(times), batch * size**3 * 2
コード例 #4
0
ファイル: test_matrixmul.py プロジェクト: fjarri/reikna
def check_errors(thr, a_shape, a_dtype, b_shape, b_dtype, transposed_a=False, transposed_b=False):
    a = get_test_array(a_shape, a_dtype)
    b = get_test_array(b_shape, b_dtype)

    a_ref = transpose(a) if transposed_a else a
    b_ref = transpose(b) if transposed_b else b

    res_ref = ref_dot(a_ref, b_ref)

    a_dev = thr.to_device(a)
    b_dev = thr.to_device(b)
    res_dev = thr.empty_like(res_ref)

    dot = MatrixMul(a_dev, b_dev, out_arr=res_dev,
        transposed_a=transposed_a, transposed_b=transposed_b)
    dotc = dot.compile(thr)
    dotc(res_dev, a_dev, b_dev)

    assert diff_is_negligible(res_dev.get(), res_ref)
コード例 #5
0
ファイル: reikna_test.py プロジェクト: csfoo/TF_binding
def main():
    api = cluda.ocl_api()
    thr = api.Thread.create()
    print thr
    shape1 = (100, 200)
    shape2 = (200, 100)

    a = numpy.random.randn(*shape1).astype(numpy.float32)
    b = numpy.random.randn(*shape2).astype(numpy.float32)
    a_dev = thr.to_device(a)
    b_dev = thr.to_device(b)
    res_dev = thr.array((shape1[0], shape2[1]), dtype=numpy.float32)

    dot = MatrixMul(a_dev, b_dev, out_arr=res_dev)
    dotc = dot.compile(thr)
    dotc(res_dev, a_dev, b_dev)

    res_reference = numpy.dot(a, b)
    print res_reference
コード例 #6
0
def main():
    api = cluda.ocl_api()
    # thr = api.Thread.create()
    thr = api.Thread.create({'exclude_devices': 'Iris Pro'})

    n = 6000
    m = 3000

    shape1 = (n, m)
    shape2 = (m, n)

    a = numpy.random.randn(*shape1).astype(numpy.float32)
    b = numpy.random.randn(*shape2).astype(numpy.float32)


    a_dev = thr.to_device(a)
    b_dev = thr.to_device(b)
    res_dev = thr.array((shape1[0], shape2[1]), dtype=numpy.float32)

    dot = MatrixMul(a_dev, b_dev, out_arr=res_dev)
    dotc = dot.compile(thr)

    gt = 0
    for i in range(10):
        thr.synchronize()
        gpu_start = time.time()
        dotc(res_dev, a_dev, b_dev)
        thr.synchronize()
        gt += time.time() - gpu_start
    print(gt)

    ct = 0
    res_reference = None
    for i in range(10):
        t = time.time()
        res_reference = numpy.dot(a, b)
        ct += time.time() - t
    print(ct)

    print(norm(res_dev.get() - res_reference) / norm(res_reference) < 1e-6)
コード例 #7
0
ファイル: reikna_test.py プロジェクト: TomWerner/hpelm
def run():

    api = cluda.ocl_api()
    thr = api.Thread.create()
    
    n = 3000
    shape1 = (n, n)
    shape2 = (n, n)
    
    a = numpy.random.randn(*shape1).astype(numpy.float32)
    b = numpy.random.randn(*shape2).astype(numpy.float32)
    a_dev = thr.to_device(a)
    b_dev = thr.to_device(b)
    res_dev = thr.array((shape1[0], shape2[1]), dtype=numpy.float32)
    
    dot = MatrixMul(a_dev, b_dev, out_arr=res_dev)
    dotc = dot.compile(thr)
    dotc(res_dev, a_dev, b_dev)
    
    res_reference = numpy.dot(a, b)
    
    print(norm(res_dev.get() - res_reference) / norm(res_reference) < 1e-6)
コード例 #8
0
ファイル: test_matrixmul.py プロジェクト: fjarri/reikna
def check_performance(thr_and_double, perf_shape,
        bwo=None, transposed_a=False, transposed_b=False):

    thr, double = thr_and_double
    dtype = numpy.float64 if double else numpy.float32
    batch, size = perf_shape

    shape = (batch, size, size)

    a = get_test_array(shape, dtype)
    b = get_test_array(shape, dtype)

    a_ref = transpose(a) if transposed_a else a
    b_ref = transpose(b) if transposed_b else b

    a_dev = thr.to_device(a)
    b_dev = thr.to_device(b)
    res_ref = ref_dot(a_ref, b_ref)
    res_dev = thr.array(res_ref.shape, dtype=dtype)

    dot = MatrixMul(a_dev, b_dev, out_arr=res_dev, block_width_override=bwo,
        transposed_a=transposed_a, transposed_b=transposed_b)

    try:
        dotc = dot.compile(thr)
    except ValueError:
        pytest.skip()

    attempts = 10
    times = []
    for i in range(attempts):
        t1 = time.time()
        dotc(res_dev, a_dev, b_dev)
        thr.synchronize()
        times.append(time.time() - t1)

    assert diff_is_negligible(thr.from_device(res_dev), res_ref)

    return min(times), batch * size ** 3 * 2
コード例 #9
0
ファイル: dht.py プロジェクト: xexo7C8/reikna
    def _build_plan(self, plan_factory, _device_params, output_arr, input_arr):

        plan = plan_factory()

        dtype = input_arr.dtype
        p_dtype = dtypes.real_for(dtype) if dtypes.is_complex(dtype) else dtype

        mode_shape = input_arr.shape if self._inverse else output_arr.shape

        current_mem = input_arr
        seq_axes = list(range(len(input_arr.shape)))
        current_axes = list(range(len(input_arr.shape)))

        for i, axis in enumerate(self._axes):
            current_mem, current_axes = self._add_transpose(plan, current_mem, current_axes, axis)

            tr_matrix = plan.persistent_array(
                self._get_transformation_matrix(p_dtype, mode_shape[axis], self._add_points[axis]))

            dot = MatrixMul(current_mem, tr_matrix)
            if i == len(self._axes) - 1 and current_axes == seq_axes:
                dot_output = output_arr
            else:
                # Cannot write to output if it is not the last transform,
                # or if we need to return to the initial axes order
                dot_output = plan.temp_array_like(dot.parameter.output)
            plan.computation_call(dot, dot_output, current_mem, tr_matrix)
            current_mem = dot_output

        # If we ended up with the wrong order of axes,
        # return to the original order.

        if current_axes != seq_axes:
            tr_axes = [current_axes.index(i) for i in range(len(current_axes))]
            transpose = Transpose(current_mem, output_arr_t=output_arr, axes=tr_axes)
            plan.add_computation(transpose, output_arr, current_mem)

        return plan
コード例 #10
0
ファイル: factories.py プロジェクト: kaizhongkaizhong/WaveSyn
 def create(thr, a_size, b_size, dtype=np.complex128, compile_=True):
     mm = MatrixMul(thr.array(a_size, dtype=dtype),
                    thr.array(b_size, dtype=dtype))
     if compile_:
         mm = mm.compile(thr)
     return mm
コード例 #11
0
ファイル: generate_gpu.py プロジェクト: fjarri/squeezed-sim
    def _build_plan(self, plan_factory, device_params, alpha, beta, alpha_i,
                    beta_i, seed):
        plan = plan_factory()

        system = self._system
        representation = self._representation

        unitary = plan.persistent_array(self._system.unitary)

        needs_noise_matrix = representation != Representation.POSITIVE_P and system.needs_noise_matrix(
        )

        mmul = MatrixMul(alpha, unitary, transposed_b=True)

        if not needs_noise_matrix:

            # TODO: this could be sped up for repr != POSITIVE_P,
            # since in that case alpha == conj(beta), and we don't need to do two multuplications.

            mmul_beta = MatrixMul(beta, unitary, transposed_b=True)
            trf_conj = self._make_trf_conj()
            mmul_beta.parameter.matrix_b.connect(trf_conj,
                                                 trf_conj.output,
                                                 matrix_b_p=trf_conj.input)

            plan.computation_call(mmul, alpha, alpha_i, unitary)
            plan.computation_call(mmul_beta, beta, beta_i, unitary)

        else:

            noise_matrix = system.noise_matrix()
            noise_matrix_dev = plan.persistent_array(noise_matrix)

            # If we're here, it's not positive-P, and alpha == conj(beta).
            # This means we can just calculate alpha, and then build beta from it.

            w = plan.temp_array_like(alpha)
            temp_alpha = plan.temp_array_like(alpha)

            plan.computation_call(mmul, temp_alpha, alpha_i, unitary)

            bijection = philox(64, 2)

            # Keeping the kernel the same so it can be cached.
            # The seed will be passed as the computation parameter instead.
            keygen = KeyGenerator.create(bijection, seed=numpy.int32(0))

            sampler = normal_bm(bijection, numpy.float64)

            plan.kernel_call(TEMPLATE.get_def("generate_apply_matrix_noise"),
                             [w, seed],
                             kernel_name="generate_apply_matrix_noise",
                             global_size=alpha.shape,
                             render_kwds=dict(
                                 bijection=bijection,
                                 keygen=keygen,
                                 sampler=sampler,
                                 mul_cr=functions.mul(numpy.complex128,
                                                      numpy.float64),
                                 add_cc=functions.add(numpy.complex128,
                                                      numpy.complex128),
                             ))

            noise = plan.temp_array_like(alpha)
            plan.computation_call(mmul, noise, w, noise_matrix_dev)

            plan.kernel_call(TEMPLATE.get_def("add_noise"),
                             [alpha, beta, temp_alpha, noise],
                             kernel_name="add_noise",
                             global_size=alpha.shape,
                             render_kwds=dict(
                                 add=functions.add(numpy.complex128,
                                                   numpy.complex128),
                                 conj=functions.conj(numpy.complex128)))

        return plan
コード例 #12
0
ファイル: factories.py プロジェクト: xialulee/WaveSyn
 def create(thr, a_size, b_size, dtype=np.complex128, compile_=True):
     mm = MatrixMul(thr.array(a_size, dtype=dtype), thr.array(b_size, dtype=dtype))
     if compile_:
         mm = mm.compile(thr)
     return mm
コード例 #13
0
ファイル: test_matrixmul.py プロジェクト: xexo7C8/reikna
def test_out_arr_shape():
    a = numpy.empty((1, 22, 33), numpy.float32)
    b = numpy.empty((2, 3, 33, 44), numpy.float32)
    dot = MatrixMul(a, b)
    assert dot.parameter.output.shape == (2, 3, 22, 44)