Ejemplo n.º 1
0
def check_errors(thr,
                 a_shape,
                 a_dtype,
                 b_shape,
                 b_dtype,
                 transposed_a=False,
                 transposed_b=False):
    a = get_test_array(a_shape, a_dtype)
    b = get_test_array(b_shape, b_dtype)

    a_ref = transpose(a) if transposed_a else a
    b_ref = transpose(b) if transposed_b else b

    res_ref = ref_dot(a_ref, b_ref)

    a_dev = thr.to_device(a)
    b_dev = thr.to_device(b)
    res_dev = thr.empty_like(res_ref)

    dot = MatrixMul(a_dev,
                    b_dev,
                    out_arr=res_dev,
                    transposed_a=transposed_a,
                    transposed_b=transposed_b)
    dotc = dot.compile(thr)
    dotc(res_dev, a_dev, b_dev)

    assert diff_is_negligible(res_dev.get(), res_ref)
Ejemplo n.º 2
0
def modified_gemm_gpu(A, B, C):
    shape = (A.shape[0], B.shape[1])
    api = cluda.cuda_api()
    thr = api.Thread.create()
    res_arr = thr.array((shape[0], shape[1]), dtype=A.dtype)

    mul = MatrixMul(A, B, out_arr=res_arr)
    mulc = mul.compile(thr)
    mulc(res_arr, A, B)

    return res_arr + C
Ejemplo n.º 3
0
def check_performance(thr_and_double,
                      perf_shape,
                      bwo=None,
                      transposed_a=False,
                      transposed_b=False):

    thr, double = thr_and_double
    dtype = numpy.float64 if double else numpy.float32
    batch, size = perf_shape

    shape = (batch, size, size)

    a = get_test_array(shape, dtype)
    b = get_test_array(shape, dtype)

    a_ref = transpose(a) if transposed_a else a
    b_ref = transpose(b) if transposed_b else b

    a_dev = thr.to_device(a)
    b_dev = thr.to_device(b)
    res_ref = ref_dot(a_ref, b_ref)
    res_dev = thr.array(res_ref.shape, dtype=dtype)

    dot = MatrixMul(a_dev,
                    b_dev,
                    out_arr=res_dev,
                    block_width_override=bwo,
                    transposed_a=transposed_a,
                    transposed_b=transposed_b)

    try:
        dotc = dot.compile(thr)
    except ValueError:
        pytest.skip()

    attempts = 10
    times = []
    for i in range(attempts):
        t1 = time.time()
        dotc(res_dev, a_dev, b_dev)
        thr.synchronize()
        times.append(time.time() - t1)

    assert diff_is_negligible(thr.from_device(res_dev), res_ref)

    return min(times), batch * size**3 * 2
Ejemplo n.º 4
0
def check_errors(thr, a_shape, a_dtype, b_shape, b_dtype, transposed_a=False, transposed_b=False):
    a = get_test_array(a_shape, a_dtype)
    b = get_test_array(b_shape, b_dtype)

    a_ref = transpose(a) if transposed_a else a
    b_ref = transpose(b) if transposed_b else b

    res_ref = ref_dot(a_ref, b_ref)

    a_dev = thr.to_device(a)
    b_dev = thr.to_device(b)
    res_dev = thr.empty_like(res_ref)

    dot = MatrixMul(a_dev, b_dev, out_arr=res_dev,
        transposed_a=transposed_a, transposed_b=transposed_b)
    dotc = dot.compile(thr)
    dotc(res_dev, a_dev, b_dev)

    assert diff_is_negligible(res_dev.get(), res_ref)
Ejemplo n.º 5
0
def main():
    api = cluda.ocl_api()
    thr = api.Thread.create()
    print thr
    shape1 = (100, 200)
    shape2 = (200, 100)

    a = numpy.random.randn(*shape1).astype(numpy.float32)
    b = numpy.random.randn(*shape2).astype(numpy.float32)
    a_dev = thr.to_device(a)
    b_dev = thr.to_device(b)
    res_dev = thr.array((shape1[0], shape2[1]), dtype=numpy.float32)

    dot = MatrixMul(a_dev, b_dev, out_arr=res_dev)
    dotc = dot.compile(thr)
    dotc(res_dev, a_dev, b_dev)

    res_reference = numpy.dot(a, b)
    print res_reference
Ejemplo n.º 6
0
def main():
    api = cluda.ocl_api()
    # thr = api.Thread.create()
    thr = api.Thread.create({'exclude_devices': 'Iris Pro'})

    n = 6000
    m = 3000

    shape1 = (n, m)
    shape2 = (m, n)

    a = numpy.random.randn(*shape1).astype(numpy.float32)
    b = numpy.random.randn(*shape2).astype(numpy.float32)


    a_dev = thr.to_device(a)
    b_dev = thr.to_device(b)
    res_dev = thr.array((shape1[0], shape2[1]), dtype=numpy.float32)

    dot = MatrixMul(a_dev, b_dev, out_arr=res_dev)
    dotc = dot.compile(thr)

    gt = 0
    for i in range(10):
        thr.synchronize()
        gpu_start = time.time()
        dotc(res_dev, a_dev, b_dev)
        thr.synchronize()
        gt += time.time() - gpu_start
    print(gt)

    ct = 0
    res_reference = None
    for i in range(10):
        t = time.time()
        res_reference = numpy.dot(a, b)
        ct += time.time() - t
    print(ct)

    print(norm(res_dev.get() - res_reference) / norm(res_reference) < 1e-6)
Ejemplo n.º 7
0
def run():

    api = cluda.ocl_api()
    thr = api.Thread.create()
    
    n = 3000
    shape1 = (n, n)
    shape2 = (n, n)
    
    a = numpy.random.randn(*shape1).astype(numpy.float32)
    b = numpy.random.randn(*shape2).astype(numpy.float32)
    a_dev = thr.to_device(a)
    b_dev = thr.to_device(b)
    res_dev = thr.array((shape1[0], shape2[1]), dtype=numpy.float32)
    
    dot = MatrixMul(a_dev, b_dev, out_arr=res_dev)
    dotc = dot.compile(thr)
    dotc(res_dev, a_dev, b_dev)
    
    res_reference = numpy.dot(a, b)
    
    print(norm(res_dev.get() - res_reference) / norm(res_reference) < 1e-6)
Ejemplo n.º 8
0
def check_performance(thr_and_double, perf_shape,
        bwo=None, transposed_a=False, transposed_b=False):

    thr, double = thr_and_double
    dtype = numpy.float64 if double else numpy.float32
    batch, size = perf_shape

    shape = (batch, size, size)

    a = get_test_array(shape, dtype)
    b = get_test_array(shape, dtype)

    a_ref = transpose(a) if transposed_a else a
    b_ref = transpose(b) if transposed_b else b

    a_dev = thr.to_device(a)
    b_dev = thr.to_device(b)
    res_ref = ref_dot(a_ref, b_ref)
    res_dev = thr.array(res_ref.shape, dtype=dtype)

    dot = MatrixMul(a_dev, b_dev, out_arr=res_dev, block_width_override=bwo,
        transposed_a=transposed_a, transposed_b=transposed_b)

    try:
        dotc = dot.compile(thr)
    except ValueError:
        pytest.skip()

    attempts = 10
    times = []
    for i in range(attempts):
        t1 = time.time()
        dotc(res_dev, a_dev, b_dev)
        thr.synchronize()
        times.append(time.time() - t1)

    assert diff_is_negligible(thr.from_device(res_dev), res_ref)

    return min(times), batch * size ** 3 * 2
Ejemplo n.º 9
0
 def create(thr, a_size, b_size, dtype=np.complex128, compile_=True):
     mm = MatrixMul(thr.array(a_size, dtype=dtype),
                    thr.array(b_size, dtype=dtype))
     if compile_:
         mm = mm.compile(thr)
     return mm
Ejemplo n.º 10
0
 def create(thr, a_size, b_size, dtype=np.complex128, compile_=True):
     mm = MatrixMul(thr.array(a_size, dtype=dtype), thr.array(b_size, dtype=dtype))
     if compile_:
         mm = mm.compile(thr)
     return mm