a = random.randint(2, size=(n * m)).astype(float32) b = random.randint(2, size=(m * p)).astype(float32) c = zeros((n * p), dtype=float32) TIMES = {} ctx = create_some_context() a_buf = Buffer(ctx, mf.READ_ONLY | mf.COPY_HOST_PTR, hostbuf=a) b_buf = Buffer(ctx, mf.READ_ONLY | mf.COPY_HOST_PTR, hostbuf=b) c_buf = Buffer(ctx, mf.WRITE_ONLY, c.nbytes) pt = perf_counter() prg = Program(ctx, load_cl_text("multiply_matr.cl")).build() TIMES["Compilation"] = perf_counter() - pt pt = perf_counter() with CommandQueue(ctx) as queue: prg.multiply(queue, c.shape, None, uint16(n), uint16(m), uint16(p), a_buf, b_buf, c_buf) TIMES["Execution"] = perf_counter() - pt pt = perf_counter() enqueue_copy(queue, c, c_buf) TIMES["Copying"] = perf_counter() - pt a_buf.release() b_buf.release() c_buf.release() print("matrix A:") print(a.reshape(n, m)) print("matrix B:") print(b.reshape(m, p)) print("multiplied A*B:") print(c.reshape(n, p)) print("\n".join("%s:\t%g" % i for i in TIMES.items()))