def main():
    cu_discriminant = vectorize([f4(f4, f4, f4), f8(f8, f8, f8)],
                                target='gpu')(poly.discriminant)

    N = 1e+8 // 2

    print 'Data size', N
    
    A, B, C = poly.generate_input(N, dtype=np.float32)
    D = np.empty(A.shape, dtype=A.dtype)

    stream = cuda.stream()

    print '== One'

    ts = time()

    with stream.auto_synchronize():
        dA = cuda.to_device(A, stream)
        dB = cuda.to_device(B, stream)
        dC = cuda.to_device(C, stream)
        dD = cuda.to_device(D, stream, copy=False)
        cu_discriminant(dA, dB, dC, out=dD, stream=stream)
        dD.to_host(stream)

    te = time()
    

    total_time = (te - ts)

    print 'Execution time %.4f' % total_time
    print 'Throughput %.2f' % (N / total_time)

    print '== Chunked'

    chunksize = 1e+7
    chunkcount = N // chunksize

    print 'Chunk size', chunksize

    sA = np.split(A, chunkcount)
    sB = np.split(B, chunkcount)
    sC = np.split(C, chunkcount)
    sD = np.split(D, chunkcount)

    device_ptrs = []

    ts = time()

    with stream.auto_synchronize():
        for a, b, c, d in zip(sA, sB, sC, sD):
            dA = cuda.to_device(a, stream)
            dB = cuda.to_device(b, stream)
            dC = cuda.to_device(c, stream)
            dD = cuda.to_device(d, stream, copy=False)
            cu_discriminant(dA, dB, dC, out=dD, stream=stream)
            dD.to_host(stream)
            device_ptrs.extend([dA, dB, dC, dD])

    te = time()

    total_time = (te - ts)

    print 'Execution time %.4f' % total_time
    print 'Throughput %.2f' % (N / total_time)


    if '-verify' in sys.argv[1:]:
        poly.check_answer(D, A, B, C)
def main():
    cu_discriminant = vectorize([f4(f4, f4, f4), f8(f8, f8, f8)],
                                target='cuda')(poly.discriminant)

    N = 1e+8 // 2

    print('Data size', N)

    A, B, C = poly.generate_input(N, dtype=np.float32)
    D = np.empty(A.shape, dtype=A.dtype)

    stream = cuda.stream()

    print('== One')

    ts = time()

    with stream.auto_synchronize():
        dA = cuda.to_device(A, stream)
        dB = cuda.to_device(B, stream)
        dC = cuda.to_device(C, stream)
        dD = cuda.to_device(D, stream, copy=False)
        cu_discriminant(dA, dB, dC, out=dD, stream=stream)
        dD.to_host(stream)

    te = time()


    total_time = (te - ts)

    print('Execution time %.4f' % total_time)
    print('Throughput %.2f' % (N / total_time))

    print('== Chunked')

    chunksize = 1e+7
    chunkcount = N // chunksize

    print('Chunk size', chunksize)

    sA = np.split(A, chunkcount)
    sB = np.split(B, chunkcount)
    sC = np.split(C, chunkcount)
    sD = np.split(D, chunkcount)

    device_ptrs = []

    ts = time()

    with stream.auto_synchronize():
        for a, b, c, d in zip(sA, sB, sC, sD):
            dA = cuda.to_device(a, stream)
            dB = cuda.to_device(b, stream)
            dC = cuda.to_device(c, stream)
            dD = cuda.to_device(d, stream, copy=False)
            cu_discriminant(dA, dB, dC, out=dD, stream=stream)
            dD.to_host(stream)
            device_ptrs.extend([dA, dB, dC, dD])

    te = time()

    total_time = (te - ts)

    print('Execution time %.4f' % total_time)
    print('Throughput %.2f' % (N / total_time))


    if '-verify' in sys.argv[1:]:
        poly.check_answer(D, A, B, C)