Python pinned Examples, numbapro.cuda.pinned Python Examples

Example #1

0

Show file

File: ga_representation.py Project: RelentlessResults/decompose001

    def symbolise(self, last_result=None):
        self.show_visualisation()
        bounds = np.empty(shape=(self.height * self.depth, 2), dtype=np.float64)
        for k in range(self.height):
            bounds[k * 3] = [0.001, 1]  # wavelength as a fraction of the window width
            bounds[k * 3 + 1] = [-1, 2]  # offset where window is 0,1
            bounds[k * 3 + 2] = [-1, 1]  # amplitude. Can be negative


        self._working_results = np.zeros(shape=(self.width, self.height), dtype=np.float64)
        # cuda init
        self._factors = np.zeros(shape=(self.height * self.depth,), dtype=np.float64)
        # cuda.profile_start()
        self._d_working_results = cuda.to_device(self._working_results)
        grid_dim = self.width / 128, self.height - 1
        block_dim = 128, 1, 1

        self.compute_samples_configured = compute_sample_gpu.configure(grid_dim, block_dim)

        with cuda.pinned(self._factors, self._working_results):
            output = scipy.optimize.differential_evolution(self.evaluation_function_cuda, bounds=bounds,
                                                           strategy='best2bin', maxiter=20, recombination=0.9,
                                                           mutation=(0.0001, 0.3), tol=0.01, init='latinhypercube',
                                                           popsize=15, disp=True, callback=self.step_callback)
        # cuda.profile_stop()

        self.evaluation_function_cuda(output.x)
        # raw_input("Press Enter to continue...")
        print(output)

Example #2

0

Show file

File: driver.py Project: julianrcook/numbapro-examples

def driver(pricer, pinned=False):
    paths = np.zeros((NumPath, NumStep + 1), order='F')
    paths[:, 0] = StockPrice
    DT = Maturity / NumStep

    if pinned:
        from numbapro import cuda
        with cuda.pinned(paths):
            ts = timer()
            pricer(paths, DT, InterestRate, Volatility)
            te = timer()
    else:
        ts = timer()
        pricer(paths, DT, InterestRate, Volatility)
        te = timer()

    ST = paths[:, -1]
    PaidOff = np.maximum(paths[:, -1] - StrikePrice, 0)
    print 'Result'
    fmt = '%20s: %s'
    print fmt % ('stock price', np.mean(ST))
    print fmt % ('standard error', np.std(ST) / sqrt(NumPath))
    print fmt % ('paid off', np.mean(PaidOff))
    optionprice = np.mean(PaidOff) * exp(-InterestRate * Maturity)
    print fmt % ('option price', optionprice)
    print fmt % ('Paths ', NumPath)
    print fmt % ('NumStep ', NumStep)

    print 'Performance'
    NumCompute = NumPath * NumStep
    print fmt % ('Mstep/second', '%.2f' % (NumCompute / (te - ts) / 1e6))
    print fmt % ('time elapsed', '%.3fs' % (te - ts))

    if '--plot' in sys.argv:
        from matplotlib import pyplot
        pathct = min(NumPath, 100)
        for i in xrange(pathct):
            pyplot.plot(paths[i])
        print 'Plotting %d/%d paths' % (pathct, NumPath)
        pyplot.show()

Example #3

0

Show file

File: driver.py Project: tishizuk/numbapro-examples

def driver(pricer, pinned=False):
    paths = np.zeros((NumPath, NumStep + 1), order='F')
    paths[:, 0] = StockPrice
    DT = Maturity / NumStep

    if pinned:
        from numbapro import cuda
        with cuda.pinned(paths):
            ts = timer()
            pricer(paths, DT, InterestRate, Volatility)
            te = timer()
    else:
        ts = timer()
        pricer(paths, DT, InterestRate, Volatility)
        te = timer()

    ST = paths[:, -1]
    PaidOff = np.maximum(paths[:, -1] - StrikePrice, 0)
    print 'Result'
    fmt = '%20s: %s'
    print fmt % ('stock price', np.mean(ST))
    print fmt % ('standard error', np.std(ST) / sqrt(NumPath))
    print fmt % ('paid off', np.mean(PaidOff))
    optionprice = np.mean(PaidOff) * exp(-InterestRate * Maturity)
    print fmt % ('option price', optionprice)

    print 'Performance'
    NumCompute = NumPath * NumStep
    print fmt % ('Mstep/second', '%.2f' % (NumCompute / (te - ts) / 1e6))
    print fmt % ('time elapsed', '%.3fs' % (te - ts))

    if '--plot' in sys.argv:
        from matplotlib import pyplot
        pathct = min(NumPath, 100)
        for i in xrange(pathct):
            pyplot.plot(paths[i])
        print 'Plotting %d/%d paths' % (pathct, NumPath)
        pyplot.show()

Example #4

0

Show file

File: pinned.py Project: Aahung/numbapro-examples

d_src = cuda.to_device(src)
d_dst = cuda.device_array_like(dst)

copy_kernel(d_src, out=d_dst)

d_dst.copy_to_host(dst)
te = timer()

print 'regular', te - ts

del d_src, d_dst

assert np.allclose(dst, src)

# Pinned (pagelocked) memory transfer

with cuda.pinned(src, dst):
    ts = timer()
    stream = cuda.stream()  # use stream to trigger async memory transfer
    d_src = cuda.to_device(src, stream=stream)
    d_dst = cuda.device_array_like(dst, stream=stream)

    copy_kernel(d_src, out=d_dst, stream=stream)

    d_dst.copy_to_host(dst, stream=stream)
    stream.synchronize()
    te = timer()
    print 'pinned', te - ts

assert np.allclose(dst, src)

Example #5

0

Show file

File: fftconvolve.py Project: codeWiz89/numbapro-examples

def main():
    # Build Filter
    laplacian_pts = """
    -4 -1 0 -1 -4
    -1 2 3 2 -1
    0 3 4 3 0
    -1 2 3 2 -1
    -4 -1 0 -1 -4
    """.split()

    laplacian = np.array(laplacian_pts, dtype=np.float32).reshape(5, 5)

    # Build Image
    try:
        filename = sys.argv[1]
        image = ndimage.imread(filename, flatten=True).astype(np.float32)
    except IndexError:
        image = misc.lena().astype(np.float32)

    print("Image size: %s" % (image.shape,))

    response = np.zeros_like(image)
    response[:5, :5] = laplacian

    # CPU
    ts = timer()
    cvimage_cpu = fftconvolve(image, laplacian, mode="same")
    te = timer()
    print("CPU: %.2fs" % (te - ts))

    # GPU
    threadperblock = 32, 8
    blockpergrid = best_grid_size(tuple(reversed(image.shape)), threadperblock)
    print("kernel config: %s x %s" % (blockpergrid, threadperblock))

    # Trigger initialization the cuFFT system.
    # This takes significant time for small dataset.
    # We should not be including the time wasted here
    cufft.FFTPlan(shape=image.shape, itype=np.complex64, otype=np.complex64)

    # Start GPU timer
    ts = timer()
    image_complex = image.astype(np.complex64)
    response_complex = response.astype(np.complex64)

    stream1 = cuda.stream()
    stream2 = cuda.stream()

    fftplan1 = cufft.FFTPlan(shape=image.shape, itype=np.complex64, otype=np.complex64, stream=stream1)
    fftplan2 = cufft.FFTPlan(shape=image.shape, itype=np.complex64, otype=np.complex64, stream=stream2)

    # pagelock memory
    with cuda.pinned(image_complex, response_complex):

        # We can overlap the transfer of response_complex with the forward FFT
        # on image_complex.
        d_image_complex = cuda.to_device(image_complex, stream=stream1)
        d_response_complex = cuda.to_device(response_complex, stream=stream2)

        fftplan1.forward(d_image_complex, out=d_image_complex)
        fftplan2.forward(d_response_complex, out=d_response_complex)

        stream2.synchronize()

        mult_inplace[blockpergrid, threadperblock, stream1](d_image_complex, d_response_complex)
        fftplan1.inverse(d_image_complex, out=d_image_complex)

        # implicitly synchronizes the streams
        cvimage_gpu = d_image_complex.copy_to_host().real / np.prod(image.shape)

    te = timer()
    print("GPU: %.2fs" % (te - ts))

    # Plot the results
    plt.subplot(1, 2, 1)
    plt.title("CPU")
    plt.imshow(cvimage_cpu, cmap=plt.cm.gray)
    plt.axis("off")

    plt.subplot(1, 2, 2)
    plt.title("GPU")
    plt.imshow(cvimage_gpu, cmap=plt.cm.gray)
    plt.axis("off")

    plt.show()

Example #6

0

Show file

d_src = cuda.to_device(src)
d_dst = cuda.device_array_like(dst)

copy_kernel(d_src, out=d_dst)

d_dst.copy_to_host(dst)
te = timer()

print 'regular', te - ts

del d_src, d_dst

assert np.allclose(dst, src)

# Pinned (pagelocked) memory transfer

with cuda.pinned(src, dst):
    ts = timer()
    stream = cuda.stream()  # use stream to trigger async memory transfer
    d_src = cuda.to_device(src, stream=stream)
    d_dst = cuda.device_array_like(dst, stream=stream)

    copy_kernel(d_src, out=d_dst, stream=stream)

    d_dst.copy_to_host(dst, stream=stream)
    stream.synchronize()
    te = timer()
    print 'pinned', te - ts

assert np.allclose(dst, src)