コード例 #1
0
def main():
    inp = np.arange(1000000, dtype=np.int32)
    factor = 4
    start, end = cuda.event(True), cuda.event(True)

    reses = []
    for (name, f) in [
        ("not shared", mult_by_x_not_shared),
        ("shared", mult_by_x_shared),
        ("not shared", mult_by_x_not_shared),
    ]:
        times = []
        for i in range(100):

            d_out = cuda.device_array_like(inp)

            start.record()
            f[blocks, threadsPerBlock](cuda.to_device(inp), d_out,
                                       cuda.to_device(np.array([factor])))
            end.record()
            end.synchronize()

            out = d_out.copy_to_host()

            # Compilation...
            if i != 0:
                times.append(cuda.event_elapsed_time(start, end))
        print(
            f"{name}: {np.mean(times):.2f} +/- {np.std(times) / np.sqrt(len(times)):.3f} (max: {np.max(times):.2f})"
        )
        reses.append(out)
    assert np.all([reses[0] == reses_i for reses_i in reses])
コード例 #2
0
ファイル: laplace_7pt_ex.py プロジェクト: orgarten/hack18_dd
def laplace_3d_cuda(d, n):
    L = d.shape[0]
    M = d.shape[1]
    N = d.shape[2]

    blockdim = (8, 8, 8)
    griddim = (L // blockdim[0], M // blockdim[1], N // blockdim[2])
    #print(griddim)

    stream = cuda.stream()
    dd = cuda.to_device(d, stream)
    dn = cuda.to_device(n, stream)

    #%timeit -n 32 -r 16 d0td1_cuda_kernel[griddim, blockdim](dd, dn)
    for i in range(0, 100):
        laplace_3d_cuda_opt_kernel[griddim, blockdim, stream](dd, dn, L, M, N)

    evtstart = cuda.event(timing=True)
    evtend = cuda.event(timing=True)
    evtstart.record(stream)
    for i in range(100):
        laplace_3d_cuda_opt_kernel[griddim, blockdim, stream](dd, dn, L, M, N)
    evtend.record(stream)
    evtend.synchronize()
    print(cuda.event_elapsed_time(evtstart, evtend) / 100.)

    dd.to_host()
コード例 #3
0
ファイル: Timer.py プロジェクト: NVIDIA/rapidAligner
    def __init__(self, label='', gpu=0):

        self.label = label
        self.gpu = gpu
        self.start = cuda.event()
        self.end = cuda.event()
        cuda.select_device(self.gpu)
        self.start.record(),
コード例 #4
0
def main(image1, image2):
    streams = []
    start_events = []
    end_events = []
    data1_gpu = []
    data2_gpu = []
    gpu_out = []
    out = []

    data_image1 = np.array(image1)
    data_image2 = np.array(image2)
    print(data_image1.shape, data_image2.shape)

    shpape_A = data_image1.shape
    # prevaod na 1 rozmerne pole
    data_image1 = data_image1.ravel()
    data_image2 = data_image2.ravel()

    input1 = np.split(data_image1, X)
    input2 = np.split(data_image1, X)

    for _ in range(len(input1)):
        streams.append(cuda.stream())
        start_events.append(cuda.event())
        end_events.append(cuda.event())

    for i in range(len(input1)):
        data1_gpu.append(cuda.to_device(input1[i], stream=streams[i]))
        data2_gpu.append(cuda.to_device(input2[i], stream=streams[i]))

    t_start = perf_counter()
    for i in range(len(input1)):
        start_events[i].record(streams[i])
        sumImages[1, 32, streams[i]](data1_gpu[i], data2_gpu[i])
    t_end = perf_counter()

    for i in range(len(input1)):
        end_events[i].record(streams[i])
        gpu_out.append(data2_gpu[i].copy_to_host(stream=streams[i]))

    for i in range(len(gpu_out)):
        out = np.concatenate((out, gpu_out[i]))

    kernel_times = []

    for k in range(len(input1)):
        kernel_times.append(
            cuda.event_elapsed_time(start_events[k], end_events[k]))

    out = out.reshape(shpape_A)
    out = out.astype('uint8')
    out = Image.fromarray(out)
    out.save("out_stream.png")
    print(f'Total time: {t_end - t_start}')
    print(f'Mean kernel duration (milliseconds): {np.mean(kernel_times)}')
    print(f'Mean kernel standard deviation \
          (milliseconds): {np.std(kernel_times)}')
コード例 #5
0
    def test_event_elapsed(self):
        N = 32
        dary = cuda.device_array(N, dtype=np.double)
        evtstart = cuda.event()
        evtend = cuda.event()

        evtstart.record()
        cuda.to_device(np.arange(N), to=dary)
        evtend.record()
        evtend.wait()
        evtend.synchronize()
        print(evtstart.elapsed_time(evtend))
コード例 #6
0
ファイル: test_events.py プロジェクト: ASPP/numba
    def test_event_elapsed(self):
        N = 32
        dary = cuda.device_array(N, dtype=np.double)
        evtstart = cuda.event()
        evtend = cuda.event()

        evtstart.record()
        cuda.to_device(np.arange(N), to=dary)
        evtend.record()
        evtend.wait()
        evtend.synchronize()
        print(evtstart.elapsed_time(evtend))
コード例 #7
0
    def test_event_elapsed_stream(self):
        N = 32
        stream = cuda.stream()
        dary = cuda.device_array(N, dtype=np.double)
        evtstart = cuda.event()
        evtend = cuda.event()

        evtstart.record(stream=stream)
        cuda.to_device(np.arange(N, dtype=np.double), to=dary, stream=stream)
        evtend.record(stream=stream)
        evtend.wait(stream=stream)
        evtend.synchronize()
        # Exercise the code path
        evtstart.elapsed_time(evtend)
コード例 #8
0
ファイル: test_events.py プロジェクト: cpcloud/numba
    def test_event_elapsed_stream(self):
        N = 32
        stream = cuda.stream()
        dary = cuda.device_array(N, dtype=np.double)
        evtstart = cuda.event()
        evtend = cuda.event()

        evtstart.record(stream=stream)
        cuda.to_device(np.arange(N), to=dary, stream=stream)
        evtend.record(stream=stream)
        evtend.wait(stream=stream)
        evtend.synchronize()
        # Exercise the code path
        evtstart.elapsed_time(evtend)
コード例 #9
0
 def __init__(self, shape, dtype, prealloc):
     self.device = cuda.get_current_device()
     self.freelist = deque()
     self.events = {}
     for i in range(prealloc):
         gpumem = cuda.device_array(shape=shape, dtype=dtype)
         self.freelist.append(gpumem)
         self.events[gpumem] = cuda.event(timing=False)
コード例 #10
0
 def __init__(self, shape, dtype, prealloc):
     self.device = cuda.get_current_device()
     self.freelist = deque()
     self.events = {}
     for i in range(prealloc):
         gpumem = cuda.device_array(shape=shape, dtype=dtype)
         self.freelist.append(gpumem)
         self.events[gpumem] = cuda.event(timing=False)
コード例 #11
0
def main():
    # Compile
    lots_of_copies()
    few_copies()

    # Now benchmark
    start, end = cuda.event(timing=True), cuda.event(timing=True)
    n = 200
    for f in [lots_of_copies, few_copies]:
        times = []
        for _ in range(n):
            start.record()
            f()
            end.record()
            end.synchronize()
            t = cuda.event_elapsed_time(start, end)
            times.append(t)
        print(f.__name__, np.mean(times), np.std(times) / np.sqrt(n))
コード例 #12
0
ファイル: KMeans.py プロジェクト: diogo-aos/masters_final
    def _set_up_profiling(self):
        # set up profiling for manual gpu mem management
        if self._cuda_mem == 'manual':
            if not hasattr(self, 'man_prof'):
                man_prof_events = ('data_ev1', 'data_ev2',
                                   'labels_ev1', 'labels_ev2',
                                   'dists_ev1', 'dists_ev2',
                                   'centroids_ev1', 'centroids_ev2',
                                   'kernel_ev1', 'kernel_ev2')
                self.man_prof = {key: cuda.event() for key in man_prof_events}
                man_prof_timings = ('data_timings', 'labels_timings',
                                    'dists_timings', 'kernel_timings',
                                    'centroids_timings')
                self.man_prof.update({key: list() for key in man_prof_timings})

        # set up profiling for auto gpu mem management
        elif self._cuda_mem == 'auto':
            if not hasattr(self, 'auto_prof'):
                auto_prof_events = ('kernel_ev1', 'kernel_ev2')
                self.auto_prof = {key: cuda.event() for key in auto_prof_events}
                self.auto_prof['kernel_timings'] = list()
コード例 #13
0
ファイル: test_scan.py プロジェクト: Chiroptera/ThesisWriting
def last_block_test():

    MAX_TPB = 512
    n = 1024

    a = np.arange(n).astype(np.int32)
    reference = np.empty_like(a)

    start = timer()
    scan.exprefixsumNumba(a, reference, init = 0)
    end = timer()

    auxidx = -1

    elb = a.size
    p2elb = np.int(np.ceil(np.log2(elb)))
    telb = 2 ** p2elb
    tlb = telb / 2
    startIdx = 0

    sm_size = telb * a.itemsize

    aux = np.empty(1,dtype=np.int8)

    trash = cuda.device_array(1)

    e1, e2 = cuda.event(), cuda.event()

    e1.record()
    scan.last_scan[1, tlb, 0, sm_size](a, aux, -1, elb, startIdx)
    e2.record()

    print "CPU took:    ", (end - start) * 1000, " ms"
    print "Kernel took: ", cuda.event_elapsed_time(e1,e2), " ms"

    print (a == reference).all()
コード例 #14
0
def test_last_block():

    MAX_TPB = 512
    n = 1024

    a = np.arange(n).astype(np.int32)
    reference = np.empty_like(a)

    start = timer()
    MyScan.exprefixsumNumba(a, reference, init=0)
    end = timer()

    auxidx = -1

    elb = a.size
    p2elb = np.int(np.ceil(np.log2(elb)))
    telb = 2 ** p2elb
    tlb = telb / 2
    startIdx = 0

    sm_size = telb * a.itemsize

    aux = np.empty(1, dtype=np.int8)

    trash = cuda.device_array(1)

    e1, e2 = cuda.event(), cuda.event()

    e1.record()
    MyScan.last_scan[1, tlb, 0, sm_size](a, aux, -1, elb, startIdx)
    e2.record()

    print "CPU took:    ", (end - start) * 1000, " ms"
    print "Kernel took: ", cuda.event_elapsed_time(e1, e2), " ms"

    print (a == reference).all()
コード例 #15
0
import numpy as np

@cuda.jit('void(float32[:], float32[:])')
def cu_copy_array(dst, src):
    i = cuda.grid(1)
    dst[i] = src[i]


BLOCKCOUNT = 25000
BLOCKSIZE = 256

aryA = np.arange(BLOCKSIZE * BLOCKCOUNT, dtype=np.float32)

print('data size: %.1fMB' % (aryA.size * aryA.dtype.itemsize / (2**20)))

evt_total_begin = cuda.event()
evt_total_end = cuda.event()

evt_kernel_begin = cuda.event()
evt_kernel_end = cuda.event()

t_total_begin = timer()
evt_total_begin.record()

# explicity tranfer memory
d_aryA = cuda.to_device(aryA)
d_aryB = cuda.device_array_like(aryA)

evt_kernel_begin.record()

t_kernel_begin = timer()
コード例 #16
0
                qz = z2[j]
                qw = w2[j]
                dx = px - qx
                dy = py - qy
                dz = pz - qz
                wprod = pw * qw
                dsq = dx * dx + dy * dy + dz * dz

                k = nbins - 1
                while dsq <= rbins_squared[k]:
                    cuda.atomic.add(result, k - 1, wprod)
                    k -= 1
                    if k <= 0:
                        break

    start = cuda.event()
    end = cuda.event()
    timing_nb = 0
    timing_nb_wall = 0

    d_x1 = cuda.to_device(x1.astype(np.float32))
    d_y1 = cuda.to_device(y1.astype(np.float32))
    d_z1 = cuda.to_device(z1.astype(np.float32))
    d_w1 = cuda.to_device(w1.astype(np.float32))

    d_x2 = cuda.to_device(x2.astype(np.float32))
    d_y2 = cuda.to_device(y2.astype(np.float32))
    d_z2 = cuda.to_device(z2.astype(np.float32))
    d_w2 = cuda.to_device(w2.astype(np.float32))

    d_rbins_squared = cuda.to_device(DEFAULT_RBINS_SQUARED.astype(np.float32))
コード例 #17
0
def intensityCalculations(GratingSeparation, WaveNumber, sourcePoints,
                          obsPoints, sourceAmp, sourcePhase):
    """This function is used as an abstraction layer for the CUDA kernel. This function does the type casting and is able to return values.
      
    Args:
      GratingSeparation (float):Constant passed in and used as distance (on the x-plane) between source and observation points
      WaveNumber (float):       Constant defined in global variables. I think it has to do with wave length; not sure why its named wavenumber
      sourcePoints (f4[:]):     Position of source points as an array of float32
      obsPoints (f4[:]):        Position of observation points as an array of float32
      sourceAmp (f4[:]):        Amplitudes from each source point as an array of float32
      sourcePhase (c8[:]):      Phase of each source point as an array of complex128
    Returns:
      return intensities, amplituteds, phases;
        intensities (f4[:]):	Array of intensities for each observation point as an array of float32
        amplitudes  (f4[:]):	Array of amplitudes  for each observation point as an array of float32
        phases	    (c8[:]):	Array of phases      for each observation point as an array of complex128
    Changelog:
    TODO:
    Author:
      Alec Buchanan - 3/2018
    """
    # Specify the number of CUDA threads
    arraySize = len(obsPoints)
    threadsperblock = 32
    blockspergrid = (arraySize + (threadsperblock - 1)) // threadsperblock

    # initialize output variables
    out_i = [0.0] * arraySize  # intensity
    out_a = [0.0] * arraySize  # amplitude
    out_p = [0.0] * arraySize  # phase

    # Cast datatypes so the kernel does not complain
    GratingSeparation = float(GratingSeparation)
    WaveNumber = float(WaveNumber)
    sourcePoints = np.array(sourcePoints,
                            dtype='f4')  # 32-bit float array, 4 bytes
    obsPoints = np.array(obsPoints, dtype='f4')  # 32-bit float array, 4 bytes
    sourcePhase = np.array(sourcePhase,
                           dtype='c8')  # 64-bit complex array, 8 bytes
    out_p = np.array(out_p, dtype='c8')  # 64-bit complex array, 8 bytes
    out_a = np.array(out_a, dtype='f4')  # 32-bit float array, 4 bytes
    out_i = np.array(out_i, dtype='f4')  # 32-bit float array, 4 bytes

    evt_total_begin = cuda.event()
    evt_total_end = cuda.event()

    evt_mem_begin = cuda.event()
    evt_mem_end = cuda.event()

    evt_mem2_begin = cuda.event()
    evt_mem2_end = cuda.event()

    evt_kernel_begin = cuda.event()
    evt_kernel_end = cuda.event()

    evt_total_begin.record()
    evt_mem_begin.record()

    d_sourcePoints = cuda.to_device(sourcePoints)
    d_obsPoints = cuda.to_device(obsPoints)
    d_sourceAmp = cuda.to_device(sourceAmp)
    d_sourcePhase = cuda.to_device(sourcePhase)
    d_out_i = cuda.to_device(out_i)
    d_out_a = cuda.to_device(out_a)
    d_out_p = cuda.to_device(out_p)

    evt_mem_end.record()
    evt_mem_end.synchronize()

    evt_kernel_begin.record()
    # call CUDA kernel
    intensityKernel[blockspergrid,
                    threadsperblock](GratingSeparation, WaveNumber,
                                     d_sourcePoints, d_obsPoints, d_sourceAmp,
                                     d_sourcePhase, d_out_p, d_out_a, d_out_i)

    evt_kernel_end.record()
    evt_kernel_end.synchronize()

    evt_mem2_begin.record()

    out_i = d_out_i.copy_to_host()
    out_a = d_out_a.copy_to_host()
    out_p = d_out_p.copy_to_host()

    evt_mem2_end.record()
    evt_mem2_end.synchronize()

    evt_total_end.record()
    evt_total_end.synchronize()

    print('total time: %fms' % evt_total_begin.elapsed_time(evt_total_end))
    print('mem-to-gpu time: %fms' % evt_mem_begin.elapsed_time(evt_mem_end))
    print('kernel time: %fms' % evt_kernel_begin.elapsed_time(evt_kernel_end))
    print('mem-from-gpu time: %fms' %
          evt_mem2_begin.elapsed_time(evt_mem2_end))
    # Remove Imaginary parts
    out_i = np.array(out_i, dtype='f4')
    out_a = np.array(out_a, dtype='f4')

    return out_i, out_a, out_p
コード例 #18
0
ファイル: KMeans.py プロジェクト: diogo-aos/masters_final
            new_centroids[i] = data[furtherDistsArgs[j]]
            j += 1

    return new_centroids


if __name__ == '__main__':

    n = 10000
    d = 200
    k = 50

    data = np.random.random((n, d)).astype(np.float32)
    centroids = np.random.random((k, d)).astype(np.float32)

    kt_start, kt_end = cuda.event(), cuda.event()

    # grid config
    tpb = 256
    bpg = np.int(np.ceil(np.float(n) / tpb))

    # compile kernel
    dData = cuda.to_device(data)
    dCentroids = cuda.to_device(centroids)
    dLabels = cuda.device_array(n, dtype=np.int32)
    dDists = cuda.device_array(n, dtype=np.float32)
    _cu_label_kernel_dists[bpg, tpb](dData, dCentroids, dLabels, dDists)

    ## data column major
    # GPU data
    data_t = data.T
コード例 #19
0
def mst_cluster_coassoc():
    t1,t2 = Timer(), Timer()

    #foldername = "/home/courses/aac2015/diogoaos/QCThesis/datasets/gaussmix1e4/"
    foldername = home + "QCThesis/datasets/gaussmix1e4/"

    print "Loading datasets"

    t1.tic()
    # dest = np.genfromtxt(foldername + "prot_dest.csr", dtype = np.int32, delimiter=",")
    # weight = np.genfromtxt(foldername + "prot_weight.csr", dtype = np.float32, delimiter=",")
    # fe = np.genfromtxt(foldername + "prot_fe.csr", dtype = np.int32, delimiter=",")

    dest = np.genfromtxt(foldername + "full_dest.csr", dtype = np.int32, delimiter=",")
    weight = np.genfromtxt(foldername + "full_weight.csr", dtype = np.float32, delimiter=",")
    fe = np.genfromtxt(foldername + "full_fe.csr", dtype = np.int32, delimiter=",")
    t1.tac()

    print "loading elapsed time : ", t1.elapsed

    fe = fe[:-1]
    od = np.empty_like(fe)
    outdegree_from_firstedge(fe, od, dest.size)

    # fix weights to dissimilarity
    weight = 100 - weight

    print "# edges : ", dest.size
    print "# vertices : ", fe.size
    print "edges/vertices ratio : ", dest.size * 1.0 / fe.size

    t1.tic()
    mst, n_edges = boruvka_minho_seq(dest, weight, fe, od)
    t1.tac()

    print "seq: time elapsed : ", t1.elapsed
    print "seq: mst size :", mst.size
    print "seq: n_edges : ", n_edges

    if n_edges < mst.size:
        mst = mst[:n_edges]
    mst.sort()

    ev1,ev2 = cuda.event(), cuda.event()

    ev1.record()
    d_dest = cuda.to_device(dest)
    d_weight = cuda.to_device(weight)
    d_fe = cuda.to_device(fe)
    d_od = cuda.to_device(od)
    ev2.record()

    send_graph_time = cuda.event_elapsed_time(ev1,ev2)

    t2.tic()
    mst2, n_edges2 = boruvka_minho_gpu(d_dest, d_weight, d_fe, d_od, MAX_TPB=512, returnDevAry = True)
    t2.tac()

    ev1.record()
    mst2 = mst2.copy_to_host()
    n_edges2 = n_edges2.getitem(0)
    ev2.record()

    recv_mst_time = cuda.event_elapsed_time(ev1,ev2)
    print "gpu: send graph time : ", send_graph_time
    print "gpu: time elapsed : ", t2.elapsed    
    print "gpu: rcv mst time : ", recv_mst_time
    print "gpu: mst size :", mst2.size  
    print "seq: n_edges : ", n_edges2

    if n_edges2 < mst2.size:
        mst2 = mst2[:n_edges2]
    mst2.sort()

    if n_edges == n_edges2:
        mst_is_equal = (mst == mst2).all()
    else:
        mst_is_equal = False
    print "mst gpu == seq : ", mst_is_equal
コード例 #20
0
 
@cuda.jit
def kernel(array):
    thd = cuda.grid(1)
    num_iters = array.size // cuda.blockDim.x
    for j in range(num_iters):
        i = j * cuda.blockDim.x + thd
        for k in range(50):
            array[i] *= 2.0
            array[i] /= 2.0
 
 
data = np.random.randn(array_len).astype('float32')
data_gpu = cuda.to_device(data)
 
start_event = cuda.event()
end_event = cuda.event()
 
start_event.record()
kernel[1, 64](data_gpu)
end_event.record()
 
# pocka, kym nebude `end_event` oznaceny za hotovy
# end_event.synchronize()
 
print('Has the kernel started yet? {}'.format(start_event.query()))
print('Has the kernel ended yet? {}'.format(end_event.query()))
 
# vypocita kolko trvalo spustenie kernelu.
# print('Kernel execution time in milliseconds: %f ' %
#       cuda.event_elapsed_time(start_event, end_event))
コード例 #21
0
ファイル: KMeans.py プロジェクト: diogo-aos/masters_final
    def _cu_label(self, data, centroids):
        #WARNING: data is being transposed when sending to GPU

        data_ev1, data_ev2 = cuda.event(), cuda.event()
        labels_ev1, labels_ev2 = cuda.event(), cuda.event()
        dists_ev1, dists_ev2 = cuda.event(), cuda.event()

        N, D = data.shape
        K, cD = centroids.shape

        if self._cuda_mem not in ('manual','auto'):
            raise Exception("cuda_mem = \'manual\' or \'auto\'")

        if self._gridDim is None or self._blockDim is None:
            self._compute_cuda_dims(data)

        labels = np.empty(N, dtype=np.int32)

        if self._cuda_mem == 'manual':
            # copy dataset and centroids, allocate memory

            ## cuda persistent handles
            # avoids redundant data transfer
            # if dataset has not been sent to device, send it and save handle
            if self._cudaDataHandle is None:
                dataT = np.ascontiguousarray(data.T)

                self.man_prof['data_ev1'].record()
                dData = cuda.to_device(dataT)
                self.man_prof['data_ev2'].record()
                self.man_prof['data_ev2'].synchronize()
                time_ms = cuda.event_elapsed_time(self.man_prof['data_ev1'],
                                                  self.man_prof['data_ev2'])
                self.man_prof['data_timings'].append(time_ms)
                self._cudaDataHandle = dData
            # otherwise just use handle
            else:
                dData = self._cudaDataHandle

            # avoids creating labels array in device more than once
            if self._cuda_labels_handle is None:
                dLabels = cuda.device_array_like(labels)
                self._cuda_labels_handle = dLabels
            else:
                dLabels = self._cuda_labels_handle

            # avoids creating dists array in device more than once
            if self._cuda_dists_handle is None:
                dDists = cuda.device_array_like(self._dists)
                self._cuda_dists_handle = dDists
            else:
                dDists = self._cuda_dists_handle

            # copy centroids to device
            self.man_prof['centroids_ev1'].record()
            dCentroids = cuda.to_device(centroids)
            self.man_prof['centroids_ev2'].record()

            # launch kernel
            self.man_prof['kernel_ev1'].record()
            _cu_label_kernel_dists[self._gridDim, self._blockDim](dData,
                                                                 dCentroids,
                                                                 dLabels,
                                                                 dDists)
            self.man_prof['kernel_ev2'].record()

            # cuda.synchronize()

            # self.man_prof['kernel_ev2'].synchronize()

            # copy labels from device to host
            self.man_prof['labels_ev1'].record()
            dLabels.copy_to_host(ary=labels)
            self.man_prof['labels_ev2'].record()

            # copy distance to centroids from device to host
            self.man_prof['dists_ev1'].record()
            dists = dDists.copy_to_host()
            self.man_prof['dists_ev2'].record()
            self._dists = dists

            # synchronize host with gpu before computing times
            self.man_prof['dists_ev2'].synchronize()

            # store timings
            time_ms = cuda.event_elapsed_time(self.man_prof['centroids_ev1'],
                                              self.man_prof['centroids_ev2'])
            self.man_prof['centroids_timings'].append(time_ms)

            time_ms = cuda.event_elapsed_time(self.man_prof['kernel_ev1'],
                                              self.man_prof['kernel_ev2'])
            self.man_prof['kernel_timings'].append(time_ms)

            time_ms = cuda.event_elapsed_time(self.man_prof['labels_ev1'],
                                              self.man_prof['labels_ev2'])
            self.man_prof['labels_timings'].append(time_ms)

            time_ms = cuda.event_elapsed_time(self.man_prof['dists_ev1'],
                                              self.man_prof['dists_ev2'])
            self.man_prof['dists_timings'].append(time_ms)

        elif self._cuda_mem == 'auto':
            self.auto_prof['kernel_ev1'].record()
            _cu_label_kernel_dists[self._gridDim,self._blockDim](data,
                                                                centroids,
                                                                labels,
                                                                self._dists)
            self.auto_prof['kernel_ev2'].record()
            time_ms = cuda.event_elapsed_time(self.auto_prof['kernel_ev1'],
                                              self.auto_prof['kernel_ev2'])
            self.auto_prof['kernel_timings'].append(time_ms)

        else:
            raise ValueError("CUDA memory management type may either \
                              be \'manual\' or \'auto\'.")

        return labels
コード例 #22
0
    def generate_batch(self,
                       end=None,
                       verbose=False,
                       fused=False,
                       nested_cva_at=None,
                       nested_im_at=None,
                       indicator_in_cva=False,
                       alpha=None,
                       im_window=None):
        if end is None:
            end = self.num_coarse_steps
        t = 0.
        self._reset()
        self.cuda_generate_exp1(self.d_exp_1, self.d_rng_states)
        self.stream.synchronize()
        self.cuda_compute_mtm(
            0, t, self.d_X, self.d_mtm_by_cpty, self.d_cash_flows_by_cpty,
            self.d_vanillas_on_fx_f32, self.d_vanillas_on_fx_i32,
            self.d_vanillas_on_fx_b8, self.d_irs_f32, self.d_irs_i32,
            self.d_zcs_f32, self.d_zcs_i32, self.dt, self.max_coarse_per_reset,
            self.cDtoH_freq, True)
        self.stream.synchronize()
        self.d_mtm_by_cpty[0].copy_to_host(ary=self.mtm_by_cpty[0],
                                           stream=self.stream)
        self.d_cash_flows_by_cpty[0].copy_to_host(
            ary=self.cash_flows_by_cpty[0], stream=self.stream)

        _cuda_bulk_diffuse_event_begin = [cuda.event() for i in range(end)]
        _cuda_bulk_diffuse_event_end = [cuda.event() for i in range(end)]

        _cuda_compute_mtm_event_begin = [cuda.event() for i in range(end)]
        _cuda_compute_mtm_event_end = [cuda.event() for i in range(end)]

        _cuda_nested_cva_event_begin = [cuda.event() for i in range(end)]
        _cuda_nested_cva_event_end = [cuda.event() for i in range(end)]

        _cuda_nested_im_event_begin = [cuda.event() for i in range(end)]
        _cuda_nested_im_event_end = [cuda.event() for i in range(end)]

        for coarse_idx in range(1, end + 1):
            t += self.dT
            idx_in_dev_arr = (coarse_idx - 1) % self.cDtoH_freq + 1
            if not fused:
                _cuda_bulk_diffuse_event_begin[coarse_idx -
                                               1].record(stream=self.stream)
                self.cuda_bulk_diffuse(
                    idx_in_dev_arr, t, self.d_X, self.d_def_indicators,
                    self.d_dom_rate_integral, self.d_spread_integrals,
                    self.d_irs_f32, self.d_irs_i32, self.d_exp_1,
                    self.d_rng_states, self.dt, self.max_coarse_per_reset)
                _cuda_bulk_diffuse_event_end[coarse_idx -
                                             1].record(stream=self.stream)
                _cuda_compute_mtm_event_begin[coarse_idx -
                                              1].record(stream=self.stream)
                self.cuda_compute_mtm(
                    idx_in_dev_arr, t, self.d_X, self.d_mtm_by_cpty,
                    self.d_cash_flows_by_cpty, self.d_vanillas_on_fx_f32,
                    self.d_vanillas_on_fx_i32, self.d_vanillas_on_fx_b8,
                    self.d_irs_f32, self.d_irs_i32, self.d_zcs_f32,
                    self.d_zcs_i32, self.dt, self.max_coarse_per_reset,
                    self.cDtoH_freq, False)
                _cuda_compute_mtm_event_end[coarse_idx -
                                            1].record(stream=self.stream)
            else:
                _cuda_bulk_diffuse_event_begin[coarse_idx -
                                               1].record(stream=self.stream)
                if idx_in_dev_arr == 1:
                    self.cuda_diffuse_and_price(
                        1, self.cDtoH_freq, t, self.d_X,
                        self.d_dom_rate_integral, self.d_spread_integrals,
                        self.d_mtm_by_cpty, self.d_cash_flows_by_cpty,
                        self.d_irs_f32, self.d_irs_i32,
                        self.d_vanillas_on_fx_f32, self.d_vanillas_on_fx_i32,
                        self.d_vanillas_on_fx_b8, self.d_rng_states, self.dt,
                        self.max_coarse_per_reset, self.cDtoH_freq)
                    self.cuda_oversimulate_defs(1, self.cDtoH_freq,
                                                self.d_def_indicators,
                                                self.d_spread_integrals,
                                                self.d_exp_1)
                _cuda_bulk_diffuse_event_end[coarse_idx -
                                             1].record(stream=self.stream)
            if nested_cva_at is not None:
                _cuda_nested_cva_event_begin[coarse_idx -
                                             1].record(stream=self.stream)
                if coarse_idx in nested_cva_at:
                    self.cuda_nested_cva(
                        idx_in_dev_arr, self.num_coarse_steps - coarse_idx, t,
                        self.d_X, self.d_def_indicators,
                        self.d_dom_rate_integral, self.d_spread_integrals,
                        self.d_mtm_by_cpty, self.d_cash_flows_by_cpty,
                        self.d_irs_f32, self.d_irs_i32,
                        self.d_vanillas_on_fx_f32, self.d_vanillas_on_fx_i32,
                        self.d_vanillas_on_fx_b8, self.d_exp_1,
                        self.d_rng_states, self.dt, self.cDtoH_freq,
                        indicator_in_cva, self.d_nested_cva,
                        self.d_nested_cva_sq)
                    self.d_nested_cva.copy_to_host(
                        ary=self.nested_cva[coarse_idx], stream=self.stream)
                    self.d_nested_cva_sq.copy_to_host(
                        ary=self.nested_cva_sq[coarse_idx], stream=self.stream)
                _cuda_nested_cva_event_end[coarse_idx -
                                           1].record(stream=self.stream)

            if nested_im_at is not None:
                _cuda_nested_im_event_begin[coarse_idx -
                                            1].record(stream=self.stream)
                if coarse_idx in nested_im_at:
                    for adam_iter in range(self.num_adam_iters):
                        adam_init = adam_iter == 0
                        step_size = self.lam * (adam_iter + 1)**(-self.gamma)
                        self.cuda_nested_im(
                            alpha, adam_init, step_size, idx_in_dev_arr,
                            im_window, t, self.d_X,
                            self.d_mtm_by_cpty[idx_in_dev_arr], self.d_irs_f32,
                            self.d_irs_i32, self.d_vanillas_on_fx_f32,
                            self.d_vanillas_on_fx_i32,
                            self.d_vanillas_on_fx_b8, self.d_rng_states,
                            self.dt, self.d_nested_im_by_cpty,
                            self.d_nested_im_std_by_cpty, self.d_nested_im_m,
                            self.d_nested_im_v, self.adam_b1, self.adam_b2,
                            adam_iter)
                    self.d_nested_im_by_cpty.copy_to_host(
                        ary=self.nested_im_by_cpty[coarse_idx],
                        stream=self.stream)
                _cuda_nested_im_event_end[coarse_idx -
                                          1].record(stream=self.stream)

            if coarse_idx % self.cDtoH_freq == 0:
                self.d_X[self.max_coarse_per_reset:].copy_to_host(
                    ary=self.X[coarse_idx - self.cDtoH_freq + 1:coarse_idx +
                               1],
                    stream=self.stream)
                self.d_spread_integrals[1:].copy_to_host(
                    ary=self.spread_integrals[coarse_idx - self.cDtoH_freq +
                                              1:coarse_idx + 1],
                    stream=self.stream)
                self.d_dom_rate_integral[1:].copy_to_host(
                    ary=self.dom_rate_integral[coarse_idx - self.cDtoH_freq +
                                               1:coarse_idx + 1],
                    stream=self.stream)
                self.d_def_indicators[1:].copy_to_host(
                    ary=self.def_indicators[coarse_idx - self.cDtoH_freq +
                                            1:coarse_idx + 1],
                    stream=self.stream)
                self.d_mtm_by_cpty[1:].copy_to_host(
                    ary=self.mtm_by_cpty[coarse_idx - self.cDtoH_freq +
                                         1:coarse_idx + 1],
                    stream=self.stream)
                self.d_cash_flows_by_cpty[1:].copy_to_host(
                    ary=self.cash_flows_by_cpty[coarse_idx - self.cDtoH_freq +
                                                1:coarse_idx + 1],
                    stream=self.stream)
                self.d_X[:self.max_coarse_per_reset].copy_to_device(
                    self.d_X[-self.max_coarse_per_reset:], stream=self.stream)
                self.d_spread_integrals[0].copy_to_device(
                    self.d_spread_integrals[self.cDtoH_freq],
                    stream=self.stream)
                self.d_dom_rate_integral[0].copy_to_device(
                    self.d_dom_rate_integral[self.cDtoH_freq],
                    stream=self.stream)
                self.d_def_indicators[0].copy_to_device(
                    self.d_def_indicators[self.cDtoH_freq], stream=self.stream)

        if end % self.cDtoH_freq != 0:
            start_idx = (end // self.cDtoH_freq) * self.cDtoH_freq + 1
            length = end % self.cDtoH_freq
            self.d_X[self.max_coarse_per_reset:self.max_coarse_per_reset +
                     length].copy_to_host(ary=self.X[start_idx:start_idx +
                                                     length],
                                          stream=self.stream)
            self.d_spread_integrals[1:length + 1].copy_to_host(
                ary=self.spread_integrals[start_idx:start_idx + length],
                stream=self.stream)
            self.d_dom_rate_integral[1:length + 1].copy_to_host(
                ary=self.dom_rate_integral[start_idx:start_idx + length],
                stream=self.stream)
            self.d_def_indicators[1:length + 1].copy_to_host(
                ary=self.def_indicators[start_idx:start_idx + length],
                stream=self.stream)
            self.d_mtm_by_cpty[1:length + 1].copy_to_host(
                ary=self.mtm_by_cpty[start_idx:start_idx + length],
                stream=self.stream)
            self.d_cash_flows_by_cpty[1:length + 1].copy_to_host(
                ary=self.cash_flows_by_cpty[start_idx:start_idx + length],
                stream=self.stream)

        if verbose:
            print('Everything was successfully queued!')

        for evt_cuda_bulk_diffuse_event, evt_cuda_compute_mtm_event, evt_cuda_nested_cva_event, evt_cuda_nested_im_event in zip(
                _cuda_bulk_diffuse_event_end, _cuda_compute_mtm_event_end,
                _cuda_nested_cva_event_end, _cuda_nested_im_event_end):
            evt_cuda_bulk_diffuse_event.synchronize()
            evt_cuda_compute_mtm_event.synchronize()
            evt_cuda_nested_cva_event.synchronize()
            evt_cuda_nested_im_event.synchronize()

        self.stream.synchronize()

        if not fused:
            print('cuda_bulk_diffuse average elapsed time per launch: {0} ms'.
                  format(
                      round(
                          sum(
                              cuda.event_elapsed_time(evt_begin, evt_end)
                              for evt_begin, evt_end in zip(
                                  _cuda_bulk_diffuse_event_begin,
                                  _cuda_bulk_diffuse_event_end)) / end, 3)))
            print('compute_mtm average elapsed time per launch: {0} ms'.format(
                round(
                    sum(
                        cuda.event_elapsed_time(evt_begin, evt_end)
                        for evt_begin, evt_end in zip(
                            _cuda_compute_mtm_event_begin,
                            _cuda_compute_mtm_event_end)) / end, 3)))
        else:
            print('cuda_diffuse_and_price elapsed time: {0} ms'.format(
                round(
                    sum(
                        cuda.event_elapsed_time(evt_begin, evt_end)
                        for evt_begin, evt_end in zip(
                            _cuda_bulk_diffuse_event_begin,
                            _cuda_bulk_diffuse_event_end)), 3)))

        if nested_cva_at is not None:
            print('cuda_nested_cva average elapsed time per launch: {0} ms'.
                  format(
                      round(
                          sum(
                              cuda.event_elapsed_time(evt_begin, evt_end)
                              for evt_begin, evt_end in zip(
                                  _cuda_nested_cva_event_begin,
                                  _cuda_nested_cva_event_end)) /
                          len(nested_cva_at), 3)))

        if nested_im_at is not None:
            print('cuda_nested_im average elapsed time per launch: {0} ms'.
                  format(
                      round(
                          sum(
                              cuda.event_elapsed_time(evt_begin, evt_end)
                              for evt_begin, evt_end in zip(
                                  _cuda_nested_im_event_begin,
                                  _cuda_nested_im_event_end)) /
                          len(nested_im_at), 3)))

        # TODO: port this to CUDA
        self.cash_pos_by_cpty = ne.evaluate('c*exp(-r)',
                                            local_dict={
                                                'c':
                                                self.cash_flows_by_cpty,
                                                'r':
                                                self.dom_rate_integral[:,
                                                                       None, :]
                                            })
        np.cumsum(self.cash_pos_by_cpty, axis=0, out=self.cash_pos_by_cpty)
        self.cash_pos_by_cpty *= np.exp(self.dom_rate_integral[:, None, :])
コード例 #23
0
data_A = []
data_B = []
streams = []
start_events = []
end_events = []
num_arrays = 100
A_gpu = []
B_gpu = []
C_gpu = []
C_out = []

# Host code
for _ in range(num_arrays):
    streams.append(cuda.stream())
    start_events.append(cuda.event())
    end_events.append(cuda.event())
    # Initialize the data arrays
    A = numpy.full((24, 12), 3, numpy.float64)  # matrix containing all 3's
    B = numpy.full((12, 22), 4, numpy.float64)  # matrix containing all 4's
    data_A.append(A)
    data_B.append(B)

t_start = perf_counter()

for i in range(num_arrays):
    # Copy the arrays to GPU
    A_gpu.append(cuda.to_device(data_A[i], stream=streams[i]))
    B_gpu.append(cuda.to_device(data_B[i], stream=streams[i]))

    # Allocate memory on the device for the result
コード例 #24
0
 def __init__(self, time_offset=0, cuda_stream=0):
     self._t_start = cuda.event(timing=True)
     self._t_end = cuda.event(timing=True)
     self._time_off = time_offset
     self._cuda_stream = cuda_stream