Beispiel #1
0
    def cart2d_to_pol2d(self, M):
        Q1 = np.zeros((len(self.R), ), dtype=np.float64)
        Q2 = np.zeros((len(self.R), len(self.A)), dtype=np.float64)
        norm = np.zeros((len(self.R), ), dtype=np.float64)

        cl.enqueue_copy(self.queue, self.buf_M, M)
        cl.enqueue_fill_buffer(self.queue, self.buf_Q1e, np.float64(0.0), 0,
                               Q1.nbytes)
        cl.enqueue_fill_buffer(self.queue, self.buf_Q2e, np.float64(0.0), 0,
                               Q2.nbytes)
        cl.enqueue_fill_buffer(self.queue, self.buf_norm, np.float64(0.0), 0,
                               norm.nbytes)

        self.prg.cart2d_to_pol2d_project(self.queue, (self.r_len, 1), None,
                                         self.buf_fmt, self.buf_M,
                                         self.buf_Q1e, self.buf_Q2e,
                                         self.buf_norm)

        cl.enqueue_copy(self.queue, Q1, self.buf_Q1e)
        cl.enqueue_copy(self.queue, Q2, self.buf_Q2e)
        cl.enqueue_copy(self.queue, norm, self.buf_norm)

        cl.enqueue_barrier(self.queue).wait()

        Q1 /= norm.sum()

        return Q1, Q2
    def postproc_depose_scalar(self, fld):
        # Correct near axis deposition
        args_grid = [self.DataDev[fld+'_m'+str(m)].data \
                     for m in range(self.Args['M']+1)]

        WGS, WGS_tot = self.get_wgs(self.Args['Nx'])
        enqueue_barrier(self.queue)
        self._treat_axis_d_knl(self.queue, (WGS_tot, ), (WGS, ), args_grid[0],
                               np.uint32(self.Args['Nx'])).wait()

        for m in range(1, self.Args['M'] + 1):
            self._treat_axis_c_knl(self.queue, (WGS_tot, ),
                                   (WGS, ), args_grid[m],
                                   np.uint32(self.Args['Nx'])).wait()

        # Divide by radius
        WGS, WGS_tot = self.get_wgs(self.Args['NxNr'])
        grid_str = ['NxNr', 'Nx', 'dV_inv']
        grid_args = [self.DataDev[arg].data for arg in grid_str]

        enqueue_barrier(self.queue)
        self._divide_by_dv_d_knl(self.queue, (WGS_tot, ), (WGS, ),
                                 args_grid[0], *grid_args).wait()

        for m in range(1, self.Args['M'] + 1):
            self._divide_by_dv_c_knl(self.queue, (WGS_tot, ), (WGS, ),
                                     args_grid[m], *grid_args).wait()
Beispiel #3
0
def debayer_frame(ctx, debayer_prg, data, rgb=False):
    q = cl.CommandQueue(ctx)

    yuv_buff = np.empty(FRAME_WIDTH * FRAME_HEIGHT + UV_SIZE * 2,
                        dtype=np.uint8)

    cam_g = cl.Buffer(ctx,
                      cl.mem_flags.READ_ONLY | cl.mem_flags.COPY_HOST_PTR,
                      hostbuf=data)
    yuv_g = cl.Buffer(ctx, cl.mem_flags.WRITE_ONLY,
                      FRAME_WIDTH * FRAME_HEIGHT + UV_SIZE * 2)

    local_worksize = (20, 20) if TICI else (4, 4)
    ev1 = debayer_prg.debayer10(q, (UV_WIDTH, UV_HEIGHT), local_worksize,
                                cam_g, yuv_g)
    cl.enqueue_copy(q, yuv_buff, yuv_g, wait_for=[ev1]).wait()
    cl.enqueue_barrier(q)

    y = yuv_buff[:FRAME_WIDTH * FRAME_HEIGHT].reshape(
        (FRAME_HEIGHT, FRAME_WIDTH))
    u = yuv_buff[FRAME_WIDTH * FRAME_HEIGHT:FRAME_WIDTH * FRAME_HEIGHT +
                 UV_SIZE].reshape((UV_HEIGHT, UV_WIDTH))
    v = yuv_buff[FRAME_WIDTH * FRAME_HEIGHT + UV_SIZE:].reshape(
        (UV_HEIGHT, UV_WIDTH))

    if rgb:
        return yuv_to_rgb(y, u, v)
    else:
        return y, u, v
Beispiel #4
0
def final(config, ctx, queue, program, buffers, debug=False):
    matrixSize = config['matrixSize']
    bandwidth = config['bandwidth']
    partitionNumber = config['partitionNumber']
    partitionSize = config['partitionSize']
    offdiagonalSize = config['offdiagonalSize']
    rhsSize = config['rhsSize']

    xo  = np.ones((partitionNumber * (partitionSize - 2 * offdiagonalSize), rhsSize), dtype=np.float32)
    tmp = np.ones((partitionNumber * (partitionSize - 2 * offdiagonalSize), rhsSize), dtype=np.float32)

    mf = cl.mem_flags
    xo_buf  = cl.Buffer(ctx, mf.WRITE_ONLY | mf.COPY_HOST_PTR, hostbuf=xo)
    tmp_buf = cl.Buffer(ctx, mf.WRITE_ONLY | mf.COPY_HOST_PTR, hostbuf=tmp)

    kernel = program.reconstruct
    kernel.set_scalar_arg_dtypes([None, None, None, None, np.int32, np.int32, np.int32])

    cl.enqueue_barrier(queue)

    kernel(
        queue,
        (partitionNumber,),
        None,
        buffers[1], # Avwg buffer from factor, see if it is also readable and still valide
        buffers[3], # x buffer from solve, see if it is still valide
        xo_buf,
        tmp_buf,
        np.int32(partitionSize),
        np.int32(offdiagonalSize),
        np.int32(rhsSize)
    )

    xtb = np.ones((partitionNumber * 2 * offdiagonalSize, rhsSize), dtype=np.float32)
    cl.enqueue_copy(queue, xtb, buffers[3])

    if (debug) :
        print "X(t,b):"
        print xtb

    cl.enqueue_copy(queue, xo, xo_buf)

    if (debug) :
        print "X':"
        print xo

    xtb = sparse.csr_matrix(xtb)
    xo = sparse.csr_matrix(xo)

    x = []
    for i in range(0, partitionNumber) :
        t = i * (2 * offdiagonalSize)
        b = (i + 1) * (2 * offdiagonalSize)
        u = i * (partitionSize - 2 * offdiagonalSize)
        v = (i + 1) * (partitionSize - 2 * offdiagonalSize)
        x.append(xtb[t : t + offdiagonalSize, 0 : rhsSize])
        x.append(xo[u : v, 0 : rhsSize])
        x.append(xtb[b - offdiagonalSize : b, 0 : rhsSize])

    return sp.sparse.vstack(x)
Beispiel #5
0
 def begin_acquire(self, nthreads_per_block=64, cl_context=None):
     if api.is_gpu_api_cuda():
         self.gpu_funcs.reset_earliest_time_int(
             np.float32(1e9),
             np.int32(len(self.earliest_time_int_gpu)),
             self.earliest_time_int_gpu,
             block=(nthreads_per_block, 1, 1),
             grid=(len(self.earliest_time_int_gpu) // nthreads_per_block +
                   1, 1))
         self.channel_q_int_gpu.fill(0)
         self.channel_q_gpu.fill(0)
         self.channel_history_gpu.fill(0)
     elif api.is_gpu_api_opencl():
         comqueue = cl.CommandQueue(cl_context)
         self.gpu_funcs.reset_earliest_time_int(
             comqueue, (nthreads_per_block, 1, 1),
             (len(self.earliest_time_int_gpu) // nthreads_per_block + 1, 1),
             np.float32(1e9),
             np.int32(len(self.earliest_time_int_gpu)),
             self.earliest_time_int_gpu.data,
             g_times_l=True).wait()
         self.channel_q_int_gpu.fill(0, queue=comqueue)
         self.channel_q_gpu.fill(0, queue=comqueue)
         self.channel_history_gpu.fill(0, queue=comqueue)
         cl.enqueue_barrier(comqueue)
Beispiel #6
0
 def run(self):
     self.program.Jacobian(self.queue, self.shape, block_shape, self.B, self.DB)
     self.run_key = True
     
     if (self.maxiter != None):
         niter = 0
         
     while (self.run_key):
         self.program.Integrate(self.queue, self.shape, block_shape, 
                           self.X, self.X1, self.B, self.DB, np.float32(self.step), self.Error, self.Current)
         cl.enqueue_barrier(self.queue)
         cl.enqueue_read_buffer(self.queue, self.Error, self._Error)
         error = np.max(self._Error)
         if (not np.isnan(error)):
             cl.enqueue_read_buffer(self.queue, self.Current, self._Current)
             self.step /= error**0.1+0.5
             if (error < 1):
                 cl.enqueue_copy(self.queue, self.X, self.X1)
                 cl.enqueue_read_buffer(self.queue, self.X1, self._X)
         else:
             self.step /= 2.
         self.queue.finish()
         if (self.maxiter != None):
             niter += 1
             if (niter == self.maxiter):
                 self.stop()
Beispiel #7
0
def kmeans(it_n,class_n,data_n, centroids_x,centroids_y,data_x,data_y,partitioned):
    context = cl.create_some_context()
    queue =  cl.CommandQueue(context);
    for i in range(0,class_n):
        print centroids_x[i],centroids_y[i]
    with open("kernel.cl", 'r') as fin:
        program = cl.Program(context, fin.read()).build()
    assign = program.assign
    buf_centroids_x=cl.Buffer(context,mf.READ_ONLY|mf.COPY_HOST_PTR,hostbuf=centroids_x)
    buf_centroids_y =cl.Buffer(context,mf.READ_ONLY|mf.COPY_HOST_PTR,hostbuf=centroids_y)
    buf_data_x=cl.Buffer(context,mf.READ_ONLY|mf.COPY_HOST_PTR,hostbuf=data_x)
    buf_data_y=cl.Buffer(context,mf.READ_ONLY|mf.COPY_HOST_PTR,hostbuf=data_y)
    buf_parts = cl.Buffer(context,mf.READ_WRITE | mf.COPY_HOST_PTR,hostbuf=partitioned)
    dbl_max =100000.0
    for i in range(0,it_n):
        assign(queue,(data_n,),None,buf_centroids_x,buf_centroids_y,buf_data_x,buf_data_y, buf_parts,np.int32(class_n),np.int32(data_n),np.float32(dbl_max))
        cl.enqueue_barrier(queue)
        e =cl.enqueue_copy(queue,partitioned,buf_parts)
        e.wait()
        count = np.zeros(class_n).astype(np.int32)
        for i in range(0,class_n):
            centroids_x[i]=0.0
            centroids_y[i]=0.0
        for i in range(0,data_n):
            centroids_x[partitioned[i]] +=data_x[i]
            centroids_y[partitioned[i]] +=data_y[i]
            count[partitioned[i]]+=1
        for i in range(0,class_n):
            if count[i] != 0:
                centroids_x[i] /= count[i]
                centroids_y[i] /= count[i]
        buf_centroids_x=cl.Buffer(context,mf.READ_ONLY|mf.COPY_HOST_PTR,hostbuf=centroids_x)
        buf_centroids_y =cl.Buffer(context,mf.READ_ONLY|mf.COPY_HOST_PTR,hostbuf=centroids_y)
    print partitioned
    return partitioned
Beispiel #8
0
	def sync(self, queue):
		'''
		Inject a pyopencl marker into the specified queue that waits
		for the attached event.
		'''
		if self._event is None: return
		cl.enqueue_barrier(queue, wait_for=[self._event])
    def postproc_depose_vector(self, vec_fld):
        args_raddiv_str = ['NxNr', 'Nx', 'dV_inv']
        args_raddiv = [self.DataDev[arg].data for arg in args_raddiv_str]

        for fld in [vec_fld + comp for comp in self.Args['vec_comps']]:
            # Correct near axis deposition
            args_fld = [self.DataDev[fld+'_m'+str(m)].data \
                         for m in range(self.Args['M']+1)]

            WGS, WGS_tot = self.get_wgs(self.Args['Nx'])
            self._treat_axis_d_knl(self.queue, (WGS_tot, ),
                                   (WGS, ), args_fld[0],
                                   np.uint32(self.Args['Nx'])).wait()

            for m in range(1, self.Args['M'] + 1):
                self._treat_axis_c_knl(self.queue, (WGS_tot, ),
                                       (WGS, ), args_fld[m],
                                       np.uint32(self.Args['Nx'])).wait()

            # Divide by radius
            WGS, WGS_tot = self.get_wgs(self.Args['NxNr'])
            enqueue_barrier(self.queue)
            self._divide_by_dv_d_knl(self.queue, (WGS_tot, ), (WGS, ),
                                     args_fld[0], *args_raddiv).wait()

            for m in range(1, self.Args['M'] + 1):
                self._divide_by_dv_c_knl(self.queue, (WGS_tot, ), (WGS, ),
                                         args_fld[m], *args_raddiv).wait()
Beispiel #10
0
def test_enqueue_barrier_marker(ctx_factory):
    ctx = ctx_factory()
    _skip_if_pocl(ctx.devices[0].platform, 'pocl crashes on enqueue_barrier')
    queue = cl.CommandQueue(ctx)
    cl.enqueue_barrier(queue)
    evt1 = cl.enqueue_marker(queue)
    evt2 = cl.enqueue_marker(queue, wait_for=[evt1])
    cl.enqueue_barrier(queue, wait_for=[evt1, evt2])
def test_enqueue_barrier_marker(ctx_factory):
    ctx = ctx_factory()
    _skip_if_pocl(ctx.devices[0].platform, 'pocl crashes on enqueue_barrier')
    queue = cl.CommandQueue(ctx)
    cl.enqueue_barrier(queue)
    evt1 = cl.enqueue_marker(queue)
    evt2 = cl.enqueue_marker(queue, wait_for=[evt1])
    cl.enqueue_barrier(queue, wait_for=[evt1, evt2])
Beispiel #12
0
    def step(self, c1, c2):
        if not self.opencl_fast_step:
            return super().step(c1, c2)

        cl.enqueue_copy(self.queue, self.buf_Q1e, self.Q1_exp)
        cl.enqueue_copy(self.queue, self.buf_Q2e, self.Q2_exp)

        try:
            self.Q1_cal
        except AttributeError:
            # i = 1
            self.prg.pol2d_to_pol3d_init(self.queue, (self.r_len, 1), None,
                                         self.buf_fmt, self.buf_Q1e,
                                         self.buf_Q2e, self.buf_P1,
                                         self.buf_P2)

            self.Q1_cal = np.empty_like(self.Q1_exp)
            self.Q2_cal = np.empty_like(self.Q2_exp)
            self.P1 = np.empty_like(self.Q1_exp)
            self.P2 = np.empty_like(self.Q2_exp)

        else:
            # i > 1
            cl.enqueue_copy(self.queue, self.buf_Q1c, self.Q1_cal)
            cl.enqueue_copy(self.queue, self.buf_Q2c, self.Q2_cal)
            cl.enqueue_copy(self.queue, self.buf_P1, self.P1)
            cl.enqueue_copy(self.queue, self.buf_P2, self.P2)

            self.prg.pol2d_to_pol3d_step(self.queue, (self.r_len, 1), None,
                                         self.buf_fmt, self.buf_Q1e,
                                         self.buf_Q2e, self.buf_Q1c,
                                         self.buf_Q2c, self.buf_P1,
                                         self.buf_P2, np.float64(c1),
                                         np.float64(c2))

        self.prg.norm_pol3d_angular(self.queue, (self.r_len, 1), None,
                                    self.buf_fmt, self.buf_P1, self.buf_P2,
                                    self.buf_norm)

        radial_norm = np.zeros((len(self.R), ), dtype=np.float64)

        cl.enqueue_copy(self.queue, self.Q1_cal, self.buf_Q1c)
        cl.enqueue_copy(self.queue, self.Q2_cal, self.buf_Q2c)
        cl.enqueue_copy(self.queue, self.P1, self.buf_P1)
        cl.enqueue_copy(self.queue, self.P2, self.buf_P2)
        cl.enqueue_copy(self.queue, radial_norm, self.buf_norm)

        cl.enqueue_barrier(self.queue).wait()

        self.P1 /= radial_norm.sum()

        self.M_cal = self.pol3d_to_cart2d(self.P1, self.P2)
        self.norm_cart2d(self.M_cal)

        assert np.isfinite(self.M_cal).all(), 'M_cal is not finite'

        self.Q1_cal, self.Q2_cal = self.cart2d_to_pol2d(self.M_cal)
Beispiel #13
0
 def run(self, X):
     cl.enqueue_copy(self.queue, self.X, X)
     #cl.enqueue_acquire_gl_objects(self.queue, [self.X])
     
     self.out = np.zeros((self.nx,), dtype = np.float32)
     cl.enqueue_copy(self.queue, self.I, self.out)
     self.program.Solve(self.queue, (self.nx, self.na), None, self.A, self.r, self.X, self.I)
     #cl.enqueue_release_gl_objects(self.queue, [self.X])
     #self.queue.finish()
     cl.enqueue_barrier(self.queue)
     return self
Beispiel #14
0
    def run(self, input_data):
        print("Running model")
        assert self.loaded, "Must have a model loaded first"

        assert all(input_data.shape == self.input_shape), \
            "Input data must be of shape "+str(self.input_shape)+\
            " but is of shape "+str(input_data.shape)

        # Set input data
        mf = cl.mem_flags
        self.bufs[self.input_buffer] = np.ascontiguousarray(input_data)
        self.opencl_bufs[self.input_buffer] = cl.Buffer(self.ctx,
            mf.READ_ONLY | mf.COPY_HOST_PTR, hostbuf=self.bufs[self.input_buffer])

        with cl.CommandQueue(self.ctx) as queue:
            # Enqueue operations
            for i, op in enumerate(self.operations):
                print("Enqueing op", i)
                t = time.time()
                self.enqueue_op(queue, op)
                t = time.time() - t
                print("Took", t, "s")

                # TODO do I need a cl.enqueue_barrier(queue)?
                # or maybe cl.wait_for_events(event) and handle which outputs
                # are used for certain inputs?
                cl.enqueue_barrier(queue)

            # Get different output not requiring the custom op
            #
            # Note: only when we request the result does it actually run the
            # network, so this takes a long time
            prediction_boxes = None
            prediction_classes = None

            print("Fetching results")
            t = time.time()
            for tensor in self.tensors:
                buf = self.replace_buffers(tensor["buffer"])
                if tensor["name"] == "Squeeze":
                    prediction_boxes = self.load_buf(queue, buf)
                elif tensor["name"] == "convert_scores":
                    prediction_classes = self.load_buf(queue, buf)
            t = time.time() - t
            print("Took", t, "s")

        # Note: only the prediction boxes/classes buffers will have valid data
        # in them though unless we load *all* the buffers in the above for loop
        np.save("tflite_opencl.npy", {
            t["name"]: self.bufs[t["buffer"]] for t in self.tensors
        })
        print("Total number of tensors:", len(self.tensors))

        return prediction_boxes, prediction_classes
Beispiel #15
0
    def kmeans_chunk_center(self, data, centers):
        data = data.astype(np.float32)
        centers = centers.astype(np.float32)
        k = len(centers)
        dim = len(centers[0])

        if not self.prg:
            self.__initialize_program(dim, k)

        out = np.zeros((10, 1), dtype=np.float32)
        new_centers = np.asarray(self.n_work_groups * [np.zeros(self._dimension, dtype=np.float32) for _ in xrange(k)], dtype=np.float32)
        data_assigns = np.empty((len(data), 1), dtype=np.int32)

        # create buffers
        data_buf = cl.Buffer(self.ctx, cl.mem_flags.READ_ONLY | cl.mem_flags.COPY_HOST_PTR, hostbuf=data)
        centers_buf = cl.Buffer(self.ctx, cl.mem_flags.READ_ONLY | cl.mem_flags.COPY_HOST_PTR, hostbuf=centers)
        assigns_buf = cl.Buffer(self.ctx, cl.mem_flags.READ_WRITE, data_assigns.nbytes)
        new_centers_buf = cl.Buffer(
            self.ctx, cl.mem_flags.WRITE_ONLY, self.n_work_groups * dim * k * np.dtype('float32').itemsize
        )
        centers_counter_buf = cl.Buffer(
            self.ctx, cl.mem_flags.WRITE_ONLY, k*self.n_work_groups * np.dtype('int32').itemsize
        )
        out_buf = cl.Buffer(self.ctx, cl.mem_flags.WRITE_ONLY | cl.mem_flags.COPY_HOST_PTR, hostbuf=out)

        e = cl.enqueue_barrier(self.queue)
        e.wait()

        # run opencl extension
        self.prg.kmeans_chunk_center_cl(
            self.queue,
            (len(data),),
            None,
            assigns_buf,
            data_buf,
            centers_buf,
            centers_counter_buf,
            new_centers_buf, out_buf
        )

        # barrier
        e = cl.enqueue_barrier(self.queue)
        e.wait()

        # wait for it to finish and read out buffers
        cl.enqueue_copy(self.queue, data_assigns, assigns_buf)
        cl.enqueue_copy(self.queue, new_centers, new_centers_buf)
        cl.enqueue_copy(self.queue, out, out_buf)

        new_centers = new_centers[:k]
        self._data_assigns.extend(data_assigns.flatten().tolist())

        return new_centers.astype(dtype=np.float32)
Beispiel #16
0
def runDijkstra(graph, source, costArray):
    context = cl.create_some_context()
    with open("Kernel.cl", 'r') as fin:
        program = cl.Program(context, fin.read()).build()
    maskArray = np.zeros(graph.vertexcount).astype(np.int32)
    maskArray[source] = 1
    updateCostArray = np.zeros(graph.vertexcount).astype(np.float32)
    for i in range(0, graph.vertexcount):
        updateCostArray[i] = np.Inf
    updateCostArray[source] = 0
    costArray[source] = 0
    queue = cl.CommandQueue(context)

    dijkstra_first = program.Dijkstra_first
    dijkstra_second = program.Dijkstra_second

    vertex = cl.Buffer(context,
                       mf.READ_ONLY | mf.COPY_HOST_PTR,
                       hostbuf=graph.vertexArray)
    edge = cl.Buffer(context,
                     mf.READ_ONLY | mf.COPY_HOST_PTR,
                     hostbuf=graph.edgeArray)
    weight = cl.Buffer(context,
                       mf.READ_ONLY | mf.COPY_HOST_PTR,
                       hostbuf=graph.weightArray)
    mask = cl.Buffer(context,
                     mf.READ_WRITE | mf.COPY_HOST_PTR,
                     hostbuf=maskArray)
    cost = cl.Buffer(context,
                     mf.READ_WRITE | mf.COPY_HOST_PTR,
                     hostbuf=costArray)
    updateCost = cl.Buffer(context,
                           mf.READ_WRITE | mf.COPY_HOST_PTR,
                           hostbuf=costArray)

    while (maskArrayEmpty(maskArray, graph.vertexcount) is False):

        dijkstra_first(queue, (graph.vertexcount, ), (1, ), vertex, edge,
                       weight, mask, cost, updateCost,
                       np.int32(graph.vertexcount), np.int32(graph.edgecount))
        dijkstra_second(queue, (graph.vertexcount, ), (1, ), vertex, edge,
                        weight, mask, cost, updateCost,
                        np.int32(graph.vertexcount))

        cl.enqueue_barrier(queue)
        e = cl.enqueue_copy(queue, maskArray, mask)
        e.wait()
    cl.enqueue_barrier(queue)
    e = cl.enqueue_copy(queue, costArray, cost)
    e.wait()
    return costArray  #return the shortest path weights from the source to each vertex
Beispiel #17
0
def test_enqueue_barrier_marker(ctx_factory):
    ctx = ctx_factory()
    # Still relevant on pocl 1.0RC1.
    _xfail_if_pocl(
            ctx.devices[0].platform, (1, 0), "pocl crashes on enqueue_barrier")

    queue = cl.CommandQueue(ctx)

    if queue._get_cl_version() >= (1, 2) and cl.get_cl_header_version() <= (1, 1):
        pytest.skip("CL impl version >= 1.2, header version <= 1.1--cannot be sure "
                "that clEnqueueWaitForEvents is implemented")

    cl.enqueue_barrier(queue)
    evt1 = cl.enqueue_marker(queue)
    evt2 = cl.enqueue_marker(queue, wait_for=[evt1])
    cl.enqueue_barrier(queue, wait_for=[evt1, evt2])
    def test_setIterationFinished_000(self):
        basePath = 'C:/Private/PhD_Publications/Publication_of_Algorithm/Code/TrackingAlgorithm/TrackingAlgorithm/TestData/ReferenceDataForTests/UnitTests/OpenClKernels/setIterationFinished_000'
        inputPath = basePath + '/input'
        referencePath = basePath + '/output'
        referenceVariableName1 = 'dev_iterationFinished'

        self.linFitSearchRangeXvalues = np.float64(
            np.transpose(np.linspace(1, 200, 200)))
        self.setupTest()

        self.nrOfLocalAngleSteps = 64
        self.detectionKernelStrideSize = 2048
        self.nrOfStrides = 1
        self.nrOfDetectionAngleSteps = np.float64(
            self.nrOfStrides * self.detectionKernelStrideSize)

        self.loadDeviceVariable('dev_iterationFinished', inputPath)
        self.setWorkGroupSizes()

        self.prg.setIterationFinished(self.queue, (1, 1), None,
                                      self.dev_iterationFinished.data)
        barrierEvent = cl.enqueue_barrier(self.queue)

        self.assertVectorEqualsExpectedResult(
            self.dev_iterationFinished,
            referencePath + '/' + referenceVariableName1 + '.npy')
        pass
    def test_calculateMembraneNormalVectors_000(self):
        basePath = 'C:/Private/PhD_Publications/Publication_of_Algorithm/Code/TrackingAlgorithm/TrackingAlgorithm/TestData/ReferenceDataForTests/UnitTests/OpenClKernels/calculateMembraneNormalVectors_000'
        inputPath = basePath + '/input'
        referencePath = basePath + '/output'
        referenceVariableName1 = 'dev_membraneNormalVectors'

        self.linFitSearchRangeXvalues = np.float64(
            np.transpose(np.linspace(1, 200, 200)))
        self.setupTest()

        self.nrOfLocalAngleSteps = 64
        self.detectionKernelStrideSize = 2048
        self.nrOfStrides = 1
        self.nrOfDetectionAngleSteps = np.float64(
            self.nrOfStrides * self.detectionKernelStrideSize)

        self.loadDeviceVariable('dev_membraneCoordinates', inputPath)
        self.loadDeviceVariable('dev_membraneNormalVectors', inputPath)
        self.loadDeviceVariable('gradientGlobalSize', inputPath)
        self.setWorkGroupSizes()

        self.prg.calculateMembraneNormalVectors(self.queue, self.gradientGlobalSize, None, \
                   self.dev_membraneCoordinates.data, \
                   self.dev_membraneNormalVectors.data \
                  )
        barrierEvent = cl.enqueue_barrier(self.queue)

        self.assertVector2EqualsExpectedResult(
            self.dev_membraneNormalVectors,
            referencePath + '/' + referenceVariableName1 + '.npy')
        pass
Beispiel #20
0
    def pol3d_to_cart2d(self, P1, P2):
        M = np.zeros((self.row_len, self.col_len), dtype=np.float64)

        cl.enqueue_copy(self.queue, self.buf_P1, P1)
        cl.enqueue_copy(self.queue, self.buf_P2, P2)
        cl.enqueue_fill_buffer(self.queue, self.buf_M, np.float64(0.0), 0,
                               M.nbytes)

        self.prg.pol3d_to_cart2d(self.queue, (self.row_len, self.col_len),
                                 None, self.buf_fmt, self.buf_P1, self.buf_P2,
                                 self.buf_M)

        cl.enqueue_copy(self.queue, M, self.buf_M)

        cl.enqueue_barrier(self.queue).wait()

        return M
Beispiel #21
0
    def pol3d_to_slice2d(self, P1, P2):
        S = np.zeros((self.row_len, self.col_len), dtype=np.float64)

        cl.enqueue_copy(self.queue, self.buf_P1, P1)
        cl.enqueue_copy(self.queue, self.buf_P2, P2)
        cl.enqueue_fill_buffer(self.queue, self.buf_S, np.float64(0.0), 0,
                               S.nbytes)

        self.prg.pol3d_to_slice2d(self.queue, (self.row_len, self.col_len),
                                  None, self.buf_fmt, self.buf_P1, self.buf_P2,
                                  self.buf_S)

        cl.enqueue_copy(self.queue, S, self.buf_S)

        cl.enqueue_barrier(self.queue).wait()

        return S
Beispiel #22
0
def _enqueue_barrier(queue, wait_for):
    if queue.device.platform.name == "Portable Computing Language":
        # pocl 0.13 and below crash on clEnqueueBarrierWithWaitList
        evt = cl.enqueue_marker(queue, wait_for=wait_for)
        queue.finish()
        return evt
    else:
        return cl.enqueue_barrier(queue, wait_for=wait_for)
Beispiel #23
0
def _enqueue_barrier(queue, wait_for):
    if queue.device.platform.name == "Portable Computing Language":
        # pocl 0.13 and below crash on clEnqueueBarrierWithWaitList
        evt = cl.enqueue_marker(queue, wait_for=wait_for)
        queue.finish()
        return evt
    else:
        return cl.enqueue_barrier(queue, wait_for=wait_for)
Beispiel #24
0
def getUZCM_Ring(n,beta,hJ):
    '''
    explicitly calculates the inner energy U, the partititon function Z, the heat capacity C and the magnetisation M for a given beta and hJ on a Ising Ring with n spins.
    '''
    beta=np.array(beta,dtype=np.double)
    beta_g=cl.Buffer(ctx,mf.READ_ONLY|mf.COPY_HOST_PTR,hostbuf=beta)
    h_g=cl.Buffer(ctx,mf.READ_WRITE,8*2**(n))
    u_g,z_g,c_g,m_g=(cl.Buffer(ctx,mf.WRITE_ONLY,beta.nbytes) for x in range(4))
    u_h,z_h,c_h,m_h=(np.zeros_like(beta) for x in range(4))
    cprg.tabulateH_Ring(cqu,(1,),None,struct.pack('i', n),struct.pack('d', hJ),h_g)
    cl.enqueue_barrier(cqu)
    cprg.getUZCM(cqu,beta.shape,None,struct.pack('i', n),beta_g,h_g,u_g,z_g,c_g,m_g)
    cl.enqueue_copy(cqu, u_h, u_g)
    cl.enqueue_copy(cqu, z_h, z_g)
    cl.enqueue_copy(cqu, c_h, c_g)
    cl.enqueue_copy(cqu, m_h, m_g)
    return u_h/n,z_h,c_h/n,m_h/n
Beispiel #25
0
    def calculateContourCenter(self):
        self.prg.calculateDs(self.queue, self.gradientGlobalSize, None, \
              self.dev_membraneCoordinates.data, \
              self.dev_ds.data \
            )
        barrierEvent = cl.enqueue_barrier(self.queue)

        self.prg.calculateSumDs(self.queue, self.gradientGlobalSize, None, \
              self.dev_ds.data, self.dev_sumds.data \
            )
        barrierEvent = cl.enqueue_barrier(self.queue)

        self.prg.calculateContourCenter(self.queue, (1,1), None, \
                 self.dev_membraneCoordinates.data, \
                 self.dev_ds.data, self.dev_sumds.data, \
                 self.dev_contourCenter.data, \
                 np.int32(self.nrOfDetectionAngleSteps) \
                )
        barrierEvent = cl.enqueue_barrier(self.queue)
Beispiel #26
0
        def execute(self):
                if self.executionsPerReadback == 1:
                        self.program.Conway(self.queue, self.a.shape, None, self.ar_ySize, self.a_buf, self.dest_buf)
                else:
                        for i in range(0,self.executionsPerReadback/2):
                                self.program.Conway(self.queue, self.a.shape, None, self.ar_ySize, self.a_buf, self.dest_buf)
                                self.program.Conway(self.queue, self.a.shape, None, self.ar_ySize, self.dest_buf, self.a_buf)
                                cl.enqueue_barrier(self.queue)
                        if self.executionsPerReadback%2 == 1:
                                self.program.Conway(self.queue, self.a.shape, None, self.ar_ySize, self.a_buf, self.dest_buf)

                if self.executionsPerReadback%2 == 1:
                        cl.enqueue_read_buffer(self.queue, self.dest_buf, self.c).wait()
                else: 
                        cl.enqueue_read_buffer(self.queue, self.a_buf, self.c).wait()

                self.a = self.c;
                #Refresh buffers
                mf = cl.mem_flags
                self.a_buf = cl.Buffer(self.ctx, mf.READ_ONLY | mf.COPY_HOST_PTR, hostbuf=self.a)
                self.dest_buf = cl.Buffer(self.ctx, mf.WRITE_ONLY, self.a.nbytes)
Beispiel #27
0
 def setStartingCoordinates(self,dev_initialMembraneCoordinatesX,dev_initialMembraneCoordinatesY, \
         dev_initialMembranNormalVectorsX,dev_initialMembranNormalVectorsY):
     cl.enqueue_copy_buffer(self.queue,
                            dev_initialMembraneCoordinatesX.data,
                            self.dev_membraneCoordinatesX.data).wait()  #<-
     cl.enqueue_copy_buffer(self.queue,
                            dev_initialMembraneCoordinatesY.data,
                            self.dev_membraneCoordinatesY.data).wait()
     cl.enqueue_copy_buffer(self.queue,
                            dev_initialMembranNormalVectorsX.data,
                            self.dev_membraneNormalVectorsX.data).wait()
     cl.enqueue_copy_buffer(self.queue,
                            dev_initialMembranNormalVectorsY.data,
                            self.dev_membraneNormalVectorsY.data).wait()
     barrierEvent = cl.enqueue_barrier(self.queue)
     self.queue.finish()
    def test_filterJumpedCoordinates_000(self):
        basePath = 'C:/Private/PhD_Publications/Publication_of_Algorithm/Code/TrackingAlgorithm/TrackingAlgorithm/TestData/ReferenceDataForTests/UnitTests/OpenClKernels/filterJumpedCoordinates_000'
        inputPath = basePath + '/input'
        referencePath = basePath + '/output'
        referenceVariableName1 = 'dev_membraneCoordinates'
        referenceVariableName2 = 'dev_membraneNormalVectors'

        self.linFitSearchRangeXvalues = np.float64(
            np.transpose(np.linspace(1, 200, 200)))
        self.setupTest()

        self.nrOfLocalAngleSteps = 64
        self.detectionKernelStrideSize = 2048
        self.nrOfStrides = 1
        self.nrOfDetectionAngleSteps = np.float64(
            self.nrOfStrides * self.detectionKernelStrideSize)

        self.loadDeviceVariable('dev_previousContourCenter', inputPath)
        self.loadDeviceVariable('dev_membraneCoordinates', inputPath)
        self.loadDeviceVariable('dev_membraneNormalVectors', inputPath)
        self.loadDeviceVariable('dev_previousInterpolatedMembraneCoordinates',
                                inputPath)
        self.loadDeviceVariable('dev_closestLowerNoneNanIndex', inputPath)
        self.loadDeviceVariable('dev_closestUpperNoneNanIndex', inputPath)
        self.maxCoordinateShift = np.float64(10.0)
        self.listOfGoodCoordinates_memSize = np.int(8192)

        self.setWorkGroupSizes()

        self.prg.filterJumpedCoordinates(self.queue, self.gradientGlobalSize, None, \
                 self.dev_previousContourCenter.data, \
                 self.dev_membraneCoordinates.data, \
                 self.dev_membraneNormalVectors.data, \
                    self.dev_previousInterpolatedMembraneCoordinates.data, \
                    cl.LocalMemory(self.dev_closestLowerNoneNanIndex.nbytes), \
                 cl.LocalMemory(self.dev_closestUpperNoneNanIndex.nbytes), \
                 cl.LocalMemory(self.listOfGoodCoordinates_memSize), \
                 self.maxCoordinateShift \
                 )
        barrierEvent = cl.enqueue_barrier(self.queue)
        self.assertVector2EqualsExpectedResult(
            self.dev_membraneCoordinates,
            referencePath + '/' + referenceVariableName1 + '.npy')
        self.assertVector2EqualsExpectedResult(
            self.dev_membraneNormalVectors,
            referencePath + '/' + referenceVariableName2 + '.npy')
        pass
Beispiel #29
0
 def setStartingMembraneNormals(self, dev_initialMembranNormalVectorsX,
                                dev_initialMembranNormalVectorsY):
     if self.resetNormalsAfterEachImage and not self.getContourId(
     ) == 0:  # reset contour normal vector to radial vectors; we do this only starting for the second, since doing this for image 0, would destroy the correspondence of the indexes of the contour coordinates to their corresponding contour normals
         cl.enqueue_copy_buffer(
             self.queue, self.dev_radialVectorsX.data,
             self.dev_membraneNormalVectorsX.data).wait()
         cl.enqueue_copy_buffer(
             self.queue, self.dev_radialVectorsY.data,
             self.dev_membraneNormalVectorsY.data).wait()
     else:  # copy contour normal vectors from last image to use as initial normal vectors for next image
         cl.enqueue_copy_buffer(
             self.queue, dev_initialMembranNormalVectorsX.data,
             self.dev_membraneNormalVectorsX.data).wait()
         cl.enqueue_copy_buffer(
             self.queue, dev_initialMembranNormalVectorsY.data,
             self.dev_membraneNormalVectorsY.data).wait()
     barrierEvent = cl.enqueue_barrier(self.queue)
Beispiel #30
0
    def setStartingCoordinatesNew(self, dev_initialMembraneCoordinatesX,
                                  dev_initialMembraneCoordinatesY):
        cl.enqueue_copy_buffer(self.queue,
                               dev_initialMembraneCoordinatesX.data,
                               self.dev_membraneCoordinatesX.data).wait()  #<-
        cl.enqueue_copy_buffer(self.queue,
                               dev_initialMembraneCoordinatesY.data,
                               self.dev_membraneCoordinatesY.data).wait()

        #cl.enqueue_copy_buffer(self.queue,dev_initialMembraneCoordinatesX.data,self.dev_interpolatedMembraneCoordinatesX.data).wait()
        #cl.enqueue_copy_buffer(self.queue,dev_initialMembraneCoordinatesY.data,self.dev_interpolatedMembraneCoordinatesY.data).wait()

        cl.enqueue_copy_buffer(
            self.queue, dev_initialMembraneCoordinatesX.data,
            self.dev_previousInterpolatedMembraneCoordinatesX.data).wait()
        cl.enqueue_copy_buffer(
            self.queue, dev_initialMembraneCoordinatesY.data,
            self.dev_previousInterpolatedMembraneCoordinatesY.data).wait()
        barrierEvent = cl.enqueue_barrier(self.queue)
def gpu_ifft(vec):
    """
    Uses the pyopencl and pyfft libraries to perform an fft on the GPU
    """
    from pyfft.cl import Plan as cl_plan
    import pyopencl as cl
    import pyopencl.array as cl_array
    from numpy import complex64, shape, complex128, float32, real    

    array_size = vec.shape
    #Find the GPU's available
    platform = cl.get_platforms()
    my_gpu_devices = platform[0].get_devices(device_type=cl.device_type.GPU)
    #Create a context using the GPU's found in the above step    
    ctx = cl.Context(devices=my_gpu_devices)
    #Create queue using that context
    queue = cl.CommandQueue(ctx)    
    #    plan = cl_plan(array_size,queue=queue)
    #Make a temporary copy of vec so that things don't get all messed up
    
    ##temp = vec.copy().astype(complex64)
    
    plan = cl_plan(array_size, dtype=complex64, queue=queue)
    #    plan = cl_plan(array_size,queue=queue)
            
    alloc = cl.tools.ImmediateAllocator(queue)
    cl.tools.MemoryPool(alloc).stop_holding()
    ##gpu_data = cl_array.to_device(queue, temp)
    vec = vec.astype(complex64)
         
    cl.enqueue_barrier(queue)
    gpu_data = cl_array.to_device(queue, vec, allocator = alloc, async = True)
    gpu_data.queue.finish()
    
    cl.enqueue_barrier(queue)
    plan.execute(gpu_data.data, inverse=True)
    cl.enqueue_barrier(queue)

    ans = gpu_data.get()
    gpu_data.data.release()
    gpu_data.queue.finish()
    queue.flush()
    for i in range(20):
        pass    
    return ans
Beispiel #32
0
    def propagate(self,
                  gpu_geometry,
                  rng_states,
                  nthreads_per_block=64,
                  max_blocks=1024,
                  max_steps=10,
                  use_weights=False,
                  scatter_first=0,
                  cl_context=None):
        """Propagate photons on GPU to termination or max_steps, whichever
        comes first.

        May be called repeatedly without reloading photon information if
        single-stepping through photon history.

        ..warning::
            `rng_states` must have at least `nthreads_per_block`*`max_blocks`
            number of curandStates.
        """
        nphotons = self.pos.size
        # bind node texture reference
        if api.is_gpu_api_cuda() and not self.node_texture_ref_bound:
            # we have to unroll, as pycuda doesn't seem to support vector times right now for binding
            self.unrolled_nodes = ga.to_gpu(
                gpu_geometry.nodes.get().ravel().view(np.uint32))
            self.unrolled_extra_nodes = ga.to_gpu(
                gpu_geometry.extra_nodes.ravel().view(np.uint32))
            self.unrolled_triangles = ga.to_gpu(
                gpu_geometry.triangles.get().ravel().view(np.uint32))
            self.unrolled_triangles4 = ga.to_gpu(
                gpu_geometry.triangles4.ravel().view(np.uint32))
            self.unrolled_vertices = ga.to_gpu(
                gpu_geometry.vertices.get().ravel().view(np.float32))
            self.unrolled_vertices4 = ga.to_gpu(
                gpu_geometry.vertices4.ravel().view(np.float32))
            self.node_texture_ref.set_address(self.unrolled_nodes.gpudata,
                                              self.unrolled_nodes.nbytes)
            self.extra_node_texture_ref.set_address(
                self.unrolled_extra_nodes.gpudata,
                self.unrolled_extra_nodes.nbytes)
            #self.unrolled_nodes.bind_to_texref_ext( self.node_texture_ref )
            #self.unrolled_extra_nodes.bind_to_texref_ext( self.extra_node_texture_ref )
            #self.unrolled_triangles.bind_to_texref_ext( self.triangles_texture_ref )
            self.triangles_texture_ref.set_address(
                self.unrolled_triangles4.gpudata,
                self.unrolled_triangles4.nbytes)
            #self.unrolled_vertices.bind_to_texref_ext( self.vertices_texture_ref )
            self.vertices_texture_ref.set_address(
                self.unrolled_vertices4.gpudata,
                self.unrolled_vertices4.nbytes)
            print "[BOUND TO TEXTURE MEMORY]"
            print "Nodes: ", self.unrolled_nodes.nbytes / 1.0e3, " kbytes"
            print "Extra nodes: ", self.unrolled_extra_nodes.nbytes / 1.0e3, " kbytes"
            print "Triangles: ", self.unrolled_triangles4.nbytes / 1.0e3, " kbytes"
            print "Vertices: ", self.unrolled_vertices4.nbytes / 1.0e3, " kbytes"
            print "Total: ", (self.unrolled_nodes.nbytes +
                              self.unrolled_extra_nodes.nbytes +
                              self.unrolled_triangles4.nbytes +
                              self.unrolled_vertices4.nbytes) / 1.0e3, "kbytes"
            self.node_texture_ref_bound = True

        # setup queue
        maxqueue = nphotons
        step = 0
        input_queue = np.empty(shape=maxqueue + 1, dtype=np.uint32)
        input_queue[0] = 0
        # Order photons initially in the queue to put the clones next to each other
        for copy in xrange(self.ncopies):
            input_queue[1 + copy::self.ncopies] = np.arange(
                self.true_nphotons,
                dtype=np.uint32) + copy * self.true_nphotons
        if api.is_gpu_api_cuda():
            input_queue_gpu = ga.to_gpu(input_queue)
        elif api.is_gpu_api_opencl():
            comqueue = cl.CommandQueue(cl_context)
            input_queue_gpu = ga.to_device(comqueue,
                                           input_queue[1:])  # why the offset?

        output_queue = np.zeros(shape=maxqueue + 1, dtype=np.uint32)
        output_queue[0] = 1
        if api.is_gpu_api_cuda():
            output_queue_gpu = ga.to_gpu(output_queue)
        elif api.is_gpu_api_opencl():
            output_queue_gpu = ga.to_device(comqueue, output_queue)

        if use_weights:
            iuse_weights = 1
        else:
            iuse_weights = 0

        adapt_factor = 1.0
        start_prop = time.time()
        while step < max_steps:
            # Just finish the rest of the steps if the # of photons is low
            #if nphotons < nthreads_per_block * 16 * 8 or use_weights:
            #    nsteps = max_steps - step
            #else:
            #    nsteps = 1
            nsteps = 1

            start_step = time.time()
            for first_photon, photons_this_round, blocks in \
                    chunk_iterator(nphotons, nthreads_per_block, max( int(adapt_factor*max_blocks), 1 )):
                #print nphotons, nthreads_per_block, max_blocks," : ",first_photon, photons_this_round, blocks, adapt_factor
                start_chunk = time.time()
                if api.is_gpu_api_cuda():
                    self.gpu_funcs.propagate(np.int32(first_photon),
                                             np.int32(photons_this_round),
                                             input_queue_gpu[1:],
                                             output_queue_gpu,
                                             rng_states,
                                             self.pos,
                                             self.dir,
                                             self.wavelengths,
                                             self.pol,
                                             self.t,
                                             self.flags,
                                             self.last_hit_triangles,
                                             self.weights,
                                             np.int32(nsteps),
                                             np.int32(iuse_weights),
                                             np.int32(scatter_first),
                                             gpu_geometry.gpudata,
                                             block=(nthreads_per_block, 1, 1),
                                             grid=(blocks, 1))
                    #cuda.Context.get_current().synchronize()
                elif api.is_gpu_api_opencl():
                    self.gpu_funcs.propagate(
                        comqueue, (photons_this_round, 1, 1),
                        None,
                        np.int32(first_photon),
                        np.int32(photons_this_round),
                        input_queue_gpu.data,
                        output_queue_gpu.data,
                        rng_states.data,
                        self.pos.data,
                        self.dir.data,
                        self.wavelengths.data,
                        self.pol.data,
                        self.t.data,
                        self.flags.data,
                        self.last_hit_triangles.data,
                        self.weights.data,
                        np.int32(nsteps),
                        np.int32(iuse_weights),
                        np.int32(scatter_first),
                        gpu_geometry.world_scale,
                        gpu_geometry.world_origin.data,
                        np.int32(len(gpu_geometry.nodes)),
                        gpu_geometry.material_data['n'],
                        gpu_geometry.material_data['step'],
                        gpu_geometry.material_data["wavelength0"],
                        gpu_geometry.vertices.data,
                        gpu_geometry.triangles.data,
                        gpu_geometry.material_codes.data,
                        gpu_geometry.colors.data,
                        gpu_geometry.nodes.data,
                        gpu_geometry.extra_nodes.data,
                        gpu_geometry.material_data["nmaterials"],
                        gpu_geometry.material_data['refractive_index'].data,
                        gpu_geometry.material_data['absorption_length'].data,
                        gpu_geometry.material_data['scattering_length'].data,
                        gpu_geometry.material_data['reemission_prob'].data,
                        gpu_geometry.material_data['reemission_cdf'].data,
                        gpu_geometry.surface_data['nsurfaces'],
                        gpu_geometry.surface_data['detect'].data,
                        gpu_geometry.surface_data['absorb'].data,
                        gpu_geometry.surface_data['reemit'].data,
                        gpu_geometry.surface_data['reflect_diffuse'].data,
                        gpu_geometry.surface_data['reflect_specular'].data,
                        gpu_geometry.surface_data['eta'].data,
                        gpu_geometry.surface_data['k'].data,
                        gpu_geometry.surface_data['reemission_cdf'].data,
                        gpu_geometry.surface_data['model'].data,
                        gpu_geometry.surface_data['transmissive'].data,
                        gpu_geometry.surface_data['thickness'].data,
                        gpu_geometry.surface_data['nplanes'].data,
                        gpu_geometry.surface_data['wire_diameter'].data,
                        gpu_geometry.surface_data['wire_pitch'].data,
                        g_times_l=True).wait()
                end_chunk = time.time()
                chunk_time = end_chunk - start_chunk
                #print "chunk time: ",chunk_time
                #if chunk_time>2.5:
                #    adapt_factor *= 0.5
            step += nsteps
            scatter_first = 0  # Only allow non-zero in first pass
            end_step = time.time()
            #print "step time: ",end_step-start_step

            if step < max_steps:
                start_requeue = time.time()
                #print "reset photon queues"
                if api.is_gpu_api_cuda():
                    cuda.Context.get_current().synchronize(
                    )  # ensure all threads done
                    #temp = input_queue_gpu
                    #input_queue_gpu = output_queue_gpu
                    #output_queue_gpu = temp
                    # Assign with a numpy array of length 1 to silence
                    # warning from PyCUDA about setting array with different strides/storage orders.
                    #output_queue_gpu[:1].set(np.ones(shape=1, dtype=np.uint32))
                    #nphotons = input_queue_gpu[:1].get()[0] - 1
                    # new style
                    output_queue_gpu.get(output_queue)
                    nphotons = output_queue[0] - 1
                    input_queue_gpu.set(output_queue)
                    output_queue_gpu[:1].set(np.ones(shape=1, dtype=np.uint32))

                elif api.is_gpu_api_opencl():
                    temp_out = output_queue_gpu.get()
                    nphotons = temp_out[0]
                    input_queue_gpu.set(
                        temp_out[1:], queue=comqueue
                    )  # set the input queue to have index of photons still need to be run
                    output_queue_gpu[:1].set(
                        np.ones(shape=1, dtype=np.uint32),
                        queue=comqueue)  # reset first instance to be one
                end_requeue = time.time()
                #print "re-queue time (nphotons=",nphotons"): ",end_requeue-start_requeue
                if nphotons == 0:
                    break

        end_prop = time.time()
        print "propagation time: ", end_prop - start_prop, " secs"
        end_flags = self.flags.get()
        end_flag = np.max(end_flags)
        if end_flag & (1 << 31):
            print >> sys.stderr, "WARNING: ABORTED PHOTONS"
        if api.is_gpu_api_cuda():
            cuda.Context.get_current().synchronize()
        elif api.is_gpu_api_opencl():
            cl.enqueue_barrier(comqueue)
Beispiel #33
0
        int ai = k*n+gid;
        for (i = k+1; i < n; i++) {
            a[i*n+gid] = a[i*n+gid] - (a[i*n+k] / a[ak]) * a[ai];
            //a[i*n+gid] = gid;
        }
    }
""").build()

for k in range(n-1):
    kernel = prg.eliminate
    kernel.set_scalar_arg_dtypes([None, numpy.int32, numpy.int32])
    # I hope it also takes the last column
    # how can I put only k jobs into the queue
    event = kernel(queue, (n-k,), None, a_buf, numpy.int32(k), numpy.int32(n))
    #for (i = k+1; i < n; i++) {
    #    a[i*n+k] = 0
    #}
    # We need to wait for all jobs on each loop
    t1 = time()
    cl.enqueue_barrier(queue)
    t2 = time()
    cl.enqueue_copy(queue, a, a_buf)
    t3 = time()
    #print("t1: ", t1-t2, " t2: ", t2-t3)
    print("k: ", k)
    print(a)

cl.enqueue_copy(queue, a, a_buf)

#print a
Beispiel #34
0
 def enqueue_wait_for_events(self, events: Sequence[Event]) -> None:
     # OpenCL has some odd semantics for an empty wait list, hence the check
     if events:
         pyopencl.enqueue_barrier(self._pyopencl_command_queue,
                                  [x._pyopencl_event for x in events])
Beispiel #35
0
    def __forward_ocl_vec4_interleaved(self, x, h0, c0, sm=False):
        def interleave(matrix):
            if len(matrix.shape) == 1:
                return np.squeeze(interleave(np.expand_dims(matrix, axis=0)),
                                  axis=0).copy()
            new = np.zeros_like(matrix)
            a, b, c, d = np.hsplit(matrix, 4)
            simd = 4
            rng = np.arange(0, matrix.shape[1], simd)
            new[:, rng] = a
            new[:, rng + 1] = b
            new[:, rng + 2] = c
            new[:, rng + 3] = d
            return new.copy()

        seq_len = x.shape[0]
        batch_size = x.shape[1]
        weights = interleave(
            np.concatenate((np.transpose(self.Wi), np.transpose(self.Wh)),
                           0)).T.astype(np.float32)
        ifcos = np.zeros((batch_size, 4 * self.hidden_size)).astype(np.float32)
        hy = np.zeros(
            (seq_len + 1, batch_size, self.hidden_size)).astype(np.float32)
        cy = np.zeros(
            (seq_len + 1, batch_size, self.hidden_size)).astype(np.float32)
        hy[0] = h0
        cy[0] = c0

        platform = cl.get_platforms()[0]  # Select the first platform [0]
        device = platform.get_devices()[
            0]  # Select the first device on this platform [0]
        context = cl.Context([device])  # Create a context with your device
        queue = cl.CommandQueue(
            context)  # Create a command queue with your context

        # Allocate on device
        x_gpu = cl.Buffer(context,
                          cl.mem_flags.COPY_HOST_PTR,
                          hostbuf=x.copy('C'))  # cuda_alloc(x)
        weights_gpu = cl.Buffer(
            context, cl.mem_flags.COPY_HOST_PTR,
            hostbuf=weights.copy('C'))  # cuda_alloc(weights)
        bias_gpu = cl.Buffer(context,
                             cl.mem_flags.COPY_HOST_PTR,
                             hostbuf=interleave(
                                 self.B).copy('C'))  # cuda_alloc(self.B)
        ifcos_gpu = cl.Buffer(context,
                              cl.mem_flags.READ_WRITE
                              | cl.mem_flags.COPY_HOST_PTR,
                              hostbuf=ifcos.copy('C'))  # cuda_alloc(ifcos)
        hy_gpu = cl.Buffer(context,
                           cl.mem_flags.READ_WRITE
                           | cl.mem_flags.COPY_HOST_PTR,
                           hostbuf=hy.copy('C'))  # cuda_alloc(hy)
        cy_gpu = cl.Buffer(context,
                           cl.mem_flags.READ_WRITE
                           | cl.mem_flags.COPY_HOST_PTR,
                           hostbuf=cy.copy('C'))  # cuda_alloc(cy)

        kernelSource = ''
        kernelFilename = 'lstm_vec4_interleaved.cl'
        with open(kernelFilename, 'r') as file:
            kernelSource = file.read()

        program = cl.Program(context, kernelSource).build()

        M = np.int32(batch_size)
        K = np.int32(self.input_size + self.hidden_size)
        N = np.int32(4 * self.hidden_size)
        gemm_lws = (8, 8, 1)
        gemm_gws = int(M), int(N), gemm_lws[2]
        eltwise_lws = (8, 8, 1)
        eltwise_gws = int(M), int(self.hidden_size), eltwise_lws[2]
        events = []
        for i in range(0, seq_len):
            gemm_kernel = program.lstm_gemm
            gemm_kernel.set_args(x_gpu, hy_gpu, weights_gpu, bias_gpu,
                                 ifcos_gpu, M, K, N, np.int32(self.input_size),
                                 np.int32(self.hidden_size), np.int32(i))
            ev1 = cl.enqueue_nd_range_kernel(queue, gemm_kernel, gemm_gws,
                                             gemm_lws)
            events.append(ev1)
            cl.enqueue_barrier(queue)
            eltwise_kernel = program.lstm_eltwise
            eltwise_kernel.set_args(cy_gpu, ifcos_gpu, hy_gpu,
                                    np.int32(self.hidden_size),
                                    np.int32(batch_size), np.int32(i))
            ev2 = cl.enqueue_nd_range_kernel(queue, eltwise_kernel,
                                             eltwise_gws, eltwise_lws)
            events.append(ev2)
            cl.enqueue_barrier(queue)

        timer_start = datetime.datetime.now()
        cl.wait_for_events(events)
        execution_time = (datetime.datetime.now() -
                          timer_start).total_seconds() * 1000

        cl.enqueue_copy(queue, ifcos, ifcos_gpu)
        cl.enqueue_copy(queue, hy, hy_gpu)
        cl.enqueue_copy(queue, cy, cy_gpu)

        queue.finish()
        # Copy the data for array c back to the host

        results = hy[1:], hy[-1:], cy[-1:]

        return results, execution_time
Beispiel #36
0
    def __forward_ocl_naive(self, x, h0, c0, acc):
        seq_len = x.shape[0]
        batch_size = x.shape[1]
        weights = np.concatenate(
            (np.transpose(self.Wi), np.transpose(self.Wh)),
            0).astype(np.float32)
        ifcos = np.zeros((batch_size, 4 * self.hidden_size)).astype(np.float32)
        hy = np.zeros(
            (seq_len + 1, batch_size, self.hidden_size)).astype(np.float32)
        cy = np.zeros(
            (seq_len + 1, batch_size, self.hidden_size)).astype(np.float32)
        hy[0] = h0
        cy[0] = c0

        platform = cl.get_platforms()[0]  # Select the first platform [0]
        device = platform.get_devices()[
            0]  # Select the first device on this platform [0]
        context = cl.Context([device])  # Create a context with your device
        queue = cl.CommandQueue(
            context)  # Create a command queue with your context

        # Allocate on device
        x_gpu = cl.Buffer(context,
                          cl.mem_flags.COPY_HOST_PTR,
                          hostbuf=x.copy('C'))  # cuda_alloc(x)
        weights_gpu = cl.Buffer(
            context, cl.mem_flags.COPY_HOST_PTR,
            hostbuf=weights.copy('C'))  # cuda_alloc(weights)
        bias_gpu = cl.Buffer(context,
                             cl.mem_flags.COPY_HOST_PTR,
                             hostbuf=self.B.copy('C'))  # cuda_alloc(self.B)
        ifcos_gpu = cl.Buffer(context,
                              cl.mem_flags.READ_WRITE
                              | cl.mem_flags.COPY_HOST_PTR,
                              hostbuf=ifcos.copy('C'))  # cuda_alloc(ifcos)
        hy_gpu = cl.Buffer(context,
                           cl.mem_flags.READ_WRITE
                           | cl.mem_flags.COPY_HOST_PTR,
                           hostbuf=hy.copy('C'))  # cuda_alloc(hy)
        cy_gpu = cl.Buffer(context,
                           cl.mem_flags.READ_WRITE
                           | cl.mem_flags.COPY_HOST_PTR,
                           hostbuf=cy.copy('C'))  # cuda_alloc(cy)

        kernelSource = ''
        kernelFilename = 'lstm_naive.cl' if acc is False else 'lstm_naive_acc.cl'
        with open(kernelFilename, 'r') as file:
            kernelSource = file.read()

        program = cl.Program(context, kernelSource).build()

        M = np.int32(batch_size)
        K = np.int32(self.input_size + self.hidden_size)
        N = np.int32(4 * self.hidden_size)

        gemm_lws = (1, 1, 1)
        gemm_gws = int(M), int(N), gemm_lws[2]
        eltwise_lws = (1, 1, 1)
        eltwise_gws = int(M), int(self.hidden_size), eltwise_lws[2]
        events = []
        for i in range(0, seq_len):
            gemm_kernel = program.lstm_gemm
            gemm_kernel.set_args(x_gpu, hy_gpu, weights_gpu, bias_gpu,
                                 ifcos_gpu, M, K, N, np.int32(self.input_size),
                                 np.int32(self.hidden_size), np.int32(i))
            ev1 = cl.enqueue_nd_range_kernel(queue, gemm_kernel, gemm_gws,
                                             gemm_lws)
            events.append(ev1)
            cl.enqueue_barrier(queue)
            eltwise_kernel = program.lstm_eltwise
            eltwise_kernel.set_args(cy_gpu, ifcos_gpu, hy_gpu,
                                    np.int32(self.hidden_size),
                                    np.int32(batch_size), np.int32(i))
            ev2 = cl.enqueue_nd_range_kernel(queue, eltwise_kernel,
                                             eltwise_gws, eltwise_lws)
            events.append(ev2)
            cl.enqueue_barrier(queue)

        timer_start = datetime.datetime.now()
        cl.wait_for_events(events)
        execution_time = (datetime.datetime.now() -
                          timer_start).total_seconds() * 1000

        cl.enqueue_copy(queue, ifcos, ifcos_gpu)
        cl.enqueue_copy(queue, hy, hy_gpu)
        cl.enqueue_copy(queue, cy, cy_gpu)

        queue.finish()

        results = hy[1:], hy[-1:], cy[-1:]

        return results, execution_time
Beispiel #37
0
 def run(self):
     self.program.Solve(self.queue, self.shape[2:5], None, self.A, self.b,
                        self.x, self.threshold)
     cl.enqueue_barrier(self.queue)
     return self
Beispiel #38
0
	def advance(self, obj, shift=False, corr=None, shcorr=None):
		'''
		Propagate a field through the current slab and transmit it
		through an interface with the next slab characterized by object
		contrast obj. The transmission overwrites the refractive index
		of the current slab with the interface reflection coefficients.

		If shift is True, the forward is shifted by half a slab to
		agree with full-wave solutions and includes a
		backward-traveling contribution caused by reflection from the
		interface with the next slab.

		The relevant result (either the forward field or the
		half-shifted combined field) is copied into a device-side
		buffer for later retrieval and handling.

		If corr is not None, it should be a tuple as specified in the
		reset() docstring to override the default use of corrective
		terms in the spectral propagator.

		The argument shcorr is interpreted exactly as corr, but is used
		instead of corr for the propagation used to shift the field to
		the center of the slab.
		'''
		prog, grid = self.prog, self.grid
		fwdque, recvque, sendque = self.fwdque, self.recvque, self.sendque

		# Point to the field components
		fwd, bck, buf = [f for f in self.fld]

		if shift:
			# Ensure that a prior copy isn't using the buffer
			buf.sync(fwdque)
			# Copy the forward field for shifting if necessary
			cl.enqueue_copy(fwdque, buf, fwd)

		# Copy the sound speed extrema for the current slab
		speedlim = list(self.speedlim)
		# Push the next slab to its buffer (overwrites speed extrema)
		ocur, onxt, obevt = self.objupdate(obj)

		if self.phasetol is not None:
			# Figure maximum propagation distance to not
			# exceed maximum permissible phase deviation
			dzl = []
			for spd in speedlim:
				# Sign governs the sign of the phase deviation,
				# which is irrelevant, so ignore it here
				spdiff = max(abs(spd - 1.), 1e-8)
				# Preventing spdiff from reaching zero limits
				# maximum permissible propagation distance
				dzl.append(abs(0.5 * self.phasetol * spd / spdiff))
			# Subdivide the slab into maximum propagation distance
			nsteps = max(1, int(np.round(self.dz / min(dzl))))
		else: nsteps = 1
		dz = self.dz / nsteps

		# Ensure that no prior copy is using the field buffer
		fwd.sync(fwdque)

		# Propagate the forward field through the slab on the fwdque
		for i in range(nsteps): self.propagate(fwd, dz, corr=corr)

		# Ensure next slab has been received before handling interface
		cl.enqueue_barrier(fwdque, wait_for=[obevt])

		# Compute transmission through the interface
		# The reflected field is only of interest if a shift is desired
		transevt = prog.txreflect(fwdque, grid, None,
				fwd, bck if shift else None, ocur, onxt)
		# Hold the current contrast slab until the transmission is done
		ocur.attachevent(transevt)

		if shift:
			# Add the forward and backward fields
			prog.caxpy(fwdque, grid, None, buf, np.float32(1.), buf, bck)
			# Propagate the combined field a half step
			# Save the propagation event for delaying result copies
			pevt = self.propagate(buf, 0.5 * self.dz, corr=shcorr)

			# Handle Goertzel iterations to compute the Fourier
			# transform of the contrast source on the unit sphere
			if self._goertzel:
				# Compute the FFT of the source in the XY plane
				crt = self.scratch[0]
				prog.ctmul(fwdque, grid, None, crt, ocur, buf)
				self.fftplan.execute(crt)
				# Compute the next Goertzel iteration
				pn1, pn2 = self.goertzbuf
				dz = np.float32(self.dz)
				# The final argument (slab count) is not yet used
				nz = np.int32(0)
				prog.goertzelfft(fwdque, grid, None, pn1, pn2, crt, dz, nz)
				# Cycle the Goertzel buffers
				self.goertzbuf = [pn2, pn1]
			else:
				# Copy the shifted field into the result buffer
				# No result sync necessary, all mods occur on sendque
				evt = cl.enqueue_copy(sendque, self.result, buf, wait_for=[pevt])
				# Attach the copy event to the source buffer
				buf.attachevent(evt)
		else: 
			# Copy the forward field into the result buffer
			# Wait for transmissions to finish for consistency
			evt = cl.enqueue_copy(sendque, self.result, fwd, wait_for=[transevt])
			# Attach the copy event to the field buffer
			fwd.attachevent(evt)
Beispiel #39
0
def baum_welch(
        sequence,
        transition_probs,
        symbol_probs,
        initial_dist,
        accuracy = 1e-3,
        maxit    = 1):

    A  = pyopencl.Buffer(
            context, 
            pyopencl.mem_flags.READ_WRITE | pyopencl.mem_flags.COPY_HOST_PTR,
            hostbuf=transition_probs)
    B  = pyopencl.Buffer(
            context,
            pyopencl.mem_flags.READ_WRITE | pyopencl.mem_flags.COPY_HOST_PTR,
            hostbuf=symbol_probs)
    pi = pyopencl.Buffer(
            context,
            pyopencl.mem_flags.READ_WRITE | pyopencl.mem_flags.COPY_HOST_PTR,
            hostbuf=initial_dist)
    ob = pyopencl.Buffer(
            context,
            pyopencl.mem_flags.READ_ONLY | pyopencl.mem_flags.COPY_HOST_PTR,
            hostbuf=sequence)

    T = len(sequence)
    N = len(transition_probs)
    M = len(symbol_probs[0])

    alpha = pyopencl.Buffer(
            context,
            pyopencl.mem_flags.READ_WRITE,
            T*N * numpy.dtype('float32').itemsize)

    beta = pyopencl.Buffer(
            context,
            pyopencl.mem_flags.READ_WRITE,
            T*N * numpy.dtype('float32').itemsize)

    matrix_buffer = pyopencl.Buffer(
            context,
            pyopencl.mem_flags.READ_WRITE,
            T*N*N * numpy.dtype('float32').itemsize)

    scratch = pyopencl.LocalMemory(
            2*kernel.WORK_GROUP_SIZE*N*N* numpy.dtype('float32').itemsize )

    reduced = pyopencl.Buffer(
            context,
            pyopencl.mem_flags.READ_WRITE,
            blocks*N*N * numpy.dtype('float32').itemsize)

    probability = pyopencl.Buffer(
            context,
            pyopencl.mem_flags.WRITE_ONLY,
            numpy.dtype('float32').itemsize )


    old_prob = 0.0
    new_prob = old_prob + accuracy + 1
    it       = 0
    while it < maxit: # abs(new_prob - old_prob) > accuracy and it < maxit:

        forward(ob, A, B, pi, T, N, alpha, matrix_buffer, scratch, reduced)
        # forward_naive(ob, A, B, pi, T, N, alpha, matrix_buffer, scratch)
        e = pyopencl.enqueue_barrier(queue)
        e.wait()
        backward_naive(ob, A, B, T, N, beta, matrix_buffer, scratch)
        e = pyopencl.enqueue_barrier(queue)
        e.wait()
        transition_probabilities(alpha, beta, A, B, ob, T, matrix_buffer)
        state_probabilities(alpha, beta, T)
        transitions = transition_counts(matrix_buffer, T-1, N, scratch) 
        states = state_counts(alpha, T-1, N, scratch)
        symbols = symbol_counts(alpha, ob, T, N, M, scratch)
        update(A, B, pi, alpha, transitions, states, symbols, probability, N, T)
        if it > 0:
            old_prob = new_prob
        new_prob = numpy.array((1), numpy.float32)
        pyopencl.enqueue_copy(queue, new_prob, probability)
        it = it + 1

    transition_probs = numpy.zeros_like(transition_probs)
    symbol_probs     = numpy.zeros_like(symbol_probs)
    initial_dist     = numpy.zeros_like(initial_dist)

    pyopencl.enqueue_copy(queue, transition_probs, A)
    pyopencl.enqueue_copy(queue, symbol_probs, B)
    pyopencl.enqueue_copy(queue, initial_dist, pi)

    return transition_probs, symbol_probs, initial_dist, new_prob, it
Beispiel #40
0
    def trackContour(self):
        if self.resetNormalsAfterEachImage and not self.getContourId(
        ) == 0 and self.nrOfTrackingIterations == 0:  # reset contour normal vector to radial vectors; we do this only starting for the second, since doing this for image 0, would destroy the correspondence of the indexes of the contour coordinates to their corresponding contour normals
            cl.enqueue_copy_buffer(
                self.queue, self.dev_radialVectorsX.data,
                self.dev_membraneNormalVectorsX.data).wait()
            cl.enqueue_copy_buffer(
                self.queue, self.dev_radialVectorsY.data,
                self.dev_membraneNormalVectorsY.data).wait()

        # tracking status variables
        self.nrOfTrackingIterations = self.nrOfTrackingIterations + 1

        stopInd = 1

        self.trackingFinished = np.array(1, dtype=np.int32)  # True
        self.dev_trackingFinished = cl_array.to_device(self.queue,
                                                       self.trackingFinished)

        self.iterationFinished = np.array(0, dtype=np.int32)  # True
        self.dev_iterationFinished = cl_array.to_device(
            self.queue, self.iterationFinished)

        self.dev_membraneCoordinates = helpers.ToDoubleVectorOnDevice(
            self.queue, self.dev_membraneCoordinatesX,
            self.dev_membraneCoordinatesY)
        self.dev_membraneNormalVectors = helpers.ToDoubleVectorOnDevice(
            self.queue, self.dev_membraneNormalVectorsX,
            self.dev_membraneNormalVectorsY)
        self.dev_previousInterpolatedMembraneCoordinates = helpers.ToDoubleVectorOnDevice(
            self.queue, self.dev_previousInterpolatedMembraneCoordinatesX,
            self.dev_previousInterpolatedMembraneCoordinatesY)
        self.dev_membranePolarCoordinates = helpers.ToDoubleVectorOnDevice(
            self.queue, self.dev_membranePolarTheta,
            self.dev_membranePolarRadius)
        self.dev_interpolatedMembraneCoordinates = helpers.ToDoubleVectorOnDevice(
            self.queue, self.dev_interpolatedMembraneCoordinatesX,
            self.dev_interpolatedMembraneCoordinatesY)

        for strideNr in range(self.nrOfStrides):
            # set the starting index of the coordinate array for each kernel instance
            kernelCoordinateStartingIndex = np.int32(
                strideNr * self.detectionKernelStrideSize)

            self.prg.findMembranePosition(self.queue, self.trackingGlobalSize, self.trackingWorkGroupSize, self.sampler, \
                     self.dev_Img, self.imgSizeX, self.imgSizeY, \
                     self.buf_localRotationMatrices, \
                     self.buf_linFitSearchRangeXvalues, \
                     self.linFitParameter, \
                     cl.LocalMemory(self.fitIntercept_memSize), cl.LocalMemory(self.fitIncline_memSize), \
                     cl.LocalMemory(self.rotatedUnitVector_memSize), \
                     self.meanParameter, \
                     self.buf_meanRangeXvalues, self.meanRangePositionOffset, \
                     cl.LocalMemory(self.localMembranePositions_memSize), \
                     self.dev_membraneCoordinates.data, \
                     self.dev_membraneNormalVectors.data, \
                     self.dev_fitInclines.data, \
                     kernelCoordinateStartingIndex, \
                     self.inclineTolerance, \
                     self.inclineRefinementRange)

            barrierEvent = cl.enqueue_barrier(self.queue)

        self.prg.filterNanValues(self.queue, self.gradientGlobalSize, None, \
               self.dev_membraneCoordinates.data, \
               self.dev_membraneNormalVectors.data, \
               cl.LocalMemory(self.dev_closestLowerNoneNanIndex.nbytes), cl.LocalMemory(self.dev_closestUpperNoneNanIndex.nbytes) \
               )
        barrierEvent = cl.enqueue_barrier(self.queue)

        self.prg.filterJumpedCoordinates(self.queue, self.gradientGlobalSize, None, \
                 self.dev_previousContourCenter.data, \
                 self.dev_membraneCoordinates.data, \
                 self.dev_membraneNormalVectors.data, \
                    self.dev_previousInterpolatedMembraneCoordinates.data, \
                    cl.LocalMemory(self.dev_closestLowerNoneNanIndex.nbytes), \
                 cl.LocalMemory(self.dev_closestUpperNoneNanIndex.nbytes), \
                 cl.LocalMemory(self.listOfGoodCoordinates_memSize), \
                 self.maxCoordinateShift \
                 )
        barrierEvent = cl.enqueue_barrier(self.queue)

        self.prg.calculateInterCoordinateAngles(self.queue, self.gradientGlobalSize, None, \
                  self.dev_interCoordinateAngles.data, \
                  self.dev_membraneCoordinates.data \
                    )
        barrierEvent = cl.enqueue_barrier(self.queue)

        self.prg.filterIncorrectCoordinates(self.queue, self.gradientGlobalSize, None, \
                 self.dev_previousContourCenter.data, \
                    self.dev_interCoordinateAngles.data, \
                    self.dev_membraneCoordinates.data, \
                    self.dev_membraneNormalVectors.data, \
                    cl.LocalMemory(self.dev_closestLowerNoneNanIndex.nbytes), cl.LocalMemory(self.dev_closestUpperNoneNanIndex.nbytes), \
                    self.maxInterCoordinateAngle \
                    )
        barrierEvent = cl.enqueue_barrier(self.queue)

        # information regarding barriers: http://stackoverflow.com/questions/13200276/what-is-the-difference-between-clenqueuebarrier-and-clfinish

        ########################################################################
        ### Calculate contour center
        ########################################################################
        self.calculateContourCenter()

        ########################################################################
        ### Convert cartesian coordinates to polar coordinates
        ########################################################################
        self.prg.cart2pol(self.queue, self.gradientGlobalSize, None, \
              self.dev_membraneCoordinates.data, \
              self.dev_membranePolarCoordinates.data, \
              self.dev_contourCenter.data)
        barrierEvent = cl.enqueue_barrier(self.queue)

        ########################################################################
        ### Interpolate polar coordinates
        ########################################################################
        self.prg.sortCoordinates(self.queue, (1,1), None, \
              self.dev_membranePolarCoordinates.data, \
              self.dev_membraneCoordinates.data, \
              self.dev_membraneNormalVectors.data, \
              np.int32(self.nrOfDetectionAngleSteps) \
              )
        barrierEvent = cl.enqueue_barrier(self.queue)

        self.prg.interpolatePolarCoordinatesLinear(self.queue, self.gradientGlobalSize, None, \
                   self.dev_membranePolarCoordinates.data, \
                   self.dev_radialVectors.data, \
                   self.dev_contourCenter.data, \
                   self.dev_membraneCoordinates.data, \
                   self.dev_interpolatedMembraneCoordinates.data, \
                   self.dev_interpolationAngles.data, \
                   self.nrOfAnglesToCompare \
                   )
        barrierEvent = cl.enqueue_barrier(self.queue)

        ########################################################################
        ### Convert polar coordinates to cartesian coordinates
        ########################################################################
        self.prg.checkIfTrackingFinished(self.queue, self.gradientGlobalSize, None, \
                 self.dev_interpolatedMembraneCoordinates.data, \
                 self.dev_previousInterpolatedMembraneCoordinates.data, \
                 self.dev_trackingFinished.data, \
                 self.coordinateTolerance)
        barrierEvent = cl.enqueue_barrier(self.queue)

        self.prg.checkIfCenterConverged(self.queue, (1,1), None, \
                self.dev_contourCenter.data, \
                self.dev_previousContourCenter.data, \
                self.dev_trackingFinished.data, \
                self.centerTolerance)
        barrierEvent = cl.enqueue_barrier(self.queue)

        self.dev_membraneNormalVectorsX, self.dev_membraneNormalVectorsY = helpers.ToSingleVectorsOnDevice(
            self.queue, self.dev_membraneNormalVectors)
        self.dev_previousInterpolatedMembraneCoordinatesX, self.dev_previousInterpolatedMembraneCoordinatesY = helpers.ToSingleVectorsOnDevice(
            self.queue, self.dev_previousInterpolatedMembraneCoordinates)
        self.dev_membraneCoordinatesX, self.dev_membraneCoordinatesY = helpers.ToSingleVectorsOnDevice(
            self.queue, self.dev_membraneCoordinates)
        self.dev_membranePolarTheta, self.dev_membranePolarRadius = helpers.ToSingleVectorsOnDevice(
            self.queue, self.dev_membranePolarCoordinates)
        self.dev_interpolatedMembraneCoordinatesX, self.dev_interpolatedMembraneCoordinatesY = helpers.ToSingleVectorsOnDevice(
            self.queue, self.dev_interpolatedMembraneCoordinates)

        cl.enqueue_read_buffer(self.queue, self.dev_trackingFinished.data,
                               self.trackingFinished).wait()

        barrierEvent = cl.enqueue_barrier(self.queue)

        cl.enqueue_copy_buffer(
            self.queue, self.dev_interpolatedMembraneCoordinatesX.data,
            self.dev_previousInterpolatedMembraneCoordinatesX.data).wait()
        cl.enqueue_copy_buffer(
            self.queue, self.dev_interpolatedMembraneCoordinatesY.data,
            self.dev_previousInterpolatedMembraneCoordinatesY.data).wait()
        cl.enqueue_copy_buffer(self.queue, self.dev_contourCenter.data,
                               self.dev_previousContourCenter.data).wait()

        self.prg.setIterationFinished(self.queue, (1, 1), None,
                                      self.dev_iterationFinished.data)
        barrierEvent = cl.enqueue_barrier(self.queue)

        cl.enqueue_read_buffer(self.queue, self.dev_iterationFinished.data,
                               self.iterationFinished).wait()

        self.setStartingCoordinatesNew(self.dev_interpolatedMembraneCoordinatesX, \
                  self.dev_interpolatedMembraneCoordinatesY)
        pass
Beispiel #41
0
 def run(self):        
     self.program.Solve(self.queue, self.shape[2:5], None, self.A, self.b, self.x, self.threshold)
     cl.enqueue_barrier(self.queue)
     return self