Esempio n. 1
0
    def sort(self, num, keys, values, batch=1, direction=1):
        
        print "bitonic sort"
        #num must be a power of 2 and <= max_num
        log2l, remainder = self.factorRadix2(num)
        if remainder != 1:
            return

        #self.keys = keys
        #self.values = values
        self.keys = cl.Buffer(self.ctx, mf.READ_ONLY | mf.COPY_HOST_PTR, hostbuf=keys)
        self.values = cl.Buffer(self.ctx, mf.READ_ONLY | mf.COPY_HOST_PTR, hostbuf=values)

        cl.enqueue_read_buffer(self.queue, self.keys, keys)
        cl.enqueue_read_buffer(self.queue, self.values, values)
        self.queue.finish()


        direction = (direction != 0)
        array_length = keys.size
        print "array_length", array_length

        if array_length < self.local_size_limit:
            self.local(array_length, direction)
        else:
            self.local1(batch, array_length, direction)
            size = 2 * self.local_size_limit
            while size <= array_length:
                stride = size / 2
                while stride > 0:
                    print "size, stride", size, stride
                    if stride >= self.local_size_limit:
                        self.merge_global(batch, array_length, stride, size, direction)
                    else:
                        self.merge_local(batch, array_length, size, stride, direction)
                        break


                    stride >>= 1
                size <<= 1

        self.queue.finish()
        #need to copy back
        cl.enqueue_copy_buffer(self.queue, self.d_tempKeys, self.keys).wait()
        cl.enqueue_copy_buffer(self.queue, self.d_tempValues, self.values).wait()
        self.queue.finish()

        #copy to cpu to view results
        cl.enqueue_read_buffer(self.queue, self.keys, keys)
        cl.enqueue_read_buffer(self.queue, self.values, values)
        self.queue.finish()
        #cl.enqueue_read_buffer(self.queue, self.d_tempKeys, keys).wait()
        #cl.enqueue_read_buffer(self.queue, self.d_tempValues, values).wait()



        return keys, values
Esempio n. 2
0
	def copyBuffer(self, buf, dest=None):
		if dest is None:
			buf_copy = self.allocate(buf.shape, buf.dtype)
		else:
			buf_copy = dest

		cl.enqueue_copy_buffer(self.queue, buf, buf_copy)

		if dest is None:
			return buf_copy
Esempio n. 3
0
    def sort(self, num, keys, values, batch=1, direction=1):

        print "bitonic sort"
        # num must be a power of 2 and <= max_num
        log2l, remainder = self.factorRadix2(num)
        if remainder != 1:
            return

        # self.keys = keys
        # self.values = values
        self.keys = cl.Buffer(self.ctx, mf.READ_ONLY | mf.COPY_HOST_PTR, hostbuf=keys)
        self.values = cl.Buffer(self.ctx, mf.READ_ONLY | mf.COPY_HOST_PTR, hostbuf=values)

        cl.enqueue_read_buffer(self.queue, self.keys, keys)
        cl.enqueue_read_buffer(self.queue, self.values, values)
        self.queue.finish()

        direction = direction != 0
        array_length = keys.size
        print "array_length", array_length

        if array_length < self.local_size_limit:
            self.local(array_length, direction)
        else:
            self.local1(batch, array_length, direction)
            size = 2 * self.local_size_limit
            while size <= array_length:
                stride = size / 2
                while stride > 0:
                    print "size, stride", size, stride
                    if stride >= self.local_size_limit:
                        self.merge_global(batch, array_length, stride, size, direction)
                    else:
                        self.merge_local(batch, array_length, size, stride, direction)
                        break

                    stride >>= 1
                size <<= 1

        self.queue.finish()
        # need to copy back
        cl.enqueue_copy_buffer(self.queue, self.d_tempKeys, self.keys).wait()
        cl.enqueue_copy_buffer(self.queue, self.d_tempValues, self.values).wait()
        self.queue.finish()

        # copy to cpu to view results
        cl.enqueue_read_buffer(self.queue, self.keys, keys)
        cl.enqueue_read_buffer(self.queue, self.values, values)
        self.queue.finish()
        # cl.enqueue_read_buffer(self.queue, self.d_tempKeys, keys).wait()
        # cl.enqueue_read_buffer(self.queue, self.d_tempValues, values).wait()

        return keys, values
Esempio n. 4
0
    def adjust_weights( self, context ):
        """
        Adjust weights of neural network by certain direction.
        """
        context.opencl.kernel_adjust_weights_quickprop( 
            context.opencl.queue, ( int( context._weights_buf_size ), ),
            context._gradient_buf,
            self.prev_direction_buf,
            self.n, self.alpha,
            self._weights_delta_buf,
            context._weights_buf
            )

        pyopencl.enqueue_copy_buffer( context.opencl.queue, context._gradient_buf, self.prev_direction_buf )
Esempio n. 5
0
def test_copy_buffer(ctx_factory):
    context = ctx_factory()

    queue = cl.CommandQueue(context)
    mf = cl.mem_flags

    a = np.random.rand(50000).astype(np.float32)
    b = np.empty_like(a)

    buf1 = cl.Buffer(context, mf.READ_ONLY | mf.COPY_HOST_PTR, hostbuf=a)
    buf2 = cl.Buffer(context, mf.WRITE_ONLY, b.nbytes)

    cl.enqueue_copy_buffer(queue, buf1, buf2).wait()
    cl.enqueue_read_buffer(queue, buf2, b).wait()

    assert la.norm(a - b) == 0
Esempio n. 6
0
def test_copy_buffer(ctx_factory):
    context = ctx_factory()

    queue = cl.CommandQueue(context)
    mf = cl.mem_flags

    a = np.random.rand(50000).astype(np.float32)
    b = np.empty_like(a)

    buf1 = cl.Buffer(context, mf.READ_ONLY | mf.COPY_HOST_PTR, hostbuf=a)
    buf2 = cl.Buffer(context, mf.WRITE_ONLY, b.nbytes)

    cl.enqueue_copy_buffer(queue, buf1, buf2).wait()
    cl.enqueue_read_buffer(queue, buf2, b).wait()

    assert la.norm(a - b) == 0
Esempio n. 7
0
    def adjust_weights( self, context ):
        """
        Adjust weights of neural network by certain direction.
        """
        context.opencl.kernel_adjust_weights_rprop( 
            context.opencl.queue, ( int( context._weights_buf_size ), ),
            context._gradient_buf,
            self.prev_gradient_buf,
            self.n_buf,
            context._weights_buf
            )

        #nn = numpy.ndarray( [context.weights_buf_size], numpy.float32 )
        #pyopencl.enqueue_read_buffer( context.opencl.queue, context.gradient_buf, nn, is_blocking = True )
        #pyopencl.enqueue_read_buffer( context.opencl.queue, self.n_buf, nn, is_blocking = True )

        pyopencl.enqueue_copy_buffer( context.opencl.queue, context._gradient_buf, self.prev_gradient_buf )
Esempio n. 8
0
 def copyBuffer(self, cl_queue, cl_buffer):
     """
     Copying the given device buffer into the already allocated memory
     """
     if not self.holds_data:
         raise RuntimeError('The buffer has been freed before copyBuffer is called')
     
     if not cl_buffer.holds_data:
         raise RuntimeError('The provided cl_buffer is either not allocated, or has been freed before copyBuffer is called')
     
     # Make sure that the input is of correct size:
     assert(cl_buffer.nx_halo == self.nx_halo), str(cl_buffer.nx_halo) + " vs " + str(self.nx_halo)
     assert(cl_buffer.ny_halo == self.ny_halo), str(cl_buffer.ny_halo) + " vs " + str(self.ny_halo)
     
     assert(cl_buffer.bytes_per_float == self.bytes_per_float), "Provided cl_buffer itemsize is " + str(cl_buffer.bytes_per_float) + ", but should have been " + str(self.bytes_per_float)
     
     # Okay, everything is fine - issue device-to-device-copy:
     total_num_bytes = self.bytes_per_float*self.nx_halo*self.ny_halo
     pyopencl.enqueue_copy_buffer(cl_queue, cl_buffer.data, self.data, total_num_bytes)
Esempio n. 9
0
 def setStartingCoordinates(self,dev_initialMembraneCoordinatesX,dev_initialMembraneCoordinatesY, \
         dev_initialMembranNormalVectorsX,dev_initialMembranNormalVectorsY):
     cl.enqueue_copy_buffer(self.queue,
                            dev_initialMembraneCoordinatesX.data,
                            self.dev_membraneCoordinatesX.data).wait()  #<-
     cl.enqueue_copy_buffer(self.queue,
                            dev_initialMembraneCoordinatesY.data,
                            self.dev_membraneCoordinatesY.data).wait()
     cl.enqueue_copy_buffer(self.queue,
                            dev_initialMembranNormalVectorsX.data,
                            self.dev_membraneNormalVectorsX.data).wait()
     cl.enqueue_copy_buffer(self.queue,
                            dev_initialMembranNormalVectorsY.data,
                            self.dev_membraneNormalVectorsY.data).wait()
     barrierEvent = cl.enqueue_barrier(self.queue)
     self.queue.finish()
Esempio n. 10
0
 def setStartingMembraneNormals(self, dev_initialMembranNormalVectorsX,
                                dev_initialMembranNormalVectorsY):
     if self.resetNormalsAfterEachImage and not self.getContourId(
     ) == 0:  # reset contour normal vector to radial vectors; we do this only starting for the second, since doing this for image 0, would destroy the correspondence of the indexes of the contour coordinates to their corresponding contour normals
         cl.enqueue_copy_buffer(
             self.queue, self.dev_radialVectorsX.data,
             self.dev_membraneNormalVectorsX.data).wait()
         cl.enqueue_copy_buffer(
             self.queue, self.dev_radialVectorsY.data,
             self.dev_membraneNormalVectorsY.data).wait()
     else:  # copy contour normal vectors from last image to use as initial normal vectors for next image
         cl.enqueue_copy_buffer(
             self.queue, dev_initialMembranNormalVectorsX.data,
             self.dev_membraneNormalVectorsX.data).wait()
         cl.enqueue_copy_buffer(
             self.queue, dev_initialMembranNormalVectorsY.data,
             self.dev_membraneNormalVectorsY.data).wait()
     barrierEvent = cl.enqueue_barrier(self.queue)
Esempio n. 11
0
    def process(self):
        """
        Process signal by this layer.
        
        Invokes OpenCL program that produces output array in background.
        """

        # ensure that all previous layers are processed
        for l in self._prev_layers:
            if not l[0].processed:
                return

        outbuf = self.context._outputs_buf
        inbuf = self.context._inputs_buf
        queue = self.opencl.queue

        i_s = 0
        for l in self._prev_layers:
            self._process_wait_for.append(
                pyopencl.enqueue_copy_buffer(
                    queue,
                    outbuf,
                    inbuf,
                    byte_count=int(l[2] * 4),
                    src_offset=int((l[0]._neurons_offset + l[1]) * 4),
                    dst_offset=int((self._inputs_offset + i_s) * 4),
                    wait_for=(l[0]._process_event, )))
            i_s += l[2]

        #process layer
        kernel = self.opencl.kernel_process_layer
        kernel.set_arg(2, self._inputs_offset)
        kernel.set_arg(3, self._weights_offset)
        kernel.set_arg(4, self._neurons_offset)
        kernel.set_arg(5, self._inputs_per_neuron)
        kernel.set_arg(6, self._neuron_count)

        self._process_event = pyopencl.enqueue_nd_range_kernel(
            queue,
            kernel, (int(self._neuron_count * 64), ), (64, ),
            wait_for=self._process_wait_for)
        del self._process_wait_for[:]

        self._processed = True

        for l in self._next_layers:
            l[0].process()
Esempio n. 12
0
    def setStartingCoordinatesNew(self, dev_initialMembraneCoordinatesX,
                                  dev_initialMembraneCoordinatesY):
        cl.enqueue_copy_buffer(self.queue,
                               dev_initialMembraneCoordinatesX.data,
                               self.dev_membraneCoordinatesX.data).wait()  #<-
        cl.enqueue_copy_buffer(self.queue,
                               dev_initialMembraneCoordinatesY.data,
                               self.dev_membraneCoordinatesY.data).wait()

        #cl.enqueue_copy_buffer(self.queue,dev_initialMembraneCoordinatesX.data,self.dev_interpolatedMembraneCoordinatesX.data).wait()
        #cl.enqueue_copy_buffer(self.queue,dev_initialMembraneCoordinatesY.data,self.dev_interpolatedMembraneCoordinatesY.data).wait()

        cl.enqueue_copy_buffer(
            self.queue, dev_initialMembraneCoordinatesX.data,
            self.dev_previousInterpolatedMembraneCoordinatesX.data).wait()
        cl.enqueue_copy_buffer(
            self.queue, dev_initialMembraneCoordinatesY.data,
            self.dev_previousInterpolatedMembraneCoordinatesY.data).wait()
        barrierEvent = cl.enqueue_barrier(self.queue)
Esempio n. 13
0
    def process( self ):
        """
        Process signal by this layer.
        
        Invokes OpenCL program that produces output array in background.
        """

        # ensure that all previous layers are processed
        for l in self._prev_layers:
            if not l[0].processed:
                return

        outbuf = self.context._outputs_buf
        inbuf = self.context._inputs_buf
        queue = self.opencl.queue

        i_s = 0
        for l in self._prev_layers:
            self._process_wait_for.append( pyopencl.enqueue_copy_buffer( 
                queue, outbuf, inbuf,
                byte_count = int( l[2] * 4 ),
                src_offset = int( ( l[0]._neurons_offset + l[1] ) * 4 ),
                dst_offset = int( ( self._inputs_offset + i_s ) * 4 ),
                wait_for = ( l[0]._process_event, )
                ) )
            i_s += l[2]

        #process layer
        kernel = self.opencl.kernel_process_layer
        kernel.set_arg( 2, self._inputs_offset )
        kernel.set_arg( 3, self._weights_offset )
        kernel.set_arg( 4, self._neurons_offset )
        kernel.set_arg( 5, self._inputs_per_neuron )
        kernel.set_arg( 6, self._neuron_count )

        self._process_event = pyopencl.enqueue_nd_range_kernel( queue, kernel,
            ( int( self._neuron_count * 64 ), ), ( 64, ),
            wait_for = self._process_wait_for
            )
        del self._process_wait_for[:]

        self._processed = True

        for l in self._next_layers:
            l[0].process()
Esempio n. 14
0
    def eval(self, calcs):
        """ Evaluate each calc and store in the k list if necessary """

        ncalcs = len(calcs)
        particles = self.particles

        k_num = 'k' + str(self.cstep)
        for i in range(ncalcs):
            calc = calcs[i]
            queue = calc.queue

            updates = calc.updates
            nupdates = calc.nupdates

            # get the destination particle array for this calc

            pa = self.arrays[calc.dnum]

            # Evaluate the calc. The result is stored in cl_tmpx, cl_tmpy, ...

            calc.sph()

            pa.read_from_buffer()

            for j in range(nupdates):
                update_prop = updates[j]
                step_prop = self.step_props[j]

                #step_array = pa.get(step_prop)
                step_prop_buffer = pa.get_cl_buffer(step_prop)

                if not calc.integrates:

                    update_prop_buffer = pa.get_cl_buffer(update_prop)

                    cl.enqueue_copy_buffer(queue,
                                           src=step_prop_buffer,
                                           dest=update_prop_buffer).wait()

                    # ensure that all processes have reached this point

                    particles.barrier()

                    # update neighbor information if 'h' has been updated

                    if calc.tag == "h":
                        particles.update()

                    # update the remote particle properties

                    self.rupdate_list[calc.dnum] = [update_prop]

                    particles.update_remote_particle_properties(
                        self.rupdate_list)

                else:
                    k_prop = self.k_props[calc.id][k_num][j]

                    k_prop_buffer = pa.get_cl_buffer(k_prop)

                    cl.enqueue_copy(
                        queue,
                        src=step_prop_buffer,
                        dest=k_prop_buffer,
                    ).wait()

                pass

        #ensure that the eval phase is completed for all processes

        particles.barrier()
Esempio n. 15
0
    def eval(self, calcs):
        """ Evaluate each calc and store in the k list if necessary """

        ncalcs = len(calcs)
        particles = self.particles
        
        k_num = 'k' + str(self.cstep)
        for i in range(ncalcs):
            calc = calcs[i]
            queue = calc.queue

            updates = calc.updates
            nupdates = calc.nupdates

            # get the destination particle array for this calc
            
            pa = self.arrays[calc.dnum]
            
            # Evaluate the calc. The result is stored in cl_tmpx, cl_tmpy, ...

            calc.sph()

            pa.read_from_buffer()

            for j in range(nupdates):
                update_prop = updates[j]
                step_prop = self.step_props[j]

                #step_array = pa.get(step_prop)
                step_prop_buffer = pa.get_cl_buffer(step_prop) 

                if not calc.integrates:

                    update_prop_buffer = pa.get_cl_buffer(update_prop)

                    cl.enqueue_copy_buffer(queue, src=step_prop_buffer,
                                           dest=update_prop_buffer).wait()
                                       
                    # ensure that all processes have reached this point

                    particles.barrier()

                    # update neighbor information if 'h' has been updated

                    if calc.tag == "h":
                        particles.update()

                    # update the remote particle properties

                    self.rupdate_list[calc.dnum] = [update_prop]

                    particles.update_remote_particle_properties(
                        self.rupdate_list)
                    
                else:
                    k_prop = self.k_props[calc.id][k_num][j]

                    k_prop_buffer = pa.get_cl_buffer(k_prop)

                    cl.enqueue_copy(queue, src=step_prop_buffer,
                                    dest=k_prop_buffer,
                                    ).wait()

                pass

        #ensure that the eval phase is completed for all processes

        particles.barrier()
# Main loop
print('main loop')
from datetime import datetime
mf = cl.mem_flags
for i, nx in enumerate(seed_nxs):
	in_host = in_rand[:nx]
	out_host = np.zeros_like(in_host)
	in_gpu = cl.Buffer(context, mf.READ_ONLY | mf.COPY_HOST_PTR, hostbuf=in_host)
	out_gpu = cl.Buffer(context, mf.WRITE_ONLY, hostbuf=in_host)

	queue.finish()
	t0 = datetime.now()
	for j in xrange(memcpy_iterations):
		#prg.copy_gpu(queue, a.shape, None, np.int32(nx), in_gpu, out_gpu)
		cl.enqueue_copy_buffer(queue, in_gpu, out_gpu)
	queue.finish()
	dt = datetime.now() - t0
	elapsed_times[i] = (dt.seconds + dt.microseconds * 1e-6) / 2 / memcpy_iterations

	cl.enqueue_read_buffer(queue, out_gpu, out_host)
	assert np.linalg.norm(in_host - out_host) == 0

	del in_host, out_host
	in_gpu.release()
	out_gpu.release()

	#print('%d/%d (%d %%)\r' % (i, num_samples, float(i)/num_samples*100)),
	#sys.stdout.flush()
	print('%d/%d (%d %%) dt = %g sec, nx = %d (%d Mbytes)' % (i, num_samples, float(i)/num_samples*100, elapsed_times[i], nx, nx*4/(1024**2)))
Esempio n. 17
0
 def reloadData(self):
     import pyopencl as cl
     cl.enqueue_acquire_gl_objects(self.queue, self.gl_objects)
     cl.enqueue_copy_buffer(self.queue, self.pos_cl, self.pos_n1_cl).wait()
     cl.enqueue_copy_buffer(self.queue, self.pos_cl, self.pos_n2_cl).wait()
     cl.enqueue_release_gl_objects(self.queue, self.gl_objects)
Esempio n. 18
0
    def trackContour(self):
        if self.resetNormalsAfterEachImage and not self.getContourId(
        ) == 0 and self.nrOfTrackingIterations == 0:  # reset contour normal vector to radial vectors; we do this only starting for the second, since doing this for image 0, would destroy the correspondence of the indexes of the contour coordinates to their corresponding contour normals
            cl.enqueue_copy_buffer(
                self.queue, self.dev_radialVectorsX.data,
                self.dev_membraneNormalVectorsX.data).wait()
            cl.enqueue_copy_buffer(
                self.queue, self.dev_radialVectorsY.data,
                self.dev_membraneNormalVectorsY.data).wait()

        # tracking status variables
        self.nrOfTrackingIterations = self.nrOfTrackingIterations + 1

        stopInd = 1

        self.trackingFinished = np.array(1, dtype=np.int32)  # True
        self.dev_trackingFinished = cl_array.to_device(self.queue,
                                                       self.trackingFinished)

        self.iterationFinished = np.array(0, dtype=np.int32)  # True
        self.dev_iterationFinished = cl_array.to_device(
            self.queue, self.iterationFinished)

        self.dev_membraneCoordinates = helpers.ToDoubleVectorOnDevice(
            self.queue, self.dev_membraneCoordinatesX,
            self.dev_membraneCoordinatesY)
        self.dev_membraneNormalVectors = helpers.ToDoubleVectorOnDevice(
            self.queue, self.dev_membraneNormalVectorsX,
            self.dev_membraneNormalVectorsY)
        self.dev_previousInterpolatedMembraneCoordinates = helpers.ToDoubleVectorOnDevice(
            self.queue, self.dev_previousInterpolatedMembraneCoordinatesX,
            self.dev_previousInterpolatedMembraneCoordinatesY)
        self.dev_membranePolarCoordinates = helpers.ToDoubleVectorOnDevice(
            self.queue, self.dev_membranePolarTheta,
            self.dev_membranePolarRadius)
        self.dev_interpolatedMembraneCoordinates = helpers.ToDoubleVectorOnDevice(
            self.queue, self.dev_interpolatedMembraneCoordinatesX,
            self.dev_interpolatedMembraneCoordinatesY)

        for strideNr in range(self.nrOfStrides):
            # set the starting index of the coordinate array for each kernel instance
            kernelCoordinateStartingIndex = np.int32(
                strideNr * self.detectionKernelStrideSize)

            self.prg.findMembranePosition(self.queue, self.trackingGlobalSize, self.trackingWorkGroupSize, self.sampler, \
                     self.dev_Img, self.imgSizeX, self.imgSizeY, \
                     self.buf_localRotationMatrices, \
                     self.buf_linFitSearchRangeXvalues, \
                     self.linFitParameter, \
                     cl.LocalMemory(self.fitIntercept_memSize), cl.LocalMemory(self.fitIncline_memSize), \
                     cl.LocalMemory(self.rotatedUnitVector_memSize), \
                     self.meanParameter, \
                     self.buf_meanRangeXvalues, self.meanRangePositionOffset, \
                     cl.LocalMemory(self.localMembranePositions_memSize), \
                     self.dev_membraneCoordinates.data, \
                     self.dev_membraneNormalVectors.data, \
                     self.dev_fitInclines.data, \
                     kernelCoordinateStartingIndex, \
                     self.inclineTolerance, \
                     self.inclineRefinementRange)

            barrierEvent = cl.enqueue_barrier(self.queue)

        self.prg.filterNanValues(self.queue, self.gradientGlobalSize, None, \
               self.dev_membraneCoordinates.data, \
               self.dev_membraneNormalVectors.data, \
               cl.LocalMemory(self.dev_closestLowerNoneNanIndex.nbytes), cl.LocalMemory(self.dev_closestUpperNoneNanIndex.nbytes) \
               )
        barrierEvent = cl.enqueue_barrier(self.queue)

        self.prg.filterJumpedCoordinates(self.queue, self.gradientGlobalSize, None, \
                 self.dev_previousContourCenter.data, \
                 self.dev_membraneCoordinates.data, \
                 self.dev_membraneNormalVectors.data, \
                    self.dev_previousInterpolatedMembraneCoordinates.data, \
                    cl.LocalMemory(self.dev_closestLowerNoneNanIndex.nbytes), \
                 cl.LocalMemory(self.dev_closestUpperNoneNanIndex.nbytes), \
                 cl.LocalMemory(self.listOfGoodCoordinates_memSize), \
                 self.maxCoordinateShift \
                 )
        barrierEvent = cl.enqueue_barrier(self.queue)

        self.prg.calculateInterCoordinateAngles(self.queue, self.gradientGlobalSize, None, \
                  self.dev_interCoordinateAngles.data, \
                  self.dev_membraneCoordinates.data \
                    )
        barrierEvent = cl.enqueue_barrier(self.queue)

        self.prg.filterIncorrectCoordinates(self.queue, self.gradientGlobalSize, None, \
                 self.dev_previousContourCenter.data, \
                    self.dev_interCoordinateAngles.data, \
                    self.dev_membraneCoordinates.data, \
                    self.dev_membraneNormalVectors.data, \
                    cl.LocalMemory(self.dev_closestLowerNoneNanIndex.nbytes), cl.LocalMemory(self.dev_closestUpperNoneNanIndex.nbytes), \
                    self.maxInterCoordinateAngle \
                    )
        barrierEvent = cl.enqueue_barrier(self.queue)

        # information regarding barriers: http://stackoverflow.com/questions/13200276/what-is-the-difference-between-clenqueuebarrier-and-clfinish

        ########################################################################
        ### Calculate contour center
        ########################################################################
        self.calculateContourCenter()

        ########################################################################
        ### Convert cartesian coordinates to polar coordinates
        ########################################################################
        self.prg.cart2pol(self.queue, self.gradientGlobalSize, None, \
              self.dev_membraneCoordinates.data, \
              self.dev_membranePolarCoordinates.data, \
              self.dev_contourCenter.data)
        barrierEvent = cl.enqueue_barrier(self.queue)

        ########################################################################
        ### Interpolate polar coordinates
        ########################################################################
        self.prg.sortCoordinates(self.queue, (1,1), None, \
              self.dev_membranePolarCoordinates.data, \
              self.dev_membraneCoordinates.data, \
              self.dev_membraneNormalVectors.data, \
              np.int32(self.nrOfDetectionAngleSteps) \
              )
        barrierEvent = cl.enqueue_barrier(self.queue)

        self.prg.interpolatePolarCoordinatesLinear(self.queue, self.gradientGlobalSize, None, \
                   self.dev_membranePolarCoordinates.data, \
                   self.dev_radialVectors.data, \
                   self.dev_contourCenter.data, \
                   self.dev_membraneCoordinates.data, \
                   self.dev_interpolatedMembraneCoordinates.data, \
                   self.dev_interpolationAngles.data, \
                   self.nrOfAnglesToCompare \
                   )
        barrierEvent = cl.enqueue_barrier(self.queue)

        ########################################################################
        ### Convert polar coordinates to cartesian coordinates
        ########################################################################
        self.prg.checkIfTrackingFinished(self.queue, self.gradientGlobalSize, None, \
                 self.dev_interpolatedMembraneCoordinates.data, \
                 self.dev_previousInterpolatedMembraneCoordinates.data, \
                 self.dev_trackingFinished.data, \
                 self.coordinateTolerance)
        barrierEvent = cl.enqueue_barrier(self.queue)

        self.prg.checkIfCenterConverged(self.queue, (1,1), None, \
                self.dev_contourCenter.data, \
                self.dev_previousContourCenter.data, \
                self.dev_trackingFinished.data, \
                self.centerTolerance)
        barrierEvent = cl.enqueue_barrier(self.queue)

        self.dev_membraneNormalVectorsX, self.dev_membraneNormalVectorsY = helpers.ToSingleVectorsOnDevice(
            self.queue, self.dev_membraneNormalVectors)
        self.dev_previousInterpolatedMembraneCoordinatesX, self.dev_previousInterpolatedMembraneCoordinatesY = helpers.ToSingleVectorsOnDevice(
            self.queue, self.dev_previousInterpolatedMembraneCoordinates)
        self.dev_membraneCoordinatesX, self.dev_membraneCoordinatesY = helpers.ToSingleVectorsOnDevice(
            self.queue, self.dev_membraneCoordinates)
        self.dev_membranePolarTheta, self.dev_membranePolarRadius = helpers.ToSingleVectorsOnDevice(
            self.queue, self.dev_membranePolarCoordinates)
        self.dev_interpolatedMembraneCoordinatesX, self.dev_interpolatedMembraneCoordinatesY = helpers.ToSingleVectorsOnDevice(
            self.queue, self.dev_interpolatedMembraneCoordinates)

        cl.enqueue_read_buffer(self.queue, self.dev_trackingFinished.data,
                               self.trackingFinished).wait()

        barrierEvent = cl.enqueue_barrier(self.queue)

        cl.enqueue_copy_buffer(
            self.queue, self.dev_interpolatedMembraneCoordinatesX.data,
            self.dev_previousInterpolatedMembraneCoordinatesX.data).wait()
        cl.enqueue_copy_buffer(
            self.queue, self.dev_interpolatedMembraneCoordinatesY.data,
            self.dev_previousInterpolatedMembraneCoordinatesY.data).wait()
        cl.enqueue_copy_buffer(self.queue, self.dev_contourCenter.data,
                               self.dev_previousContourCenter.data).wait()

        self.prg.setIterationFinished(self.queue, (1, 1), None,
                                      self.dev_iterationFinished.data)
        barrierEvent = cl.enqueue_barrier(self.queue)

        cl.enqueue_read_buffer(self.queue, self.dev_iterationFinished.data,
                               self.iterationFinished).wait()

        self.setStartingCoordinatesNew(self.dev_interpolatedMembraneCoordinatesX, \
                  self.dev_interpolatedMembraneCoordinatesY)
        pass
Esempio n. 19
0
    def trackContourSequentially(self):
        ## tracking status variables
        #self.trackingFinished = np.int32(1) # True
        #self.iterationFinished = np.int32(1) # True

        for coordinateIndex in range(int(self.nrOfDetectionAngleSteps)):
            coordinateIndex = np.int32(coordinateIndex)

            angle = self.angleStepSize * np.float64(coordinateIndex + 1)

            radiusVectorRotationMatrix = np.array(
                [[np.cos(angle), -np.sin(angle)],
                 [np.sin(angle), np.cos(angle)]])

            self.dev_membraneNormalVectors = helpers.ToDoubleVectorOnDevice(
                self.queue, self.dev_membraneNormalVectorsX,
                self.dev_membraneNormalVectorsY)
            self.dev_membraneCoordinates = helpers.ToDoubleVectorOnDevice(
                self.queue, self.dev_membraneCoordinatesX,
                self.dev_membraneCoordinatesY)

            self.prg.findMembranePosition(self.queue, self.global_size, self.local_size, self.sampler, \
                     self.dev_Img, self.imgSizeX, self.imgSizeY, \
                     self.buf_localRotationMatrices, \
                     self.buf_linFitSearchRangeXvalues, \
                     self.linFitParameter, \
                     cl.LocalMemory(self.fitIntercept_memSize), cl.LocalMemory(self.fitIncline_memSize), \
                     cl.LocalMemory(self.rotatedUnitVector_memSize), \
                     self.meanParameter, \
                     self.buf_meanRangeXvalues, self.meanRangePositionOffset, \
                     cl.LocalMemory(self.localMembranePositions_memSize), \
                     self.dev_membraneCoordinates.data, \
                     self.dev_membraneNormalVectors.data, \
                     self.dev_fitInclines.data, \
                     coordinateIndex, \
                     self.inclineTolerance, \
                     self.inclineRefinementRange)

            barrierEvent = cl.enqueue_barrier(self.queue)

            self.dev_membraneCoordinatesX, self.dev_membraneCoordinatesY = helpers.ToSingleVectorsOnDevice(
                self.queue, self.dev_membraneCoordinates)
            self.dev_membraneNormalVectorsX, self.dev_membraneNormalVectorsY = helpers.ToSingleVectorsOnDevice(
                self.queue, self.dev_membraneNormalVectors)

            cl.enqueue_read_buffer(self.queue,
                                   self.dev_membraneCoordinatesX.data,
                                   self.host_membraneCoordinatesX).wait()
            cl.enqueue_read_buffer(self.queue,
                                   self.dev_membraneCoordinatesY.data,
                                   self.host_membraneCoordinatesY).wait()

            cl.enqueue_read_buffer(self.queue,
                                   self.dev_membraneNormalVectorsX.data,
                                   self.host_membraneNormalVectorsX).wait()
            cl.enqueue_read_buffer(self.queue,
                                   self.dev_membraneNormalVectorsY.data,
                                   self.host_membraneNormalVectorsY).wait()

            currentMembraneCoordinate = np.array([
                self.host_membraneCoordinatesX[coordinateIndex],
                self.host_membraneCoordinatesY[coordinateIndex]
            ])

            radiusVector = currentMembraneCoordinate - self.rotationCenterCoordinate
            radiusVectorNorm = np.sqrt(radiusVector[0]**2 + radiusVector[1]**2)

            rotatedRadiusUnitVector = radiusVectorRotationMatrix.dot(
                self.radiusUnitVector)

            nextMembranePosition = self.rotationCenterCoordinate + rotatedRadiusUnitVector * radiusVectorNorm
            nextMembraneNormalVector = np.array([
                self.host_membraneNormalVectorsX[coordinateIndex],
                self.host_membraneNormalVectorsY[coordinateIndex]
            ])

            if coordinateIndex < self.host_membraneCoordinatesX.shape[0] - 1:
                self.host_membraneCoordinatesX[coordinateIndex +
                                               1] = nextMembranePosition[0]
                self.host_membraneCoordinatesY[coordinateIndex +
                                               1] = nextMembranePosition[1]

                self.host_membraneNormalVectorsX[
                    coordinateIndex + 1] = nextMembraneNormalVector[0]
                self.host_membraneNormalVectorsY[
                    coordinateIndex + 1] = nextMembraneNormalVector[1]

                self.dev_membraneCoordinatesX = cl_array.to_device(
                    self.queue, self.host_membraneCoordinatesX)
                self.dev_membraneCoordinatesY = cl_array.to_device(
                    self.queue, self.host_membraneCoordinatesY)

                self.dev_membraneNormalVectorsX = cl_array.to_device(
                    self.queue, self.host_membraneNormalVectorsX)
                self.dev_membraneNormalVectorsY = cl_array.to_device(
                    self.queue, self.host_membraneNormalVectorsY)

        # calculate new normal vectors
        self.dev_membraneCoordinates = helpers.ToDoubleVectorOnDevice(
            self.queue, self.dev_membraneCoordinatesX,
            self.dev_membraneCoordinatesY)
        self.dev_membraneNormalVectors = helpers.ToDoubleVectorOnDevice(
            self.queue, self.dev_membraneNormalVectorsX,
            self.dev_membraneNormalVectorsY)

        self.prg.calculateMembraneNormalVectors(self.queue, self.gradientGlobalSize, None, \
                   self.dev_membraneCoordinates.data, \
                   self.dev_membraneNormalVectors.data \
                  )

        self.calculateContourCenter()

        self.dev_membraneCoordinatesX, self.dev_membraneCoordinatesY = helpers.ToSingleVectorsOnDevice(
            self.queue, self.dev_membraneCoordinates)
        self.dev_membraneNormalVectorsX, self.dev_membraneNormalVectorsY = helpers.ToSingleVectorsOnDevice(
            self.queue, self.dev_membraneNormalVectors)

        cl.enqueue_copy_buffer(
            self.queue, self.dev_membraneCoordinatesX.data,
            self.dev_interpolatedMembraneCoordinatesX.data).wait()
        cl.enqueue_copy_buffer(
            self.queue, self.dev_membraneCoordinatesY.data,
            self.dev_interpolatedMembraneCoordinatesY.data).wait()

        cl.enqueue_copy_buffer(
            self.queue, self.dev_membraneCoordinatesX.data,
            self.dev_previousInterpolatedMembraneCoordinatesX.data).wait()
        cl.enqueue_copy_buffer(
            self.queue, self.dev_membraneCoordinatesY.data,
            self.dev_previousInterpolatedMembraneCoordinatesY.data).wait()

        self.setStartingCoordinatesNew(self.dev_interpolatedMembraneCoordinatesX, \
                  self.dev_interpolatedMembraneCoordinatesY)
        self.queue.finish()
Esempio n. 20
0
 def setContourCenter(self, dev_initialContourCenter):
     cl.enqueue_copy_buffer(self.queue, dev_initialContourCenter.data,
                            self.dev_previousContourCenter.data).wait()
     pass
Esempio n. 21
0
    def start_training( self, context, training_data, training_results,
                        maximal_iterations = 10000, target_error = 0.01,
                        report = False ):
        """
        Starts training.
        
        @param context
            Input layer of neural network.
            
        @param training_data
            Array of tuples of inputs and outputs.
            
        @param training_results
            TrainingResults structure where optimal results will be stored.
            
        @param maximal_iterations
            Maximal iteration to perform.
            
        @param target_error
            Target absolute error.
            
        @param report
            Report object (optimal)
            
        @return Tuple of performed iterations count, minimal relative error
        """

        start_time = time.clock()

        self.prepare_training( context )

        total_error = numpy.array( [1e12], numpy.float32 )
        total_error_buf = pyopencl.Buffer( 
            context.opencl.context, pyopencl.mem_flags.READ_WRITE | pyopencl.mem_flags.COPY_HOST_PTR,
            hostbuf = total_error )

        zeros_buf = pyopencl.Buffer( 
            context.opencl.context,
            pyopencl.mem_flags.READ_ONLY | pyopencl.mem_flags.COPY_HOST_PTR,
            hostbuf = numpy.zeros( [context._weights_buf_size], numpy.float32 )
            )

        read_ready_event = None

        o_buf = pyopencl.Buffer( 
            context.opencl.context, pyopencl.mem_flags.READ_ONLY | pyopencl.mem_flags.COPY_HOST_PTR,
            hostbuf = numpy.zeros( [context.output_layer.neuron_count], numpy.float32 )
            )

        context.opencl.kernel_setup_training_data.set_arg( 0, context._neurons_buf_size )
        context.opencl.kernel_setup_training_data.set_arg( 1, context._outputs_buf )
        context.opencl.kernel_setup_training_data.set_arg( 2, context.output_layer._neurons_offset )
        context.opencl.kernel_setup_training_data.set_arg( 3, context.output_layer.neuron_count )
        context.opencl.kernel_setup_training_data.set_arg( 4, o_buf )
        context.opencl.kernel_setup_training_data.set_arg( 5, pyopencl.LocalMemory( 32 * 4 ) )
        context.opencl.kernel_setup_training_data.set_arg( 6, context._errors_backpropagation_buf )
        context.opencl.kernel_setup_training_data.set_arg( 7, total_error_buf )

        # clear gradient
        pyopencl.enqueue_copy_buffer( 
            context.opencl.queue, zeros_buf, context._gradient_buf
            ).wait()

        i = 0
        calc_error_evt = None
        while training_results.minimal_error > target_error:
            if i >= maximal_iterations:
                break
            i += 1

            reset_total_error_evt = pyopencl.enqueue_copy_buffer( context.opencl.queue, zeros_buf, total_error_buf, byte_count = 4 )
            j = 0
            for inputs, outputs in training_data:
                j += 1
#                pyopencl.enqueue_barrier( context.opencl.queue )
                evt = context.input_layer.set_inputs( inputs, is_blocking = False )
                context.input_layer._process_wait_for.append( evt )
                context.input_layer.process()

                evt = pyopencl.enqueue_write_buffer( 
                    context.opencl.queue, o_buf, outputs, is_blocking = False
                    )

                calc_error_evt = pyopencl.enqueue_nd_range_kernel( 
                    context.opencl.queue,
                    context.opencl.kernel_setup_training_data,
                    ( 32, ), ( 32, ),
                    wait_for = ( evt, context.output_layer._process_event, reset_total_error_evt )
                    )

#                print context.output_layer.get_outputs()

                context.output_layer._calc_gradient_wait_for.append( calc_error_evt )
                context.input_layer.calc_weights_gradient()
                #print context.output_layer._get_gradient( )

                if not self.offline:
                    self.adjust_weights( context )
                    evt = pyopencl.enqueue_copy_buffer( 
                        context.opencl.queue, zeros_buf, context._gradient_buf,
                        wait_for = ( context.input_layer._calc_gradient_event, )
                        )
                    context.output_layer._calc_gradient_wait_for.append( evt )

                if j % 20000 == 0:
                    context.opencl.queue.finish()

            if self.offline:
                save_n = self.n
                self.n /= numpy.float32( len( training_data ) )
                self.adjust_weights( context )
                self.n = save_n
                evt = pyopencl.enqueue_copy_buffer( context.opencl.queue, zeros_buf, context._gradient_buf )
                context.output_layer._calc_gradient_wait_for.append( evt )

            if read_ready_event and read_ready_event.command_execution_status == pyopencl.command_execution_status.COMPLETE:
                read_ready_event = None
                error_sum = total_error[0] / len( training_data )
#                print error_sum, ' ', i, ' ', self.n

                if report:
                    report.process_iteration( len( training_data ), self, training_results, error_sum, context )

                self.adjust_training_parameters( error_sum )

                if error_sum < training_results.minimal_error:
                    training_results.minimal_error = error_sum
                    training_results.store_weights( context )   # note: this call is blocking!

                if error_sum < target_error:
                    break;

                training_results.opencl_time += context.opencl.gather_opencl_stats()

            if not read_ready_event:
                # we use nonblocking read to avoid waiting for GPU
                # this could lead to a delay in obtaining current error
                # error of current iteration can be returned in several iteration ahead
                read_ready_event = pyopencl.enqueue_read_buffer( 
                    context.opencl.queue, total_error_buf,
                    total_error, is_blocking = False,
                    wait_for = ( calc_error_evt, ) if calc_error_evt else None
                    )

        training_results.iterations += i

        pyopencl.enqueue_read_buffer( 
            context.opencl.queue, total_error_buf,
            total_error, is_blocking = True,
            wait_for = ( calc_error_evt, ) if calc_error_evt else None
            )
        error_sum = total_error[0] / len( training_data )

        if error_sum < training_results.minimal_error:
            training_results.minimal_error = error_sum
            training_results.store_weights( context )

        training_results.opencl_time += context.opencl.gather_opencl_stats()
        training_results.total_time += time.clock() - start_time
Esempio n. 22
0
def test_overwrite_ecb():
    cl.enqueue_copy_buffer(queue, cl_zero_buffer, cl_empty_buffer,
                           zero_buffer.nbytes).wait()
Esempio n. 23
0
zero_buffer = np.zeros(200048, dtype=cl.array.vec.uint2)
empty_buffer = np.empty(200048, dtype=cl.array.vec.uint2)
empty_buffer.fill(16)
mf = cl.mem_flags
cl_zero_buffer = cl.Buffer(ctx,
                           mf.READ_ONLY | mf.COPY_HOST_PTR,
                           hostbuf=zero_buffer)
cl_empty_buffer = cl.Buffer(ctx,
                            mf.READ_ONLY | mf.COPY_HOST_PTR,
                            hostbuf=empty_buffer)
zero = np.zeros(2, np.uint32)
# make sure both buffers are initialised
cl.enqueue_copy(queue, empty_buffer, cl_empty_buffer)
print(empty_buffer)
cl.enqueue_copy_buffer(queue, cl_zero_buffer, cl_empty_buffer,
                       zero_buffer.nbytes).wait()

cl.enqueue_copy(queue, empty_buffer, cl_empty_buffer)
print(empty_buffer)


@timeit_repeat(reps)
def test_overwrite_ecb():
    cl.enqueue_copy_buffer(queue, cl_zero_buffer, cl_empty_buffer,
                           zero_buffer.nbytes).wait()


@timeit_repeat(reps)
def test_overwrite_efb():
    cl.enqueue_fill_buffer(queue, cl_empty_buffer, zero, 0,
                           zero_buffer.nbytes).wait()