Esempio n. 1
0
    def init(self, input_width):
        self.input_width = input_width
        if not isinstance(self.weight_initializer, Initializer):
            weight_initializer = self.weight_initializer(
                self.input_width, self.units)
        if not isinstance(self.bias_initializer, Initializer):
            bias_initializer = self.bias_initializer()
        self.weights_buf = weight_initializer((self.units, self.input_width))
        self.weights = array.to_device(self.queue, self.weights_buf)
        self.bias_buf = bias_initializer((self.units, 1))
        self.bias = array.to_device(self.queue, self.bias_buf)
        # should probably make this 2d so it can have dimensions (output_width, batch_size)
        self.output = array.zeros(self.queue, (self.batch_size, self.units),
                                  dtype=dtype)
        self.output_data = self.output.data
        self.input_width = cltypes.uint(self.input_width)
        self.output_width = cltypes.uint(self.units)
        self.activation = self.activation
        max_output, max_batch_size = self.queue.device.max_work_item_sizes[:2]
        if self.output_width > max_output:
            raise ValueError(
                f"Layer output cannot exceed {max_output}, you gave {self.output_width}"
            )
        if self.batch_size > max_batch_size:
            raise ValueError(
                f"Batch size cannot exceed {max_batch_size}, you gave {self.batch_size}"
            )

        return self.units
Esempio n. 2
0
    def inferSingleStepCL(self, pattern, weights):
        """
        __constant char* pattern,
        __constant float* weights,
        __global float* predictions,
        __global float* sums,
        uint const numBuckets

        :param pattern:
        :param param:
        :return:
        """
        cl_pattern = cl.Buffer(self._ctx,
                               mf.READ_ONLY | mf.COPY_HOST_PTR,
                               hostbuf=pattern)
        cl_weights = cl.Buffer(self._ctx,
                               mf.READ_ONLY | mf.COPY_HOST_PTR,
                               hostbuf=weights)
        predictions = np.empty(len(pattern), dtype=cltypes.float)
        cl_predictions = cl.Buffer(self._ctx, mf.WRITE_ONLY,
                                   predictions.nbytes)
        cl_sums = cl.Buffer(self._ctx, mf.READ_WRITE, 32 * len(pattern))
        self._prg.inferSingleStep(self._queue, (pattern.shape[0], ), None,
                                  cl_pattern, cl_weights, cl_predictions,
                                  cl_sums, cltypes.uint(self._numBuckets),
                                  cltypes.uint(self._maxinput))
        cl.enqueue_copy(self._queue, predictions, cl_predictions).wait()
        return predictions
Esempio n. 3
0
    def compute(self, recordNum, pattern, bucketIdx, actValue, learn, infer):
        """
        Computes 1 step
        :param recordNum:
        :param pattern: indices of active columns in the TM layer
        :param classification: dict of bucketIdx and actualValue
        :param learn:
        :param infer:
        :return:
        """
        pattern = np.array(pattern, dtype=cltypes.uint)
        if not self._init_buffers:
            self._setup_buffers(pattern)

        ev_copy_pattern = cl.enqueue_write_buffer(self._queue, self.cl_activeBitIdx, pattern)
        # update bit activations on device side
        ev_update_bit = self._prg.update_bit_activations(self._queue, (pattern.size,), None,
                                                         self.cl_bit_activations, self.cl_activeBitIdx,
                                                         wait_for=[ev_copy_pattern])

        multiStepPredictions = {}
        ev_learn = None
        if learn:
            ev_learn = [self._prg.learn(self._queue, (self.step_count * pattern.size,), None,
                                        self.cl_activeBitIdx, self.cl_table_average, self.cl_table_counts, self.alpha,
                                        self.actValueAlpha,
                                        cltypes.uint(bucketIdx), self._numBuckets,
                                        wait_for=[ev_update_bit])]
        if infer:
            """
                const __global float* averages,
                const __global uint* counts,
                const __global uint* activeBitIdx,
                __global float2* predictions, // the array of predictions
                __global const  uint* bitActivations, // the number of times each bit has been active
                uint const activeBits
            """
            # kernel for every active bit in each step
            ev_infer = self._prg.infer(self._queue, (self._numBuckets,), None,
                                       self.cl_table_average, self.cl_table_counts, self.cl_activeBitIdx,
                                       self.cl_predictions, self.cl_bit_activations,
                                       cltypes.uint(pattern.size), wait_for=ev_learn)

            cl.enqueue_copy(self._queue, self._predictions, self.cl_predictions, wait_for=[ev_infer]).wait()
        # print("Activations", self.bucket_activations)
        # multiStepPredictions['actualValues'] = predictions['x'] / len(pattern)
        # multiStepPredictions[step] = predictions['y'] / len(pattern)  # the probability for each bucket
        # print("Actual Values", multiStepPredictions['actualValues'])
        multiStepPredictions[1] = self._predictions.copy()
        # print("Probability", multiStepPredictions[1])
        self.bucket_activations[bucketIdx] += 1

        return multiStepPredictions
Esempio n. 4
0
    def _get_overlap_score_loop_bin(self, encoding):
        """
        Returns an array with boosted and non-boosted scores as a vector
        :param encoding: the encoded data
        :return:

        overlap_loop(
            __constant int64* activeBits, // active bits in sorted order
            __constant synapse_struct* synapses, // all the synapses
            __global uint2* overlaps, // columns to store overlap scores
            __constant float* boostFactors, // boost values for columns
            const float synPermConnected,
            const int synapsesPerColumn,
            const uint numActiveBits
        )
        """
        active_bits = np.where(encoding == 1)[0]
        cl.enqueue_write_buffer(self._queue, self.cl_active_bits, active_bits)
        cl_synapses = self._get_cl_synapses_buffer()
        cl_boostFactors = self._get_cl_boost_factor_buffer()
        overlap = np.zeros(self.columnCount, dtype=cl.array.vec.uint2
                           )  # array of overlap and boosted overlap scores
        cl_overlap = cl.Buffer(self._ctx, mf.WRITE_ONLY, overlap.nbytes)
        self.prog.overlap_loop_bin(self._queue, (self.columnCount, ), None,
                                   self.cl_active_bits, cl_synapses,
                                   cl_overlap, cl_boostFactors,
                                   self.synPermConnected,
                                   self.synapsesPerColumn,
                                   cltypes.uint(active_bits.size)).wait()

        cl.enqueue_copy(self._queue, overlap, cl_overlap).wait()
        return overlap
Esempio n. 5
0
    def __init__(self,
                 queue,
                 activationThreshold=14,
                 cellsPerColumn=32,
                 columnCount=2048,
                 globalDecay=0.0,
                 initialPerm=0.21,
                 inputWidth=2048,
                 maxAge=0,
                 maxSegmentsPerCell=128,
                 maxSynapsesPerSegment=32,
                 minThreshold=11,
                 newSynapseCount=20,
                 outputType='normal',
                 pamLength=3,
                 permanenceDec=0.1,
                 permanenceInc=0.1,
                 seed=1960,
                 temporalImp='cl',
                 verbosity=0):
        if temporalImp != 'cl':
            raise ValueError('This implementation only supports OpenCL')
        self.activationThreshold = cltypes.uint(activationThreshold)
        self.columnCount = cltypes.uint(columnCount)
        self.cellsPerColumn = cltypes.uint(cellsPerColumn)
        self.globalDecay = cltypes.float(globalDecay)
        self.initialPerm = cltypes.float(initialPerm)
        self.maxAge = cltypes.uint(maxAge)
        self.maxSegmentsPerCell = cltypes.uint(maxSegmentsPerCell)
        self.maxSynapsesPerSegment = cltypes.uint(maxSynapsesPerSegment)
        self.minThreshold = cltypes.uint(minThreshold)
        self.newSynapseCount = cltypes.uint(newSynapseCount)
        self.outputType = outputType
        self.pamLength = cltypes.uint(pamLength)
        self.permanenceDec = cltypes.float(permanenceDec)
        self.permanenceInc = cltypes.float(permanenceInc)
        np.random.seed(seed)

        self.verbosity = verbosity
        self.columnCount = columnCount
        self.inputWidth = inputWidth

        self._queue = queue
        self._ctx = queue.context

        np.random.seed(seed)
        self._setup_cl_buffers()
Esempio n. 6
0
    def __init__(self, queue, numBuckets, steps=[1], bits=2048, alpha=0.001, actValueAlpha=0.3, verbosity=False):
        self._prg = cl.Program(queue.context, kernel_src).build()
        self._learn_iteration = 0
        self.bit_activations = np.zeros(bits, dtype=cltypes.uint)
        self.bucket_activations = np.zeros(numBuckets, dtype=cltypes.uint)
        self.steps = steps
        self.step_count = len(steps)
        self.alpha = cltypes.float(alpha)
        self.actValueAlpha = cltypes.float(actValueAlpha)
        self.bits = bits  # number of bits in the input
        self._queue = queue  # the opencl queue
        self._ctx = queue.context  # the opencl context
        self._numBuckets = cltypes.uint(numBuckets)
        self._verbose = verbosity

        self._init_buffers = False
Esempio n. 7
0
def run(ctx, src):
    w, h = 1024, 512
    depth = 1024
    img = np.zeros((h, w), dtype=cltypes.float)

    time = run_kernel(
        ctx,
        src,
        (w, h),
        Mem(img),
        *[cltypes.uint(k) for k in (w, h, depth)],
    )
    """
    for row in img[::16,::16]:
        for z in row:
            print("@" if z > 0.5 else ".", end="")
        print()
    """
    print("\t{:.3f} sec: {}".format(time, os.path.split(src)[1]))

    return (img, )
Esempio n. 8
0
    def __init__(self, dims):
        self.width = dims[0]
        self.height = dims[1]

        import pyopencl as cl
        from pyopencl import cltypes
        import numpy as np
        from matplotlib import cm

        self.cm = cm
        self.np = np
        self.cl = cl

        self.ctx = cl.Context([cl.get_platforms()[1].get_devices()[0]])
        self.queue = cl.CommandQueue(self.ctx)
        self.prg = cl.Program(
            self.ctx, """
            #pragma OPENCL EXTENSION cl_khr_byte_addressable_store : enable
            __kernel void mandelbrot(__global float2 *q, __constant uchar4 *lut,
                             __global uchar4 *output, __global uint* output2, uint const maxiter)
            {
                const int gid = get_global_id(0);
                float real = q[gid].x;
                float imag = q[gid].y;
                output[gid] = (uchar4)(0,0,0,0);
                output2[gid] = 0;
                for(uint curiter = 0; curiter < maxiter; curiter++) {
                    float real2 = real*real, imag2 = imag*imag;
                    if (real2 + imag2 > 4.0f) {
                        output[gid] = lut[curiter];
                        output2[gid] = curiter;
                        return;
                    }
                    imag = 2 * real*imag + q[gid].y;
                    real = real2 - imag2 + q[gid].x;
                }
            }
        """).build()
        import time
        self.time = time
        self.centerx = (-0.74877 + -0.74872) / 2
        self.centery = (0.06505 + 0.06510) / 2
        self.padding = 2
        self.maxiter = cltypes.uint(64)

        # self.xmin = -np.pi
        # self.xmax = np.pi
        # self.ymin = -np.pi
        # self.ymax = np.pi
        self.update_pos()

        cmap = self.cm.get_cmap('gnuplot2', self.maxiter)
        cols = [(np.array(cmap(i)[:-1]) * 255).astype(cl.cltypes.uchar)
                for i in range(self.maxiter)]
        self.lut = np.zeros((self.maxiter, ), cl.cltypes.uchar4)
        for idx, i in enumerate(cols):
            self.lut[idx][0] = i[0]
            self.lut[idx][1] = i[1]
            self.lut[idx][2] = i[2]
        self.lut_opencl = cl.Buffer(self.ctx,
                                    cl.mem_flags.READ_ONLY
                                    | cl.mem_flags.COPY_HOST_PTR,
                                    hostbuf=self.lut)
Esempio n. 9
0
        ],
        dtype=cltypes.float)
    colors_buf = cl.Buffer(ctx,
                           mf.READ_ONLY | mf.COPY_HOST_PTR,
                           hostbuf=colors)
    color_width = 8.0

    # Read source file
    with open(src_path, "r") as f:
        src = f.read()
    # Build program
    prg = cl.Program(ctx, src).build()

    # Execute OpenCL kernel on the device
    prg.render(queue, (width, height), None, image_buf, map_buf, colors_buf,
               *[cltypes.uint(x) for x in [width, height, depth, ssf]],
               cltypes.uint(len(colors)), cltypes.float(color_width))

    # Copy rendered image to host
    cl.enqueue_copy(queue, image, image_buf)

    # Flush queue
    queue.flush()
    queue.finish()

    # Draw low-resolution image in terminal
    for row in np.mean(image[::32, ::16], axis=2):
        for x in row:
            print("@" if x > 1e-4 else ".", end="")
        print()
Esempio n. 10
0
    def compute(self, recordNum, pattern, classification, learn, infer):
        """
        Computes 1 step
        :param recordNum:
        :param pattern: indices of active columns in the TM layer
        :param classification: dict of bucketIdx and actualValue
        :param learn:
        :param infer:
        :return:
        """
        if self.verbosity:
            print("  recordNum:", recordNum)
            print("  patternNZ (%d):" % len(pattern), pattern)
            print("  classificationIn:", classification)

        bucketIdx, actValue = classification['bucketIdx'], classification[
            'actValue']
        pattern = np.array(pattern).astype(cltypes.uint)
        self._patternNZHistory.append((recordNum, pattern))

        retval = None
        if infer:
            retval = self.infer(pattern, classification)
        return retval

        if learn and bucketIdx is not None:
            cl_activeBitIdx = cl.Buffer(self._ctx,
                                        mf.READ_ONLY | mf.COPY_HOST_PTR,
                                        hostbuf=pattern)
            for learnRecordNum, learnPattern in self._patternNZHistory:
                error = dict()
                targetDist = np.zeros(self._numBuckets + 1,
                                      dtype=cltypes.float)
                targetDist[bucketIdx] = 1.0

            for step, table in self._weights.iteritems():
                # print("old table")
                # self._show_table(table)
                """
                 int* activeBitIdx
                 float2 *table, // x=histogram, y=moving average
                float const alpha, // moving average alpha
                float const actualValue, // actual input value
                int const bucketIdx, // bucket that actualValue falls into
                int const bucketCount,
                bool const learn,
                bool const infer,
                __global float *predictions
                """

                new_table = table.copy()
                cl_new_table = cl.Buffer(self._ctx,
                                         mf.WRITE_ONLY | mf.COPY_HOST_PTR,
                                         hostbuf=new_table)
                cl_table = cl.Buffer(self._ctx,
                                     mf.READ_ONLY | mf.COPY_HOST_PTR,
                                     hostbuf=table)
                if learn:
                    cl_table = cl.Buffer(self._ctx,
                                         mf.READ_ONLY | mf.COPY_HOST_PTR,
                                         hostbuf=table)
                if infer:
                    predictions = np.zeros(self._numBuckets,
                                           dtype=cl.array.vec.float2)
                    cl_predictions = cl.Buffer(self._ctx, mf.READ_WRITE,
                                               predictions.nbytes)
                else:
                    cl_predictions = cl.Buffer(self._ctx, mf.WRITE_ONLY, 1)
                self._prg.infer_compute(self._queue, (pattern.shape[0], ),
                                        None, cl_activeBitIdx, cl_table,
                                        cl_new_table,
                                        cltypes.float(self.actValueAlpha),
                                        cltypes.float(actValue),
                                        cltypes.uint(bucketIdx),
                                        cltypes.uint(self._numBuckets),
                                        cltypes.char(learn),
                                        cltypes.char(infer), cl_predictions)

                if learn:
                    cl.enqueue_copy(self._queue, self.steps[step],
                                    cl_new_table).wait()
                if infer:
                    cl.enqueue_copy(self._queue, predictions,
                                    cl_predictions).wait()
                print("Activations", self.bucket_activations)

                multiStepPredictions[step] = predictions['y'] / len(
                    pattern)  # the probability for each bucket
                print("Actual Values", multiStepPredictions['actualValues'])
                print("Probability", multiStepPredictions[step])
        self.bucket_activations[bucketIdx] += 1

        return multiStepPredictions
Esempio n. 11
0
File: nn.py Progetto: JonnoFTW/nn-cl
    def train(self,
              epochs: int,
              loss: Loss,
              optimizer: Optimizer,
              x_train,
              y_train,
              x_test,
              y_test,
              x_validation=None,
              y_validation=None,
              batch_size: int = 1,
              shuffle: bool = True,
              validation_pct=None,
              validation_method='cross-validation',
              callbacks=[]):
        """

        :param epochs: number of epochs to run
        :param loss:  a loss function
        :param optimizer: the optimizer to use
        :param x_train: a 2D array of shape (rows, features)
        :param y_train: a 2d array of shape (rows, output features),
                output_features is the number of values we want to predict
        :param x_test: testing data inputs
        :param y_test: testing data true values
        :param validation_method: a string to determine which validation  method to use: 'holdout','cross-validation'
        :return: None

        For example, our input might be:
        x_train = [
            [0,1,1],
            [0,2,1],
            [1,2,1],
            [0,3,4],
        ]
        That is 4 rows with 3 features each, we might do a binary classification on this:
        y_train = [
            [0,1],
            [0,1],
            [1,0],
            [0,1]
        ]
        That is, each training input maps to one of these
        All this will be copied to the device

        Validation methods are:

        1. Specify x_validation,y_validation and the same provided dataset will be used to validate every epoch
        2. Specify validation_pct to determine how much of the training set will be set aside as validation.
           Specify validation_method to determine which method to use:
            * holdout: the same subset of x_train is used to validate each epoch
            * cross-validation: at the start of each epoch a random sample of x_train/y_train is set aside


        """

        if validation_pct is not None and x_validation is not None and y_validation is not None:
            raise ValueError(
                "Please set either validation_pct or (x_validation,x_validation)"
            )
        if x_validation is not None != x_validation is not None:
            raise ValueError("Please set both (x_validation and y_validation)")

        x_train = x_train.astype(dtype)
        y_train = y_train.astype(dtype)

        if validation_pct:

            # slice off the last validation_ct from x_train,y_train
            if 0 <= validation_pct < 1:
                training_samples = int(x_train.shape[0] * (1 - validation_pct))
                validation_samples = int(x_train.shape[0] * validation_pct)
                if validation_method == 'holdout':
                    print(
                        f"Holding out last {validation_samples} samples of training data for validation"
                    )
                    x_train = x_train[:training_samples]
                    y_train = y_train[:training_samples]

                    x_validation = x_train[training_samples:]
                    y_validation = y_train[training_samples:]
                    x_val_gpu = array.to_device(self.queue, x_validation)
                    y_val_gpu = array.to_device(self.queue, y_validation)
                elif validation_method == 'cross-validation':
                    print(
                        f"Using cross-validation on last {validation_samples}")
                else:
                    raise ValueError("Invalid validation method")
                validation_user = False

            else:
                raise ValueError(
                    "Validation_pct must be in range 0 <= val% < 1")
        elif x_validation is not None and y_validation is not None:
            print("User provided validation")
            x_validation = x_validation.astype(dtype)
            y_validation = y_validation.astype(dtype)
            x_val_gpu = array.to_device(self.queue, x_validation)
            y_val_gpu = array.to_device(self.queue, y_validation)
            validation_samples = len(x_validation)
            training_samples = x_train.shape[0]
            validation_user = True
        else:
            training_samples = x_train.shape[0]
        if len(x_train) != len(y_train):
            raise ValueError("X and Y for test/train must be same length")
        if training_samples % batch_size != 0:
            raise ValueError(
                "Training dataset must have rows divisible by batch size")

        input_features = cltypes.uint(x_train.shape[1])
        output_features = cltypes.uint(y_train.shape[1])
        if input_features != self.layers[0].input_width:
            raise ValueError(
                f"Input features (provided={input_features}) must be the same as layer_0 input width (required={self.layers[0].input_width})"
            )
        # Just copy all training and all testing data to the device
        for dn, ds in ("x_train",
                       x_train), ("y_train",
                                  y_train), ("x_validation",
                                             x_validation), ("y_validation",
                                                             y_validation):
            try:
                print("{}\n\tsize={}\n\tshape={}".format(
                    dn, humanize.naturalsize(ds.nbytes), ds.shape))
            except AttributeError:
                pass

        # x_train_gpu = cl.Buffer(self.ctx, mf.READ_WRITE | mf.COPY_HOST_PTR, hostbuf=x_train)
        x_train_gpu = array.to_device(self.queue, x_train)
        y_train_gpu = array.to_device(self.queue, y_train)

        # should probably check that our data won't exceed available device memory,
        # transparently queue up more data once it's been used
        losses = {'batch': [], 'validation': [], 'testing': []}
        for i in tqdm(range(epochs), desc='Epoch: ', position=0):
            # shuffle the rows
            if shuffle:
                self.shuffle(x_train_gpu.data, y_train_gpu.data,
                             training_samples, input_features, output_features)
            for idx in tqdm(range(training_samples // batch_size),
                            desc='Batch: ',
                            position=1,
                            unit=' batch'):
                idx = cltypes.uint(idx)
                # idx here is the batch number

                batch_x_gpu = x_train_gpu[idx * batch_size:idx * batch_size +
                                          batch_size]
                batch_y_gpu = y_train_gpu[idx * batch_size:idx * batch_size +
                                          batch_size]
                # copy all of these to the device?
                output = self.forward(batch_x_gpu, verbose=False)
                loss_val = loss.cpu(batch_y_gpu, output)
                # err = loss(batch_y_gpu, output, )
                losses['batch'].append(loss_val)
                # print(f"Mean Batch Loss={loss_val}")
                optimizer(loss, self, batch_x_gpu, batch_y_gpu)
                # if idx % 900 == 0:
                #     for c in callbacks:
                #         if c.batch_end:
                #             c(losses)
            # run the network and get error for the validation set
            # this should be a single batch of size validation_samples
            # will need to allocate specific validation arrays

            # if validation_user:
            #     # validate with user supplied validation data
            #     output = self.forward(x_val_gpu, 0)  # should probably be done as a single batch,
            #     val_loss = loss(y_val_gpu, output, 0)
            # else:
            #     # idx is the index of the validation set start position
            #     idx = len(x_train) - validation_samples
            #     output = self.forward(x_train_gpu, idx)
            #     val_loss = loss(y_train_gpu, output, idx)
            # losses['validation'].append(val_loss)
            # # collect metrics for training set
            # output = self.forward(x_test, 0)
            # test_loss = loss(y_test, output, 0)
            # losses['testing'].append(test_loss)
            for c in callbacks:
                c(losses)
        return losses
Esempio n. 12
0
    def __init__(self,
                 queue,
                 columnCount=2048,
                 globalInhibition=1,
                 inputWidth=500,
                 inputActive=33,
                 boostStrength=0.0,
                 numActiveColumnsPerInhArea=40,
                 potentialPct=.5,
                 stimulusThreshold=0,
                 seed=1956,
                 dutyCyclePeriod=1000,
                 spVerbosity=0,
                 spatialImp='cl',
                 synPermActiveInc=0.05,
                 synPermConnected=0.10,
                 synPermInactiveDec=0.008):
        if spatialImp != 'cl':
            raise ValueError(
                'This implementation only supports OpenCL Temporal Memory')
        if globalInhibition != 1:
            raise ValueError(
                'This implementation does not support local inhibition')
        self.columnCount = cltypes.uint(columnCount)
        self.globalInhibition = globalInhibition
        self.inputWidth = inputWidth
        self.boostStrength = boostStrength
        self.numActiveColumnPerInhArea = cltypes.uint(
            numActiveColumnsPerInhArea)
        self.potentialPct = cltypes.float(potentialPct)
        np.random.seed(seed)
        self.verbosity = spVerbosity
        self.synPermActiveInc = cltypes.float(synPermActiveInc)
        self.synPermConnected = cltypes.float(synPermConnected)
        self.synPermInactiveDec = cltypes.float(synPermInactiveDec)
        # store the TM as an array of int, either on or off
        self.columns = np.zeros(columnCount, dtype=cltypes.uint)
        self.synapsesPerColumn = cltypes.uint(inputWidth * potentialPct)
        self._stimulusThreshold = cltypes.uint(stimulusThreshold)
        self._dutyCyclePeriod = cltypes.uint(dutyCyclePeriod)

        self._activeDutyCycles = np.zeros(self.columnCount, cltypes.uint)
        self._overlapDutyCycles = np.zeros(self.columnCount, cltypes.uint)
        self._minOverlapDutyCycles = np.zeros(self.columnCount, cltypes.uint)
        self._boostFactors = np.ones(self.columnCount, dtype=cltypes.float)

        self._queue = queue
        self._ctx = queue.context
        self._updatePeriod = 50

        synapse_struct = np.dtype([('permanence', cltypes.float),
                                   ('bitIdx', cltypes.uint)])
        synapse_struct, synapse_struct_c_decl = cl.tools.match_dtype_to_c_struct(
            self._ctx.devices[0], "synapse_struct", synapse_struct)
        synapse_struct = cl.tools.get_or_register_dtype(
            'synapse_struct', synapse_struct)

        overlap_struct = np.dtype([('overlap', cltypes.uint),
                                   ('boosted', cltypes.uint)])
        overlap_struct, overlap_struct_c_decl = cl.tools.match_dtype_to_c_struct(
            self._ctx.devices[0], "overlap_struct", overlap_struct)
        self.overlap_struct = cl.tools.get_or_register_dtype(
            'overlap_struct', overlap_struct)

        self.synapses = np.zeros(
            (columnCount * self.synapsesPerColumn),
            dtype=synapse_struct)  # x is permanence value, y is input bit idx
        if spVerbosity >= 1:
            print(
                '------------CL  SpatialPooler Parameters ------------------')
            # print("Synapse Struct", synapse_struct_c_decl)
            print("Synapses\t", self.synapses.size)
            print("Columns\t", self.columnCount)
            print("Input Width\t", self.inputWidth)
            print("Synapses Per Column\t", self.synapsesPerColumn)
            print("Synapse Connection Threshold\t", self.synPermConnected)

        self.synPermMin_ = 0.0
        self.synPermMax_ = 1.0

        self.synapses['permanence'] = np.clip(
            np.random.normal(synPermConnected,
                             (self.synPermMax_ - self.synPermMin_) / 10,
                             size=self.synapses.shape[0]).astype(np.float32),
            0, 1)
        self.synapses_no_bit = self.synapses['permanence']
        input_synapses = np.arange(0, inputWidth)
        for column in range(self.columnCount):
            idx = column * self.synapsesPerColumn
            self.synapses['bitIdx'][idx:idx +
                                    self.synapsesPerColumn] = np.random.choice(
                                        input_synapses, self.synapsesPerColumn,
                                        False)
        bits, counts = np.unique(self.synapses['bitIdx'], return_counts=True)
        # array mapping each input bit to it's synapses indexes
        max_count = np.max(counts)
        self.max_input_to_synapse = cltypes.int(max_count)
        self.input_bitIdx = np.full((len(counts) * max_count),
                                    -1,
                                    dtype=cltypes.int)

        for inputBitIdx in xrange(inputWidth):
            idx = inputBitIdx * max_count
            synapseIndexes = np.where(
                self.synapses['bitIdx'] == inputBitIdx)[0]
            self.input_bitIdx[idx:idx + synapseIndexes.size] = synapseIndexes

        # print("Connected synapses: ", np.where(self.synapses['permanence'] > synPermConnected)[0].size / float(
        #     self.synapses['permanence'].size))
        # each column connects to exactly columnCount*potentialPct inputs
        src = ''.join(
            [synapse_struct_c_decl, overlap_struct_c_decl, kernel_src])
        self.prog = cl.Program(self._ctx, src).build()
        # print (map(lambda x: x.get_info(pyopencl.kernel_info.FUNCTION_NAME), self.prog.all_kernels()))
        self._iterationNum = 0
        self._iterationLearnNum = 0
        self._inhibitionRadius = self.columnCount
        self.synapseCount = self.synapsesPerColumn * self.columnCount
        if spVerbosity >= 1:
            # self._show_synapses()
            pass

        # initialise host buffers for commonly used things
        # we only copy stuff between host and device when we need to

        self.overlap = np.zeros(
            self.columnCount,
            dtype=cltypes.uint2)  # array of overlap and boosted overlap scores
        self.cl_boost_factors = cl.Buffer(self._ctx,
                                          mf.READ_WRITE | mf.COPY_HOST_PTR,
                                          hostbuf=self._boostFactors)
        self.cl_overlap = cl.Buffer(self._ctx, mf.READ_WRITE,
                                    self.overlap.nbytes)

        encoding_temp = np.empty(
            inputWidth,
            dtype=cltypes.uchar)  # output is a np.uint8 == cltypes.uchar

        self.cl_encoding = cl.Buffer(self._ctx,
                                     mf.READ_ONLY,
                                     size=encoding_temp.nbytes)
        self.active_bits = np.zeros(inputActive, dtype=cltypes.long)
        self.inputActive = inputActive
        self.cl_active_bits = cl.Buffer(self._ctx,
                                        mf.READ_ONLY | mf.COPY_HOST_PTR,
                                        hostbuf=self.active_bits)
        self.cl_synapses = cl.Buffer(self._ctx,
                                     mf.READ_WRITE | mf.COPY_HOST_PTR,
                                     hostbuf=self.synapses)
        self.cl_input_bitIdx = cl.Buffer(self._ctx,
                                         mf.READ_ONLY | mf.COPY_HOST_PTR,
                                         hostbuf=self.input_bitIdx)

        self.cl_synapses_no_bit = cl.Buffer(self._ctx,
                                            mf.READ_ONLY | mf.COPY_HOST_PTR,
                                            hostbuf=self.synapses_no_bit)