Esempio n. 1
0
    for K, C, N in ((3072,3072*1,32),(3072,3072*1,64),(3072,3072*1,96),(3072,3072*1,128),
                    (3072,3072*2,32),(3072,3072*2,64),(3072,3072*2,96),(3072,3072*2,128),
                    (3072,3072*3,32),(3072,3072*3,64),(3072,3072*3,96),(3072,3072*3,128),
                    (3072,3072*4,32),(3072,3072*4,64),(3072,3072*4,96),(3072,3072*4,128),): 
                    #(3072,3072,32+128*0),(3072,3072,64+128*0),(3072,3072,96+128*0),(3072,3072,128+128*0),
                    #(3072,3072,32+128*1),(3072,3072,64+128*1),(3072,3072,96+128*1),(3072,3072,128+128*1),
                    #(3072,3072,32+128*2),(3072,3072,64+128*2),(3072,3072,96+128*2),(3072,3072,128+128*2),
                    #(3072,3072,32+128*3),(3072,3072,64+128*3),(3072,3072,96+128*3),(3072,3072,128+128*3),): 
        for op,  dimA,  dimB,  dimC in (
          ("nn", (K,C), (C,N), (K,N) ),  # fprop
          ("tn", (K,C), (K,N), (C,N) ),  # bprop
          ("nt", (K,N), (C,N), (K,C) )): # update

            repeat = 5000 if C <= 3072 else 500

            devA1 = ng.empty(dimA, dtype=dtype)
            devB1 = ng.empty(dimB, dtype=dtype)
            devC1 = ng.empty(dimC, dtype=dtype)

            # fill with uniform randoms from -1 to 1
            devA1[:] = 2 * (.5 - ng.rand())
            devB1[:] = 2 * (.5 - ng.rand())

            # just alias if same dtype
            if dtype is np.float32:
                devA2 = devA1
                devB2 = devB1
            # otherwise copy
            else:
                devA2 = ng.empty(dimA, dtype=np.float32)
                devB2 = ng.empty(dimB, dtype=np.float32)
Esempio n. 2
0
                for k in size:
                    print("op,M,N,K: ", op, m, n, k)

                    dimA = (m, k) if op[0] == 'n' else (k, m)
                    dimB = (k, n) if op[1] == 'n' else (n, k)
                    dimC = (m, n)

                    cpuA = np.random.uniform(-1.0, 1.0,
                                             dimA).astype(np.float32)
                    cpuB = np.random.uniform(-1.0, 1.0,
                                             dimB).astype(np.float32)
                    #cpuB = np.identity(n, dtype=dtype)

                    devA = ng.array(cpuA, dtype=dtype)
                    devB = ng.array(cpuB, dtype=dtype)
                    devC = ng.empty(dimC, dtype=dtype)

                    #repeat = min(int(50.0 * 4096**3 / (m * n * k)), 1000)

                    if op[0] == 't': cpuA, devA = cpuA.T, devA.T
                    if op[1] == 't': cpuB, devB = cpuB.T, devB.T

                    ng.dot(devA, devB, devC, repeat=1)

                    #context.synchronize()

                    cpuC = np.dot(cpuA, cpuB)

                    cpuD = devC.get()
                    diff = np.absolute(cpuC - cpuD)
                    max_diff = diff.max()
Esempio n. 3
0
            layers.append(layer)

            # find the size of the largest buffers so they can be shared
            if layer.sizeF > max_weights:
                max_weights = layer.sizeF
                max_weight_layer = layer

            if layer.sizeO > max_deltas:
                max_deltas = layer.sizeO
                max_delta_layer = layer

        # for layer in sorted(layers, key=lambda l: l.sizeO, reverse=True):
        #     print "%d %s" % (layer.sizeO, layer)

        # Init shared buffers (assumes consistent dtype for now)
        shared_deltas[0] = ng.empty(max_delta_layer.dimO2,
                                    dtype=max_delta_layer.dtype)
        shared_deltas[1] = ng.empty(max_delta_layer.dimO2,
                                    dtype=max_delta_layer.dtype)
        shared_weights = ng.empty(max_weight_layer.dimF2,
                                  dtype=max_weight_layer.dtype)

        prev_layer = None
        delta = False
        for layer in layers:

            print layer

            # Intitalize buffers.  Alernate shared delta buffer.
            # One layer can't have the same buffer for both error in and error out.
            layer.init_activations()
            layer.init_weights(shared=shared_weights)
# bprop(nn): NK   x KC   = NC
# updat(tn): NK^T x NC   = KC

repeat = 2000


for K, C, N in ((3072,3072,32),):

    total  = 0

    for op,  dimA,  dimB,  dimC in (
      ("nn", (K,C), (C,N), (K,N) ),   # fprop
      ("tn", (K,C), (K,N), (C,N) ),   # bprop
      ("nt", (K,N), (C,N), (K,C) ),): # update

        devA = ng.empty(dimA, dtype=np.float32)
        devB = ng.empty(dimB, dtype=np.float32)
        devC = ng.empty(dimC, dtype=np.float32)

        # fill with uniform randoms from -1 to 1
        devA[:] = 2 * (.5 - ng.rand())
        devB[:] = 2 * (.5 - ng.rand())

        total += cublas_dot(op, devA, devB, devC, repeat=repeat, warmup=True)

    print "N2 Total: ", total
    total = 0

    for op,  dimA,  dimB,  dimC in (
      ("nt", (N,C), (K,C), (N,K) ),   # fprop
      ("nn", (N,K), (K,C), (N,C) ),   # bprop
Esempio n. 5
0
            dimA = (m,k) if op[0] == 'n' else (k,m)
            dimB = (k,n) if op[1] == 'n' else (n,k)
            dimC = (m,n)

            if data_type == "All Ones":
                cpuA = np.ones(dimA, dtype=dtype).astype(np.float32)
                cpuB = np.ones(dimB, dtype=dtype).astype(np.float32)
                #cpuB = np.identity(n, dtype=np.float32)
            else:
                cpuA = np.random.uniform(-1.0, 1.0, dimA).astype(np.float32)
                cpuB = np.random.uniform(-1.0, 1.0, dimB).astype(np.float32)

            devA = ng.array(cpuA, dtype=dtype)
            devB = ng.array(cpuB, dtype=dtype)
            devC = ng.empty(dimC, dtype=dtype)

            if op[0] == 't': cpuA, devA = cpuA.T, devA.T
            if op[1] == 't': cpuB, devB = cpuB.T, devB.T

            ng.dot(devA, devB, devC, repeat=repeat)

            if cpu:

                cpuC = np.dot(cpuA, cpuB)

                cpuD = devC.get()
                diff = np.absolute(cpuC - cpuD)

                print diff.max()
                print cpuD[::max(m//4,1),::max(n//4,1)]
Esempio n. 6
0
class GPU(Backend):
    """
    Sets up a NervanaGPU based backend for matrix operations.
    Note that some functions defined in the generic Backend class such as are
    cross-map pooling and normalization and adaDelta are not implemented for
    this backend.
    """
    default_dtype = np.float32

    def __init__(self, rng_seed, stochastic_round=False, device_id=0):
        self.ng = NervanaGPU(stochastic_round=stochastic_round)
        logger.info("Initialized NervanaGPU with stochastic_round=%s",
                    stochastic_round)
        self.rng_seed = rng_seed
        self.rng_init()
        self.device_id = device_id if device_id is not None else 0

    def __getstate__(self):
        """
        Defines what and how we go about serializing an instance of this class.

        Returns:
            self.__dict__: The full contents of the backend class instance,
                           except for the mem_pool which is on device and
                           cannot be serialized.
        """
        if hasattr(self, 'mem_pool') and self.mem_pool is not None:
            self.mem_pool_pickle = {'shape': self.mem_pool.shape,
                                    'dtype': np.float32}
            self.mem_pool = None

        return self.__dict__

    def __setstate__(self, state):
        """
        Defines how we go about deserializing into an instance of this class.

        Arguments:
            self.__dict__: The full contents of the backend class instance,
                           except for the mem_pool which is on device and
                           cannot be serialized.
        """
        self.__dict__.update(state)
        self.mem_pool = self.ng.empty(self.mem_pool_pickle['shape'],
                                      dtype=self.mem_pool_pickle['dtype'])

    def init_mempool(self, shape, dtype=default_dtype):
        """
        Allocates a memory pool for temporary storage
        """
        self.mem_pool = self.ng.empty(shape, dtype=dtype)

    def alloc_host_mem(self, shape, dtype):
        return drv.pagelocked_empty(shape, dtype, order="C", mem_flags=0)

    def create_stream(self):
        return drv.Stream()

    def async_copy(self, dest, src, stream=None):
        drv.memcpy_htod_async(dest.gpudata, src, stream)

    def rng_init(self):
        """
        Initialize and seed the pseudo random number genrator. Random numbers
        are generated on the host using numpy, then transfered to device.
        """
        seed = None
        if 'rng_seed' in self.__dict__:
            seed = self.rng_seed
            logger.info("Seeding random number generator with: %s", str(seed))
        np.random.seed(seed)

    def flop_timing_init(self, decorate_fc, decorate_conv, decorate_ew):
        """
        Initialize FLOP timing.  Wraps the specified MOP calls via a decorator
        to record elapsed time and number of operations.

        Arguments:
           decorate_fc (list): string giving the function names of fully
                               connected layer forward/backward/update calls
                               to time.
           decorate_conv (list): string giving the function names of
                                 convolutional layer forward/backward/update
                                 calls to time.
           decorate_ew (list): string giving the function names of element-wise
                               calls to time.

        Notes:
            Must be called prior to first flop_timing_start call
        """
        self.start = drv.Event()
        self.end = drv.Event()
        self.flop_timer = FlopsDecorator(self)
        self.flop_timer.decorate(decorate_fc=decorate_fc,
                                 decorate_conv=decorate_conv,
                                 decorate_ew=decorate_ew)

    def flop_timinig_start(self):
        """
        Start a new FLOP timer.
        Returns:
            None: dummy value (not used)
        """
        return self.start.record()

    def flop_timing_finish(self, start_time):
        """
        Complete current FLOP timing.

        Arguments:
            start_time (unused): ignored.

        Returns:
            float: elapsed time in seconds since prior flop_timing_start call.
        """
        self.end.record()
        self.end.synchronize()
        return self.end.time_since(self.start)

    def uniform(self, low=0.0, high=1.0, shape=1, dtype=default_dtype,
                persist_values=True, name=None, allocator=drv.mem_alloc):
        """
        generate numpy random number and convert to a GPUTensor.
        If called with dype=None it will probably explode
        """
        ary = np.random.uniform(low, high, shape)
        return GPUTensor(ary.shape, dtype, allocator=allocator, name=name,
                         rounding=self.ng.round_mode).set(ary)

    def normal(self, loc=0.0, scale=1.0, size=1, dtype=default_dtype,
               persist_values=True, name=None, allocator=drv.mem_alloc):
        """
        Gaussian/Normal random number sample generation
        """
        ary = np.random.normal(loc, scale, size)
        return GPUTensor(ary.shape, dtype, allocator=allocator, name=name,
                         rounding=self.ng.round_mode).set(ary)

    def fprop_fc(self, out, inputs, weights, layer=None):
        """
        Forward propagate the inputs of a fully connected network layer to
        produce output pre-activations (ready for transformation by an
        activation function).

        Arguments:
            out (GPUTensor): Where to store the forward propagated results.
            inputs (GPUTensor): Will be either the dataset input values (first
                                layer), or the outputs from the previous layer.
            weights (GPUTensor): The weight coefficient values for this layer.
            layer (Layer): The layer object.
        """
        self.ng.dot(weights, inputs, out)

    def bprop_fc(self, out, weights, deltas, layer=None):
        """
        Backward propagate the error through a fully connected network layer.

        Arguments:
            out (GPUTensor): Where to store the backward propagated errors.
            weights (GPUTensor): The weight coefficient values for this layer.
            deltas (GPUTensor): The error values for this layer
            layer (Layer): The layer object.
        """
        self.ng.dot(weights.T, deltas, out)

    def update_fc(self, out, inputs, deltas, layer=None):
        """
        Compute the updated gradient for a fully connected network layer.

        Arguments:
            out (GPUTensor): Where to store the updated gradient value.
            inputs (GPUTensor): Will be either the dataset input values (first
                                layer), or the outputs from the previous layer.
            deltas (GPUTensor): The error values for this layer
            layer (Layer): The layer object.
        """
        self.ng.dot(deltas, inputs.T, out)

    def fprop_conv(self, out, inputs, weights, ofmshape, ofmsize, ofmlocs,
                   ifmshape, links, nifm, padding, stride, ngroups, fpropbuf,
                   local=False):
        """
        Forward propagate the inputs of a convolutional network layer to
        produce output pre-activations (ready for transformation by an
        activation function).

        Arguments:
            out (GPUTensor): Where to store the forward propagated results.
            inputs (GPUTensor): Will be either the dataset input values (first
                             layer), or the outputs from the previous layer.
            weights (GPUTensor): The weight coefficient values for this layer.
            ofmshape (tuple): Dimensions of each output feature map (typically
                              number of height and width neurons).
            ofmsize (int): Total size of each output feature map.
            ofmlocs (GPUTensor): Indices giving the location of each element
                                  in each output feature map stored in out.
            ifmshape (tuple): Dimensions of each input feature map (typically
                              number of height and width neurons).  For this
                              backend we expect these values to be square.
            links (GPUTensor): Input receptive field indices.
            nifm (int): Total number of input feature maps.
            padding (int): Number of additional elements to include along each
                           dimension of each local receptive field during the
                           convolution operation.
            stride (int): Number of neurons to shift the filter at each step.
            ngroups (int): Number of groups.
            fpropbuf (GPUTensor): Temporary storage buffer used to hold the
                                  convolved outputs for a single receptive
                                  field.  Not used for this backend.
            local (bool, optional): Whether to do local filtering (True) or
                                    convolution (False, the default)
        """

        '''
        N: Number of images in mini-batch
        C: Number of input feature maps
        K: Number of output feature maps

        D: Depth  of input image
        H: Height of input image
        W: Width  of input image

        T: Depth  of filter kernel
        R: Height of filter kernel
        S: Width  of filter kernel
        '''
        self.ng.fprop_conv(layer=fpropbuf, I=inputs, F=weights, O=out,
                           alpha=1.0, repeat=1)

    def bprop_conv(self, out, weights, deltas, ofmshape, ofmsize, ofmlocs,
                   ifmshape, links, padding, stride, nifm, ngroups, bpropbuf,
                   local=False):
        """
        Backward propagate the error through a convolutional network layer.

        Arguments:
            out (GPUTensor): Where to store the backward propagated errors.
            weights (GPUTensor): The weight coefficient values for this layer.
            deltas (GPUTensor): The error values for this layer
            ofmshape (tuple): Dimensions of each output feature map (typically
                              height and width).
            ofmsize (int): Total size of each output feature map.
            ofmlocs (GPUTensor): Indices giving the location of each element in
                                 each output feature map stored in out.
            ifmshape (tuple): Dimensions of each input feature map (typically
                              height and width).
            links (GPUTensor): Input receptive field indices.
            nifm (int): Total number of input feature maps.
            padding (int): Number of additional elements to include along each
                           dimension of each local receptive field during the
                           convolution operation.
            stride (int): Number of neurons to shift the filter at each step.
            ngroups (int): Number of groups.
            bpropbuf (GPUTensor): Temporary storage buffer used to hold the
                                  backpropagated error for a single receptive
                                  field
            local (bool, optional): Whether to do local filtering (True) or
                                    convolution (False, the default)
        """
        self.ng.bprop_conv(layer=bpropbuf, F=weights, E=deltas, grad_I=out,
                           alpha=1.0, repeat=1)

    def update_conv(self, out, inputs, weights, deltas, ofmshape, ofmsize,
                    ofmlocs, ifmshape, links, nifm, padding, stride, ngroups,
                    fwidth, updatebuf, local=False, layer=None):
        """
        Compute the updated gradient for a convolutional network layer.

        Arguments:
            out (GPUTensor): Where to store the updated gradient value.
            inputs (GPUTensor): Will be either the dataset input values (first
                                layer), or the outputs from the previous layer.
            weights (GPUTensor): The weight coefficient values for this layer.
            deltas (GPUTensor): The error values for this layer
            ofmshape (tuple): Dimensions of each output feature map (typically
                              height and width).
            ofmsize (int): Total size of each output feature map.
            ofmlocs (GPUTensor): Indices giving the location of each element in
                                 each output feature map stored in out.
            ifmshape (tuple): Dimensions of each input feature map (typically
                              height and width).
            links (GPUTensor): Input receptive field indices.
            nifm (int): Total number of input feature maps.
            padding (int): Number of additional elements to include along each
                           dimension of each local receptive field during the
                           convolution operation.
            stride (int): Number of neurons to shift the filter at each step.
            ngroups (int): Number of groups.
            fwidth (int): Filter width.
            updatebuf (GPUTensor): Temporary storage buffer used to hold the
                                   updated gradient for a single receptive
                                   field
            local (bool, optional): Whether to do local filtering (True) or
                                    convolution (False, the default)
            layer (Layer): The layer object.
        """
        self.ng.update_conv(layer=updatebuf, I=inputs, E=deltas, grad_F=out,
                            alpha=1.0, repeat=1)

    def fprop_pool(self, out, inputs, op, ofmshape, ofmsize, ofmlocs, fshape,
                   ifmshape, links, nifm, padding, stride, fpropbuf):
        """
        Forward propagate the inputs of a Pooling network layer to
        produce output pre-activations (ready for transformation by an
        activation function).

        Arguments:
            out (GPUTensor): Where to store the forward propagated results.
            inputs (GPUTensor): Will be either the dataset input values (first
                                layer), or the outputs from the previous layer.
            op (string): The type of pooling operation to apply.  We support
                         "max", "avg", "l2" currently.
            ofmshape (tuple): Dimensions of each output feature map (typically
                              number of height and width neurons).
            ofmsize (int): Total size of each output feature map.
            ofmlocs (GPUTensor): Indices giving the location of each element in
                                 each output feature map stored in out.
            fshape (tuple): Dimensions of each filter (typically height and
                            width).
            ifmshape (tuple): Dimensions of each input feature map (typically
                              number of height and width neurons).
            links (GPUTensor): Input receptive field indices.
            nifm (int): Total number of input feature maps.
            padding (int): Number of additional elements to include along each
                           dimension of each local receptive field during the
                           pooling operation.
            stride (int): Number of neurons to shift the filter at each step.
            fpropbuf (GPUTensor): Temporary storage buffer used to hold the
                                  pooled outputs for a single receptive field.
        """
        op = op.lower()
        if op == "max":
            self.ng.fprop_pool(layer=fpropbuf, I=inputs, O=out, repeat=1)
        else:
            raise AttributeError("unexpected pooling op type: %s", op)

    def bprop_pool(self, out, fouts, inputs, deltas, op, ofmshape, ofmsize,
                   ofmlocs, fshape, fpsize, ifmshape, links, nifm, padding,
                   stride, bpropbuf):
        """
        Backward propagate the error through a pooling network layer.

        Arguments:
            out (GPUTensor): Where to store the backward propagated errors.
            fouts (GPUTensor): Forward propagated outputs from the previous
                               layer.
            inputs (GPUTensor): Will be either the dataset input values (first
                                layer), or the outputs from the previous layer.
            deltas (GPUTensor): The error values for this layer
            op (string): The type of pooling operation to apply.  We support
                         "max", "avg", "l2" currently.
            ofmshape (tuple): Dimensions of each output feature map (typically
                              height and width).
            ofmsize (int): Total size of each output feature map.
            ofmlocs (GPUTensor): Indices giving the location of each element in
                              each output feature map stored in out.
            fshape (tuple): Dimensions of each filter (typically height and
                            width).
            fpsize (int): The size of each filter.
            ifmshape (tuple): Dimensions of each input feature map (typically
                              height and width).
            links (GPUTensor): Input receptive field indices.
            nifm (int): Total number of input feature maps.
            padding (int): Number of additional elements to include along each
                           dimension of each local receptive field during the
                           pooling operation.
            stride (int): Number of neurons to shift the filter at each step.
            bpropbuf (GPUTensor): Temporary storage buffer used to hold the
                                  backpropagated error for a single receptive
                                  field
        """
        op = op.lower()
        if op == "max":
            self.ng.bprop_pool(layer=bpropbuf, I=inputs, E=deltas, grad_I=out,
                               repeat=1)
        else:
            raise AttributeError("unexpected pooling op type: %s", op)

    def logistic(self, x, out):
        """
        Logistic sigmoid nonlinearity, 1/(1+exp(-x))

        Arguments:
            x (GPUTensor): Input tensor
            out (GPUTensor): Output tensor

        """
        self.ng.sig(x, out=out)

        return out

    def rectlin(self, x, out):
        """
        Rectified Linear nonlinearity

        Arguments:
            x (GPUTensor): Input tensor
            out (GPUTensor): Output tensor

        """
        self.ng.maximum(x, 0., out=out)
        return out

    def rectleaky(self, x, slope, out):
        out[:] = self.ng.maximum(x, x*slope)

    def rectleaky_derivative(self, x, slope, out):
        out[:] = self.ng.greater(x, 0) * (1.0 - slope) + slope

    def sum(self, tsr, axes, out):
        """
        Sum

        Arguments:
            tsr  (GPUTensor): Input tensor
            axes (int): Axis along which the reduction is performed. If axes
                        is None,  the tensor is flattened and reduced over
                        both dimensions.
            out (GPUTensor): Output tensor
        """
        if axes is None:
            sze = tsr.shape[0]*tsr.shape[1]
            self.ng.sum(tsr.reshape(sze, 1), axis=0, out=out)
        else:
            self.ng.sum(tsr, axis=axes, out=out)
        return out

    def mean(self, tsr, axes, out):
        """
        Calculates the arithmetic mean of the elements along the specified
        axes.

        Arguments:
            tsr (GPUTensor): Input tensor
            axes (int): Axis along which the reduction is performed. If axes
                        is None,  the tensor is flattened and reduced over
                        both dimensions.
            out (GPUTensor): Output tensor
        """
        if axes is None:
            sze = tsr.shape[0]*tsr.shape[1]
            self.ng.mean(tsr.reshape(sze, 1), axis=0, out=out)
        else:
            self.ng.mean(tsr, axis=axes, out=out)
        return out

    def min(self, tsr, axes, out):
        """
        Calculates the minimum of the elements along the specified
        axes.

        Arguments:
            tsr (GPUTensor): Input tensor
            axes (int): Axis along which the reduction is performed. If axes
                        is None,  the tensor is flattened and reduced over
                        both dimensions.
            out (GPUTensor): Output tensor
        """
        if axes is None:
            sze = tsr.shape[0]*tsr.shape[1]
            self.ng.min(tsr.reshape(sze, 1), axis=0, out=out)
        else:
            self.ng.min(tsr, axis=axes, out=out)
        return out

    def max(self, tsr, axes, out):
        """
        Calculates the maximum of the elements along the specified
        axes.

        Arguments:
            tsr (GPUTensor): Input tensor
            axes (int): Axis along which the reduction is performed. If axes
                        is None,  the tensor is flattened and reduced over
                        both dimensions.
            out (GPUTensor): Output tensor
        """
        if axes is None:
            sze = tsr.shape[0]*tsr.shape[1]
            self.ng.max(tsr.reshape(sze, 1), axis=0, out=out)
        else:
            self.ng.max(tsr, axis=axes, out=out)
        return out

    def variance(self, tsr, axes, out, mean=None):
        """
        Calculates the variance of the elements along the specified
        axes.

        Arguments:
            tsr (GPUTensor): the tensor on which to compute the variance
            axes (int, list, optional): the dimension(s) along which to
                                        variance.  If set to None, we will
                                        variance over all dimensions.
            out (GPUTensor): where the result will be stored.
            mean (GPUTensor): the tensor containing mean of tsr

        Returns:
            GPUTensor: reference to out
        """
        if mean is None:
            logger.error("GPUTensor requires mean to be specified.")
            raise ValueError("mean not specified")
        self.ng.mean(self.ng.square(tsr-mean),  axis=axes, out=out)
        return out

    def fabs(self, x, out):
        """
        Calculates absolute value of the elements in a tensor

        Arguments:
            x (GPUTensor): Input tensor
            out (GPUTensor): Output tensor

        Returns:
            GPUTensor: reference to out
        """
        self.ng.fabs(x, out=out)
        return out

    def sqrt(self, x, out):
        """
        Calculates square root of the elements in a tensor

        Arguments:
            x (GPUTensor): Input tensor
            out (GPUTensor): Output tensor

        Returns:
            GPUTensor: reference to out
        """
        self.ng.sqrt(x, out=out)
        return out

    def zeros(self, shape, dtype=default_dtype, persist_values=True):
        """
        Allocate a new GPUTensor and fill it with zeros.

        Arguments:
            shape (tupel): Shape of the desired GPUTensor
            dtype (dtype): Optional datatype
            persist_values (bool, optional): If set to True (the default), the
                                             values assigned to this Tensor
                                             will persist across multiple begin
                                             and end calls.  Setting to False
                                             may provide a performance increase
                                             if values do not need to be
                                             maintained across such calls

        Returns:
            GPUTensor: output
        """
        return self.ng.zeros(shape, dtype=dtype)

    def ones(self, shape, dtype=default_dtype, persist_values=True):
        """
        Allocate a new GPUTensor and fill it with ones.

        Arguments:
            shape (tupel): Shape of the desired GPUTensor
            dtype (dtype): Optional datatype
            persist_values (bool, optional): If set to True (the default), the
                                             values assigned to this Tensor
                                             will persist across multiple begin
                                             and end calls.  Setting to False
                                             may provide a performance increase
                                             if values do not need to be
                                             maintained across such calls

        Returns:
            GPUTensor: output
        """
        return self.ng.ones(shape, dtype=dtype)

    def empty(self, shape, dtype=default_dtype, persist_values=True):
        """
        Allocate a new GPUTensor.

        Arguments:
            shape (tupel): Shape of the desired GPUTensor
            dtype (dtype): Optional datatype
            persist_values (bool, optional): If set to True (the default), the
                                             values assigned to this Tensor
                                             will persist across multiple begin
                                             and end calls.  Setting to False
                                             may provide a performance increase
                                             if values do not need to be
                                             maintained across such calls

        Returns:
            GPUTensor: output
        """
        return self.ng.empty(shape, dtype=dtype)

    def array(self, ary, dtype=default_dtype, persist_values=True, name=None,
              allocator=drv.mem_alloc):
        """
        Allocate a new GPUTensor and fill it with supplied numpy array.

        Arguments:
            ary (ndarray): Numpy array with source data
            dtype (dtype, optional): Optional datatype
            persist_values (bool, optional): If set to True (the default), the
                                             values assigned to this Tensor
                                             will persist across multiple begin
                                             and end calls.  Setting to False
                                             may provide a performance increase
                                             if values do not need to be
                                             maintained across such calls
            name (string): Name for the GPUTensor
            allocator (pycuda): Pycuda memory allocator

        Returns:
            GPUTensor: output
        """
        return GPUTensor(ary.shape, dtype, allocator=allocator, name=name,
                         rounding=self.ng.round_mode).set(ary)

    def add(self, left, right, out):
        """
        Elementwise addition

        Arguments:
            left (GPUTensor, numeric): left-hand side operand.
            right (GPUTensor, numeric): right-hand side operand.
            out (GPUTensor): where the result will be stored.

        Returns:
            GPUTensor: reference to out
        """
        self.ng.add(left, right, out=out)
        return out

    def subtract(self, left, right, out):
        """
        Elementwise subtraction

        Arguments:
            left (GPUTensor, numeric): left-hand side operand.
            right (GPUTensor, numeric): right-hand side operand.
            out (GPUTensor): where the result will be stored.

        Returns:
            GPUTensor: reference to out
        """
        self.ng.subtract(left, right, out=out)
        return out

    def multiply(self, left, right, out):
        """
        Elementwise multiplication

        Arguments:
            left (GPUTensor, numeric): left-hand side operand.
            right (GPUTensor, numeric): right-hand side operand.
            out (GPUTensor): where the result will be stored.

        Returns:
            GPUTensor: reference to out
        """
        self.ng.multiply(left, right, out=out)
        return out

    def divide(self, left, right, out):
        """
        Elementwise division

        Arguments:
            left (GPUTensor, numeric): left-hand side operand.
            right (GPUTensor, numeric): right-hand side operand.
            out (GPUTensor): where the result will be stored.

        Returns:
            GPUTensor: reference to out
        """
        self.ng.divide(left, right, out=out)
        return out

    def greater(self, left, right, out):
        """
        Elementwise greater than testing

        Arguments:
            left (GPUTensor, numeric): left-hand side operand.
            right (GPUTensor, numeric): right-hand side operand.
            out (GPUTensor): where the result will be stored.

        Returns:
            GPUTensor: reference to out
        """
        self.ng.greater(left, right, out=out)
        return out

    def equal(self, left, right, out):
        """
        Performs element-wise equality testing on each element of left and
        right, storing the result in out.  Each operand is assumed to be the
        same shape (or broadcastable as such).

        Arguments:
            left (GPUTensor, numeric): left-hand side operand.
            right (GPUTensor, numeric): right-hand side operand.
            out (GPUTensor): where the result will be stored.

        Returns:
            GPUTensor: reference to out
        """
        self.ng.equal(left, right, out=out)
        return out

    def not_equal(self, left, right, out):
        """
        Elementwise not equal testing

        Arguments:
            left (GPUTensor, numeric): left-hand side operand.
            right (GPUTensor, numeric): right-hand side operand.
            out (GPUTensor): where the result will be stored.

        Returns:
            GPUTensor: reference to out
        """
        self.ng.not_equal(left, right, out=out)
        return out

    def clip(self, a, a_min, a_max, out):
        """
        Elementwise clipping between a range of specified values

        Arguments:
            a (GPUTensor): input tensor.
            a_min (float): floor value.
            a_max (float): ceiling value.
            out (GPUTensor): where the result will be stored.

        Returns:
            GPUTensor: reference to out
        """
        self.ng.clip(a, a_min, a_max, out=out)
        return out

    def log(self, a, out):
        """
        Elementwise base-e logarithm

        Arguments:
            a (GPUTensor): input tensor.
            out (GPUTensor): where the result will be stored.

        Returns:
            GPUTensor: reference to out
        """
        self.ng.log(a, out=out)
        return out

    def tanh(self, a, out):
        """
        Elementwise tanh

        Arguments:
            a (GPUTensor): input tensor.
            out (GPUTensor): where the result will be stored.

        Returns:
            GPUTensor: reference to out
        """
        self.ng.tanh(a, out=out)
        return out

    def argmax(self, a, out, axis=0):
        """
        Calculates the indices of the maximal element value along the specified
        axis.  If multiple elements contain the maximum, only the elements of
        the first are returned.

        Arguments:
            tsr (GPUTensor): The GPUTensor on which to find the maximum indices
            axis (int): The dimension along which to find the maximum.  If set
                        to None, find the overall maximum index of a flattened
                        representation of tsr.
            out (GPUTensor): Where to store the result.  Should be of the
                             appropriate type and expected shape

        Returns:
            GPUTensor: reference to out
        """
        self.ng.argmax(a, out=out, axis=axis)
        return out

    def softmax(self, x, out):
        """
        Softmax nonlinearity. Computes exp(x-max(x)) / sum_i exp(x_i-max(x_i))

        Arguments:
            x (GPUTensor): input tensor.
            out (GPUTensor): where the result will be stored.

        Returns:
            GPUTensor: reference to out
        """
        out[:] = (self.ng.reciprocal(self.ng.sum(
                  self.ng.exp(x - self.ng.max(x, axis=0)), axis=0)) *
                  self.ng.exp(x - self.ng.max(x, axis=0)))
        return out

    def softmax_gradient(self, y, err, out):
        """
        Gradient of the softmax nonlinearity.

        Arguments:
            y (GPUTensor): input tensor.
            err (GPUTensor): backpropagated error.
            out (GPUTensor): where the result will be stored.

        Returns:
            GPUTensor: reference to out
        """
        raise NotImplementedError("Softmax gradient should use shortcut")
        return out

    def make_binary_mask(self, tsr, keepthresh=0.5, dtype=default_dtype):
        """
        Create a binary mask for dropout layers.

        Arguments:
            tsr (GPUTensor): Output tensor
            keepthresh (float): fraction of ones
        """
        self.ng.dropout(keep=keepthresh, out=tsr)

    def gdm_compound(self, ps_item, us_item, vs_item, momentum_coef,
                     learning_rate, epoch):
        """
        Perform gradient descent update with momentum.

        Arguments:
            ps_item (GPUTensor): parameter tensor (e.g. a weight matrix)
            us_item (GPUTensor): update tensor, contains gradient wrt. weights
            vs_item (GPUTensor): velocity tensor.
            momentum_coef (float): momentum coefficient.
            learning_rate (float): learning rate.
            epoch (int): epoch (used in conjunction with diagnostics).

        Outputs are written to vs_item (updated velocity)
        and ps_item (updated weights)
        """
        vs_item[:] = vs_item * momentum_coef - us_item * learning_rate
        ps_item[:] = ps_item + vs_item

    def gdmwd_compound(self, ps_item, us_item, vs_item, momentum_coef,
                       learning_rate, wd, epoch):
        """
        Perform gradient descent update with momentum and weight decay.

        Arguments:
            ps_item (GPUTensor): parameter tensor (e.g. a weight matrix)
            us_item (GPUTensor): update tensor, contains gradient wrt. weights
            vs_item (GPUTensor): velocity tensor.
            momentum_coef (float): momentum coefficient.
            learning_rate (float): learning rate.
            wd (float): weight decay parameter.
            epoch (int): epoch (used in conjunction with diagnostics).

        Outputs:
            ps_item, the updated weights.
            vs_item, the updated velocity.
            us_item, used as a temp buffer.
        """
        vs_item[:] = vs_item * momentum_coef - us_item * \
            learning_rate - learning_rate * wd * ps_item
        ps_item[:] = ps_item + vs_item

    def ada_update(self, ps_item, us_item, gs_item, ds_item, ls_item, ss_item,
                   rho, epsilon):
        """
        Update rule for AdaDelta (Zeiler, http://arxiv.org/abs/1212.5701)

        Arguments:
            ps_item: weight / parameter (will be updated)
            us_item: update
            gs_item: expected value of Gradient Squared (will be updated)
            ds_item: expected value of Delta Squared (will be updated)
            ls_item: learning rate (will be updated)
            ss_item: Scratch Space
            rho: decay constant (determines window size)
            epsilon: small positive constant for numerical stability
        """
        # Accumulate E[Grad^2]
        gs_item[:] = gs_item * rho + (1.0 - rho) * us_item * us_item

        # Calculate Updates
        ls_item[:] = self.ng.sqrt((ds_item + epsilon) /
                                  (gs_item + epsilon)) * (-1.0) * us_item

        # Accumulate E[Delt^2]
        ds_item[:] = ds_item * rho + (1.0 - rho) * ls_item * ls_item

        # Final update to the params
        ps_item[:] = ps_item + ls_item

    def rms_update(self, params, updates, run_squares, velocity, scratch_space,
                   gamma, epsilon, learning_rate, momentum_coef):

        # Update running squares
        run_squares[:] = gamma * run_squares + (1. - gamma) * updates * updates

        # Now scale the gradient by lr / rms(grad) (with a epsilon term for
        # stability) and use it to update the params
        if momentum_coef == 0:
            params[:] = params - learning_rate * updates * self.ng.reciprocal(
                self.ng.sqrt(run_squares) + epsilon)
        else:
            velocity[:] = velocity * momentum_coef - \
                learning_rate * updates * \
                self.ng.reciprocal(self.ng.sqrt(run_squares) + epsilon)
            params[:] = params + velocity

    def fprop_bn_compound(self, inputs, beta, gamma, eps, xvar, xhat, out):
        """
        Batch normalization forward pass, compounded to run in 3 kernel calls.

        Arguments:
            inputs: input data to be normalized
            beta: location parameter
            gamma: scale parameter
            eps: small constant for numerical stability
            xvar: variance (updated)
            xhat: normalized input (updated)
            out: normalized and rescaled input (updated)
        """
        xvar[:] = self.ng.reciprocal(self.ng.sqrt(self.ng.var(inputs, axis=1) +
                                                  eps))
        xhat[:] = xvar * (inputs - self.ng.mean(inputs, axis=1))
        out[:] = xhat * gamma + beta
        return out

    def bprop_bn_compound(self, xhat, error, xvar, gamma,
                          beta_updates, gamma_updates):
        """
        Batch normalization backward pass, compounded to run with 4 kernel
        calls.

        Arguments:
            xhat: normalized input data (updated)
            error: backpropagated deltas (updated)
            xvar: precomputed variance
            gamma: scale parameter
            beta_updates: gradient update for beta (updated)
            gamma_updates: gradient update for gamma (updated)
        """
        gamma_updates[:] = self.ng.sum(xhat * error, axis=1)
        beta_updates[:] = self.ng.sum(error, axis=1)
        xhat[:] = (xhat * gamma_updates + beta_updates) / float(xhat.shape[1])
        error[:] = xvar * gamma * (error - xhat)
Esempio n. 7
0
cpuI[-1,:] = 0.0

# cpu output arrays
cpuO = np.zeros(dimO,             dtype=np.float32)
cpuB = np.zeros(slicable(dimI,1), dtype=np.float32)
cpuU = np.zeros(slicable(dimF),   dtype=np.float32)

# give gpu the input array without zero padding (not needed)
devI = ng.array(cpuI[:-1,:].reshape(dimI), dtype=dtype)
devF = ng.array(cpuF.reshape(dimF), dtype=dtype)
devE = ng.array(cpuE, dtype=dtype)

devO = devB = devU = 0

if "fprop"  in ops:
    devO = ng.empty(dimO, dtype=dtype)
    ng.fprop_conv(conv,  devI, devF, devO, alpha=1.0, repeat=repeat)

if "bprop"  in ops:
    devB = ng.empty(dimI, dtype=dtype)
    ng.bprop_conv(conv,  devF, devE, devB, alpha=1.0, repeat=repeat)

if "update" in ops:
    devU = ng.empty(dimF, dtype=dtype)
    ng.update_conv(conv, devI, devE, devU, alpha=1.0, repeat=repeat)


def pixel_indices(mt, pr, qs):

    T,R,S = conv.TRS
    D,H,W = conv.DHW
Esempio n. 8
0
    N,C,K = conv.NCK
    D,H,W = conv.DHW
    T,R,S = conv.TRS
    M,P,Q = conv.MPQ
    pad_d, pad_h, pad_w = conv.padding
    str_d, str_h, str_w = conv.strides
    alpha, beta = (1.0, 0.0)

    dimI = conv.dimI2
    dimF = conv.dimF2
    dimO = conv.dimO2

    print "cudnn:"

    cuI = ng.empty(dimI[::-1], dtype=np.float32)
    cuF = ng.empty(dimF[::-1], dtype=np.float32)
    cuE = ng.empty(dimO[::-1], dtype=np.float32)
    cuB = ng.empty(dimI[::-1], dtype=np.float32)
    cuU = ng.empty(dimF[::-1], dtype=np.float32)
    cuO = ng.empty(dimO[::-1], dtype=np.float32)
    cuI[:] = 2 * (.5 - ng.rand())
    cuF[:] = 2 * (.5 - ng.rand())
    cuE[:] = 2 * (.5 - ng.rand())

    #print drv.mem_get_info()

    I_data = ctypes.c_void_p(int(cuI.gpudata))
    F_data = ctypes.c_void_p(int(cuF.gpudata))
    O_data = ctypes.c_void_p(int(cuO.gpudata))
    E_data = ctypes.c_void_p(int(cuE.gpudata))
Esempio n. 9
0
    dimO = (X, N, K)

if ones:
    cpuI = np.ones(dimI, dtype=np.float32)
    cpuE = np.ones(dimO, dtype=np.float32)
    cpuW = np.ones(dimW, dtype=np.float32)
else:
    cpuI = np.random.uniform(-1.0, 1.0, dimI).astype(dtype).astype(np.float32)
    cpuE = np.random.uniform(-1.0, 1.0, dimO).astype(dtype).astype(np.float32)
    cpuW = np.random.uniform(-1.0, 1.0, dimW).astype(dtype).astype(np.float32)

devI = ng.array(cpuI, dtype=dtype)
devE = ng.array(cpuE, dtype=dtype)
devW = ng.array(cpuW, dtype=dtype)

devO = ng.empty(dimO, dtype=dtype)
devB = ng.empty(dimI, dtype=dtype)
devU = ng.empty(dimW, dtype=dtype)

if Nin:
    ng.batched_dot(devW, devI, devO, repeat=repeat, size=size)  # fprop
    ng.batched_dot(devW.T, devE, devB, repeat=repeat, size=size)  # bprop
    ng.batched_dot(devE, devI.T, devU, repeat=repeat, size=size)  # update
else:
    ng.batched_dot(devI, devW.T, devO, repeat=repeat, size=size)  # fprop
    ng.batched_dot(devE, devW, devB, repeat=repeat, size=size)  # bprop
    ng.batched_dot(devE.T, devI, devU, repeat=repeat, size=size)  # update

if cpu:

    cpuO = np.empty(dimO, dtype=np.float32)
Esempio n. 10
0
    dimO = (X,N,K)

if ones:
    cpuI = np.ones(dimI, dtype=np.float32)
    cpuE = np.ones(dimO, dtype=np.float32)
    cpuW = np.ones(dimW, dtype=np.float32)
else:
    cpuI = np.random.uniform(-1.0, 1.0, dimI).astype(dtype).astype(np.float32)
    cpuE = np.random.uniform(-1.0, 1.0, dimO).astype(dtype).astype(np.float32)
    cpuW = np.random.uniform(-1.0, 1.0, dimW).astype(dtype).astype(np.float32)

devI = ng.array(cpuI, dtype=dtype)
devE = ng.array(cpuE, dtype=dtype)
devW = ng.array(cpuW, dtype=dtype)

devO = ng.empty(dimO, dtype=dtype)
devB = ng.empty(dimI, dtype=dtype)
devU = ng.empty(dimW, dtype=dtype)

if Nin:
    ng.batched_dot(devW,   devI,   devO, repeat=repeat, size=size) # fprop
    ng.batched_dot(devW.T, devE,   devB, repeat=repeat, size=size) # bprop
    ng.batched_dot(devE,   devI.T, devU, repeat=repeat, size=size) # update
else:
    ng.batched_dot(devI,   devW.T, devO, repeat=repeat, size=size) # fprop
    ng.batched_dot(devE,   devW,   devB, repeat=repeat, size=size) # bprop
    ng.batched_dot(devE.T, devI,   devU, repeat=repeat, size=size) # update

if cpu:

    cpuO = np.empty(dimO, dtype=np.float32)
Esempio n. 11
0
            layers.append(layer)

            # find the size of the largest buffers so they can be shared
            if layer.sizeF > max_weights:
                max_weights = layer.sizeF
                max_weight_layer = layer

            if layer.sizeO > max_deltas:
                max_deltas = layer.sizeO
                max_delta_layer = layer

        # for layer in sorted(layers, key=lambda l: l.sizeO, reverse=True):
        #     print("%d %s" % (layer.sizeO, layer))

        # Init shared buffers (assumes consistent dtype for now)
        shared_deltas[0] = ng.empty(max_delta_layer.dimO2,  dtype=max_delta_layer.dtype)
        shared_deltas[1] = ng.empty(max_delta_layer.dimO2,  dtype=max_delta_layer.dtype)
        shared_weights   = ng.empty(max_weight_layer.dimF2, dtype=max_weight_layer.dtype)

        prev_layer = None
        delta = False
        for layer in layers:

            print(layer)

            # Intitalize buffers.  Alernate shared delta buffer.
            # One layer can't have the same buffer for both error in and error out.
            layer.init_activations()
            layer.init_weights(shared=shared_weights)
            layer.init_deltas(shared=shared_deltas[delta])
Esempio n. 12
0
            # find the size of the largest buffers so they can be shared
            if layer.sizeF > max_weights:
                max_weights = layer.sizeF
                max_weight_layer = layer

            if layer.sizeI > max_deltas and type(prev_layer) is not DataLayer:
                max_deltas = layer.sizeI
                max_delta_layer = layer

            prev_layer = layer
            layers.append(layer)

        # Init shared buffers (assumes consistent dtype for now)
        shared_deltas.append(
            ng.empty(max_delta_layer.dimI, dtype=max_delta_layer.dtype))
        shared_deltas.append(
            ng.empty(max_delta_layer.dimI, dtype=max_delta_layer.dtype))
        if inception:
            shared_deltas.append(
                ng.empty(max_delta_layer.dimI, dtype=max_delta_layer.dtype))
            shared_deltas.append(
                ng.empty(max_delta_layer.dimI, dtype=max_delta_layer.dtype))

        shared_updates = ng.empty(max_weight_layer.dimF, dtype=np.float32)

        for i, layer in enumerate(layers):
            if verbose:
                print(layer)

            # Intitalize buffers.  Alernate shared delta buffer.
Esempio n. 13
0
cpuI[-1,:] = 0.0

# cpu output arrays
cpuO = np.zeros(dimO,             dtype=np.float32)
cpuB = np.zeros(slicable(dimI,1), dtype=np.float32)
cpuU = np.zeros(slicable(dimF),   dtype=np.float32)

# give gpu the input array without zero padding (not needed)
devI = ng.array(cpuI[:-1,:].reshape(dimI), dtype=dtype)
devF = ng.array(cpuF.reshape(dimF), dtype=dtype)
devE = ng.array(cpuE, dtype=dtype)

devO = devB = devU = 0

if "fprop"  in ops:
    devO = ng.empty(dimO, dtype=dtype)
    ng.fprop_conv(conv,  devI, devF, devO, alpha=1.0, repeat=repeat)

if "bprop"  in ops:
    devB = ng.empty(dimI, dtype=dtype)
    ng.bprop_conv(conv,  devF, devE, devB, alpha=1.0, repeat=repeat)

if "update" in ops:
    devU = ng.empty(dimF, dtype=dtype)
    ng.update_conv(conv, devI, devE, devU, alpha=1.0, repeat=repeat)


def pixel_indices(mt, pr, qs):

    T,R,S = conv.TRS
    D,H,W = conv.DHW
Esempio n. 14
0
    N, C, K = conv.NCK
    D, H, W = conv.DHW
    T, R, S = conv.TRS
    M, P, Q = conv.MPQ
    pad_d, pad_h, pad_w = conv.padding
    str_d, str_h, str_w = conv.strides
    alpha, beta = (1.0, 0.0)

    dimI = conv.dimI2
    dimF = conv.dimF2
    dimO = conv.dimO2

    print "cudnn:"

    cuI = ng.empty(dimI[::-1], dtype=np.float32)
    cuF = ng.empty(dimF[::-1], dtype=np.float32)
    cuE = ng.empty(dimO[::-1], dtype=np.float32)
    cuB = ng.empty(dimI[::-1], dtype=np.float32)
    cuU = ng.empty(dimF[::-1], dtype=np.float32)
    cuO = ng.empty(dimO[::-1], dtype=np.float32)
    cuI[:] = 2 * (.5 - ng.rand())
    cuF[:] = 2 * (.5 - ng.rand())
    cuE[:] = 2 * (.5 - ng.rand())

    #print drv.mem_get_info()

    I_data = ctypes.c_void_p(int(cuI.gpudata))
    F_data = ctypes.c_void_p(int(cuF.gpudata))
    O_data = ctypes.c_void_p(int(cuO.gpudata))
    E_data = ctypes.c_void_p(int(cuE.gpudata))
Esempio n. 15
0
cpuI = np.random.uniform(0.0, 9.4, slicable(dimI,1)).astype(np.float16).astype(np.float32)

# zero pad the last row of cpu input for the sake of numpy
if pool.op == "max":
    cpuI[-1,:] = np.finfo(cpuI.dtype).min
else:
    cpuI[-1,:] = 0

# cpu output arrays
cpuO = np.empty(dimO, dtype=np.float32)
cpuB = np.zeros(slicable(dimI,1), dtype=np.float32)

# give gpu the input array without zero padding (not needed)
devI = ng.array(cpuI[:-1,:].reshape(dimI), dtype=dtype)
devO = ng.zeros(dimO, dtype=dtype)
devB = ng.empty(dimI, dtype=dtype)

ng.fprop_pool(pool, devI, devO, repeat=repeat)

ng.bprop_pool(pool, devI, devO, devB, repeat=repeat)

def pixel_indices(kj, mt, pr, qs):

    C       = pool.C
    J,T,R,S = pool.JTRS
    D,H,W = pool.DHW
    HW    = H*W
    DHW   = D*H*W
    imax  = C*D*H*W
    idx   = []
Esempio n. 16
0
class GPU(Backend):
    """
    Sets up a NervanaGPU based backend for matrix operations.
    Note that some functions defined in the generic Backend class such as are
    cross-map pooling and normalization and adaDelta are not implemented for
    this backend.
    """
    default_dtype = np.float32

    def __init__(self, rng_seed, stochastic_round=False, device_id=0):
        self.ng = NervanaGPU(stochastic_round=stochastic_round)
        logger.info("Initialized NervanaGPU with stochastic_round=%s",
                    stochastic_round)
        self.rng_seed = rng_seed
        self.rng_init()
        self.device_id = device_id if device_id is not None else 0

    def __getstate__(self):
        """
        Defines what and how we go about serializing an instance of this class.

        Returns:
            self.__dict__: The full contents of the backend class instance,
                           except for the mem_pool which is on device and
                           cannot be serialized.
        """
        if hasattr(self, 'mem_pool') and self.mem_pool is not None:
            self.mem_pool_pickle = {'shape': self.mem_pool.shape,
                                    'dtype': np.float32}
            self.mem_pool = None

        return self.__dict__

    def __setstate__(self, state):
        """
        Defines how we go about deserializing into an instance of this class.

        Arguments:
            self.__dict__: The full contents of the backend class instance,
                           except for the mem_pool which is on device and
                           cannot be serialized.
        """
        self.__dict__.update(state)
        self.mem_pool = self.ng.empty(self.mem_pool_pickle['shape'],
                                      dtype=self.mem_pool_pickle['dtype'])

    def init_mempool(self, shape, dtype=default_dtype):
        """
        Allocates a memory pool for temporary storage
        """
        self.mem_pool = self.ng.empty(shape, dtype=dtype)

    def alloc_host_mem(self, shape, dtype):
        return drv.pagelocked_empty(shape, dtype, order="C", mem_flags=0)

    def create_stream(self):
        return drv.Stream()

    def async_copy(self, dest, src, stream=None):
        drv.memcpy_htod_async(dest.gpudata, src, stream)

    def rng_init(self):
        """
        Initialize and seed the pseudo random number genrator. Random numbers
        are generated on the host using numpy, then transfered to device.
        """
        seed = None
        if 'rng_seed' in self.__dict__:
            seed = self.rng_seed
            logger.info("Seeding random number generator with: %s", str(seed))
        np.random.seed(seed)

    def flop_timing_init(self, decorate_fc, decorate_conv, decorate_ew):
        """
        Initialize FLOP timing.  Wraps the specified MOP calls via a decorator
        to record elapsed time and number of operations.

        Arguments:
           decorate_fc (list): string giving the function names of fully
                               connected layer forward/backward/update calls
                               to time.
           decorate_conv (list): string giving the function names of
                                 convolutional layer forward/backward/update
                                 calls to time.
           decorate_ew (list): string giving the function names of element-wise
                               calls to time.

        Notes:
            Must be called prior to first flop_timing_start call
        """
        self.start = drv.Event()
        self.end = drv.Event()
        self.flop_timer = FlopsDecorator(self)
        self.flop_timer.decorate(decorate_fc=decorate_fc,
                                 decorate_conv=decorate_conv,
                                 decorate_ew=decorate_ew)

    def flop_timinig_start(self):
        """
        Start a new FLOP timer.
        Returns:
            None: dummy value (not used)
        """
        return self.start.record()

    def flop_timing_finish(self, start_time):
        """
        Complete current FLOP timing.

        Arguments:
            start_time (unused): ignored.

        Returns:
            float: elapsed time in seconds since prior flop_timing_start call.
        """
        self.end.record()
        self.end.synchronize()
        return self.end.time_since(self.start)

    def uniform(self, low=0.0, high=1.0, shape=1, dtype=default_dtype,
                persist_values=True, name=None, allocator=drv.mem_alloc):
        """
        generate numpy random number and convert to a GPUTensor.
        If called with dype=None it will probably explode
        """
        ary = np.random.uniform(low, high, shape)
        return self.ng.array(ary, dtype=dtype, name=name)

    def normal(self, loc=0.0, scale=1.0, size=1, dtype=default_dtype,
               persist_values=True, name=None, allocator=drv.mem_alloc):
        """
        Gaussian/Normal random number sample generation
        """
        ary = np.random.normal(loc, scale, size)
        return self.ng.array(ary, dtype=dtype, name=name)

    def fprop_fc(self, out, inputs, weights, layer=None):
        """
        Forward propagate the inputs of a fully connected network layer to
        produce output pre-activations (ready for transformation by an
        activation function).

        Arguments:
            out (GPUTensor): Where to store the forward propagated results.
            inputs (GPUTensor): Will be either the dataset input values (first
                                layer), or the outputs from the previous layer.
            weights (GPUTensor): The weight coefficient values for this layer.
            layer (Layer): The layer object.
        """
        self.ng.dot(weights, inputs, out)

    def bprop_fc(self, out, weights, deltas, layer=None):
        """
        Backward propagate the error through a fully connected network layer.

        Arguments:
            out (GPUTensor): Where to store the backward propagated errors.
            weights (GPUTensor): The weight coefficient values for this layer.
            deltas (GPUTensor): The error values for this layer
            layer (Layer): The layer object.
        """
        self.ng.dot(weights.T, deltas, out)

    def update_fc(self, out, inputs, deltas, layer=None):
        """
        Compute the updated gradient for a fully connected network layer.

        Arguments:
            out (GPUTensor): Where to store the updated gradient value.
            inputs (GPUTensor): Will be either the dataset input values (first
                                layer), or the outputs from the previous layer.
            deltas (GPUTensor): The error values for this layer
            layer (Layer): The layer object.
        """
        self.ng.dot(deltas, inputs.T, out)

    def fprop_conv(self, out, inputs, weights, ofmshape, ofmsize, ofmlocs,
                   ifmshape, links, nifm, padding, stride, ngroups, fpropbuf,
                   local=False):
        """
        Forward propagate the inputs of a convolutional network layer to
        produce output pre-activations (ready for transformation by an
        activation function).

        Arguments:
            out (GPUTensor): Where to store the forward propagated results.
            inputs (GPUTensor): Will be either the dataset input values (first
                             layer), or the outputs from the previous layer.
            weights (GPUTensor): The weight coefficient values for this layer.
            ofmshape (tuple): Dimensions of each output feature map (typically
                              number of height and width neurons).
            ofmsize (int): Total size of each output feature map.
            ofmlocs (GPUTensor): Indices giving the location of each element
                                  in each output feature map stored in out.
            ifmshape (tuple): Dimensions of each input feature map (typically
                              number of height and width neurons).  For this
                              backend we expect these values to be square.
            links (GPUTensor): Input receptive field indices.
            nifm (int): Total number of input feature maps.
            padding (int): Number of additional elements to include along each
                           dimension of each local receptive field during the
                           convolution operation.
            stride (int): Number of neurons to shift the filter at each step.
            ngroups (int): Number of groups.
            fpropbuf (GPUTensor): Temporary storage buffer used to hold the
                                  convolved outputs for a single receptive
                                  field.  Not used for this backend.
            local (bool, optional): Whether to do local filtering (True) or
                                    convolution (False, the default)
        """

        '''
        N: Number of images in mini-batch
        C: Number of input feature maps
        K: Number of output feature maps

        D: Depth  of input image
        H: Height of input image
        W: Width  of input image

        T: Depth  of filter kernel
        R: Height of filter kernel
        S: Width  of filter kernel
        '''
        self.ng.fprop_conv(layer=fpropbuf, I=inputs, F=weights, O=out,
                           alpha=1.0, repeat=1)

    def bprop_conv(self, out, weights, deltas, ofmshape, ofmsize, ofmlocs,
                   ifmshape, links, padding, stride, nifm, ngroups, bpropbuf,
                   local=False):
        """
        Backward propagate the error through a convolutional network layer.

        Arguments:
            out (GPUTensor): Where to store the backward propagated errors.
            weights (GPUTensor): The weight coefficient values for this layer.
            deltas (GPUTensor): The error values for this layer
            ofmshape (tuple): Dimensions of each output feature map (typically
                              height and width).
            ofmsize (int): Total size of each output feature map.
            ofmlocs (GPUTensor): Indices giving the location of each element in
                                 each output feature map stored in out.
            ifmshape (tuple): Dimensions of each input feature map (typically
                              height and width).
            links (GPUTensor): Input receptive field indices.
            nifm (int): Total number of input feature maps.
            padding (int): Number of additional elements to include along each
                           dimension of each local receptive field during the
                           convolution operation.
            stride (int): Number of neurons to shift the filter at each step.
            ngroups (int): Number of groups.
            bpropbuf (GPUTensor): Temporary storage buffer used to hold the
                                  backpropagated error for a single receptive
                                  field
            local (bool, optional): Whether to do local filtering (True) or
                                    convolution (False, the default)
        """
        self.ng.bprop_conv(layer=bpropbuf, F=weights, E=deltas, grad_I=out,
                           alpha=1.0, repeat=1)

    def update_conv(self, out, inputs, weights, deltas, ofmshape, ofmsize,
                    ofmlocs, ifmshape, links, nifm, padding, stride, ngroups,
                    fwidth, updatebuf, local=False, layer=None):
        """
        Compute the updated gradient for a convolutional network layer.

        Arguments:
            out (GPUTensor): Where to store the updated gradient value.
            inputs (GPUTensor): Will be either the dataset input values (first
                                layer), or the outputs from the previous layer.
            weights (GPUTensor): The weight coefficient values for this layer.
            deltas (GPUTensor): The error values for this layer
            ofmshape (tuple): Dimensions of each output feature map (typically
                              height and width).
            ofmsize (int): Total size of each output feature map.
            ofmlocs (GPUTensor): Indices giving the location of each element in
                                 each output feature map stored in out.
            ifmshape (tuple): Dimensions of each input feature map (typically
                              height and width).
            links (GPUTensor): Input receptive field indices.
            nifm (int): Total number of input feature maps.
            padding (int): Number of additional elements to include along each
                           dimension of each local receptive field during the
                           convolution operation.
            stride (int): Number of neurons to shift the filter at each step.
            ngroups (int): Number of groups.
            fwidth (int): Filter width.
            updatebuf (GPUTensor): Temporary storage buffer used to hold the
                                   updated gradient for a single receptive
                                   field
            local (bool, optional): Whether to do local filtering (True) or
                                    convolution (False, the default)
            layer (Layer): The layer object.
        """
        self.ng.update_conv(layer=updatebuf, I=inputs, E=deltas, grad_F=out,
                            alpha=1.0, repeat=1)

    def fprop_pool(self, out, inputs, op, ofmshape, ofmsize, ofmlocs, fshape,
                   ifmshape, links, nifm, padding, stride, fpropbuf):
        """
        Forward propagate the inputs of a Pooling network layer to
        produce output pre-activations (ready for transformation by an
        activation function).

        Arguments:
            out (GPUTensor): Where to store the forward propagated results.
            inputs (GPUTensor): Will be either the dataset input values (first
                                layer), or the outputs from the previous layer.
            op (string): The type of pooling operation to apply.  We support
                         "max", "avg", "l2" currently.
            ofmshape (tuple): Dimensions of each output feature map (typically
                              number of height and width neurons).
            ofmsize (int): Total size of each output feature map.
            ofmlocs (GPUTensor): Indices giving the location of each element in
                                 each output feature map stored in out.
            fshape (tuple): Dimensions of each filter (typically height and
                            width).
            ifmshape (tuple): Dimensions of each input feature map (typically
                              number of height and width neurons).
            links (GPUTensor): Input receptive field indices.
            nifm (int): Total number of input feature maps.
            padding (int): Number of additional elements to include along each
                           dimension of each local receptive field during the
                           pooling operation.
            stride (int): Number of neurons to shift the filter at each step.
            fpropbuf (GPUTensor): Temporary storage buffer used to hold the
                                  pooled outputs for a single receptive field.
        """
        op = op.lower()
        if op == "max":
            self.ng.fprop_pool(layer=fpropbuf, I=inputs, O=out, repeat=1)
        else:
            raise AttributeError("unexpected pooling op type: %s", op)

    def bprop_pool(self, out, fouts, inputs, deltas, op, ofmshape, ofmsize,
                   ofmlocs, fshape, fpsize, ifmshape, links, nifm, padding,
                   stride, bpropbuf):
        """
        Backward propagate the error through a pooling network layer.

        Arguments:
            out (GPUTensor): Where to store the backward propagated errors.
            fouts (GPUTensor): Forward propagated outputs from the previous
                               layer.
            inputs (GPUTensor): Will be either the dataset input values (first
                                layer), or the outputs from the previous layer.
            deltas (GPUTensor): The error values for this layer
            op (string): The type of pooling operation to apply.  We support
                         "max", "avg", "l2" currently.
            ofmshape (tuple): Dimensions of each output feature map (typically
                              height and width).
            ofmsize (int): Total size of each output feature map.
            ofmlocs (GPUTensor): Indices giving the location of each element in
                              each output feature map stored in out.
            fshape (tuple): Dimensions of each filter (typically height and
                            width).
            fpsize (int): The size of each filter.
            ifmshape (tuple): Dimensions of each input feature map (typically
                              height and width).
            links (GPUTensor): Input receptive field indices.
            nifm (int): Total number of input feature maps.
            padding (int): Number of additional elements to include along each
                           dimension of each local receptive field during the
                           pooling operation.
            stride (int): Number of neurons to shift the filter at each step.
            bpropbuf (GPUTensor): Temporary storage buffer used to hold the
                                  backpropagated error for a single receptive
                                  field
        """
        op = op.lower()
        if op == "max":
            self.ng.bprop_pool(layer=bpropbuf, I=inputs, E=deltas, grad_I=out,
                               repeat=1)
        else:
            raise AttributeError("unexpected pooling op type: %s", op)

    def logistic(self, x, out):
        """
        Logistic sigmoid nonlinearity, 1/(1+exp(-x))

        Arguments:
            x (GPUTensor): Input tensor
            out (GPUTensor): Output tensor

        """
        self.ng.sig(x, out=out)

        return out

    def rectlin(self, x, out):
        """
        Rectified Linear nonlinearity

        Arguments:
            x (GPUTensor): Input tensor
            out (GPUTensor): Output tensor

        """
        self.ng.maximum(x, 0., out=out)
        return out

    def rectleaky(self, x, slope, out):
        out[:] = self.ng.maximum(x, x*slope)

    def rectleaky_derivative(self, x, slope, out):
        out[:] = self.ng.greater(x, 0) * (1.0 - slope) + slope

    def sum(self, tsr, axes, out):
        """
        Sum

        Arguments:
            tsr  (GPUTensor): Input tensor
            axes (int): Axis along which the reduction is performed. If axes
                        is None,  the tensor is flattened and reduced over
                        both dimensions.
            out (GPUTensor): Output tensor
        """
        if axes is None:
            sze = tsr.shape[0]*tsr.shape[1]
            self.ng.sum(tsr.reshape(sze, 1), axis=0, out=out)
        else:
            self.ng.sum(tsr, axis=axes, out=out)
        return out

    def mean(self, tsr, axes, out):
        """
        Calculates the arithmetic mean of the elements along the specified
        axes.

        Arguments:
            tsr (GPUTensor): Input tensor
            axes (int): Axis along which the reduction is performed. If axes
                        is None,  the tensor is flattened and reduced over
                        both dimensions.
            out (GPUTensor): Output tensor
        """
        if axes is None:
            sze = tsr.shape[0]*tsr.shape[1]
            self.ng.mean(tsr.reshape(sze, 1), axis=0, out=out)
        else:
            self.ng.mean(tsr, axis=axes, out=out)
        return out

    def min(self, tsr, axes, out):
        """
        Calculates the minimum of the elements along the specified
        axes.

        Arguments:
            tsr (GPUTensor): Input tensor
            axes (int): Axis along which the reduction is performed. If axes
                        is None,  the tensor is flattened and reduced over
                        both dimensions.
            out (GPUTensor): Output tensor
        """
        if axes is None:
            sze = tsr.shape[0]*tsr.shape[1]
            self.ng.min(tsr.reshape(sze, 1), axis=0, out=out)
        else:
            self.ng.min(tsr, axis=axes, out=out)
        return out

    def max(self, tsr, axes, out):
        """
        Calculates the maximum of the elements along the specified
        axes.

        Arguments:
            tsr (GPUTensor): Input tensor
            axes (int): Axis along which the reduction is performed. If axes
                        is None,  the tensor is flattened and reduced over
                        both dimensions.
            out (GPUTensor): Output tensor
        """
        if axes is None:
            sze = tsr.shape[0]*tsr.shape[1]
            self.ng.max(tsr.reshape(sze, 1), axis=0, out=out)
        else:
            self.ng.max(tsr, axis=axes, out=out)
        return out

    def variance(self, tsr, axes, out, mean=None):
        """
        Calculates the variance of the elements along the specified
        axes.

        Arguments:
            tsr (GPUTensor): the tensor on which to compute the variance
            axes (int, list, optional): the dimension(s) along which to
                                        variance.  If set to None, we will
                                        variance over all dimensions.
            out (GPUTensor): where the result will be stored.
            mean (GPUTensor): the tensor containing mean of tsr

        Returns:
            GPUTensor: reference to out
        """
        if mean is None:
            logger.error("GPUTensor requires mean to be specified.")
            raise ValueError("mean not specified")
        self.ng.mean(self.ng.square(tsr-mean),  axis=axes, out=out)
        return out

    def fabs(self, x, out):
        """
        Calculates absolute value of the elements in a tensor

        Arguments:
            x (GPUTensor): Input tensor
            out (GPUTensor): Output tensor

        Returns:
            GPUTensor: reference to out
        """
        self.ng.fabs(x, out=out)
        return out

    def sqrt(self, x, out):
        """
        Calculates square root of the elements in a tensor

        Arguments:
            x (GPUTensor): Input tensor
            out (GPUTensor): Output tensor

        Returns:
            GPUTensor: reference to out
        """
        self.ng.sqrt(x, out=out)
        return out

    def zeros(self, shape, dtype=default_dtype, persist_values=True):
        """
        Allocate a new GPUTensor and fill it with zeros.

        Arguments:
            shape (tupel): Shape of the desired GPUTensor
            dtype (dtype): Optional datatype
            persist_values (bool, optional): If set to True (the default), the
                                             values assigned to this Tensor
                                             will persist across multiple begin
                                             and end calls.  Setting to False
                                             may provide a performance increase
                                             if values do not need to be
                                             maintained across such calls

        Returns:
            GPUTensor: output
        """
        return self.ng.zeros(shape, dtype=dtype)

    def ones(self, shape, dtype=default_dtype, persist_values=True):
        """
        Allocate a new GPUTensor and fill it with ones.

        Arguments:
            shape (tupel): Shape of the desired GPUTensor
            dtype (dtype): Optional datatype
            persist_values (bool, optional): If set to True (the default), the
                                             values assigned to this Tensor
                                             will persist across multiple begin
                                             and end calls.  Setting to False
                                             may provide a performance increase
                                             if values do not need to be
                                             maintained across such calls

        Returns:
            GPUTensor: output
        """
        return self.ng.ones(shape, dtype=dtype)

    def empty(self, shape, dtype=default_dtype, persist_values=True):
        """
        Allocate a new GPUTensor.

        Arguments:
            shape (tupel): Shape of the desired GPUTensor
            dtype (dtype): Optional datatype
            persist_values (bool, optional): If set to True (the default), the
                                             values assigned to this Tensor
                                             will persist across multiple begin
                                             and end calls.  Setting to False
                                             may provide a performance increase
                                             if values do not need to be
                                             maintained across such calls

        Returns:
            GPUTensor: output
        """
        return self.ng.empty(shape, dtype=dtype)

    def array(self, ary, dtype=default_dtype, persist_values=True, name=None,
              allocator=drv.mem_alloc):
        """
        Allocate a new GPUTensor and fill it with supplied numpy array.

        Arguments:
            ary (ndarray): Numpy array with source data
            dtype (dtype, optional): Optional datatype
            persist_values (bool, optional): If set to True (the default), the
                                             values assigned to this Tensor
                                             will persist across multiple begin
                                             and end calls.  Setting to False
                                             may provide a performance increase
                                             if values do not need to be
                                             maintained across such calls
            name (string): Name for the GPUTensor
            allocator (pycuda): Pycuda memory allocator

        Returns:
            GPUTensor: output
        """
        return self.ng.array(ary, dtype=dtype, name=name)

    def add(self, left, right, out):
        """
        Elementwise addition

        Arguments:
            left (GPUTensor, numeric): left-hand side operand.
            right (GPUTensor, numeric): right-hand side operand.
            out (GPUTensor): where the result will be stored.

        Returns:
            GPUTensor: reference to out
        """
        self.ng.add(left, right, out=out)
        return out

    def subtract(self, left, right, out):
        """
        Elementwise subtraction

        Arguments:
            left (GPUTensor, numeric): left-hand side operand.
            right (GPUTensor, numeric): right-hand side operand.
            out (GPUTensor): where the result will be stored.

        Returns:
            GPUTensor: reference to out
        """
        self.ng.subtract(left, right, out=out)
        return out

    def multiply(self, left, right, out):
        """
        Elementwise multiplication

        Arguments:
            left (GPUTensor, numeric): left-hand side operand.
            right (GPUTensor, numeric): right-hand side operand.
            out (GPUTensor): where the result will be stored.

        Returns:
            GPUTensor: reference to out
        """
        self.ng.multiply(left, right, out=out)
        return out

    def divide(self, left, right, out):
        """
        Elementwise division

        Arguments:
            left (GPUTensor, numeric): left-hand side operand.
            right (GPUTensor, numeric): right-hand side operand.
            out (GPUTensor): where the result will be stored.

        Returns:
            GPUTensor: reference to out
        """
        self.ng.divide(left, right, out=out)
        return out

    def greater(self, left, right, out):
        """
        Elementwise greater than testing

        Arguments:
            left (GPUTensor, numeric): left-hand side operand.
            right (GPUTensor, numeric): right-hand side operand.
            out (GPUTensor): where the result will be stored.

        Returns:
            GPUTensor: reference to out
        """
        self.ng.greater(left, right, out=out)
        return out

    def equal(self, left, right, out):
        """
        Performs element-wise equality testing on each element of left and
        right, storing the result in out.  Each operand is assumed to be the
        same shape (or broadcastable as such).

        Arguments:
            left (GPUTensor, numeric): left-hand side operand.
            right (GPUTensor, numeric): right-hand side operand.
            out (GPUTensor): where the result will be stored.

        Returns:
            GPUTensor: reference to out
        """
        self.ng.equal(left, right, out=out)
        return out

    def not_equal(self, left, right, out):
        """
        Elementwise not equal testing

        Arguments:
            left (GPUTensor, numeric): left-hand side operand.
            right (GPUTensor, numeric): right-hand side operand.
            out (GPUTensor): where the result will be stored.

        Returns:
            GPUTensor: reference to out
        """
        self.ng.not_equal(left, right, out=out)
        return out

    def clip(self, a, a_min, a_max, out):
        """
        Elementwise clipping between a range of specified values

        Arguments:
            a (GPUTensor): input tensor.
            a_min (float): floor value.
            a_max (float): ceiling value.
            out (GPUTensor): where the result will be stored.

        Returns:
            GPUTensor: reference to out
        """
        self.ng.clip(a, a_min, a_max, out=out)
        return out

    def log(self, a, out):
        """
        Elementwise base-e logarithm

        Arguments:
            a (GPUTensor): input tensor.
            out (GPUTensor): where the result will be stored.

        Returns:
            GPUTensor: reference to out
        """
        self.ng.log(a, out=out)
        return out

    def tanh(self, a, out):
        """
        Elementwise tanh

        Arguments:
            a (GPUTensor): input tensor.
            out (GPUTensor): where the result will be stored.

        Returns:
            GPUTensor: reference to out
        """
        self.ng.tanh(a, out=out)
        return out

    def argmax(self, a, out, axis=0):
        """
        Calculates the indices of the maximal element value along the specified
        axis.  If multiple elements contain the maximum, only the elements of
        the first are returned.

        Arguments:
            tsr (GPUTensor): The GPUTensor on which to find the maximum indices
            axis (int): The dimension along which to find the maximum.  If set
                        to None, find the overall maximum index of a flattened
                        representation of tsr.
            out (GPUTensor): Where to store the result.  Should be of the
                             appropriate type and expected shape

        Returns:
            GPUTensor: reference to out
        """
        self.ng.argmax(a, out=out, axis=axis)
        return out

    def softmax(self, x, out):
        """
        Softmax nonlinearity. Computes exp(x-max(x)) / sum_i exp(x_i-max(x_i))

        Arguments:
            x (GPUTensor): input tensor.
            out (GPUTensor): where the result will be stored.

        Returns:
            GPUTensor: reference to out
        """
        out[:] = (self.ng.reciprocal(self.ng.sum(
                  self.ng.exp(x - self.ng.max(x, axis=0)), axis=0)) *
                  self.ng.exp(x - self.ng.max(x, axis=0)))
        return out

    def softmax_gradient(self, y, err, out):
        """
        Gradient of the softmax nonlinearity.

        Arguments:
            y (GPUTensor): input tensor.
            err (GPUTensor): backpropagated error.
            out (GPUTensor): where the result will be stored.

        Returns:
            GPUTensor: reference to out
        """
        raise NotImplementedError("Softmax gradient should use shortcut")
        return out

    def make_binary_mask(self, tsr, keepthresh=0.5, dtype=default_dtype):
        """
        Create a binary mask for dropout layers.

        Arguments:
            tsr (GPUTensor): Output tensor
            keepthresh (float): fraction of ones
        """
        self.ng.dropout(keep=keepthresh, out=tsr)

    def gdm_compound(self, ps_item, us_item, vs_item, momentum_coef,
                     learning_rate, epoch):
        """
        Perform gradient descent update with momentum.

        Arguments:
            ps_item (GPUTensor): parameter tensor (e.g. a weight matrix)
            us_item (GPUTensor): update tensor, contains gradient wrt. weights
            vs_item (GPUTensor): velocity tensor.
            momentum_coef (float): momentum coefficient.
            learning_rate (float): learning rate.
            epoch (int): epoch (used in conjunction with diagnostics).

        Outputs are written to vs_item (updated velocity)
        and ps_item (updated weights)
        """
        vs_item[:] = vs_item * momentum_coef - us_item * learning_rate
        ps_item[:] = ps_item + vs_item

    def gdmwd_compound(self, ps_item, us_item, vs_item, momentum_coef,
                       learning_rate, wd, epoch):
        """
        Perform gradient descent update with momentum and weight decay.

        Arguments:
            ps_item (GPUTensor): parameter tensor (e.g. a weight matrix)
            us_item (GPUTensor): update tensor, contains gradient wrt. weights
            vs_item (GPUTensor): velocity tensor.
            momentum_coef (float): momentum coefficient.
            learning_rate (float): learning rate.
            wd (float): weight decay parameter.
            epoch (int): epoch (used in conjunction with diagnostics).

        Outputs:
            ps_item, the updated weights.
            vs_item, the updated velocity.
            us_item, used as a temp buffer.
        """
        vs_item[:] = vs_item * momentum_coef - us_item * \
            learning_rate - learning_rate * wd * ps_item
        ps_item[:] = ps_item + vs_item

    def exp_mavg(self, mavg, newval, rho):
        """
        Calculate the exponential moving average

        Arguments:
            mavg:  The running value of the moving average
            newval:  New sample to be added to the moving average
            rho:  Interpolation value
        """

        mavg[:] = rho * mavg + (1.0 - rho) * newval

    def ada_update(self, ps_item, us_item, gs_item, ds_item, ls_item, ss_item,
                   rho, epsilon):
        """
        Update rule for AdaDelta (Zeiler, http://arxiv.org/abs/1212.5701)

        Arguments:
            ps_item: weight / parameter (will be updated)
            us_item: update
            gs_item: expected value of Gradient Squared (will be updated)
            ds_item: expected value of Delta Squared (will be updated)
            ls_item: learning rate (will be updated)
            ss_item: Scratch Space
            rho: decay constant (determines window size)
            epsilon: small positive constant for numerical stability
        """
        # Accumulate E[Grad^2]
        gs_item[:] = gs_item * rho + (1.0 - rho) * us_item * us_item

        # Calculate Updates
        ls_item[:] = self.ng.sqrt((ds_item + epsilon) /
                                  (gs_item + epsilon)) * (-1.0) * us_item

        # Accumulate E[Delt^2]
        ds_item[:] = ds_item * rho + (1.0 - rho) * ls_item * ls_item

        # Final update to the params
        ps_item[:] = ps_item + ls_item

    def rms_update(self, params, updates, run_squares, velocity, scratch_space,
                   gamma, epsilon, learning_rate, momentum_coef):

        # Update running squares
        run_squares[:] = gamma * run_squares + (1. - gamma) * updates * updates

        # Now scale the gradient by lr / rms(grad) (with a epsilon term for
        # stability) and use it to update the params
        if momentum_coef == 0:
            params[:] = params - learning_rate * updates * self.ng.reciprocal(
                self.ng.sqrt(run_squares) + epsilon)
        else:
            velocity[:] = velocity * momentum_coef - \
                learning_rate * updates * \
                self.ng.reciprocal(self.ng.sqrt(run_squares) + epsilon)
            params[:] = params + velocity

    def fprop_bn_compound(self, inputs, beta, gamma, eps, xhat,
                          xmean, xvar, gmean, gvar, rho, out):
        """
        Batch normalization forward pass, compounded to run in 3 kernel calls.

        Arguments:
            inputs: input data to be normalized
            beta: location parameter
            gamma: scale parameter
            eps: small constant for numerical stability
            xvar: variance (updated)
            xhat: normalized input (updated)
            out: normalized and rescaled input (updated)
        """
        xvar[:] = self.ng.var(inputs, axis=1)
        xmean[:] = self.ng.mean(inputs, axis=1)
        gmean[:] = gmean * rho + (1.0 - rho) * xmean
        gvar[:] = gvar * rho + (1.0 - rho) * xvar

        xvar[:] = self.ng.reciprocal(self.ng.sqrt(xvar + eps))
        xhat[:] = xvar * (inputs - xmean)
        out[:] = xhat * gamma + beta
        return out

    def bprop_bn_compound(self, xhat, error, xvar, gamma,
                          beta_updates, gamma_updates):
        """
        Batch normalization backward pass, compounded to run with 4 kernel
        calls.

        Arguments:
            xhat: normalized input data (updated)
            error: backpropagated deltas (updated)
            xvar: precomputed variance
            gamma: scale parameter
            beta_updates: gradient update for beta (updated)
            gamma_updates: gradient update for gamma (updated)
        """
        gamma_updates[:] = self.ng.sum(xhat * error, axis=1)
        beta_updates[:] = self.ng.sum(error, axis=1)
        xhat[:] = (xhat * gamma_updates + beta_updates) / float(xhat.shape[1])
        error[:] = xvar * gamma * (error - xhat)
Esempio n. 17
0
                                  1)).astype(np.float16).astype(np.float32)

# zero pad the last row of cpu input for the sake of numpy
if pool.op == "max":
    cpuI[-1, :] = np.finfo(cpuI.dtype).min
else:
    cpuI[-1, :] = 0

# cpu output arrays
cpuO = np.empty(dimO, dtype=np.float32)
cpuB = np.zeros(slicable(dimI, 1), dtype=np.float32)

# give gpu the input array without zero padding (not needed)
devI = ng.array(cpuI[:-1, :].reshape(dimI), dtype=dtype)
devO = ng.zeros(dimO, dtype=dtype)
devB = ng.empty(dimI, dtype=dtype)

ng.fprop_pool(pool, devI, devO, repeat=repeat)

ng.bprop_pool(pool, devI, devO, devB, repeat=repeat)


def pixel_indices(kj, mt, pr, qs):

    C = pool.C
    J, T, R, S = pool.JTRS
    D, H, W = pool.DHW
    HW = H * W
    DHW = D * H * W
    imax = C * D * H * W
    idx = []
                inception = True

            # find the size of the largest buffers so they can be shared
            if layer.sizeF > max_weights:
                max_weights = layer.sizeF
                max_weight_layer = layer

            if layer.sizeI > max_deltas and type(prev_layer) is not DataLayer:
                max_deltas = layer.sizeI
                max_delta_layer = layer

            prev_layer = layer
            layers.append(layer)

        # Init shared buffers (assumes consistent dtype for now)
        shared_deltas.append(ng.empty(max_delta_layer.dimI, dtype=max_delta_layer.dtype))
        shared_deltas.append(ng.empty(max_delta_layer.dimI, dtype=max_delta_layer.dtype))
        if inception:
            shared_deltas.append(ng.empty(max_delta_layer.dimI, dtype=max_delta_layer.dtype))
            shared_deltas.append(ng.empty(max_delta_layer.dimI, dtype=max_delta_layer.dtype))

        shared_updates = ng.empty(max_weight_layer.dimF, dtype=np.float32)

        for i, layer in enumerate(layers):
            print(layer)

            # Intitalize buffers.  Alernate shared delta buffer.
            # One layer can't have the same buffer for both error in and error out.
            layer.init_activations()
            layer.init_weights(shared=shared_updates, zeros=zeros)
            if i > 1:
Esempio n. 19
0
for dtype in (
        np.float16,
        np.float32,
):

    for K, C, N in ((32, 4096, 1512), ):

        for alpha, beta in ((1.0, 0.0), (0.5, 0.5)):

            for op, dimA, dimB, dimC in (
                ("nn", (K, C), (C, N), (K, N)),  # fprop
                ("tn", (K, C), (K, N), (C, N)),  # bprop
                ("nt", (K, N), (C, N), (K, C)),
            ):  # update

                devA1 = ng.empty(dimA, dtype=dtype)
                devB1 = ng.empty(dimB, dtype=dtype)
                devC1 = ng.empty(dimC, dtype=dtype)

                # fill with uniform randoms from -1 to 1
                devA1[:] = 2 * (.5 - ng.rand())
                devB1[:] = 2 * (.5 - ng.rand())
                devC1[:] = 2 * (.5 - ng.rand())

                # just alias if same dtype
                if dtype is np.float32:
                    devA2 = devA1
                    devB2 = devB1
                # otherwise copy
                else:
                    devA2 = ng.empty(dimA, dtype=np.float32)
Esempio n. 20
0
# bprop(nn): NK   x KC   = NC
# updat(tn): NK^T x NC   = KC

repeat = 2000


for K, C, N in ((3072,3072,32),):

    total  = 0

    for op,  dimA,  dimB,  dimC in (
      ("nn", (K,C), (C,N), (K,N) ),   # fprop
      ("tn", (K,C), (K,N), (C,N) ),   # bprop
      ("nt", (K,N), (C,N), (K,C) ),): # update

        devA = ng.empty(dimA, dtype=np.float32)
        devB = ng.empty(dimB, dtype=np.float32)
        devC = ng.empty(dimC, dtype=np.float32)

        # fill with uniform randoms from -1 to 1
        devA[:] = 2 * (.5 - ng.rand())
        devB[:] = 2 * (.5 - ng.rand())

        total += cublas_dot(op, devA, devB, devC, repeat=repeat, warmup=True)

    print("N2 Total: ", total)
    total = 0

    for op,  dimA,  dimB,  dimC in (
      ("nt", (N,C), (K,C), (N,K) ),   # fprop
      ("nn", (N,K), (K,C), (N,C) ),   # bprop