Python NervanaGPU.array Examples

Programming Language: Python

Namespace/Package Name: nervanagpu

Class/Type: NervanaGPU

Method/Function: array

Examples at hotexamples.com: 13

Python NervanaGPU.array - 13 examples found. These are the top rated real world Python examples of nervanagpu.NervanaGPU.array extracted from open source projects. You can rate examples to help us improve the quality of examples.

Frequently Used Methods

Show Hide

empty(10)

dot(8)

NervanaGPU(6)

array(5)

update_conv(3)

rand(3)

bprop_conv(3)

mean(3)

max(3)

fprop_conv(3)

finite(2)

sum(2)

pool_layer(2)

min(2)

add(2)

fprop_pool(2)

zeros(2)

bprop_pool(2)

conv_layer(2)

fabs(1)

var(1)

argmax(1)

tanh(1)

batched_dot(1)

subtract(1)

square(1)

sqrt(1)

sig(1)

reciprocal(1)

bench(1)

power(1)

ones(1)

exp(1)

not_equal(1)

multiply(1)

clip(1)

maximum(1)

divide(1)

log(1)

dropout(1)

empty_like(1)

fprop_cuda_conv(1)

equal(1)

greater(1)

Example #1

Show file

File: pool_test.py Project: KayneWest/nervanagpu

# cpu input arrays
cpuI = np.random.uniform(0.0, 9.4, slicable(dimI,1)).astype(np.float16).astype(np.float32)

# zero pad the last row of cpu input for the sake of numpy
if pool.op == "max":
    cpuI[-1,:] = np.finfo(cpuI.dtype).min
else:
    cpuI[-1,:] = 0

# cpu output arrays
cpuO = np.empty(dimO, dtype=np.float32)
cpuB = np.zeros(slicable(dimI,1), dtype=np.float32)

# give gpu the input array without zero padding (not needed)
devI = ng.array(cpuI[:-1,:].reshape(dimI), dtype=dtype)
devO = ng.zeros(dimO, dtype=dtype)
devB = ng.empty(dimI, dtype=dtype)

ng.fprop_pool(pool, devI, devO, repeat=repeat)

ng.bprop_pool(pool, devI, devO, devB, repeat=repeat)

def pixel_indices(kj, mt, pr, qs):

    C       = pool.C
    J,T,R,S = pool.JTRS
    D,H,W = pool.DHW
    HW    = H*W
    DHW   = D*H*W
    imax  = C*D*H*W

Example #2

Show file

File: pool_test.py Project: zky001/nervanagpu

cpuI = np.random.uniform(0.0, 9.4,
                         slicable(dimI,
                                  1)).astype(np.float16).astype(np.float32)

# zero pad the last row of cpu input for the sake of numpy
if pool.op == "max":
    cpuI[-1, :] = np.finfo(cpuI.dtype).min
else:
    cpuI[-1, :] = 0

# cpu output arrays
cpuO = np.empty(dimO, dtype=np.float32)
cpuB = np.zeros(slicable(dimI, 1), dtype=np.float32)

# give gpu the input array without zero padding (not needed)
devI = ng.array(cpuI[:-1, :].reshape(dimI), dtype=dtype)
devO = ng.zeros(dimO, dtype=dtype)
devB = ng.empty(dimI, dtype=dtype)

ng.fprop_pool(pool, devI, devO, repeat=repeat)

ng.bprop_pool(pool, devI, devO, devB, repeat=repeat)


def pixel_indices(kj, mt, pr, qs):

    C = pool.C
    J, T, R, S = pool.JTRS
    D, H, W = pool.DHW
    HW = H * W
    DHW = D * H * W

Example #3

Show file

File: gemm_test.py Project: honorpeter/Caffe2_fpga

        for n in (size):
            for op in ("tn", "nn", "nt"):  # "tn","nn","nt",
                for k in size:
                    print("op,M,N,K: ", op, m, n, k)

                    dimA = (m, k) if op[0] == 'n' else (k, m)
                    dimB = (k, n) if op[1] == 'n' else (n, k)
                    dimC = (m, n)

                    cpuA = np.random.uniform(-1.0, 1.0,
                                             dimA).astype(np.float32)
                    cpuB = np.random.uniform(-1.0, 1.0,
                                             dimB).astype(np.float32)
                    #cpuB = np.identity(n, dtype=dtype)

                    devA = ng.array(cpuA, dtype=dtype)
                    devB = ng.array(cpuB, dtype=dtype)
                    devC = ng.empty(dimC, dtype=dtype)

                    #repeat = min(int(50.0 * 4096**3 / (m * n * k)), 1000)

                    if op[0] == 't': cpuA, devA = cpuA.T, devA.T
                    if op[1] == 't': cpuB, devB = cpuB.T, devB.T

                    ng.dot(devA, devB, devC, repeat=1)

                    #context.synchronize()

                    cpuC = np.dot(cpuA, cpuB)

                    cpuD = devC.get()

Example #4

Show file

File: gpu.py Project: jcoreyes/neon

class GPU(Backend):
    """
    Sets up a NervanaGPU based backend for matrix operations.
    Note that some functions defined in the generic Backend class such as are
    cross-map pooling and normalization and adaDelta are not implemented for
    this backend.
    """
    default_dtype = np.float32

    def __init__(self, rng_seed, stochastic_round=False, device_id=0):
        self.ng = NervanaGPU(stochastic_round=stochastic_round)
        logger.info("Initialized NervanaGPU with stochastic_round=%s",
                    stochastic_round)
        self.rng_seed = rng_seed
        self.rng_init()
        self.device_id = device_id if device_id is not None else 0
        self.streams = [self.create_stream(), self.create_stream()]

    def __getstate__(self):
        """
        Defines what and how we go about serializing an instance of this class.

        Returns:
            self.__dict__: The full contents of the backend class instance,
                           except for the mem_pool which is on device and
                           cannot be serialized.
        """
        if hasattr(self, 'mem_pool') and self.mem_pool is not None:
            self.mem_pool_pickle = {'shape': self.mem_pool.shape,
                                    'dtype': np.float32}
            self.mem_pool = None

        return self.__dict__

    def __setstate__(self, state):
        """
        Defines how we go about deserializing into an instance of this class.

        Arguments:
            self.__dict__: The full contents of the backend class instance,
                           except for the mem_pool which is on device and
                           cannot be serialized.
        """
        self.__dict__.update(state)
        self.mem_pool = self.ng.empty(self.mem_pool_pickle['shape'],
                                      dtype=self.mem_pool_pickle['dtype'])

    def init_mempool(self, shape, dtype=default_dtype):
        """
        Allocates a memory pool for temporary storage
        """
        self.mem_pool = self.ng.empty(shape, dtype=dtype)

    def alloc_host_mem(self, shape, dtype):
        return drv.pagelocked_empty(shape, dtype, order="C", mem_flags=0)

    def create_stream(self):
        return drv.Stream()

    def async_copy(self, dest, src, stream=None):
        drv.memcpy_htod_async(dest.gpudata, src, stream)

    def rng_init(self):
        """
        Initialize and seed the pseudo random number genrator. Random numbers
        are generated on the host using numpy, then transfered to device.
        """
        seed = None
        if 'rng_seed' in self.__dict__:
            seed = self.rng_seed
            logger.info("Seeding random number generator with: %s", str(seed))
        np.random.seed(seed)

    def flop_timing_init(self, decorate_fc, decorate_conv, decorate_ew):
        """
        Initialize FLOP timing.  Wraps the specified MOP calls via a decorator
        to record elapsed time and number of operations.

        Arguments:
           decorate_fc (list): string giving the function names of fully
                               connected layer forward/backward/update calls
                               to time.
           decorate_conv (list): string giving the function names of
                                 convolutional layer forward/backward/update
                                 calls to time.
           decorate_ew (list): string giving the function names of element-wise
                               calls to time.

        Notes:
            Must be called prior to first flop_timing_start call
        """
        self.start = drv.Event()
        self.end = drv.Event()
        self.flop_timer = FlopsDecorator(self)
        self.flop_timer.decorate(decorate_fc=decorate_fc,
                                 decorate_conv=decorate_conv,
                                 decorate_ew=decorate_ew)

    def flop_timinig_start(self):
        """
        Start a new FLOP timer.
        Returns:
            None: dummy value (not used)
        """
        return self.start.record()

    def flop_timing_finish(self, start_time):
        """
        Complete current FLOP timing.

        Arguments:
            start_time (unused): ignored.

        Returns:
            float: elapsed time in seconds since prior flop_timing_start call.
        """
        self.end.record()
        self.end.synchronize()
        return self.end.time_since(self.start)

    def uniform(self, low=0.0, high=1.0, shape=1, dtype=default_dtype,
                name=None, allocator=drv.mem_alloc):
        """
        generate numpy random number and convert to a GPUTensor.
        If called with dype=None it will probably explode
        """
        ary = np.random.uniform(low, high, shape)
        return self.ng.array(ary, dtype, name)

    def normal(self, loc=0.0, scale=1.0, size=1, dtype=default_dtype,
               name=None, allocator=drv.mem_alloc):
        """
        Gaussian/Normal random number sample generation
        """
        ary = np.random.normal(loc, scale, size)
        return self.ng.array(ary, dtype, name)

    def fprop_fc(self, out, inputs, weights, layer=None):
        """
        Forward propagate the inputs of a fully connected network layer to
        produce output pre-activations (ready for transformation by an
        activation function).

        Arguments:
            out (GPUTensor): Where to store the forward propagated results.
            inputs (GPUTensor): Will be either the dataset input values (first
                                layer), or the outputs from the previous layer.
            weights (GPUTensor): The weight coefficient values for this layer.
            layer (Layer): The layer object.
        """
        self.ng.dot(weights, inputs, out)

    def bprop_fc(self, out, weights, deltas, layer=None):
        """
        Backward propagate the error through a fully connected network layer.

        Arguments:
            out (GPUTensor): Where to store the backward propagated errors.
            weights (GPUTensor): The weight coefficient values for this layer.
            deltas (GPUTensor): The error values for this layer
            layer (Layer): The layer object.
        """
        self.ng.dot(weights.T, deltas, out)

    def update_fc(self, out, inputs, deltas, layer=None):
        """
        Compute the updated gradient for a fully connected network layer.

        Arguments:
            out (GPUTensor): Where to store the updated gradient value.
            inputs (GPUTensor): Will be either the dataset input values (first
                                layer), or the outputs from the previous layer.
            deltas (GPUTensor): The error values for this layer
            layer (Layer): The layer object.
        """
        self.ng.dot(deltas, inputs.T, out)

    def fprop_conv(self, out, inputs, weights, ofmshape, ofmsize, ofmlocs,
                   ifmshape, links, nifm, padding, stride, ngroups, fpropbuf,
                   local=False):
        """
        Forward propagate the inputs of a convolutional network layer to
        produce output pre-activations (ready for transformation by an
        activation function).

        Arguments:
            out (GPUTensor): Where to store the forward propagated results.
            inputs (GPUTensor): Will be either the dataset input values (first
                             layer), or the outputs from the previous layer.
            weights (GPUTensor): The weight coefficient values for this layer.
            ofmshape (tuple): Dimensions of each output feature map (typically
                              number of height and width neurons).
            ofmsize (int): Total size of each output feature map.
            ofmlocs (GPUTensor): Indices giving the location of each element
                                  in each output feature map stored in out.
            ifmshape (tuple): Dimensions of each input feature map (typically
                              number of height and width neurons).  For this
                              backend we expect these values to be square.
            links (GPUTensor): Input receptive field indices.
            nifm (int): Total number of input feature maps.
            padding (int): Number of additional elements to include along each
                           dimension of each local receptive field during the
                           convolution operation.
            stride (int): Number of neurons to shift the filter at each step.
            ngroups (int): Number of groups.
            fpropbuf (GPUTensor): Temporary storage buffer used to hold the
                                  convolved outputs for a single receptive
                                  field.  Not used for this backend.
            local (bool, optional): Whether to do local filtering (True) or
                                    convolution (False, the default)
        """

        '''
        N: Number of images in mini-batch
        C: Number of input feature maps
        K: Number of output feature maps

        D: Depth  of input image
        H: Height of input image
        W: Width  of input image

        T: Depth  of filter kernel
        R: Height of filter kernel
        S: Width  of filter kernel
        '''
        self.ng.fprop_conv(layer=fpropbuf, I=inputs, F=weights, O=out,
                           alpha=1.0, repeat=1)

    def bprop_conv(self, out, weights, deltas, ofmshape, ofmsize, ofmlocs,
                   ifmshape, links, padding, stride, nifm, ngroups, bpropbuf,
                   local=False):
        """
        Backward propagate the error through a convolutional network layer.

        Arguments:
            out (GPUTensor): Where to store the backward propagated errors.
            weights (GPUTensor): The weight coefficient values for this layer.
            deltas (GPUTensor): The error values for this layer
            ofmshape (tuple): Dimensions of each output feature map (typically
                              height and width).
            ofmsize (int): Total size of each output feature map.
            ofmlocs (GPUTensor): Indices giving the location of each element in
                                 each output feature map stored in out.
            ifmshape (tuple): Dimensions of each input feature map (typically
                              height and width).
            links (GPUTensor): Input receptive field indices.
            nifm (int): Total number of input feature maps.
            padding (int): Number of additional elements to include along each
                           dimension of each local receptive field during the
                           convolution operation.
            stride (int): Number of neurons to shift the filter at each step.
            ngroups (int): Number of groups.
            bpropbuf (GPUTensor): Temporary storage buffer used to hold the
                                  backpropagated error for a single receptive
                                  field
            local (bool, optional): Whether to do local filtering (True) or
                                    convolution (False, the default)
        """
        self.ng.bprop_conv(layer=bpropbuf, F=weights, E=deltas, grad_I=out,
                           alpha=1.0, repeat=1)

    def update_conv(self, out, inputs, weights, deltas, ofmshape, ofmsize,
                    ofmlocs, ifmshape, links, nifm, padding, stride, ngroups,
                    fwidth, updatebuf, local=False, layer=None):
        """
        Compute the updated gradient for a convolutional network layer.

        Arguments:
            out (GPUTensor): Where to store the updated gradient value.
            inputs (GPUTensor): Will be either the dataset input values (first
                                layer), or the outputs from the previous layer.
            weights (GPUTensor): The weight coefficient values for this layer.
            deltas (GPUTensor): The error values for this layer
            ofmshape (tuple): Dimensions of each output feature map (typically
                              height and width).
            ofmsize (int): Total size of each output feature map.
            ofmlocs (GPUTensor): Indices giving the location of each element in
                                 each output feature map stored in out.
            ifmshape (tuple): Dimensions of each input feature map (typically
                              height and width).
            links (GPUTensor): Input receptive field indices.
            nifm (int): Total number of input feature maps.
            padding (int): Number of additional elements to include along each
                           dimension of each local receptive field during the
                           convolution operation.
            stride (int): Number of neurons to shift the filter at each step.
            ngroups (int): Number of groups.
            fwidth (int): Filter width.
            updatebuf (GPUTensor): Temporary storage buffer used to hold the
                                   updated gradient for a single receptive
                                   field
            local (bool, optional): Whether to do local filtering (True) or
                                    convolution (False, the default)
            layer (Layer): The layer object.
        """
        self.ng.update_conv(layer=updatebuf, I=inputs, E=deltas, grad_F=out,
                            alpha=1.0, repeat=1)

    def fprop_pool(self, out, inputs, op, ofmshape, ofmsize, ofmlocs, fshape,
                   ifmshape, links, nifm, padding, stride, fpropbuf):
        """
        Forward propagate the inputs of a Pooling network layer to
        produce output pre-activations (ready for transformation by an
        activation function).

        Arguments:
            out (GPUTensor): Where to store the forward propagated results.
            inputs (GPUTensor): Will be either the dataset input values (first
                                layer), or the outputs from the previous layer.
            op (string): The type of pooling operation to apply.  We support
                         "max", "avg", "l2" currently.
            ofmshape (tuple): Dimensions of each output feature map (typically
                              number of height and width neurons).
            ofmsize (int): Total size of each output feature map.
            ofmlocs (GPUTensor): Indices giving the location of each element in
                                 each output feature map stored in out.
            fshape (tuple): Dimensions of each filter (typically height and
                            width).
            ifmshape (tuple): Dimensions of each input feature map (typically
                              number of height and width neurons).
            links (GPUTensor): Input receptive field indices.
            nifm (int): Total number of input feature maps.
            padding (int): Number of additional elements to include along each
                           dimension of each local receptive field during the
                           pooling operation.
            stride (int): Number of neurons to shift the filter at each step.
            fpropbuf (GPUTensor): Temporary storage buffer used to hold the
                                  pooled outputs for a single receptive field.
        """
        op = op.lower()
        if op == "max":
            self.ng.fprop_pool(layer=fpropbuf, I=inputs, O=out, repeat=1)
        else:
            raise AttributeError("unexpected pooling op type: %s", op)

    def bprop_pool(self, out, fouts, inputs, deltas, op, ofmshape, ofmsize,
                   ofmlocs, fshape, fpsize, ifmshape, links, nifm, padding,
                   stride, bpropbuf):
        """
        Backward propagate the error through a pooling network layer.

        Arguments:
            out (GPUTensor): Where to store the backward propagated errors.
            fouts (GPUTensor): Forward propagated outputs from the previous
                               layer.
            inputs (GPUTensor): Will be either the dataset input values (first
                                layer), or the outputs from the previous layer.
            deltas (GPUTensor): The error values for this layer
            op (string): The type of pooling operation to apply.  We support
                         "max", "avg", "l2" currently.
            ofmshape (tuple): Dimensions of each output feature map (typically
                              height and width).
            ofmsize (int): Total size of each output feature map.
            ofmlocs (GPUTensor): Indices giving the location of each element in
                              each output feature map stored in out.
            fshape (tuple): Dimensions of each filter (typically height and
                            width).
            fpsize (int): The size of each filter.
            ifmshape (tuple): Dimensions of each input feature map (typically
                              height and width).
            links (GPUTensor): Input receptive field indices.
            nifm (int): Total number of input feature maps.
            padding (int): Number of additional elements to include along each
                           dimension of each local receptive field during the
                           pooling operation.
            stride (int): Number of neurons to shift the filter at each step.
            bpropbuf (GPUTensor): Temporary storage buffer used to hold the
                                  backpropagated error for a single receptive
                                  field
        """
        op = op.lower()
        if op == "max":
            self.ng.bprop_pool(layer=bpropbuf, I=inputs, E=deltas, grad_I=out,
                               repeat=1)
        else:
            raise AttributeError("unexpected pooling op type: %s", op)

    def logistic(self, x, out):
        """
        Logistic sigmoid nonlinearity, 1/(1+exp(-x))

        Arguments:
            x (GPUTensor): Input tensor
            out (GPUTensor): Output tensor

        """
        self.ng.sig(x, out=out)

        return out

    def rectlin(self, x, out):
        """
        Rectified Linear nonlinearity

        Arguments:
            x (GPUTensor): Input tensor
            out (GPUTensor): Output tensor

        """
        self.ng.maximum(x, 0., out=out)
        return out

    def rectleaky(self, x, slope, out):
        out[:] = self.ng.maximum(x, x*slope)

    def rectleaky_derivative(self, x, slope, out):
        out[:] = self.ng.greater(x, 0) * (1.0 - slope) + slope

    def sum(self, tsr, axes, out):
        """
        Sum

        Arguments:
            tsr  (GPUTensor): Input tensor
            axes (int): Axis along which the reduction is performed. If axes
                        is None,  the tensor is flattened and reduced over
                        both dimensions.
            out (GPUTensor): Output tensor
        """
        if axes is None:
            sze = tsr.shape[0]*tsr.shape[1]
            self.ng.sum(tsr.reshape(sze, 1), axis=0, out=out)
        else:
            self.ng.sum(tsr, axis=axes, out=out)
        return out

    def mean(self, tsr, axes, out):
        """
        Calculates the arithmetic mean of the elements along the specified
        axes.

        Arguments:
            tsr (GPUTensor): Input tensor
            axes (int): Axis along which the reduction is performed. If axes
                        is None,  the tensor is flattened and reduced over
                        both dimensions.
            out (GPUTensor): Output tensor
        """
        if axes is None:
            sze = tsr.shape[0]*tsr.shape[1]
            self.ng.mean(tsr.reshape(sze, 1), axis=0, out=out)
        else:
            self.ng.mean(tsr, axis=axes, out=out)
        return out

    def min(self, tsr, axes, out):
        """
        Calculates the minimum of the elements along the specified
        axes.

        Arguments:
            tsr (GPUTensor): Input tensor
            axes (int): Axis along which the reduction is performed. If axes
                        is None,  the tensor is flattened and reduced over
                        both dimensions.
            out (GPUTensor): Output tensor
        """
        if axes is None:
            sze = tsr.shape[0]*tsr.shape[1]
            self.ng.min(tsr.reshape(sze, 1), axis=0, out=out)
        else:
            self.ng.min(tsr, axis=axes, out=out)
        return out

    def max(self, tsr, axes, out):
        """
        Calculates the maximum of the elements along the specified
        axes.

        Arguments:
            tsr (GPUTensor): Input tensor
            axes (int): Axis along which the reduction is performed. If axes
                        is None,  the tensor is flattened and reduced over
                        both dimensions.
            out (GPUTensor): Output tensor
        """
        if axes is None:
            sze = tsr.shape[0]*tsr.shape[1]
            self.ng.max(tsr.reshape(sze, 1), axis=0, out=out)
        else:
            self.ng.max(tsr, axis=axes, out=out)
        return out

    def variance(self, tsr, axes, out, mean=None):
        """
        Calculates the variance of the elements along the specified
        axes.

        Arguments:
            tsr (GPUTensor): the tensor on which to compute the variance
            axes (int, list, optional): the dimension(s) along which to
                                        variance.  If set to None, we will
                                        variance over all dimensions.
            out (GPUTensor): where the result will be stored.
            mean (GPUTensor): the tensor containing mean of tsr

        Returns:
            GPUTensor: reference to out
        """
        if mean is None:
            logger.error("GPUTensor requires mean to be specified.")
            raise ValueError("mean not specified")
        self.ng.mean(self.ng.square(tsr-mean),  axis=axes, out=out)
        return out

    def fabs(self, x, out):
        """
        Calculates absolute value of the elements in a tensor

        Arguments:
            x (GPUTensor): Input tensor
            out (GPUTensor): Output tensor

        Returns:
            GPUTensor: reference to out
        """
        self.ng.fabs(x, out=out)
        return out

    def sqrt(self, x, out):
        """
        Calculates square root of the elements in a tensor

        Arguments:
            x (GPUTensor): Input tensor
            out (GPUTensor): Output tensor

        Returns:
            GPUTensor: reference to out
        """
        self.ng.sqrt(x, out=out)
        return out

    def zeros(self, shape, dtype=default_dtype):
        """
        Allocate a new GPUTensor and fill it with zeros.

        Arguments:
            shape (tupel): Shape of the desired GPUTensor
            dtype (dtype): Optional datatype

        Returns:
            GPUTensor: output
        """
        return self.ng.zeros(shape, dtype=dtype)

    def ones(self, shape, dtype=default_dtype):
        """
        Allocate a new GPUTensor and fill it with ones.

        Arguments:
            shape (tupel): Shape of the desired GPUTensor
            dtype (dtype): Optional datatype

        Returns:
            GPUTensor: output
        """
        return self.ng.ones(shape, dtype=dtype)

    def empty(self, shape, dtype=default_dtype):
        """
        Allocate a new GPUTensor.

        Arguments:
            shape (tupel): Shape of the desired GPUTensor
            dtype (dtype): Optional datatype

        Returns:
            GPUTensor: output
        """
        return self.ng.empty(shape, dtype=dtype)

    def array(self, ary, dtype=default_dtype, name=None,
              allocator=drv.mem_alloc):
        """
        Allocate a new GPUTensor and fill it with supplied numpy array.

        Arguments:
            ary (ndarray): Numpy array with source data
            dtype (dtype): Optional datatype
            name (string): Name for the GPUTensor
            allocator (pycuda): Pycuda memory allocator

        Returns:
            GPUTensor: output
        """
        return self.ng.array(ary, dtype=dtype)

    def add(self, left, right, out):
        """
        Elementwise addition

        Arguments:
            left (GPUTensor, numeric): left-hand side operand.
            right (GPUTensor, numeric): right-hand side operand.
            out (GPUTensor): where the result will be stored.

        Returns:
            GPUTensor: reference to out
        """
        self.ng.add(left, right, out=out)
        return out

    def subtract(self, left, right, out):
        """
        Elementwise subtraction

        Arguments:
            left (GPUTensor, numeric): left-hand side operand.
            right (GPUTensor, numeric): right-hand side operand.
            out (GPUTensor): where the result will be stored.

        Returns:
            GPUTensor: reference to out
        """
        self.ng.subtract(left, right, out=out)
        return out

    def multiply(self, left, right, out):
        """
        Elementwise multiplication

        Arguments:
            left (GPUTensor, numeric): left-hand side operand.
            right (GPUTensor, numeric): right-hand side operand.
            out (GPUTensor): where the result will be stored.

        Returns:
            GPUTensor: reference to out
        """
        self.ng.multiply(left, right, out=out)
        return out

    def divide(self, left, right, out):
        """
        Elementwise division

        Arguments:
            left (GPUTensor, numeric): left-hand side operand.
            right (GPUTensor, numeric): right-hand side operand.
            out (GPUTensor): where the result will be stored.

        Returns:
            GPUTensor: reference to out
        """
        self.ng.divide(left, right, out=out)
        return out

    def greater(self, left, right, out):
        """
        Elementwise greater than testing

        Arguments:
            left (GPUTensor, numeric): left-hand side operand.
            right (GPUTensor, numeric): right-hand side operand.
            out (GPUTensor): where the result will be stored.

        Returns:
            GPUTensor: reference to out
        """
        self.ng.greater(left, right, out=out)
        return out

    def equal(self, left, right, out):
        """
        Performs element-wise equality testing on each element of left and
        right, storing the result in out.  Each operand is assumed to be the
        same shape (or broadcastable as such).

        Arguments:
            left (GPUTensor, numeric): left-hand side operand.
            right (GPUTensor, numeric): right-hand side operand.
            out (GPUTensor): where the result will be stored.

        Returns:
            GPUTensor: reference to out
        """
        self.ng.equal(left, right, out=out)
        return out

    def not_equal(self, left, right, out):
        """
        Elementwise not equal testing

        Arguments:
            left (GPUTensor, numeric): left-hand side operand.
            right (GPUTensor, numeric): right-hand side operand.
            out (GPUTensor): where the result will be stored.

        Returns:
            GPUTensor: reference to out
        """
        self.ng.not_equal(left, right, out=out)
        return out

    def clip(self, a, a_min, a_max, out):
        """
        Elementwise clipping between a range of specified values

        Arguments:
            a (GPUTensor): input tensor.
            a_min (float): floor value.
            a_max (float): ceiling value.
            out (GPUTensor): where the result will be stored.

        Returns:
            GPUTensor: reference to out
        """
        self.ng.clip(a, a_min, a_max, out=out)
        return out

    def log(self, a, out):
        """
        Elementwise base-e logarithm

        Arguments:
            a (GPUTensor): input tensor.
            out (GPUTensor): where the result will be stored.

        Returns:
            GPUTensor: reference to out
        """
        self.ng.log(a, out=out)
        return out

    def tanh(self, a, out):
        """
        Elementwise tanh

        Arguments:
            a (GPUTensor): input tensor.
            out (GPUTensor): where the result will be stored.

        Returns:
            GPUTensor: reference to out
        """
        self.ng.tanh(a, out=out)
        return out

    def argmax(self, a, out, axis=0):
        """
        Calculates the indices of the maximal element value along the specified
        axis.  If multiple elements contain the maximum, only the elements of
        the first are returned.

        Arguments:
            tsr (GPUTensor): The GPUTensor on which to find the maximum indices
            axis (int): The dimension along which to find the maximum.  If set
                        to None, find the overall maximum index of a flattened
                        representation of tsr.
            out (GPUTensor): Where to store the result.  Should be of the
                             appropriate type and expected shape

        Returns:
            GPUTensor: reference to out
        """
        self.ng.argmax(a, out=out, axis=axis)
        return out

    def softmax(self, x, out):
        """
        Softmax nonlinearity. Computes exp(x-max(x)) / sum_i exp(x_i-max(x_i))

        Arguments:
            x (GPUTensor): input tensor.
            out (GPUTensor): where the result will be stored.

        Returns:
            GPUTensor: reference to out
        """
        out[:] = (self.ng.reciprocal(self.ng.sum(
                  self.ng.exp(x - self.ng.max(x, axis=0)), axis=0)) *
                  self.ng.exp(x - self.ng.max(x, axis=0)))
        return out

    def softmax_gradient(self, y, err, out):
        """
        Gradient of the softmax nonlinearity.

        Arguments:
            y (GPUTensor): input tensor.
            err (GPUTensor): backpropagated error.
            out (GPUTensor): where the result will be stored.

        Returns:
            GPUTensor: reference to out
        """
        raise NotImplementedError("Softmax gradient should use shortcut")
        return out

    def make_binary_mask(self, tsr, keepthresh=0.5, dtype=default_dtype):
        """
        Create a binary mask for dropout layers.

        Arguments:
            tsr (GPUTensor): Output tensor
            keepthresh (float): fraction of ones
        """
        self.ng.dropout(keep=keepthresh, out=tsr)

    def gdm_compound(self, ps_item, us_item, vs_item, momentum_coef,
                     learning_rate, epoch):
        """
        Perform gradient descent update with momentum.

        Arguments:
            ps_item (GPUTensor): parameter tensor (e.g. a weight matrix)
            us_item (GPUTensor): update tensor, contains gradient wrt. weights
            vs_item (GPUTensor): velocity tensor.
            momentum_coef (float): momentum coefficient.
            learning_rate (float): learning rate.
            epoch (int): epoch (used in conjunction with diagnostics).

        Outputs are written to vs_item (updated velocity)
        and ps_item (updated weights)
        """
        self.ng.subtract(self.ng.multiply(vs_item, momentum_coef),
                         self.ng.multiply(us_item, learning_rate),
                         out=vs_item)
        self.ng.add(ps_item, vs_item, out=ps_item)

    def gdmwd_compound(self, ps_item, us_item, vs_item, momentum_coef,
                       learning_rate, wd, epoch):
        """
        Perform gradient descent update with momentum and weight decay.

        Arguments:
            ps_item (GPUTensor): parameter tensor (e.g. a weight matrix)
            us_item (GPUTensor): update tensor, contains gradient wrt. weights
            vs_item (GPUTensor): velocity tensor.
            momentum_coef (float): momentum coefficient.
            learning_rate (float): learning rate.
            wd (float): weight decay parameter.
            epoch (int): epoch (used in conjunction with diagnostics).

        Outputs:
            ps_item, the updated weights.
            vs_item, the updated velocity.
            us_item, used as a temp buffer.
        """
        self.ng.subtract(self.ng.multiply(vs_item, momentum_coef),
                         self.ng.multiply(us_item, learning_rate),
                         out=vs_item)

        # weight decay
        self.ng.multiply(self.ng.multiply(ps_item, wd),
                         learning_rate, out=us_item)
        self.ng.subtract(vs_item, us_item, out=vs_item)

        self.ng.add(ps_item, vs_item, out=ps_item)

    def ada_update(self, ps_item, us_item, gs_item, ds_item, ls_item, ss_item,
                   rho, epsilon):
        """
        Update rule for AdaDelta (Zeiler, http://arxiv.org/abs/1212.5701)

        Arguments:
            ps_item: weight / parameter (will be updated)
            us_item: update
            gs_item: expected value of Gradient Squared (will be updated)
            ds_item: expected value of Delta Squared (will be updated)
            ls_item: learning rate (will be updated)
            ss_item: Scratch Space
            rho: decay constant (determines window size)
            epsilon: small positive constant for numerical stability
        """
        # Accumulate E[Grad^2]
        gs_item[:] = gs_item * rho + (1.0 - rho) * us_item * us_item

        # Calculate Updates
        ls_item[:] = self.ng.sqrt((ds_item + epsilon) /
                                  (gs_item + epsilon)) * (-1.0) * us_item

        # Accumulate E[Delt^2]
        ds_item[:] = ds_item * rho + (1.0 - rho) * ls_item * ls_item

        # Final update to the params
        ps_item[:] = ps_item + ls_item

    def fprop_bn_compound(self, inputs, beta, gamma, eps, xvar, xhat, out):
        """
        Batch normalization forward pass, compounded to run in 3 kernel calls.

        Arguments:
            inputs: input data to be normalized
            beta: location parameter
            gamma: scale parameter
            eps: small constant for numerical stability
            xvar: variance (updated)
            xhat: normalized input (updated)
            out: normalized and rescaled input (updated)
        """
        xvar[:] = self.ng.reciprocal(self.ng.sqrt(self.ng.var(inputs, axis=1) +
                                                  eps))
        xhat[:] = xvar * (inputs - self.ng.mean(inputs, axis=1))
        out[:] = xhat * gamma + beta
        return out

    def bprop_bn_compound(self, xhat, error, xvar, gamma,
                          beta_updates, gamma_updates):
        """
        Batch normalization backward pass, compounded to run with 4 kernel
        calls.

        Arguments:
            xhat: normalized input data (updated)
            error: backpropagated deltas (updated)
            xvar: precomputed variance
            gamma: scale parameter
            beta_updates: gradient update for beta (updated)
            gamma_updates: gradient update for gamma (updated)
        """
        gamma_updates[:] = self.ng.sum(xhat * error, axis=1)
        beta_updates[:] = self.ng.sum(error, axis=1)
        xhat[:] = (xhat * gamma_updates + beta_updates) / float(xhat.shape[1])
        error[:] = xvar * gamma * (error - xhat)

Example #5

Show file

File: gemm_test2.py Project: KayneWest/nervanagpu

        m, n, k = size
        for op in ("tn","nn","nt"): #"tn","nn","nt"

            dimA = (m,k) if op[0] == 'n' else (k,m)
            dimB = (k,n) if op[1] == 'n' else (n,k)
            dimC = (m,n)

            if data_type == "All Ones":
                cpuA = np.ones(dimA, dtype=dtype).astype(np.float32)
                cpuB = np.ones(dimB, dtype=dtype).astype(np.float32)
                #cpuB = np.identity(n, dtype=np.float32)
            else:
                cpuA = np.random.uniform(-1.0, 1.0, dimA).astype(np.float32)
                cpuB = np.random.uniform(-1.0, 1.0, dimB).astype(np.float32)

            devA = ng.array(cpuA, dtype=dtype)
            devB = ng.array(cpuB, dtype=dtype)
            devC = ng.empty(dimC, dtype=dtype)

            if op[0] == 't': cpuA, devA = cpuA.T, devA.T
            if op[1] == 't': cpuB, devB = cpuB.T, devB.T

            ng.dot(devA, devB, devC, repeat=repeat)

            if cpu:

                cpuC = np.dot(cpuA, cpuB)

                cpuD = devC.get()
                diff = np.absolute(cpuC - cpuD)

Example #6

Show file

File: pow_failure.py Project: awni/nervanagpu

import numpy as np
import pycuda.autoinit
from nervanagpu import NervanaGPU
nrv = NervanaGPU(default_dtype=np.float32)

a = nrv.array(np.random.randn(200,200))
b = nrv.empty_like(a)
b[:] = a**2
assert not np.any(np.isnan(b.get())), "Shouldn't have any nan's here"

Example #7

Show file

File: batched_dot_test.py Project: honorpeter/Caffe2_fpga

    dimI = (X, C, N)
    dimO = (X, K, N)
else:
    dimI = (X, N, C)
    dimO = (X, N, K)

if ones:
    cpuI = np.ones(dimI, dtype=np.float32)
    cpuE = np.ones(dimO, dtype=np.float32)
    cpuW = np.ones(dimW, dtype=np.float32)
else:
    cpuI = np.random.uniform(-1.0, 1.0, dimI).astype(dtype).astype(np.float32)
    cpuE = np.random.uniform(-1.0, 1.0, dimO).astype(dtype).astype(np.float32)
    cpuW = np.random.uniform(-1.0, 1.0, dimW).astype(dtype).astype(np.float32)

devI = ng.array(cpuI, dtype=dtype)
devE = ng.array(cpuE, dtype=dtype)
devW = ng.array(cpuW, dtype=dtype)

devO = ng.empty(dimO, dtype=dtype)
devB = ng.empty(dimI, dtype=dtype)
devU = ng.empty(dimW, dtype=dtype)

if Nin:
    ng.batched_dot(devW, devI, devO, repeat=repeat, size=size)  # fprop
    ng.batched_dot(devW.T, devE, devB, repeat=repeat, size=size)  # bprop
    ng.batched_dot(devE, devI.T, devU, repeat=repeat, size=size)  # update
else:
    ng.batched_dot(devI, devW.T, devO, repeat=repeat, size=size)  # fprop
    ng.batched_dot(devE, devW, devB, repeat=repeat, size=size)  # bprop
    ng.batched_dot(devE.T, devI, devU, repeat=repeat, size=size)  # update

Example #8

Show file

File: gpu.py Project: xiaoyunwu/neon

class GPU(Backend):
    """
    Sets up a NervanaGPU based backend for matrix operations.
    Note that some functions defined in the generic Backend class such as are
    cross-map pooling and normalization and adaDelta are not implemented for
    this backend.
    """
    default_dtype = np.float32

    def __init__(self, rng_seed, stochastic_round=False, device_id=0):
        self.ng = NervanaGPU(stochastic_round=stochastic_round)
        logger.info("Initialized NervanaGPU with stochastic_round=%s",
                    stochastic_round)
        self.rng_seed = rng_seed
        self.rng_init()
        self.device_id = device_id if device_id is not None else 0

    def __getstate__(self):
        """
        Defines what and how we go about serializing an instance of this class.

        Returns:
            self.__dict__: The full contents of the backend class instance,
                           except for the mem_pool which is on device and
                           cannot be serialized.
        """
        if hasattr(self, 'mem_pool') and self.mem_pool is not None:
            self.mem_pool_pickle = {'shape': self.mem_pool.shape,
                                    'dtype': np.float32}
            self.mem_pool = None

        return self.__dict__

    def __setstate__(self, state):
        """
        Defines how we go about deserializing into an instance of this class.

        Arguments:
            self.__dict__: The full contents of the backend class instance,
                           except for the mem_pool which is on device and
                           cannot be serialized.
        """
        self.__dict__.update(state)
        self.mem_pool = self.ng.empty(self.mem_pool_pickle['shape'],
                                      dtype=self.mem_pool_pickle['dtype'])

    def init_mempool(self, shape, dtype=default_dtype):
        """
        Allocates a memory pool for temporary storage
        """
        self.mem_pool = self.ng.empty(shape, dtype=dtype)

    def alloc_host_mem(self, shape, dtype):
        return drv.pagelocked_empty(shape, dtype, order="C", mem_flags=0)

    def create_stream(self):
        return drv.Stream()

    def async_copy(self, dest, src, stream=None):
        drv.memcpy_htod_async(dest.gpudata, src, stream)

    def rng_init(self):
        """
        Initialize and seed the pseudo random number genrator. Random numbers
        are generated on the host using numpy, then transfered to device.
        """
        seed = None
        if 'rng_seed' in self.__dict__:
            seed = self.rng_seed
            logger.info("Seeding random number generator with: %s", str(seed))
        np.random.seed(seed)

    def flop_timing_init(self, decorate_fc, decorate_conv, decorate_ew):
        """
        Initialize FLOP timing.  Wraps the specified MOP calls via a decorator
        to record elapsed time and number of operations.

        Arguments:
           decorate_fc (list): string giving the function names of fully
                               connected layer forward/backward/update calls
                               to time.
           decorate_conv (list): string giving the function names of
                                 convolutional layer forward/backward/update
                                 calls to time.
           decorate_ew (list): string giving the function names of element-wise
                               calls to time.

        Notes:
            Must be called prior to first flop_timing_start call
        """
        self.start = drv.Event()
        self.end = drv.Event()
        self.flop_timer = FlopsDecorator(self)
        self.flop_timer.decorate(decorate_fc=decorate_fc,
                                 decorate_conv=decorate_conv,
                                 decorate_ew=decorate_ew)

    def flop_timinig_start(self):
        """
        Start a new FLOP timer.
        Returns:
            None: dummy value (not used)
        """
        return self.start.record()

    def flop_timing_finish(self, start_time):
        """
        Complete current FLOP timing.

        Arguments:
            start_time (unused): ignored.

        Returns:
            float: elapsed time in seconds since prior flop_timing_start call.
        """
        self.end.record()
        self.end.synchronize()
        return self.end.time_since(self.start)

    def uniform(self, low=0.0, high=1.0, shape=1, dtype=default_dtype,
                persist_values=True, name=None, allocator=drv.mem_alloc):
        """
        generate numpy random number and convert to a GPUTensor.
        If called with dype=None it will probably explode
        """
        ary = np.random.uniform(low, high, shape)
        return self.ng.array(ary, dtype=dtype, name=name)

    def normal(self, loc=0.0, scale=1.0, size=1, dtype=default_dtype,
               persist_values=True, name=None, allocator=drv.mem_alloc):
        """
        Gaussian/Normal random number sample generation
        """
        ary = np.random.normal(loc, scale, size)
        return self.ng.array(ary, dtype=dtype, name=name)

    def fprop_fc(self, out, inputs, weights, layer=None):
        """
        Forward propagate the inputs of a fully connected network layer to
        produce output pre-activations (ready for transformation by an
        activation function).

        Arguments:
            out (GPUTensor): Where to store the forward propagated results.
            inputs (GPUTensor): Will be either the dataset input values (first
                                layer), or the outputs from the previous layer.
            weights (GPUTensor): The weight coefficient values for this layer.
            layer (Layer): The layer object.
        """
        self.ng.dot(weights, inputs, out)

    def bprop_fc(self, out, weights, deltas, layer=None):
        """
        Backward propagate the error through a fully connected network layer.

        Arguments:
            out (GPUTensor): Where to store the backward propagated errors.
            weights (GPUTensor): The weight coefficient values for this layer.
            deltas (GPUTensor): The error values for this layer
            layer (Layer): The layer object.
        """
        self.ng.dot(weights.T, deltas, out)

    def update_fc(self, out, inputs, deltas, layer=None):
        """
        Compute the updated gradient for a fully connected network layer.

        Arguments:
            out (GPUTensor): Where to store the updated gradient value.
            inputs (GPUTensor): Will be either the dataset input values (first
                                layer), or the outputs from the previous layer.
            deltas (GPUTensor): The error values for this layer
            layer (Layer): The layer object.
        """
        self.ng.dot(deltas, inputs.T, out)

    def fprop_conv(self, out, inputs, weights, ofmshape, ofmsize, ofmlocs,
                   ifmshape, links, nifm, padding, stride, ngroups, fpropbuf,
                   local=False):
        """
        Forward propagate the inputs of a convolutional network layer to
        produce output pre-activations (ready for transformation by an
        activation function).

        Arguments:
            out (GPUTensor): Where to store the forward propagated results.
            inputs (GPUTensor): Will be either the dataset input values (first
                             layer), or the outputs from the previous layer.
            weights (GPUTensor): The weight coefficient values for this layer.
            ofmshape (tuple): Dimensions of each output feature map (typically
                              number of height and width neurons).
            ofmsize (int): Total size of each output feature map.
            ofmlocs (GPUTensor): Indices giving the location of each element
                                  in each output feature map stored in out.
            ifmshape (tuple): Dimensions of each input feature map (typically
                              number of height and width neurons).  For this
                              backend we expect these values to be square.
            links (GPUTensor): Input receptive field indices.
            nifm (int): Total number of input feature maps.
            padding (int): Number of additional elements to include along each
                           dimension of each local receptive field during the
                           convolution operation.
            stride (int): Number of neurons to shift the filter at each step.
            ngroups (int): Number of groups.
            fpropbuf (GPUTensor): Temporary storage buffer used to hold the
                                  convolved outputs for a single receptive
                                  field.  Not used for this backend.
            local (bool, optional): Whether to do local filtering (True) or
                                    convolution (False, the default)
        """

        '''
        N: Number of images in mini-batch
        C: Number of input feature maps
        K: Number of output feature maps

        D: Depth  of input image
        H: Height of input image
        W: Width  of input image

        T: Depth  of filter kernel
        R: Height of filter kernel
        S: Width  of filter kernel
        '''
        self.ng.fprop_conv(layer=fpropbuf, I=inputs, F=weights, O=out,
                           alpha=1.0, repeat=1)

    def bprop_conv(self, out, weights, deltas, ofmshape, ofmsize, ofmlocs,
                   ifmshape, links, padding, stride, nifm, ngroups, bpropbuf,
                   local=False):
        """
        Backward propagate the error through a convolutional network layer.

        Arguments:
            out (GPUTensor): Where to store the backward propagated errors.
            weights (GPUTensor): The weight coefficient values for this layer.
            deltas (GPUTensor): The error values for this layer
            ofmshape (tuple): Dimensions of each output feature map (typically
                              height and width).
            ofmsize (int): Total size of each output feature map.
            ofmlocs (GPUTensor): Indices giving the location of each element in
                                 each output feature map stored in out.
            ifmshape (tuple): Dimensions of each input feature map (typically
                              height and width).
            links (GPUTensor): Input receptive field indices.
            nifm (int): Total number of input feature maps.
            padding (int): Number of additional elements to include along each
                           dimension of each local receptive field during the
                           convolution operation.
            stride (int): Number of neurons to shift the filter at each step.
            ngroups (int): Number of groups.
            bpropbuf (GPUTensor): Temporary storage buffer used to hold the
                                  backpropagated error for a single receptive
                                  field
            local (bool, optional): Whether to do local filtering (True) or
                                    convolution (False, the default)
        """
        self.ng.bprop_conv(layer=bpropbuf, F=weights, E=deltas, grad_I=out,
                           alpha=1.0, repeat=1)

    def update_conv(self, out, inputs, weights, deltas, ofmshape, ofmsize,
                    ofmlocs, ifmshape, links, nifm, padding, stride, ngroups,
                    fwidth, updatebuf, local=False, layer=None):
        """
        Compute the updated gradient for a convolutional network layer.

        Arguments:
            out (GPUTensor): Where to store the updated gradient value.
            inputs (GPUTensor): Will be either the dataset input values (first
                                layer), or the outputs from the previous layer.
            weights (GPUTensor): The weight coefficient values for this layer.
            deltas (GPUTensor): The error values for this layer
            ofmshape (tuple): Dimensions of each output feature map (typically
                              height and width).
            ofmsize (int): Total size of each output feature map.
            ofmlocs (GPUTensor): Indices giving the location of each element in
                                 each output feature map stored in out.
            ifmshape (tuple): Dimensions of each input feature map (typically
                              height and width).
            links (GPUTensor): Input receptive field indices.
            nifm (int): Total number of input feature maps.
            padding (int): Number of additional elements to include along each
                           dimension of each local receptive field during the
                           convolution operation.
            stride (int): Number of neurons to shift the filter at each step.
            ngroups (int): Number of groups.
            fwidth (int): Filter width.
            updatebuf (GPUTensor): Temporary storage buffer used to hold the
                                   updated gradient for a single receptive
                                   field
            local (bool, optional): Whether to do local filtering (True) or
                                    convolution (False, the default)
            layer (Layer): The layer object.
        """
        self.ng.update_conv(layer=updatebuf, I=inputs, E=deltas, grad_F=out,
                            alpha=1.0, repeat=1)

    def fprop_pool(self, out, inputs, op, ofmshape, ofmsize, ofmlocs, fshape,
                   ifmshape, links, nifm, padding, stride, fpropbuf):
        """
        Forward propagate the inputs of a Pooling network layer to
        produce output pre-activations (ready for transformation by an
        activation function).

        Arguments:
            out (GPUTensor): Where to store the forward propagated results.
            inputs (GPUTensor): Will be either the dataset input values (first
                                layer), or the outputs from the previous layer.
            op (string): The type of pooling operation to apply.  We support
                         "max", "avg", "l2" currently.
            ofmshape (tuple): Dimensions of each output feature map (typically
                              number of height and width neurons).
            ofmsize (int): Total size of each output feature map.
            ofmlocs (GPUTensor): Indices giving the location of each element in
                                 each output feature map stored in out.
            fshape (tuple): Dimensions of each filter (typically height and
                            width).
            ifmshape (tuple): Dimensions of each input feature map (typically
                              number of height and width neurons).
            links (GPUTensor): Input receptive field indices.
            nifm (int): Total number of input feature maps.
            padding (int): Number of additional elements to include along each
                           dimension of each local receptive field during the
                           pooling operation.
            stride (int): Number of neurons to shift the filter at each step.
            fpropbuf (GPUTensor): Temporary storage buffer used to hold the
                                  pooled outputs for a single receptive field.
        """
        op = op.lower()
        if op == "max":
            self.ng.fprop_pool(layer=fpropbuf, I=inputs, O=out, repeat=1)
        else:
            raise AttributeError("unexpected pooling op type: %s", op)

    def bprop_pool(self, out, fouts, inputs, deltas, op, ofmshape, ofmsize,
                   ofmlocs, fshape, fpsize, ifmshape, links, nifm, padding,
                   stride, bpropbuf):
        """
        Backward propagate the error through a pooling network layer.

        Arguments:
            out (GPUTensor): Where to store the backward propagated errors.
            fouts (GPUTensor): Forward propagated outputs from the previous
                               layer.
            inputs (GPUTensor): Will be either the dataset input values (first
                                layer), or the outputs from the previous layer.
            deltas (GPUTensor): The error values for this layer
            op (string): The type of pooling operation to apply.  We support
                         "max", "avg", "l2" currently.
            ofmshape (tuple): Dimensions of each output feature map (typically
                              height and width).
            ofmsize (int): Total size of each output feature map.
            ofmlocs (GPUTensor): Indices giving the location of each element in
                              each output feature map stored in out.
            fshape (tuple): Dimensions of each filter (typically height and
                            width).
            fpsize (int): The size of each filter.
            ifmshape (tuple): Dimensions of each input feature map (typically
                              height and width).
            links (GPUTensor): Input receptive field indices.
            nifm (int): Total number of input feature maps.
            padding (int): Number of additional elements to include along each
                           dimension of each local receptive field during the
                           pooling operation.
            stride (int): Number of neurons to shift the filter at each step.
            bpropbuf (GPUTensor): Temporary storage buffer used to hold the
                                  backpropagated error for a single receptive
                                  field
        """
        op = op.lower()
        if op == "max":
            self.ng.bprop_pool(layer=bpropbuf, I=inputs, E=deltas, grad_I=out,
                               repeat=1)
        else:
            raise AttributeError("unexpected pooling op type: %s", op)

    def logistic(self, x, out):
        """
        Logistic sigmoid nonlinearity, 1/(1+exp(-x))

        Arguments:
            x (GPUTensor): Input tensor
            out (GPUTensor): Output tensor

        """
        self.ng.sig(x, out=out)

        return out

    def rectlin(self, x, out):
        """
        Rectified Linear nonlinearity

        Arguments:
            x (GPUTensor): Input tensor
            out (GPUTensor): Output tensor

        """
        self.ng.maximum(x, 0., out=out)
        return out

    def rectleaky(self, x, slope, out):
        out[:] = self.ng.maximum(x, x*slope)

    def rectleaky_derivative(self, x, slope, out):
        out[:] = self.ng.greater(x, 0) * (1.0 - slope) + slope

    def sum(self, tsr, axes, out):
        """
        Sum

        Arguments:
            tsr  (GPUTensor): Input tensor
            axes (int): Axis along which the reduction is performed. If axes
                        is None,  the tensor is flattened and reduced over
                        both dimensions.
            out (GPUTensor): Output tensor
        """
        if axes is None:
            sze = tsr.shape[0]*tsr.shape[1]
            self.ng.sum(tsr.reshape(sze, 1), axis=0, out=out)
        else:
            self.ng.sum(tsr, axis=axes, out=out)
        return out

    def mean(self, tsr, axes, out):
        """
        Calculates the arithmetic mean of the elements along the specified
        axes.

        Arguments:
            tsr (GPUTensor): Input tensor
            axes (int): Axis along which the reduction is performed. If axes
                        is None,  the tensor is flattened and reduced over
                        both dimensions.
            out (GPUTensor): Output tensor
        """
        if axes is None:
            sze = tsr.shape[0]*tsr.shape[1]
            self.ng.mean(tsr.reshape(sze, 1), axis=0, out=out)
        else:
            self.ng.mean(tsr, axis=axes, out=out)
        return out

    def min(self, tsr, axes, out):
        """
        Calculates the minimum of the elements along the specified
        axes.

        Arguments:
            tsr (GPUTensor): Input tensor
            axes (int): Axis along which the reduction is performed. If axes
                        is None,  the tensor is flattened and reduced over
                        both dimensions.
            out (GPUTensor): Output tensor
        """
        if axes is None:
            sze = tsr.shape[0]*tsr.shape[1]
            self.ng.min(tsr.reshape(sze, 1), axis=0, out=out)
        else:
            self.ng.min(tsr, axis=axes, out=out)
        return out

    def max(self, tsr, axes, out):
        """
        Calculates the maximum of the elements along the specified
        axes.

        Arguments:
            tsr (GPUTensor): Input tensor
            axes (int): Axis along which the reduction is performed. If axes
                        is None,  the tensor is flattened and reduced over
                        both dimensions.
            out (GPUTensor): Output tensor
        """
        if axes is None:
            sze = tsr.shape[0]*tsr.shape[1]
            self.ng.max(tsr.reshape(sze, 1), axis=0, out=out)
        else:
            self.ng.max(tsr, axis=axes, out=out)
        return out

    def variance(self, tsr, axes, out, mean=None):
        """
        Calculates the variance of the elements along the specified
        axes.

        Arguments:
            tsr (GPUTensor): the tensor on which to compute the variance
            axes (int, list, optional): the dimension(s) along which to
                                        variance.  If set to None, we will
                                        variance over all dimensions.
            out (GPUTensor): where the result will be stored.
            mean (GPUTensor): the tensor containing mean of tsr

        Returns:
            GPUTensor: reference to out
        """
        if mean is None:
            logger.error("GPUTensor requires mean to be specified.")
            raise ValueError("mean not specified")
        self.ng.mean(self.ng.square(tsr-mean),  axis=axes, out=out)
        return out

    def fabs(self, x, out):
        """
        Calculates absolute value of the elements in a tensor

        Arguments:
            x (GPUTensor): Input tensor
            out (GPUTensor): Output tensor

        Returns:
            GPUTensor: reference to out
        """
        self.ng.fabs(x, out=out)
        return out

    def sqrt(self, x, out):
        """
        Calculates square root of the elements in a tensor

        Arguments:
            x (GPUTensor): Input tensor
            out (GPUTensor): Output tensor

        Returns:
            GPUTensor: reference to out
        """
        self.ng.sqrt(x, out=out)
        return out

    def zeros(self, shape, dtype=default_dtype, persist_values=True):
        """
        Allocate a new GPUTensor and fill it with zeros.

        Arguments:
            shape (tupel): Shape of the desired GPUTensor
            dtype (dtype): Optional datatype
            persist_values (bool, optional): If set to True (the default), the
                                             values assigned to this Tensor
                                             will persist across multiple begin
                                             and end calls.  Setting to False
                                             may provide a performance increase
                                             if values do not need to be
                                             maintained across such calls

        Returns:
            GPUTensor: output
        """
        return self.ng.zeros(shape, dtype=dtype)

    def ones(self, shape, dtype=default_dtype, persist_values=True):
        """
        Allocate a new GPUTensor and fill it with ones.

        Arguments:
            shape (tupel): Shape of the desired GPUTensor
            dtype (dtype): Optional datatype
            persist_values (bool, optional): If set to True (the default), the
                                             values assigned to this Tensor
                                             will persist across multiple begin
                                             and end calls.  Setting to False
                                             may provide a performance increase
                                             if values do not need to be
                                             maintained across such calls

        Returns:
            GPUTensor: output
        """
        return self.ng.ones(shape, dtype=dtype)

    def empty(self, shape, dtype=default_dtype, persist_values=True):
        """
        Allocate a new GPUTensor.

        Arguments:
            shape (tupel): Shape of the desired GPUTensor
            dtype (dtype): Optional datatype
            persist_values (bool, optional): If set to True (the default), the
                                             values assigned to this Tensor
                                             will persist across multiple begin
                                             and end calls.  Setting to False
                                             may provide a performance increase
                                             if values do not need to be
                                             maintained across such calls

        Returns:
            GPUTensor: output
        """
        return self.ng.empty(shape, dtype=dtype)

    def array(self, ary, dtype=default_dtype, persist_values=True, name=None,
              allocator=drv.mem_alloc):
        """
        Allocate a new GPUTensor and fill it with supplied numpy array.

        Arguments:
            ary (ndarray): Numpy array with source data
            dtype (dtype, optional): Optional datatype
            persist_values (bool, optional): If set to True (the default), the
                                             values assigned to this Tensor
                                             will persist across multiple begin
                                             and end calls.  Setting to False
                                             may provide a performance increase
                                             if values do not need to be
                                             maintained across such calls
            name (string): Name for the GPUTensor
            allocator (pycuda): Pycuda memory allocator

        Returns:
            GPUTensor: output
        """
        return self.ng.array(ary, dtype=dtype, name=name)

    def add(self, left, right, out):
        """
        Elementwise addition

        Arguments:
            left (GPUTensor, numeric): left-hand side operand.
            right (GPUTensor, numeric): right-hand side operand.
            out (GPUTensor): where the result will be stored.

        Returns:
            GPUTensor: reference to out
        """
        self.ng.add(left, right, out=out)
        return out

    def subtract(self, left, right, out):
        """
        Elementwise subtraction

        Arguments:
            left (GPUTensor, numeric): left-hand side operand.
            right (GPUTensor, numeric): right-hand side operand.
            out (GPUTensor): where the result will be stored.

        Returns:
            GPUTensor: reference to out
        """
        self.ng.subtract(left, right, out=out)
        return out

    def multiply(self, left, right, out):
        """
        Elementwise multiplication

        Arguments:
            left (GPUTensor, numeric): left-hand side operand.
            right (GPUTensor, numeric): right-hand side operand.
            out (GPUTensor): where the result will be stored.

        Returns:
            GPUTensor: reference to out
        """
        self.ng.multiply(left, right, out=out)
        return out

    def divide(self, left, right, out):
        """
        Elementwise division

        Arguments:
            left (GPUTensor, numeric): left-hand side operand.
            right (GPUTensor, numeric): right-hand side operand.
            out (GPUTensor): where the result will be stored.

        Returns:
            GPUTensor: reference to out
        """
        self.ng.divide(left, right, out=out)
        return out

    def greater(self, left, right, out):
        """
        Elementwise greater than testing

        Arguments:
            left (GPUTensor, numeric): left-hand side operand.
            right (GPUTensor, numeric): right-hand side operand.
            out (GPUTensor): where the result will be stored.

        Returns:
            GPUTensor: reference to out
        """
        self.ng.greater(left, right, out=out)
        return out

    def equal(self, left, right, out):
        """
        Performs element-wise equality testing on each element of left and
        right, storing the result in out.  Each operand is assumed to be the
        same shape (or broadcastable as such).

        Arguments:
            left (GPUTensor, numeric): left-hand side operand.
            right (GPUTensor, numeric): right-hand side operand.
            out (GPUTensor): where the result will be stored.

        Returns:
            GPUTensor: reference to out
        """
        self.ng.equal(left, right, out=out)
        return out

    def not_equal(self, left, right, out):
        """
        Elementwise not equal testing

        Arguments:
            left (GPUTensor, numeric): left-hand side operand.
            right (GPUTensor, numeric): right-hand side operand.
            out (GPUTensor): where the result will be stored.

        Returns:
            GPUTensor: reference to out
        """
        self.ng.not_equal(left, right, out=out)
        return out

    def clip(self, a, a_min, a_max, out):
        """
        Elementwise clipping between a range of specified values

        Arguments:
            a (GPUTensor): input tensor.
            a_min (float): floor value.
            a_max (float): ceiling value.
            out (GPUTensor): where the result will be stored.

        Returns:
            GPUTensor: reference to out
        """
        self.ng.clip(a, a_min, a_max, out=out)
        return out

    def log(self, a, out):
        """
        Elementwise base-e logarithm

        Arguments:
            a (GPUTensor): input tensor.
            out (GPUTensor): where the result will be stored.

        Returns:
            GPUTensor: reference to out
        """
        self.ng.log(a, out=out)
        return out

    def tanh(self, a, out):
        """
        Elementwise tanh

        Arguments:
            a (GPUTensor): input tensor.
            out (GPUTensor): where the result will be stored.

        Returns:
            GPUTensor: reference to out
        """
        self.ng.tanh(a, out=out)
        return out

    def argmax(self, a, out, axis=0):
        """
        Calculates the indices of the maximal element value along the specified
        axis.  If multiple elements contain the maximum, only the elements of
        the first are returned.

        Arguments:
            tsr (GPUTensor): The GPUTensor on which to find the maximum indices
            axis (int): The dimension along which to find the maximum.  If set
                        to None, find the overall maximum index of a flattened
                        representation of tsr.
            out (GPUTensor): Where to store the result.  Should be of the
                             appropriate type and expected shape

        Returns:
            GPUTensor: reference to out
        """
        self.ng.argmax(a, out=out, axis=axis)
        return out

    def softmax(self, x, out):
        """
        Softmax nonlinearity. Computes exp(x-max(x)) / sum_i exp(x_i-max(x_i))

        Arguments:
            x (GPUTensor): input tensor.
            out (GPUTensor): where the result will be stored.

        Returns:
            GPUTensor: reference to out
        """
        out[:] = (self.ng.reciprocal(self.ng.sum(
                  self.ng.exp(x - self.ng.max(x, axis=0)), axis=0)) *
                  self.ng.exp(x - self.ng.max(x, axis=0)))
        return out

    def softmax_gradient(self, y, err, out):
        """
        Gradient of the softmax nonlinearity.

        Arguments:
            y (GPUTensor): input tensor.
            err (GPUTensor): backpropagated error.
            out (GPUTensor): where the result will be stored.

        Returns:
            GPUTensor: reference to out
        """
        raise NotImplementedError("Softmax gradient should use shortcut")
        return out

    def make_binary_mask(self, tsr, keepthresh=0.5, dtype=default_dtype):
        """
        Create a binary mask for dropout layers.

        Arguments:
            tsr (GPUTensor): Output tensor
            keepthresh (float): fraction of ones
        """
        self.ng.dropout(keep=keepthresh, out=tsr)

    def gdm_compound(self, ps_item, us_item, vs_item, momentum_coef,
                     learning_rate, epoch):
        """
        Perform gradient descent update with momentum.

        Arguments:
            ps_item (GPUTensor): parameter tensor (e.g. a weight matrix)
            us_item (GPUTensor): update tensor, contains gradient wrt. weights
            vs_item (GPUTensor): velocity tensor.
            momentum_coef (float): momentum coefficient.
            learning_rate (float): learning rate.
            epoch (int): epoch (used in conjunction with diagnostics).

        Outputs are written to vs_item (updated velocity)
        and ps_item (updated weights)
        """
        vs_item[:] = vs_item * momentum_coef - us_item * learning_rate
        ps_item[:] = ps_item + vs_item

    def gdmwd_compound(self, ps_item, us_item, vs_item, momentum_coef,
                       learning_rate, wd, epoch):
        """
        Perform gradient descent update with momentum and weight decay.

        Arguments:
            ps_item (GPUTensor): parameter tensor (e.g. a weight matrix)
            us_item (GPUTensor): update tensor, contains gradient wrt. weights
            vs_item (GPUTensor): velocity tensor.
            momentum_coef (float): momentum coefficient.
            learning_rate (float): learning rate.
            wd (float): weight decay parameter.
            epoch (int): epoch (used in conjunction with diagnostics).

        Outputs:
            ps_item, the updated weights.
            vs_item, the updated velocity.
            us_item, used as a temp buffer.
        """
        vs_item[:] = vs_item * momentum_coef - us_item * \
            learning_rate - learning_rate * wd * ps_item
        ps_item[:] = ps_item + vs_item

    def exp_mavg(self, mavg, newval, rho):
        """
        Calculate the exponential moving average

        Arguments:
            mavg:  The running value of the moving average
            newval:  New sample to be added to the moving average
            rho:  Interpolation value
        """

        mavg[:] = rho * mavg + (1.0 - rho) * newval

    def ada_update(self, ps_item, us_item, gs_item, ds_item, ls_item, ss_item,
                   rho, epsilon):
        """
        Update rule for AdaDelta (Zeiler, http://arxiv.org/abs/1212.5701)

        Arguments:
            ps_item: weight / parameter (will be updated)
            us_item: update
            gs_item: expected value of Gradient Squared (will be updated)
            ds_item: expected value of Delta Squared (will be updated)
            ls_item: learning rate (will be updated)
            ss_item: Scratch Space
            rho: decay constant (determines window size)
            epsilon: small positive constant for numerical stability
        """
        # Accumulate E[Grad^2]
        gs_item[:] = gs_item * rho + (1.0 - rho) * us_item * us_item

        # Calculate Updates
        ls_item[:] = self.ng.sqrt((ds_item + epsilon) /
                                  (gs_item + epsilon)) * (-1.0) * us_item

        # Accumulate E[Delt^2]
        ds_item[:] = ds_item * rho + (1.0 - rho) * ls_item * ls_item

        # Final update to the params
        ps_item[:] = ps_item + ls_item

    def rms_update(self, params, updates, run_squares, velocity, scratch_space,
                   gamma, epsilon, learning_rate, momentum_coef):

        # Update running squares
        run_squares[:] = gamma * run_squares + (1. - gamma) * updates * updates

        # Now scale the gradient by lr / rms(grad) (with a epsilon term for
        # stability) and use it to update the params
        if momentum_coef == 0:
            params[:] = params - learning_rate * updates * self.ng.reciprocal(
                self.ng.sqrt(run_squares) + epsilon)
        else:
            velocity[:] = velocity * momentum_coef - \
                learning_rate * updates * \
                self.ng.reciprocal(self.ng.sqrt(run_squares) + epsilon)
            params[:] = params + velocity

    def fprop_bn_compound(self, inputs, beta, gamma, eps, xhat,
                          xmean, xvar, gmean, gvar, rho, out):
        """
        Batch normalization forward pass, compounded to run in 3 kernel calls.

        Arguments:
            inputs: input data to be normalized
            beta: location parameter
            gamma: scale parameter
            eps: small constant for numerical stability
            xvar: variance (updated)
            xhat: normalized input (updated)
            out: normalized and rescaled input (updated)
        """
        xvar[:] = self.ng.var(inputs, axis=1)
        xmean[:] = self.ng.mean(inputs, axis=1)
        gmean[:] = gmean * rho + (1.0 - rho) * xmean
        gvar[:] = gvar * rho + (1.0 - rho) * xvar

        xvar[:] = self.ng.reciprocal(self.ng.sqrt(xvar + eps))
        xhat[:] = xvar * (inputs - xmean)
        out[:] = xhat * gamma + beta
        return out

    def bprop_bn_compound(self, xhat, error, xvar, gamma,
                          beta_updates, gamma_updates):
        """
        Batch normalization backward pass, compounded to run with 4 kernel
        calls.

        Arguments:
            xhat: normalized input data (updated)
            error: backpropagated deltas (updated)
            xvar: precomputed variance
            gamma: scale parameter
            beta_updates: gradient update for beta (updated)
            gamma_updates: gradient update for gamma (updated)
        """
        gamma_updates[:] = self.ng.sum(xhat * error, axis=1)
        beta_updates[:] = self.ng.sum(error, axis=1)
        xhat[:] = (xhat * gamma_updates + beta_updates) / float(xhat.shape[1])
        error[:] = xvar * gamma * (error - xhat)

Example #9

Show file

File: gpu.py Project: neuroidss/neon

class GPU(Backend):
    """
    Sets up a NervanaGPU based backend for matrix operations.
    Note that some functions defined in the generic Backend class such as
    cross-map pooling and normalization and are not implemented for
    this backend.
    """
    default_dtype = np.float32

    def __init__(self, rng_seed, stochastic_round=False, device_id=0):
        import pycuda.driver as drv
        drv.init()
        global ctx
        ctx = drv.Device(device_id).make_context()
        import atexit
        atexit.register(ctx.pop)
        self.ng = NervanaGPU(stochastic_round=stochastic_round)
        logger.info("Initialized NervanaGPU with stochastic_round=%s",
                    stochastic_round)
        self.rng_seed = rng_seed
        self.rng_init()
        self.device_id = device_id if device_id is not None else 0

    def __getstate__(self):
        """
        Defines what and how we go about serializing an instance of this class.

        Returns:
            self.__dict__: The full contents of the backend class instance,
                           except for the mem_pool which is on device and
                           cannot be serialized.
        """
        if hasattr(self, 'mem_pool') and self.mem_pool is not None:
            self.mem_pool_pickle = {'shape': self.mem_pool.shape,
                                    'dtype': np.float32}
            self.mem_pool = None

        return self.__dict__

    def __setstate__(self, state):
        """
        Defines how we go about deserializing into an instance of this class.

        Arguments:
            self.__dict__: The full contents of the backend class instance,
                           except for the mem_pool which is on device and
                           cannot be serialized.
        """
        self.__dict__.update(state)
        self.mem_pool = self.ng.empty(self.mem_pool_pickle['shape'],
                                      dtype=self.mem_pool_pickle['dtype'])

    def init_mempool(self, shape, dtype=default_dtype):
        """
        Allocates a memory pool for temporary storage
        """
        self.mem_pool = self.ng.empty(shape, dtype=dtype)

    def alloc_host_mem(self, shape, dtype=default_dtype):
        return drv.pagelocked_empty(shape, dtype, order="C", mem_flags=0)

    def create_stream(self):
        return drv.Stream()

    def synchronize(self):
        pass

    def async_copy(self, dest, src, stream=None):
        drv.memcpy_htod_async(dest.gpudata, src, stream)

    def rng_init(self):
        """
        Initialize and seed the pseudo random number genrator. Random numbers
        are generated on the host using numpy, then transfered to device.
        """
        seed = None
        if 'rng_seed' in self.__dict__:
            seed = self.rng_seed
            logger.info("Seeding random number generator with: %s", str(seed))
        np.random.seed(seed)

    def flop_timing_init(self, decorate_fc, decorate_conv, decorate_ew):
        """
        Initialize FLOP timing.  Wraps the specified MOP calls via a decorator
        to record elapsed time and number of operations.

        Arguments:
           decorate_fc (list): string giving the function names of fully
                               connected layer forward/backward/update calls
                               to time.
           decorate_conv (list): string giving the function names of
                                 convolutional layer forward/backward/update
                                 calls to time.
           decorate_ew (list): string giving the function names of element-wise
                               calls to time.

        Notes:
            Must be called prior to first flop_timing_start call
        """
        self.start = drv.Event()
        self.end = drv.Event()
        self.flop_timer = FlopsDecorator(self)
        self.flop_timer.decorate(decorate_fc=decorate_fc,
                                 decorate_conv=decorate_conv,
                                 decorate_ew=decorate_ew)

    def flop_timinig_start(self):
        """
        Start a new FLOP timer.
        Returns:
            None: dummy value (not used)
        """
        return self.start.record()

    def flop_timing_finish(self, start_time):
        """
        Complete current FLOP timing.

        Arguments:
            start_time (unused): ignored.

        Returns:
            float: elapsed time in seconds since prior flop_timing_start call.
        """
        self.end.record()
        self.end.synchronize()
        return self.end.time_since(self.start)

    def uniform(self, low=0.0, high=1.0, size=1, dtype=default_dtype,
                persist_values=True, name=None):
        """
        generate numpy random number and convert to a GPUTensor.
        If called with dype=None it will probably explode
        """
        ary = np.random.uniform(low, high, size)
        return self.ng.array(ary, dtype=dtype, name=name)

    def normal(self, loc=0.0, scale=1.0, size=1, dtype=default_dtype,
               persist_values=True, name=None):
        """
        Gaussian/Normal random number sample generation
        """
        ary = np.random.normal(loc, scale, size)
        return self.ng.array(ary, dtype=dtype, name=name)

    def fprop_fc(self, out, inputs, weights, layer=None):
        """
        Forward propagate the inputs of a fully connected network layer to
        produce output pre-activations (ready for transformation by an
        activation function).

        Arguments:
            out (GPUTensor): Where to store the forward propagated results.
            inputs (GPUTensor): Will be either the dataset input values (first
                                layer), or the outputs from the previous layer.
            weights (GPUTensor): The weight coefficient values for this layer.
            layer (Layer): The layer object.
        """
        self.ng.dot(weights, inputs, out)

    def bprop_fc(self, out, weights, deltas, layer=None):
        """
        Backward propagate the error through a fully connected network layer.

        Arguments:
            out (GPUTensor): Where to store the backward propagated errors.
            weights (GPUTensor): The weight coefficient values for this layer.
            deltas (GPUTensor): The error values for this layer
            layer (Layer): The layer object.
        """
        self.ng.dot(weights.T, deltas, out)

    def update_fc(self, out, inputs, deltas, layer=None):
        """
        Compute the updated gradient for a fully connected network layer.

        Arguments:
            out (GPUTensor): Where to store the updated gradient value.
            inputs (GPUTensor): Will be either the dataset input values (first
                                layer), or the outputs from the previous layer.
            deltas (GPUTensor): The error values for this layer
            layer (Layer): The layer object.
        """
        self.ng.dot(deltas, inputs.T, out)

    def update_fc_bias(self, err, out):
        """
        Compute the updated bias gradient for a fully connected network layer.

        Arguments:
            out (GPUTensor): Where to store the updated gradient value.
            err (GPUTensor): backpropagated error
        """
        self.ng.sum(err, axis=1, out=out)

    def add_fc_bias(self, inputs, bias):
        """
        Add the bias for a fully connected network layer.

        Arguments:
            inputs (GPUTensor): the input to update.
            bias (GPUTensor): the amount to increment
        """
        self.ng.add(inputs, bias, out=inputs)

    def fprop_conv(self, out, inputs, weights, ofmshape, ofmsize, ofmlocs,
                   ifmshape, links, nifm, padding, stride, ngroups, fpropbuf,
                   local=False):
        """
        Forward propagate the inputs of a convolutional network layer to
        produce output pre-activations (ready for transformation by an
        activation function).

        Arguments:
            out (GPUTensor): Where to store the forward propagated results.
            inputs (GPUTensor): Will be either the dataset input values (first
                             layer), or the outputs from the previous layer.
            weights (GPUTensor): The weight coefficient values for this layer.
            ofmshape (tuple): Dimensions of each output feature map (typically
                              number of height and width neurons).
            ofmsize (int): Total size of each output feature map.
            ofmlocs (GPUTensor): Indices giving the location of each element
                                  in each output feature map stored in out.
            ifmshape (tuple): Dimensions of each input feature map (typically
                              number of height and width neurons).  For this
                              backend we expect these values to be square.
            links (GPUTensor): Input receptive field indices.
            nifm (int): Total number of input feature maps.
            padding (int): Number of additional elements to include along each
                           dimension of each local receptive field during the
                           convolution operation.
            stride (int): Number of neurons to shift the filter at each step.
            ngroups (int): Number of groups.
            fpropbuf (GPUTensor): Temporary storage buffer used to hold the
                                  convolved outputs for a single receptive
                                  field.  Not used for this backend.
            local (bool, optional): Whether to do local filtering (True) or
                                    convolution (False, the default)
        """

        '''
        N: Number of images in mini-batch
        C: Number of input feature maps
        K: Number of output feature maps

        D: Depth  of input image
        H: Height of input image
        W: Width  of input image

        T: Depth  of filter kernel
        R: Height of filter kernel
        S: Width  of filter kernel
        '''
        self.ng.fprop_conv(layer=fpropbuf, I=inputs, F=weights, O=out,
                           alpha=1.0, repeat=1)

    def bprop_conv(self, out, weights, deltas, ofmshape, ofmsize, ofmlocs,
                   ifmshape, links, padding, stride, nifm, ngroups, bpropbuf,
                   local=False):
        """
        Backward propagate the error through a convolutional network layer.

        Arguments:
            out (GPUTensor): Where to store the backward propagated errors.
            weights (GPUTensor): The weight coefficient values for this layer.
            deltas (GPUTensor): The error values for this layer
            ofmshape (tuple): Dimensions of each output feature map (typically
                              height and width).
            ofmsize (int): Total size of each output feature map.
            ofmlocs (GPUTensor): Indices giving the location of each element in
                                 each output feature map stored in out.
            ifmshape (tuple): Dimensions of each input feature map (typically
                              height and width).
            links (GPUTensor): Input receptive field indices.
            nifm (int): Total number of input feature maps.
            padding (int): Number of additional elements to include along each
                           dimension of each local receptive field during the
                           convolution operation.
            stride (int): Number of neurons to shift the filter at each step.
            ngroups (int): Number of groups.
            bpropbuf (GPUTensor): Temporary storage buffer used to hold the
                                  backpropagated error for a single receptive
                                  field
            local (bool, optional): Whether to do local filtering (True) or
                                    convolution (False, the default)
        """
        self.ng.bprop_conv(layer=bpropbuf, F=weights, E=deltas, grad_I=out,
                           alpha=1.0, repeat=1)

    def update_conv(self, out, inputs, weights, deltas, ofmshape, ofmsize,
                    ofmlocs, ifmshape, links, nifm, padding, stride, ngroups,
                    fwidth, updatebuf, local=False, layer=None):
        """
        Compute the updated gradient for a convolutional network layer.

        Arguments:
            out (GPUTensor): Where to store the updated gradient value.
            inputs (GPUTensor): Will be either the dataset input values (first
                                layer), or the outputs from the previous layer.
            weights (GPUTensor): The weight coefficient values for this layer.
            deltas (GPUTensor): The error values for this layer
            ofmshape (tuple): Dimensions of each output feature map (typically
                              height and width).
            ofmsize (int): Total size of each output feature map.
            ofmlocs (GPUTensor): Indices giving the location of each element in
                                 each output feature map stored in out.
            ifmshape (tuple): Dimensions of each input feature map (typically
                              height and width).
            links (GPUTensor): Input receptive field indices.
            nifm (int): Total number of input feature maps.
            padding (int): Number of additional elements to include along each
                           dimension of each local receptive field during the
                           convolution operation.
            stride (int): Number of neurons to shift the filter at each step.
            ngroups (int): Number of groups.
            fwidth (int): Filter width.
            updatebuf (GPUTensor): Temporary storage buffer used to hold the
                                   updated gradient for a single receptive
                                   field
            local (bool, optional): Whether to do local filtering (True) or
                                    convolution (False, the default)
            layer (Layer): The layer object.
        """
        self.ng.update_conv(layer=updatebuf, I=inputs, E=deltas, grad_F=out,
                            alpha=1.0, repeat=1)

    def fprop_pool(self, out, inputs, op, ofmshape, ofmsize, ofmlocs, fshape,
                   ifmshape, links, nifm, padding, stride, fpropbuf):
        """
        Forward propagate the inputs of a Pooling network layer to
        produce output pre-activations (ready for transformation by an
        activation function).

        Arguments:
            out (GPUTensor): Where to store the forward propagated results.
            inputs (GPUTensor): Will be either the dataset input values (first
                                layer), or the outputs from the previous layer.
            op (string): The type of pooling operation to apply.  We support
                         "max", "avg", "l2" currently.
            ofmshape (tuple): Dimensions of each output feature map (typically
                              number of height and width neurons).
            ofmsize (int): Total size of each output feature map.
            ofmlocs (GPUTensor): Indices giving the location of each element in
                                 each output feature map stored in out.
            fshape (tuple): Dimensions of each filter (typically height and
                            width).
            ifmshape (tuple): Dimensions of each input feature map (typically
                              number of height and width neurons).
            links (GPUTensor): Input receptive field indices.
            nifm (int): Total number of input feature maps.
            padding (int): Number of additional elements to include along each
                           dimension of each local receptive field during the
                           pooling operation.
            stride (int): Number of neurons to shift the filter at each step.
            fpropbuf (GPUTensor): Temporary storage buffer used to hold the
                                  pooled outputs for a single receptive field.
        """
        op = op.lower()
        if op == "max":
            self.ng.fprop_pool(layer=fpropbuf, I=inputs, O=out, repeat=1)
        else:
            raise AttributeError("unexpected pooling op type: %s", op)

    def bprop_pool(self, out, fouts, inputs, deltas, op, ofmshape, ofmsize,
                   ofmlocs, fshape, fpsize, ifmshape, links, nifm, padding,
                   stride, bpropbuf):
        """
        Backward propagate the error through a pooling network layer.

        Arguments:
            out (GPUTensor): Where to store the backward propagated errors.
            fouts (GPUTensor): Forward propagated outputs from the previous
                               layer.
            inputs (GPUTensor): Will be either the dataset input values (first
                                layer), or the outputs from the previous layer.
            deltas (GPUTensor): The error values for this layer
            op (string): The type of pooling operation to apply.  We support
                         "max", "avg", "l2" currently.
            ofmshape (tuple): Dimensions of each output feature map (typically
                              height and width).
            ofmsize (int): Total size of each output feature map.
            ofmlocs (GPUTensor): Indices giving the location of each element in
                              each output feature map stored in out.
            fshape (tuple): Dimensions of each filter (typically height and
                            width).
            fpsize (int): The size of each filter.
            ifmshape (tuple): Dimensions of each input feature map (typically
                              height and width).
            links (GPUTensor): Input receptive field indices.
            nifm (int): Total number of input feature maps.
            padding (int): Number of additional elements to include along each
                           dimension of each local receptive field during the
                           pooling operation.
            stride (int): Number of neurons to shift the filter at each step.
            bpropbuf (GPUTensor): Temporary storage buffer used to hold the
                                  backpropagated error for a single receptive
                                  field
        """
        op = op.lower()
        if op == "max":
            self.ng.bprop_pool(layer=bpropbuf, I=inputs, E=deltas, grad_I=out,
                               repeat=1)
        else:
            raise AttributeError("unexpected pooling op type: %s", op)

    def logistic(self, x, out):
        """
        Logistic sigmoid nonlinearity, 1/(1+exp(-x))

        Arguments:
            x (GPUTensor): Input tensor
            out (GPUTensor): Output tensor

        """
        self.ng.sig(x, out=out)

        return out

    def transpose(self, untransposed, transposed):
        transposed[:] = untransposed.T

    def crossent(self, y, t, partial, out, epsilon, doscale, ismulti=False):
        """
        Computes cross entropy cost.

        Arguments:
            y (GPUTensor): Model outputs
            t (GPUTensor): Targets
            partial (GPUTensor): temporary buffer used for 2D reduction
            out (GPUTensor): Storage for the cross entropy output
            epsilon (float): constant for numerical stability
            doscale (boolean): If True, cross_entropy is scaled by batch size
            ismulti (boolean): If True, compute multi class cross_entropy
        """
        sumbuf = partial.reshape((partial.size, 1))[:partial.shape[0]]
        if ismulti:
            self.ng.sum(-t * self.ng.log(y + epsilon),
                        axis=None, partial=sumbuf, out=out)
        else:
            self.ng.sum((t - 1) * self.ng.log(1 - y + epsilon) -
                        t * self.ng.log(y + epsilon),
                        axis=None, partial=sumbuf, out=out)
        if doscale:
            out[:] = out / y.shape[1]
        return out

    def logistic_compound(self, inputs, outputs):
        """
        Applies logistic function and its derivative to the dataset passed.

        Arguments:
            inputs (GPUTensor): Input data to be transformed. This also
                                 acts as storage for the output of the
                                 derivative function.
            outputs (GPUTensor): Storage for the transformed output.
        """
        # Apply the logistic function.
        outputs[:] = self.ng.sig(inputs)
        inputs[:] = (1.0 - outputs) * inputs

    def rectlin(self, x, out):
        """
        Rectified Linear nonlinearity

        Arguments:
            x (GPUTensor): Input tensor
            out (GPUTensor): Output tensor

        """
        self.ng.maximum(x, 0., out=out)
        return out

    def rectlin_derivative(self, x, out):
        """
        Rectified linear nonlinearity derivative

        Arguments:
            x (GPUTensor): Input tensor
            out (GPUTensor): Output tensor
        """
        self.ng.greater(x, 0, out=out)
        return out

    def rectleaky(self, x, slope, out):
        """
        Leaky rectified linear nonlinearity

        Arguments:
            x (GPUTensor): Input tensor
            slope (float): amount of gradient to apply when unit is not active
            out (GPUTensor): Output tensor
        """
        out[:] = self.ng.maximum(x, x*slope)

    def rectleaky_derivative(self, x, slope, out):
        """
        Leaky rectified linear nonlinearity derivative

        Arguments:
            x (GPUTensor): Input tensor
            slope (float): amount of gradient to apply when unit is not active
            out (GPUTensor): Output tensor
        """
        out[:] = self.ng.greater(x, 0) * (1.0 - slope) + slope

    def sum(self, tsr, axes, out):
        """
        Sum

        Arguments:
            tsr  (GPUTensor): Input tensor
            axes (int): Axis along which the reduction is performed. If axes
                        is None,  the tensor is flattened and reduced over
                        both dimensions.
            out (GPUTensor): Output tensor
        """
        if axes is None:
            sze = tsr.shape[0]*tsr.shape[1]
            self.ng.sum(tsr.reshape(sze, 1), axis=0, out=out)
        else:
            self.ng.sum(tsr, axis=axes, out=out)
        return out

    def norm(self, tsr, order=None, axis=None, out=None):
        """
        Calculates and returns the vector p-norms of the GPUTensor along the
        specified axis.  The p-norm is defined on a vector A as
        :math:`||A||_p = \sum_i(|A_i|^p)^{1/p}`.

        Arguments:
            tsr (GPUTensor): the GPUTensor on which to find the norms
            order (int): The order or p upon which the norm is calculated.
                         Valid values include:
                         None, inf, -inf, 0, 1, -1, 2, -2, ...
            axis (int): The axis along which to compute vector norms.
            out (GPUTensor): where to write the results to.  Must be
                             of the expected result shape.

        Returns:
            GPUTensor: p-norm of tsr along the specified axis.

        Raises:
            IndexError if invalid axis specified
            AttributeError if invalid order specified

        See Also:
            `numpy.linalg.norm`
        """
        if not isinstance(axis, int) or axis < 0 or axis >= len(tsr.shape):
            raise IndexError("invalid axis value: %s", axis)
        if not isinstance(order, (int, float)):
            raise AttributeError("invalid order value: %s", order)
        if out is None:
            raise AttributeError("No output tensor speficied", order)
        if order == float('Inf'):
            self.ng.max(self.fabs(tsr), axis, out)
        elif order == float('-Inf'):
            self.ng.min(self.fabs(tsr), axis, out)
        elif order == 0:
            tmp = self.zeros(tsr.shape)
            self.ng.not_equal(tsr, tmp, tmp)
            self.ng.sum(tmp, axis, out)
        else:
            tmp = self.empty(tsr.shape)
            self.ng.power(self.fabs(tsr), order, tmp)
            self.ng.sum(tmp, axis, out)
            self.ng.power(out, (1.0 / order), out)
        return out

    def mean(self, tsr, axes, out):
        """
        Calculates the arithmetic mean of the elements along the specified
        axes.

        Arguments:
            tsr (GPUTensor): Input tensor
            axes (int): Axis along which the reduction is performed. If axes
                        is None,  the tensor is flattened and reduced over
                        both dimensions.
            out (GPUTensor): Output tensor
        """
        if axes is None:
            sze = tsr.shape[0]*tsr.shape[1]
            self.ng.mean(tsr.reshape(sze, 1), axis=0, out=out)
        else:
            self.ng.mean(tsr, axis=axes, out=out)
        return out

    def min(self, tsr, axes, out):
        """
        Calculates the minimum of the elements along the specified
        axes.

        Arguments:
            tsr (GPUTensor): Input tensor
            axes (int): Axis along which the reduction is performed. If axes
                        is None,  the tensor is flattened and reduced over
                        both dimensions.
            out (GPUTensor): Output tensor
        """
        if axes is None:
            sze = tsr.shape[0]*tsr.shape[1]
            self.ng.min(tsr.reshape(sze, 1), axis=0, out=out)
        else:
            self.ng.min(tsr, axis=axes, out=out)
        return out

    def max(self, tsr, axes, out):
        """
        Calculates the maximum of the elements along the specified
        axes.

        Arguments:
            tsr (GPUTensor): Input tensor
            axes (int): Axis along which the reduction is performed. If axes
                        is None,  the tensor is flattened and reduced over
                        both dimensions.
            out (GPUTensor): Output tensor
        """
        if axes is None:
            sze = tsr.shape[0]*tsr.shape[1]
            self.ng.max(tsr.reshape(sze, 1), axis=0, out=out)
        else:
            self.ng.max(tsr, axis=axes, out=out)
        return out

    def variance(self, tsr, axes, out, mean=None):
        """
        Calculates the variance of the elements along the specified
        axes.

        Arguments:
            tsr (GPUTensor): the tensor on which to compute the variance
            axes (int, list, optional): the dimension(s) along which to
                                        variance.  If set to None, we will
                                        variance over all dimensions.
            out (GPUTensor): where the result will be stored.
            mean (GPUTensor): the tensor containing mean of tsr

        Returns:
            GPUTensor: reference to out
        """
        if mean is None:
            logger.error("GPUTensor requires mean to be specified.")
            raise ValueError("mean not specified")
        self.ng.mean(self.ng.square(tsr-mean),  axis=axes, out=out)
        return out

    def fabs(self, x, out):
        """
        Calculates absolute value of the elements in a tensor

        Arguments:
            x (GPUTensor): Input tensor
            out (GPUTensor): Output tensor

        Returns:
            GPUTensor: reference to out
        """
        self.ng.fabs(x, out=out)
        return out

    def sqrt(self, x, out):
        """
        Calculates square root of the elements in a tensor

        Arguments:
            x (GPUTensor): Input tensor
            out (GPUTensor): Output tensor

        Returns:
            GPUTensor: reference to out
        """
        self.ng.sqrt(x, out=out)
        return out

    def zeros(self, shape, dtype=default_dtype, persist_values=True):
        """
        Allocate a new GPUTensor and fill it with zeros.

        Arguments:
            shape (tupel): Shape of the desired GPUTensor
            dtype (dtype): Optional datatype
            persist_values (bool, optional): If set to True (the default), the
                                             values assigned to this Tensor
                                             will persist across multiple begin
                                             and end calls.  Setting to False
                                             may provide a performance increase
                                             if values do not need to be
                                             maintained across such calls

        Returns:
            GPUTensor: output
        """
        return self.ng.zeros(shape, dtype=dtype)

    def ones(self, shape, dtype=default_dtype, persist_values=True):
        """
        Allocate a new GPUTensor and fill it with ones.

        Arguments:
            shape (tupel): Shape of the desired GPUTensor
            dtype (dtype): Optional datatype
            persist_values (bool, optional): If set to True (the default), the
                                             values assigned to this Tensor
                                             will persist across multiple begin
                                             and end calls.  Setting to False
                                             may provide a performance increase
                                             if values do not need to be
                                             maintained across such calls

        Returns:
            GPUTensor: output
        """
        return self.ng.ones(shape, dtype=dtype)

    def zeros_like(self, ary, dtype=default_dtype, persist_values=True,
                   name=None):
        """
        Instantiate a new instance of this backend's Tensor class, with the
        shape taken from ary and populating each element with a value of 0.

        Arguments:
            ary (tensor object): Tensor to inherit the dimensions of.
            dtype (data-type, optional): If present, specifies the underlying
                                         type to employ for each element.
            persist_values (bool, optional): If set to True (the default), the
                                             values assigned to this Tensor
                                             will persist across multiple begin
                                             and end calls.  Setting to False
                                             may provide a performance increase
                                             if values do not need to be
                                             maintained across such calls
        Returns:
            Tensor: array object

        Raises:
            NotImplementedError: Can't be instantiated directly.

        See Also:
            :py:func:`~neon.backends.backend.Backend.empty`,
            :py:func:`~neon.backends.backend.Backend.ones`,
            :py:func:`~neon.backends.backend.Backend.array`
        """
        return self.zeros(ary.shape, dtype=dtype,
                          persist_values=persist_values)

    def empty_like(self, ary, dtype=default_dtype, persist_values=True,
                   name=None):
        """
        Instantiate a new instance of this backend's Tensor class, with the
        shape taken from ary.

        Arguments:
            ary (tensor object): Tensor to inherit the dimensions of.
            dtype (data-type, optional): If present, specifies the underlying
                                         type to employ for each element.
            persist_values (bool, optional): If set to True (the default), the
                                             values assigned to this Tensor
                                             will persist across multiple begin
                                             and end calls.  Setting to False
                                             may provide a performance increase
                                             if values do not need to be
                                             maintained across such calls
        Returns:
            Tensor: array object

        Raises:
            NotImplementedError: Can't be instantiated directly.

        See Also:
            :py:func:`~neon.backends.backend.Backend.empty`,
            :py:func:`~neon.backends.backend.Backend.ones`,
            :py:func:`~neon.backends.backend.Backend.array`
        """
        return self.empty(ary.shape, dtype=dtype,
                          persist_values=persist_values, name=name)

    def empty(self, shape, dtype=default_dtype, persist_values=True,
              name=None):
        """
        Allocate a new GPUTensor.

        Arguments:
            shape (tupel): Shape of the desired GPUTensor
            dtype (dtype): Optional datatype
            persist_values (bool, optional): If set to True (the default), the
                                             values assigned to this Tensor
                                             will persist across multiple begin
                                             and end calls.  Setting to False
                                             may provide a performance increase
                                             if values do not need to be
                                             maintained across such calls

        Returns:
            GPUTensor: output
        """
        return self.ng.empty(shape, dtype=dtype)

    def copy(self, ary):
        """
        returns a copy of ary
        """
        res = self.empty_like(ary)
        res.copy(ary)
        return res

    def array(self, ary, dtype=default_dtype, persist_values=True, name=None,
              allocator=drv.mem_alloc):
        """
        Allocate a new GPUTensor and fill it with supplied numpy array.

        Arguments:
            ary (ndarray): Numpy array with source data
            dtype (dtype, optional): Optional datatype
            persist_values (bool, optional): If set to True (the default), the
                                             values assigned to this Tensor
                                             will persist across multiple begin
                                             and end calls.  Setting to False
                                             may provide a performance increase
                                             if values do not need to be
                                             maintained across such calls
            name (string): Name for the GPUTensor
            allocator (pycuda): Pycuda memory allocator

        Returns:
            GPUTensor: output
        """
        return self.ng.array(ary, dtype=dtype, name=name)

    def add(self, left, right, out):
        """
        Elementwise addition

        Arguments:
            left (GPUTensor, numeric): left-hand side operand.
            right (GPUTensor, numeric): right-hand side operand.
            out (GPUTensor): where the result will be stored.

        Returns:
            GPUTensor: reference to out
        """
        self.ng.add(left, right, out=out)
        return out

    def subtract(self, left, right, out):
        """
        Elementwise subtraction

        Arguments:
            left (GPUTensor, numeric): left-hand side operand.
            right (GPUTensor, numeric): right-hand side operand.
            out (GPUTensor): where the result will be stored.

        Returns:
            GPUTensor: reference to out
        """
        self.ng.subtract(left, right, out=out)
        return out

    def multiply(self, left, right, out):
        """
        Elementwise multiplication

        Arguments:
            left (GPUTensor, numeric): left-hand side operand.
            right (GPUTensor, numeric): right-hand side operand.
            out (GPUTensor): where the result will be stored.

        Returns:
            GPUTensor: reference to out
        """
        self.ng.multiply(left, right, out=out)
        return out

    def divide(self, left, right, out):
        """
        Elementwise division

        Arguments:
            left (GPUTensor, numeric): left-hand side operand.
            right (GPUTensor, numeric): right-hand side operand.
            out (GPUTensor): where the result will be stored.

        Returns:
            GPUTensor: reference to out
        """
        self.ng.divide(left, right, out=out)
        return out

    def greater(self, left, right, out):
        """
        Elementwise greater than testing

        Arguments:
            left (GPUTensor, numeric): left-hand side operand.
            right (GPUTensor, numeric): right-hand side operand.
            out (GPUTensor): where the result will be stored.

        Returns:
            GPUTensor: reference to out
        """
        self.ng.greater(left, right, out=out)
        return out

    def equal(self, left, right, out):
        """
        Performs element-wise equality testing on each element of left and
        right, storing the result in out.  Each operand is assumed to be the
        same shape (or broadcastable as such).

        Arguments:
            left (GPUTensor, numeric): left-hand side operand.
            right (GPUTensor, numeric): right-hand side operand.
            out (GPUTensor): where the result will be stored.

        Returns:
            GPUTensor: reference to out
        """
        self.ng.equal(left, right, out=out)
        return out

    def not_equal(self, left, right, out):
        """
        Elementwise not equal testing

        Arguments:
            left (GPUTensor, numeric): left-hand side operand.
            right (GPUTensor, numeric): right-hand side operand.
            out (GPUTensor): where the result will be stored.

        Returns:
            GPUTensor: reference to out
        """
        self.ng.not_equal(left, right, out=out)
        return out

    def clip(self, a, a_min, a_max, out):
        """
        Elementwise clipping between a range of specified values

        Arguments:
            a (GPUTensor): input tensor.
            a_min (float): floor value.
            a_max (float): ceiling value.
            out (GPUTensor): where the result will be stored.

        Returns:
            GPUTensor: reference to out
        """
        self.ng.clip(a, a_min, a_max, out=out)
        return out

    def log(self, a, out):
        """
        Elementwise base-e logarithm

        Arguments:
            a (GPUTensor): input tensor.
            out (GPUTensor): where the result will be stored.

        Returns:
            GPUTensor: reference to out
        """
        self.ng.log(a, out=out)
        return out

    def tanh(self, a, out):
        """
        Elementwise tanh

        Arguments:
            a (GPUTensor): input tensor.
            out (GPUTensor): where the result will be stored.

        Returns:
            GPUTensor: reference to out
        """
        self.ng.tanh(a, out=out)
        return out

    def argmax(self, a, out, axis=0):
        """
        Calculates the indices of the maximal element value along the specified
        axis.  If multiple elements contain the maximum, only the elements of
        the first are returned.

        Arguments:
            tsr (GPUTensor): The GPUTensor on which to find the maximum indices
            axis (int): The dimension along which to find the maximum.  If set
                        to None, find the overall maximum index of a flattened
                        representation of tsr.
            out (GPUTensor): Where to store the result.  Should be of the
                             appropriate type and expected shape

        Returns:
            GPUTensor: reference to out
        """
        self.ng.argmax(a, out=out, axis=axis)
        return out

    def softmax(self, x, out):
        """
        Softmax nonlinearity. Computes exp(x-max(x)) / sum_i exp(x_i-max(x_i))

        Arguments:
            x (GPUTensor): input tensor.
            out (GPUTensor): where the result will be stored.

        Returns:
            GPUTensor: reference to out
        """
        out[:] = (self.ng.reciprocal(self.ng.sum(
                  self.ng.exp(x - self.ng.max(x, axis=0)), axis=0)) *
                  self.ng.exp(x - self.ng.max(x, axis=0)))
        return out

    def softmax_gradient(self, y, err, out):
        """
        Gradient of the softmax nonlinearity.

        Arguments:
            y (GPUTensor): input tensor.
            err (GPUTensor): backpropagated error.
            out (GPUTensor): where the result will be stored.

        Returns:
            GPUTensor: reference to out
        """
        raise NotImplementedError("Softmax gradient should use shortcut")
        return out

    def make_binary_mask(self, tsr, keepthresh=0.5, dtype=default_dtype):
        """
        Create a binary mask for dropout layers.

        Arguments:
            tsr (GPUTensor): Output tensor
            keepthresh (float): fraction of ones
        """
        self.ng.dropout(keep=keepthresh, out=tsr)

    def gdm_compound(self, ps_item, us_item, vs_item, momentum_coef,
                     learning_rate, epoch):
        """
        Perform gradient descent update with momentum.

        Arguments:
            ps_item (GPUTensor): parameter tensor (e.g. a weight matrix)
            us_item (GPUTensor): update tensor, contains gradient wrt. weights
            vs_item (GPUTensor): velocity tensor.
            momentum_coef (float): momentum coefficient.
            learning_rate (float): learning rate.
            epoch (int): epoch (used in conjunction with diagnostics).

        Outputs are written to vs_item (updated velocity)
        and ps_item (updated weights)
        """
        vs_item[:] = vs_item * momentum_coef - us_item * learning_rate
        ps_item[:] = ps_item + vs_item

    def gdmwd_compound(self, ps_item, us_item, vs_item, momentum_coef,
                       learning_rate, wd, epoch):
        """
        Perform gradient descent update with momentum and weight decay.

        Arguments:
            ps_item (GPUTensor): parameter tensor (e.g. a weight matrix)
            us_item (GPUTensor): update tensor, contains gradient wrt. weights
            vs_item (GPUTensor): velocity tensor.
            momentum_coef (float): momentum coefficient.
            learning_rate (float): learning rate.
            wd (float): weight decay parameter.
            epoch (int): epoch (used in conjunction with diagnostics).

        Outputs:
            ps_item, the updated weights.
            vs_item, the updated velocity.
            us_item, used as a temp buffer.
        """
        vs_item[:] = (vs_item * momentum_coef -
                      us_item * learning_rate -
                      ps_item * learning_rate * wd)
        ps_item[:] = ps_item + vs_item

    def exp_mavg(self, mavg, newval, rho):
        """
        Calculate the exponential moving average

        Arguments:
            mavg:  The running value of the moving average
            newval:  New sample to be added to the moving average
            rho:  Interpolation value
        """

        mavg[:] = rho * mavg + (1.0 - rho) * newval

    def ada_update(self, ps_item, us_item, gs_item, ds_item, ls_item, ss_item,
                   rho, epsilon):
        """
        Update rule for AdaDelta (Zeiler, http://arxiv.org/abs/1212.5701)

        Arguments:
            ps_item: weight / parameter (will be updated)
            us_item: update
            gs_item: expected value of Gradient Squared (will be updated)
            ds_item: expected value of Delta Squared (will be updated)
            ls_item: learning rate (will be updated)
            ss_item: Scratch Space
            rho: decay constant (determines window size)
            epsilon: small positive constant for numerical stability
        """
        # Accumulate E[Grad^2]
        gs_item[:] = gs_item * rho + (1.0 - rho) * us_item * us_item

        # Calculate Updates
        ls_item[:] = self.ng.sqrt((ds_item + epsilon) /
                                  (gs_item + epsilon)) * (-1.0) * us_item

        # Accumulate E[Delt^2]
        ds_item[:] = ds_item * rho + (1.0 - rho) * ls_item * ls_item

        # Final update to the params
        ps_item[:] = ps_item + ls_item

    def rms_update(self, params, updates, run_squares, velocity, scratch_space,
                   gamma, epsilon, learning_rate, momentum_coef):

        # Update running squares
        run_squares[:] = gamma * run_squares + (1. - gamma) * updates * updates

        # Now scale the gradient by lr / rms(grad) (with a epsilon term for
        # stability) and use it to update the params
        if momentum_coef == 0:
            params[:] = params - learning_rate * updates * self.ng.reciprocal(
                self.ng.sqrt(run_squares) + epsilon)
        else:
            velocity[:] = velocity * momentum_coef - \
                learning_rate * updates * \
                self.ng.reciprocal(self.ng.sqrt(run_squares) + epsilon)
            params[:] = params + velocity

    def fprop_bn_compound(self, inputs, beta, gamma, eps, xhat,
                          xmean, xvar, gmean, gvar, rho, out):
        """
        Batch normalization forward pass, compounded to run in 3 kernel calls.

        Arguments:
            inputs: input data to be normalized
            beta: location parameter
            gamma: scale parameter
            eps: small constant for numerical stability
            xvar: variance (updated)
            xhat: normalized input (updated)
            out: normalized and rescaled input (updated)
        """
        xvar[:] = self.ng.var(inputs, axis=1)
        xmean[:] = self.ng.mean(inputs, axis=1)
        gmean[:] = gmean * rho + (1.0 - rho) * xmean
        gvar[:] = gvar * rho + (1.0 - rho) * xvar

        xvar[:] = self.ng.reciprocal(self.ng.sqrt(xvar + eps))
        xhat[:] = xvar * (inputs - xmean)
        out[:] = xhat * gamma + beta
        return out

    def bprop_bn_compound(self, xhat, error, xvar, gamma,
                          beta_updates, gamma_updates):
        """
        Batch normalization backward pass, compounded to run with 4 kernel
        calls.

        Arguments:
            xhat: normalized input data (updated)
            error: backpropagated deltas (updated)
            xvar: precomputed variance
            gamma: scale parameter
            beta_updates: gradient update for beta (updated)
            gamma_updates: gradient update for gamma (updated)
        """
        gamma_updates[:] = self.ng.sum(xhat * error, axis=1)
        beta_updates[:] = self.ng.sum(error, axis=1)
        xhat[:] = (xhat * gamma_updates + beta_updates) / float(xhat.shape[1])
        error[:] = xvar * gamma * (error - xhat)

Example #10

Show file

File: batched_dot_test.py Project: chagge/nervanagpu

    dimI = (X,C,N)
    dimO = (X,K,N)
else:
    dimI = (X,N,C)
    dimO = (X,N,K)

if ones:
    cpuI = np.ones(dimI, dtype=np.float32)
    cpuE = np.ones(dimO, dtype=np.float32)
    cpuW = np.ones(dimW, dtype=np.float32)
else:
    cpuI = np.random.uniform(-1.0, 1.0, dimI).astype(dtype).astype(np.float32)
    cpuE = np.random.uniform(-1.0, 1.0, dimO).astype(dtype).astype(np.float32)
    cpuW = np.random.uniform(-1.0, 1.0, dimW).astype(dtype).astype(np.float32)

devI = ng.array(cpuI, dtype=dtype)
devE = ng.array(cpuE, dtype=dtype)
devW = ng.array(cpuW, dtype=dtype)

devO = ng.empty(dimO, dtype=dtype)
devB = ng.empty(dimI, dtype=dtype)
devU = ng.empty(dimW, dtype=dtype)

if Nin:
    ng.batched_dot(devW,   devI,   devO, repeat=repeat, size=size) # fprop
    ng.batched_dot(devW.T, devE,   devB, repeat=repeat, size=size) # bprop
    ng.batched_dot(devE,   devI.T, devU, repeat=repeat, size=size) # update
else:
    ng.batched_dot(devI,   devW.T, devO, repeat=repeat, size=size) # fprop
    ng.batched_dot(devE,   devW,   devB, repeat=repeat, size=size) # bprop
    ng.batched_dot(devE.T, devI,   devU, repeat=repeat, size=size) # update

Example #11

Show file

File: testcudaconv.py Project: jcoreyes/nervanagpu

def run():
    ng = NervanaGPU(stochastic_round=False)

    dt = np.float32
    # N: Number of images in mini-batch
    # C: Number of input feature maps
    # K: Number of output feature maps

    # D: Depth  of input image
    # H: Height of input image
    # W: Width  of input image

    # T: Depth  of filter kernel
    # R: Height of filter kernel
    # S: Width  of filter kernel
    # 
    # * images:      (numColors, imgSizeY, imgSizeX, numImages) with stride given
    # * filters:     (numColors, filterPixels, numFilters) if conv
    # *              (numModules, numColors, filterPixels, numFilters) otherwise
    # *
    # * targets:     (numFilters, numModulesY, numModulesX, numImages)

    N = 128
    C = 3
    K = 64

    D = 1
    H = 64
    W = 64

    T = 1
    R = 8
    S = 8

    pad_h = pad_w = 0
    str_h = str_w = 4

    layer = ng.conv_layer(dt, N, C, K,
            D=D, H=H, W=W,
            T=T, R=R, S=S,
            pad_d=0, pad_h=pad_h, pad_w=pad_w,
            str_d=1, str_h=str_h, str_w=str_w,
            grid_P=0, grid_Q=0, update_size=None)

    numImages = N 
    numFilters = K

    numModulesY = int(math.ceil(float(H - R + 1 + 2*pad_h) / str_h))
    numModulesX = int(math.ceil(float(W - S + 1 + 2*pad_w) / str_w))

    print "Num Modules ", numModulesX, numModulesY


    # Set up images, filters, and outputs
    # imgd = np.loadtxt("im1.txt")
    # img = np.zeros((64, 64, 3))
    # print imgd.shape
    # for i in range(3):
    #     img[:, :, i] = imgd[i*64:(i+1)*64, :]
    # hostImages = np.tile(img)

    hostImages = np.random.rand(C, H, W, N)
    hostFilters = np.random.uniform(low=0.0, high=1.0, size=(C, S*R, numFilters)) #np.ones((C, S*R, numFilters)) #
    hostOutputs = np.zeros((numFilters, numModulesY, numModulesX, N))

    print "Input sum", np.sum(hostImages)

    # Run cc2 kernel    
    devI = ng.array(hostImages, dtype=dt)
    devF = ng.array(hostFilters, dtype=dt)
    devO = ng.array(hostOutputs, dtype=dt)

    ng.fprop_cuda_conv(layer, devI, devF, devO)

    print "CC2 input sum: ", np.sum(devI.asnumpyarray())
    print "CC2 output sum: ", np.sum(devO.asnumpyarray())

    # Run maxwel kernel
    # images: (C * H * W, N)
    # filters:  (C * S * R , numFilters)
    # outputs:  (numFilters * numModulesX * numModulesY, N)
    devI = ng.array(hostImages.reshape((C*H*W, N)), dtype=dt)
    devF = ng.array(hostFilters.reshape((C*S*R, numFilters)), dtype=dt)
    devO2 = ng.array(hostOutputs.reshape(numFilters*numModulesX*numModulesY, N), dtype=dt)

    ng.fprop_conv(layer, devI, devF, devO2)
    print "NG input sum: ", np.sum(devI.asnumpyarray())
    print "NG output sum: ", np.sum(devO2.asnumpyarray())

    hostOutputs1 = np.reshape(devO.asnumpyarray(), devO2.shape)
    hostOutputs2 = devO2.asnumpyarray()

    for i in xrange(hostOutputs1.shape[0]):
       for j in xrange(hostOutputs1.shape[1]):
           assert(abs(hostOutputs1[i, j] - hostOutputs2[i, j]) < 1e-4)

Example #12

Show file

File: conv_test.py Project: KayneWest/nervanagpu

else:
    cpuI = np.random.uniform(-127.0, 127.0, slicable(dimI,1)).astype(np.float32) #.astype(np.uint8) .astype(np.int8)
    cpuF = np.random.uniform(0.0, 1.1, slicable(dimF)  ).astype(np.float32)
    cpuE = np.random.uniform(-1.01, 1.01, dimO            ).astype(np.float32)

# zero pad the last row of cpu input for the sake of numpy
cpuI[-1,:] = 0.0

# cpu output arrays
cpuO = np.zeros(dimO,             dtype=np.float32)
cpuB = np.zeros(slicable(dimI,1), dtype=np.float32)
cpuU = np.zeros(slicable(dimF),   dtype=np.float32)

# give gpu the input array without zero padding (not needed)
devI = ng.array(cpuI[:-1,:].reshape(dimI), dtype=dtype)
devF = ng.array(cpuF.reshape(dimF), dtype=dtype)
devE = ng.array(cpuE, dtype=dtype)

devO = devB = devU = 0

if "fprop"  in ops:
    devO = ng.empty(dimO, dtype=dtype)
    ng.fprop_conv(conv,  devI, devF, devO, alpha=1.0, repeat=repeat)

if "bprop"  in ops:
    devB = ng.empty(dimI, dtype=dtype)
    ng.bprop_conv(conv,  devF, devE, devB, alpha=1.0, repeat=repeat)

if "update" in ops:
    devU = ng.empty(dimF, dtype=dtype)

Example #13

Show file

File: conv_test.py Project: honorpeter/Caffe2_fpga

else:
    cpuI = np.random.uniform(-127.0, 127.0, slicable(dimI,1)).astype(np.float32) #.astype(np.uint8) .astype(np.int8)
    cpuF = np.random.uniform(0.0, 1.1, slicable(dimF)  ).astype(np.float32)
    cpuE = np.random.uniform(-1.01, 1.01, dimO            ).astype(np.float32)

# zero pad the last row of cpu input for the sake of numpy
cpuI[-1,:] = 0.0

# cpu output arrays
cpuO = np.zeros(dimO,             dtype=np.float32)
cpuB = np.zeros(slicable(dimI,1), dtype=np.float32)
cpuU = np.zeros(slicable(dimF),   dtype=np.float32)

# give gpu the input array without zero padding (not needed)
devI = ng.array(cpuI[:-1,:].reshape(dimI), dtype=dtype)
devF = ng.array(cpuF.reshape(dimF), dtype=dtype)
devE = ng.array(cpuE, dtype=dtype)

devO = devB = devU = 0

if "fprop"  in ops:
    devO = ng.empty(dimO, dtype=dtype)
    ng.fprop_conv(conv,  devI, devF, devO, alpha=1.0, repeat=repeat)

if "bprop"  in ops:
    devB = ng.empty(dimI, dtype=dtype)
    ng.bprop_conv(conv,  devF, devE, devB, alpha=1.0, repeat=repeat)

if "update" in ops:
    devU = ng.empty(dimF, dtype=dtype)