Exemple #1
0
 def __init__(self, rng_seed, stochastic_round=False, device_id=0):
     self.ng = NervanaGPU(stochastic_round=stochastic_round)
     logger.info("Initialized NervanaGPU with stochastic_round=%s",
                 stochastic_round)
     self.rng_seed = rng_seed
     self.rng_init()
     self.device_id = device_id if device_id is not None else 0
Exemple #2
0
 def __init__(self, rng_seed, stochastic_round=False, device_id=0):
     import pycuda.driver as drv
     drv.init()
     global ctx
     ctx = drv.Device(device_id).make_context()
     import atexit
     atexit.register(ctx.pop)
     self.ng = NervanaGPU(stochastic_round=stochastic_round)
     logger.info("Initialized NervanaGPU with stochastic_round=%s",
                 stochastic_round)
     self.rng_seed = rng_seed
     self.rng_init()
     self.device_id = device_id if device_id is not None else 0
Exemple #3
0
 def __init__(self, rng_seed, stochastic_round=False, device_id=0):
     self.ng = NervanaGPU(stochastic_round=stochastic_round)
     logger.info("Initialized NervanaGPU with stochastic_round=%s",
                 stochastic_round)
     self.rng_seed = rng_seed
     self.rng_init()
     self.device_id = device_id if device_id is not None else 0
Exemple #4
0
 def __init__(self, rng_seed, stochastic_round=False, device_id=0):
     import pycuda.driver as drv
     drv.init()
     global ctx
     ctx = drv.Device(device_id).make_context()
     import atexit
     atexit.register(ctx.pop)
     self.ng = NervanaGPU(stochastic_round=stochastic_round)
     logger.info("Initialized NervanaGPU with stochastic_round=%s",
                 stochastic_round)
     self.rng_seed = rng_seed
     self.rng_init()
     self.device_id = device_id if device_id is not None else 0
Exemple #5
0
class MGPU(GPU):
    default_dtype = np.float32
    num_dev = 1
    is_dist = True

    def __init__(self,
                 rng_seed,
                 stochastic_round=False,
                 device_id=0,
                 num_dev=2):
        drv.init()
        self.num_dev = num_dev

        if device_id == 0:
            self.dev_list = range(num_dev)
        else:
            self.dev_list = device_id

        assert len(self.dev_list) == self.num_dev
        assert self.num_dev <= drv.Device.count()

        self.ctxs = []
        self.devs = []
        self._strms = []
        self._redstrms = []

        self._events = []
        self._redevents = []

        self. async = True
        self._nostrms = [None for i in self.dev_list]

        for i in self.dev_list:
            self.devs.append(drv.Device(i))

        for dev in self.devs:
            self.ctxs.append(
                dev.make_context(drv.ctx_flags.SCHED_BLOCKING_SYNC))
            self._strms.append(drv.Stream())
            self._redstrms.append(drv.Stream())
            self._events.append(drv.Event())
            self._redevents.append(drv.Event())
            drv.Context.pop()

        self.ctxs[0].push()
        atexit.register(drv.Context.pop)
        MGPUTensor.ctxs = self.ctxs
        MGPUTensor.num_dev = num_dev

        self.ng = NervanaGPU(stochastic_round=stochastic_round)
        logger.info("Initialized %d device NervanaGPU, stochastic_round=%s",
                    num_dev, stochastic_round)
        self.ng.block = None
        self.rng_seed = rng_seed
        self.rng_init()

        # Setup the pairwise contexts
        # TODO clean up this code to avoid indexing
        for dev1, ctx1 in zip(self.devs, self.ctxs):
            ctx1.push()
            for dev2, ctx2 in zip(self.devs, self.ctxs):
                if dev1 == dev2:
                    continue
                if dev1.can_access_peer(dev2):
                    ctx1.enable_peer_access(ctx2)
                else:
                    print('Cannot enable peer access between '
                          '{:d} and {:d}'.format(dev1, dev2))
            ctx1.pop()

    def make_events(self):
        evtlist = []
        for ctx in self.ctxs:
            ctx.push()
            evtlist.append(drv.Event())
            ctx.pop()
        return evtlist

    # These definitions are for performing grouped context commands
    # This is experimental and should remove _stack for actual usage
    def begin_stack(self, block, identifier):
        if block == Block.update:
            self.ng.block = Block.update
            self.call_stack = []
        else:
            pass

    def end_stack(self, block, identifier):
        if block == Block.update:
            self.ng.block = None
            for idx, ctx in enumerate(self.ctxs):
                ctx.push()
                self.ng.stream = self.strms[idx]
                for method, args, kwargs in self.call_stack:
                    myargs = [
                        a._tensorlist[idx] if isinstance(a, MGPUTensor) else a
                        for a in args
                    ]
                    mykwargs = {
                        k:
                        v._tensorlist[idx] if isinstance(v, MGPUTensor) else v
                        for k, v in kwargs.iteritems()
                    }
                    getattr(super(MGPU, self), method)(*myargs, **mykwargs)
                self.ng.stream = None
                ctx.pop()
            self.call_stack = None
        else:
            pass

    @property
    def strms(self):
        return self._strms if self. async else self._nostrms

    @property
    def redstrms(self):
        return self._redstrms if self. async else self._nostrms

    def uniform(self,
                low=0.0,
                high=1.0,
                size=1,
                dtype=default_dtype,
                name=None,
                persist_values=True,
                ptype='replica'):
        """
        generate numpy random number and convert to a GPUTensor.
        If called with dtype=None it will probably explode
        """
        assert len(size) == 2
        result = self.empty(size, dtype=dtype, persist_values=persist_values)
        result.ptype = ptype
        beshape = size if ptype == 'replica' else (self.num_dev * size[0],
                                                   size[1])
        ary = np.random.uniform(low, high, beshape).astype(dtype)
        self.set(result, ary)
        return result

    def normal(self,
               loc=0.0,
               scale=1.0,
               size=1,
               dtype=default_dtype,
               name=None,
               persist_values=True,
               ptype='replica'):
        """
        Gaussian/Normal random number sample generation
        """
        assert len(size) == 2
        result = self.empty(size, dtype=dtype, persist_values=persist_values)
        result.ptype = ptype
        beshape = size if ptype == 'replica' else (self.num_dev * size[0],
                                                   size[1])
        ary = np.random.normal(loc, scale, beshape).astype(dtype)
        self.set(result, ary)
        return result

    def synchronize(self):
        if not self. async:
            return
        for s in self.strms:
            s.synchronize()

    def redsynchronize(self):
        if not self. async:
            return
        for s in self.redstrms:
            s.synchronize()

    def allocate_fragment(self,
                          shape,
                          dtype=default_dtype,
                          persist_values=True):
        # TODO: set ptype to be fragment in this case ??
        return self.empty((shape[0], shape[1] / self.num_dev),
                          dtype,
                          persist_values=persist_values)

    def zeros_like(self,
                   ary,
                   dtype=default_dtype,
                   persist_values=True,
                   name=None):
        result = self.zeros(ary.shape,
                            dtype=dtype,
                            persist_values=persist_values)
        result.ptype = ary.ptype
        return result

    def empty_like(self,
                   ary,
                   dtype=default_dtype,
                   persist_values=True,
                   name=None):
        result = self.empty(ary.shape,
                            dtype=dtype,
                            persist_values=persist_values,
                            name=name)
        result.ptype = ary.ptype
        return result

    def set(self, tensor, data):
        assert isinstance(tensor, MGPUTensor)
        if tensor.ptype == 'replica':
            for dest, strm, ctx in zip(tensor.tlist, self.strms, self.ctxs):
                ctx.push()
                drv.memcpy_htod_async(dest.ptr, data, strm)
                ctx.pop()
            # tensor.copy_from(data)
        else:
            self.scatter(data, tensor)

    def scatter(self, hbuf, dbuf):
        '''
        scatters the array data in hbuf to the mgpu tensor
        assumes that dbuf is a M x N and hbuf is M x (Nxk) where k is the
        number of replicas
        also assumes that dtype of hbuf and dbuf are the same
        '''
        assert hbuf.size == dbuf.size * dbuf.num_dev
        assert isinstance(dbuf, MGPUTensor)
        assert hbuf.dtype == dbuf.dtype
        ndata = dbuf.size
        starts = [i * ndata for i in range(self.num_dev)]

        for dest, strm, ctx, doff in zip(dbuf.tlist, self.strms, self.ctxs,
                                         starts):
            src = hbuf.reshape((hbuf.size))[doff:(doff + ndata)]
            ctx.push()
            drv.memcpy_htod_async(dest.ptr, src, strm)
            ctx.pop()

        self.synchronize()

    def fprop_fc(self, out, inputs, weights, layer=None):
        """
        In this case, the weights are shards, the acts are replicas
        ubuf should be of size nout/num_dev x mbsz
        """
        ubuf = layer.mempool[0]
        assert ubuf.shape == (weights.shape[0], inputs.shape[1])

        if layer.use_biases:
            biases = layer.biases.tlist
        else:
            biases = [None for i in range(self.num_dev)]

        for dbuf, ibuf, wt, bs, strm, ctx in zip(ubuf.tlist, inputs.tlist,
                                                 weights.tlist, biases,
                                                 self.strms, self.ctxs):
            ctx.push()
            self.ng.stream = strm
            self.ng.dot(wt, ibuf, dbuf)
            if layer.use_biases:
                self.ng.add(dbuf, bs, out=dbuf)
            ctx.pop()

        # Note, should be safe not to sync because each fragment is computed
        # on the same stream that originates the copy
        # self.synchronize()
        self.fragment_to_replica(ubuf, out)

    def bprop_fc(self, out, weights, deltas, layer=None):
        """
        Backward propagate the error through a fully connected network layer.

        Arguments:
            out (GPUTensor): Where to store the backward propagated errors.
            weights (GPUTensor): The weight coefficient values for this layer.
            deltas (GPUTensor): The error values for this layer
            layer (Layer): The layer object.
        """
        ubuf = layer.mempool[1]
        wtsz = weights.shape[0]
        starts = [i * wtsz for i in range(self.num_dev)]
        assert out.shape == (weights.shape[1], deltas.shape[1])
        assert ubuf.shape == out.shape

        for dbuf, ibuf, wt, strm, ctx, off in zip(out.tlist, deltas.tlist,
                                                  weights.tlist, self.strms,
                                                  self.ctxs, starts):
            ctx.push()
            self.ng.stream = strm
            self.ng.dot(wt.T, ibuf[off:(off + wtsz)], dbuf)
            ctx.pop()

        # Note, should be safe not to sync because each fragment is computed
        # on the same stream that originates the copy
        self.synchronize()
        self.reduce(out, ubuf)

    def update_fc(self, out, inputs, deltas, layer=None):
        wtsz = out.shape[0]
        starts = [i * wtsz for i in range(self.num_dev)]

        for obuf, dbuf, ibuf, strm, ctx, off in zip(out.tlist, deltas.tlist,
                                                    inputs.tlist, self.strms,
                                                    self.ctxs, starts):
            ctx.push()
            self.ng.stream = strm
            self.ng.dot(dbuf[off:(off + wtsz)], ibuf.T, obuf)
            ctx.pop()

        # self.synchronize()

    def update_fc_bias(self, err, out):
        """
        Compute the updated bias gradient for a fully connected network layer.

        Arguments:
            out (GPUTensor): Where to store the updated gradient value.
            err (GPUTensor): backpropagated error
        """
        wtsz = out.shape[0]
        starts = [i * wtsz for i in range(self.num_dev)]

        for ebuf, obuf, strm, ctx, off in zip(err.tlist, out.tlist, self.strms,
                                              self.ctxs, starts):
            ctx.push()
            self.ng.stream = strm
            self.ng.sum(ebuf[off:(off + wtsz)], axis=1, out=obuf)
            ctx.pop()

    def add_fc_bias(self, inputs, bias):
        """
        This is a no-op since we absorb the bias add into the fprop_fc call
        """
        pass

    def reduce_tensor(self, ary, async=True):
        '''
        This is the case for the scalar tensor
        '''
        assert ary.size == 1
        if ary.ptype == 'replica':
            self.ctxs[0].push()
            result = ary.tlist[0].get()
            self.ctxs[0].pop()
            return result

        result = np.zeros((self.num_dev, 1), ary.dtype)
        for i, (ctx, src_buf,
                strm) in enumerate(zip(self.ctxs, ary.tlist, self.strms)):
            ctx.push()
            drv.memcpy_dtoh_async(result[i], src_buf.ptr, strm)
            ctx.pop()
        self.synchronize()
        return result.sum()
Exemple #6
0
# distributed under the License is distributed on an "AS IS" BASIS,
# WITHOUT WARRANTIES OR CONDITIONS OF ANY KIND, either express or implied.
# See the License for the specific language governing permissions and
# limitations under the License.

# Using just cublas compare N as the contiguous dimension verses the non-contiguous dimension.

import numpy as np
import pycuda.driver as drv
from nervanagpu import NervanaGPU
from pycuda.autoinit import context
from scikits.cuda import cublas

print(context.get_device().name())

ng = NervanaGPU(stochastic_round=False, bench=True)

handle = cublas.cublasCreate()

start, end = (drv.Event(), drv.Event())

def cublas_dot(op, A, B, C, repeat=1, warmup=False):

    lda = A.shape[0]
    ldb = B.shape[0]
    ldc = C.shape[0]

    m = C.shape[0]
    n = C.shape[1]
    k = A.shape[1] if op[0] == 'n' else A.shape[0]
Exemple #7
0
class GPU(Backend):
    """
    Sets up a NervanaGPU based backend for matrix operations.
    Note that some functions defined in the generic Backend class such as are
    cross-map pooling and normalization and adaDelta are not implemented for
    this backend.
    """
    default_dtype = np.float32

    def __init__(self, rng_seed, stochastic_round=False, device_id=0):
        self.ng = NervanaGPU(stochastic_round=stochastic_round)
        logger.info("Initialized NervanaGPU with stochastic_round=%s",
                    stochastic_round)
        self.rng_seed = rng_seed
        self.rng_init()
        self.device_id = device_id if device_id is not None else 0

    def __getstate__(self):
        """
        Defines what and how we go about serializing an instance of this class.

        Returns:
            self.__dict__: The full contents of the backend class instance,
                           except for the mem_pool which is on device and
                           cannot be serialized.
        """
        if hasattr(self, 'mem_pool') and self.mem_pool is not None:
            self.mem_pool_pickle = {'shape': self.mem_pool.shape,
                                    'dtype': np.float32}
            self.mem_pool = None

        return self.__dict__

    def __setstate__(self, state):
        """
        Defines how we go about deserializing into an instance of this class.

        Arguments:
            self.__dict__: The full contents of the backend class instance,
                           except for the mem_pool which is on device and
                           cannot be serialized.
        """
        self.__dict__.update(state)
        self.mem_pool = self.ng.empty(self.mem_pool_pickle['shape'],
                                      dtype=self.mem_pool_pickle['dtype'])

    def init_mempool(self, shape, dtype=default_dtype):
        """
        Allocates a memory pool for temporary storage
        """
        self.mem_pool = self.ng.empty(shape, dtype=dtype)

    def alloc_host_mem(self, shape, dtype):
        return drv.pagelocked_empty(shape, dtype, order="C", mem_flags=0)

    def create_stream(self):
        return drv.Stream()

    def async_copy(self, dest, src, stream=None):
        drv.memcpy_htod_async(dest.gpudata, src, stream)

    def rng_init(self):
        """
        Initialize and seed the pseudo random number genrator. Random numbers
        are generated on the host using numpy, then transfered to device.
        """
        seed = None
        if 'rng_seed' in self.__dict__:
            seed = self.rng_seed
            logger.info("Seeding random number generator with: %s", str(seed))
        np.random.seed(seed)

    def flop_timing_init(self, decorate_fc, decorate_conv, decorate_ew):
        """
        Initialize FLOP timing.  Wraps the specified MOP calls via a decorator
        to record elapsed time and number of operations.

        Arguments:
           decorate_fc (list): string giving the function names of fully
                               connected layer forward/backward/update calls
                               to time.
           decorate_conv (list): string giving the function names of
                                 convolutional layer forward/backward/update
                                 calls to time.
           decorate_ew (list): string giving the function names of element-wise
                               calls to time.

        Notes:
            Must be called prior to first flop_timing_start call
        """
        self.start = drv.Event()
        self.end = drv.Event()
        self.flop_timer = FlopsDecorator(self)
        self.flop_timer.decorate(decorate_fc=decorate_fc,
                                 decorate_conv=decorate_conv,
                                 decorate_ew=decorate_ew)

    def flop_timinig_start(self):
        """
        Start a new FLOP timer.
        Returns:
            None: dummy value (not used)
        """
        return self.start.record()

    def flop_timing_finish(self, start_time):
        """
        Complete current FLOP timing.

        Arguments:
            start_time (unused): ignored.

        Returns:
            float: elapsed time in seconds since prior flop_timing_start call.
        """
        self.end.record()
        self.end.synchronize()
        return self.end.time_since(self.start)

    def uniform(self, low=0.0, high=1.0, shape=1, dtype=default_dtype,
                persist_values=True, name=None, allocator=drv.mem_alloc):
        """
        generate numpy random number and convert to a GPUTensor.
        If called with dype=None it will probably explode
        """
        ary = np.random.uniform(low, high, shape)
        return self.ng.array(ary, dtype=dtype, name=name)

    def normal(self, loc=0.0, scale=1.0, size=1, dtype=default_dtype,
               persist_values=True, name=None, allocator=drv.mem_alloc):
        """
        Gaussian/Normal random number sample generation
        """
        ary = np.random.normal(loc, scale, size)
        return self.ng.array(ary, dtype=dtype, name=name)

    def fprop_fc(self, out, inputs, weights, layer=None):
        """
        Forward propagate the inputs of a fully connected network layer to
        produce output pre-activations (ready for transformation by an
        activation function).

        Arguments:
            out (GPUTensor): Where to store the forward propagated results.
            inputs (GPUTensor): Will be either the dataset input values (first
                                layer), or the outputs from the previous layer.
            weights (GPUTensor): The weight coefficient values for this layer.
            layer (Layer): The layer object.
        """
        self.ng.dot(weights, inputs, out)

    def bprop_fc(self, out, weights, deltas, layer=None):
        """
        Backward propagate the error through a fully connected network layer.

        Arguments:
            out (GPUTensor): Where to store the backward propagated errors.
            weights (GPUTensor): The weight coefficient values for this layer.
            deltas (GPUTensor): The error values for this layer
            layer (Layer): The layer object.
        """
        self.ng.dot(weights.T, deltas, out)

    def update_fc(self, out, inputs, deltas, layer=None):
        """
        Compute the updated gradient for a fully connected network layer.

        Arguments:
            out (GPUTensor): Where to store the updated gradient value.
            inputs (GPUTensor): Will be either the dataset input values (first
                                layer), or the outputs from the previous layer.
            deltas (GPUTensor): The error values for this layer
            layer (Layer): The layer object.
        """
        self.ng.dot(deltas, inputs.T, out)

    def fprop_conv(self, out, inputs, weights, ofmshape, ofmsize, ofmlocs,
                   ifmshape, links, nifm, padding, stride, ngroups, fpropbuf,
                   local=False):
        """
        Forward propagate the inputs of a convolutional network layer to
        produce output pre-activations (ready for transformation by an
        activation function).

        Arguments:
            out (GPUTensor): Where to store the forward propagated results.
            inputs (GPUTensor): Will be either the dataset input values (first
                             layer), or the outputs from the previous layer.
            weights (GPUTensor): The weight coefficient values for this layer.
            ofmshape (tuple): Dimensions of each output feature map (typically
                              number of height and width neurons).
            ofmsize (int): Total size of each output feature map.
            ofmlocs (GPUTensor): Indices giving the location of each element
                                  in each output feature map stored in out.
            ifmshape (tuple): Dimensions of each input feature map (typically
                              number of height and width neurons).  For this
                              backend we expect these values to be square.
            links (GPUTensor): Input receptive field indices.
            nifm (int): Total number of input feature maps.
            padding (int): Number of additional elements to include along each
                           dimension of each local receptive field during the
                           convolution operation.
            stride (int): Number of neurons to shift the filter at each step.
            ngroups (int): Number of groups.
            fpropbuf (GPUTensor): Temporary storage buffer used to hold the
                                  convolved outputs for a single receptive
                                  field.  Not used for this backend.
            local (bool, optional): Whether to do local filtering (True) or
                                    convolution (False, the default)
        """

        '''
        N: Number of images in mini-batch
        C: Number of input feature maps
        K: Number of output feature maps

        D: Depth  of input image
        H: Height of input image
        W: Width  of input image

        T: Depth  of filter kernel
        R: Height of filter kernel
        S: Width  of filter kernel
        '''
        self.ng.fprop_conv(layer=fpropbuf, I=inputs, F=weights, O=out,
                           alpha=1.0, repeat=1)

    def bprop_conv(self, out, weights, deltas, ofmshape, ofmsize, ofmlocs,
                   ifmshape, links, padding, stride, nifm, ngroups, bpropbuf,
                   local=False):
        """
        Backward propagate the error through a convolutional network layer.

        Arguments:
            out (GPUTensor): Where to store the backward propagated errors.
            weights (GPUTensor): The weight coefficient values for this layer.
            deltas (GPUTensor): The error values for this layer
            ofmshape (tuple): Dimensions of each output feature map (typically
                              height and width).
            ofmsize (int): Total size of each output feature map.
            ofmlocs (GPUTensor): Indices giving the location of each element in
                                 each output feature map stored in out.
            ifmshape (tuple): Dimensions of each input feature map (typically
                              height and width).
            links (GPUTensor): Input receptive field indices.
            nifm (int): Total number of input feature maps.
            padding (int): Number of additional elements to include along each
                           dimension of each local receptive field during the
                           convolution operation.
            stride (int): Number of neurons to shift the filter at each step.
            ngroups (int): Number of groups.
            bpropbuf (GPUTensor): Temporary storage buffer used to hold the
                                  backpropagated error for a single receptive
                                  field
            local (bool, optional): Whether to do local filtering (True) or
                                    convolution (False, the default)
        """
        self.ng.bprop_conv(layer=bpropbuf, F=weights, E=deltas, grad_I=out,
                           alpha=1.0, repeat=1)

    def update_conv(self, out, inputs, weights, deltas, ofmshape, ofmsize,
                    ofmlocs, ifmshape, links, nifm, padding, stride, ngroups,
                    fwidth, updatebuf, local=False, layer=None):
        """
        Compute the updated gradient for a convolutional network layer.

        Arguments:
            out (GPUTensor): Where to store the updated gradient value.
            inputs (GPUTensor): Will be either the dataset input values (first
                                layer), or the outputs from the previous layer.
            weights (GPUTensor): The weight coefficient values for this layer.
            deltas (GPUTensor): The error values for this layer
            ofmshape (tuple): Dimensions of each output feature map (typically
                              height and width).
            ofmsize (int): Total size of each output feature map.
            ofmlocs (GPUTensor): Indices giving the location of each element in
                                 each output feature map stored in out.
            ifmshape (tuple): Dimensions of each input feature map (typically
                              height and width).
            links (GPUTensor): Input receptive field indices.
            nifm (int): Total number of input feature maps.
            padding (int): Number of additional elements to include along each
                           dimension of each local receptive field during the
                           convolution operation.
            stride (int): Number of neurons to shift the filter at each step.
            ngroups (int): Number of groups.
            fwidth (int): Filter width.
            updatebuf (GPUTensor): Temporary storage buffer used to hold the
                                   updated gradient for a single receptive
                                   field
            local (bool, optional): Whether to do local filtering (True) or
                                    convolution (False, the default)
            layer (Layer): The layer object.
        """
        self.ng.update_conv(layer=updatebuf, I=inputs, E=deltas, grad_F=out,
                            alpha=1.0, repeat=1)

    def fprop_pool(self, out, inputs, op, ofmshape, ofmsize, ofmlocs, fshape,
                   ifmshape, links, nifm, padding, stride, fpropbuf):
        """
        Forward propagate the inputs of a Pooling network layer to
        produce output pre-activations (ready for transformation by an
        activation function).

        Arguments:
            out (GPUTensor): Where to store the forward propagated results.
            inputs (GPUTensor): Will be either the dataset input values (first
                                layer), or the outputs from the previous layer.
            op (string): The type of pooling operation to apply.  We support
                         "max", "avg", "l2" currently.
            ofmshape (tuple): Dimensions of each output feature map (typically
                              number of height and width neurons).
            ofmsize (int): Total size of each output feature map.
            ofmlocs (GPUTensor): Indices giving the location of each element in
                                 each output feature map stored in out.
            fshape (tuple): Dimensions of each filter (typically height and
                            width).
            ifmshape (tuple): Dimensions of each input feature map (typically
                              number of height and width neurons).
            links (GPUTensor): Input receptive field indices.
            nifm (int): Total number of input feature maps.
            padding (int): Number of additional elements to include along each
                           dimension of each local receptive field during the
                           pooling operation.
            stride (int): Number of neurons to shift the filter at each step.
            fpropbuf (GPUTensor): Temporary storage buffer used to hold the
                                  pooled outputs for a single receptive field.
        """
        op = op.lower()
        if op == "max":
            self.ng.fprop_pool(layer=fpropbuf, I=inputs, O=out, repeat=1)
        else:
            raise AttributeError("unexpected pooling op type: %s", op)

    def bprop_pool(self, out, fouts, inputs, deltas, op, ofmshape, ofmsize,
                   ofmlocs, fshape, fpsize, ifmshape, links, nifm, padding,
                   stride, bpropbuf):
        """
        Backward propagate the error through a pooling network layer.

        Arguments:
            out (GPUTensor): Where to store the backward propagated errors.
            fouts (GPUTensor): Forward propagated outputs from the previous
                               layer.
            inputs (GPUTensor): Will be either the dataset input values (first
                                layer), or the outputs from the previous layer.
            deltas (GPUTensor): The error values for this layer
            op (string): The type of pooling operation to apply.  We support
                         "max", "avg", "l2" currently.
            ofmshape (tuple): Dimensions of each output feature map (typically
                              height and width).
            ofmsize (int): Total size of each output feature map.
            ofmlocs (GPUTensor): Indices giving the location of each element in
                              each output feature map stored in out.
            fshape (tuple): Dimensions of each filter (typically height and
                            width).
            fpsize (int): The size of each filter.
            ifmshape (tuple): Dimensions of each input feature map (typically
                              height and width).
            links (GPUTensor): Input receptive field indices.
            nifm (int): Total number of input feature maps.
            padding (int): Number of additional elements to include along each
                           dimension of each local receptive field during the
                           pooling operation.
            stride (int): Number of neurons to shift the filter at each step.
            bpropbuf (GPUTensor): Temporary storage buffer used to hold the
                                  backpropagated error for a single receptive
                                  field
        """
        op = op.lower()
        if op == "max":
            self.ng.bprop_pool(layer=bpropbuf, I=inputs, E=deltas, grad_I=out,
                               repeat=1)
        else:
            raise AttributeError("unexpected pooling op type: %s", op)

    def logistic(self, x, out):
        """
        Logistic sigmoid nonlinearity, 1/(1+exp(-x))

        Arguments:
            x (GPUTensor): Input tensor
            out (GPUTensor): Output tensor

        """
        self.ng.sig(x, out=out)

        return out

    def rectlin(self, x, out):
        """
        Rectified Linear nonlinearity

        Arguments:
            x (GPUTensor): Input tensor
            out (GPUTensor): Output tensor

        """
        self.ng.maximum(x, 0., out=out)
        return out

    def rectleaky(self, x, slope, out):
        out[:] = self.ng.maximum(x, x*slope)

    def rectleaky_derivative(self, x, slope, out):
        out[:] = self.ng.greater(x, 0) * (1.0 - slope) + slope

    def sum(self, tsr, axes, out):
        """
        Sum

        Arguments:
            tsr  (GPUTensor): Input tensor
            axes (int): Axis along which the reduction is performed. If axes
                        is None,  the tensor is flattened and reduced over
                        both dimensions.
            out (GPUTensor): Output tensor
        """
        if axes is None:
            sze = tsr.shape[0]*tsr.shape[1]
            self.ng.sum(tsr.reshape(sze, 1), axis=0, out=out)
        else:
            self.ng.sum(tsr, axis=axes, out=out)
        return out

    def mean(self, tsr, axes, out):
        """
        Calculates the arithmetic mean of the elements along the specified
        axes.

        Arguments:
            tsr (GPUTensor): Input tensor
            axes (int): Axis along which the reduction is performed. If axes
                        is None,  the tensor is flattened and reduced over
                        both dimensions.
            out (GPUTensor): Output tensor
        """
        if axes is None:
            sze = tsr.shape[0]*tsr.shape[1]
            self.ng.mean(tsr.reshape(sze, 1), axis=0, out=out)
        else:
            self.ng.mean(tsr, axis=axes, out=out)
        return out

    def min(self, tsr, axes, out):
        """
        Calculates the minimum of the elements along the specified
        axes.

        Arguments:
            tsr (GPUTensor): Input tensor
            axes (int): Axis along which the reduction is performed. If axes
                        is None,  the tensor is flattened and reduced over
                        both dimensions.
            out (GPUTensor): Output tensor
        """
        if axes is None:
            sze = tsr.shape[0]*tsr.shape[1]
            self.ng.min(tsr.reshape(sze, 1), axis=0, out=out)
        else:
            self.ng.min(tsr, axis=axes, out=out)
        return out

    def max(self, tsr, axes, out):
        """
        Calculates the maximum of the elements along the specified
        axes.

        Arguments:
            tsr (GPUTensor): Input tensor
            axes (int): Axis along which the reduction is performed. If axes
                        is None,  the tensor is flattened and reduced over
                        both dimensions.
            out (GPUTensor): Output tensor
        """
        if axes is None:
            sze = tsr.shape[0]*tsr.shape[1]
            self.ng.max(tsr.reshape(sze, 1), axis=0, out=out)
        else:
            self.ng.max(tsr, axis=axes, out=out)
        return out

    def variance(self, tsr, axes, out, mean=None):
        """
        Calculates the variance of the elements along the specified
        axes.

        Arguments:
            tsr (GPUTensor): the tensor on which to compute the variance
            axes (int, list, optional): the dimension(s) along which to
                                        variance.  If set to None, we will
                                        variance over all dimensions.
            out (GPUTensor): where the result will be stored.
            mean (GPUTensor): the tensor containing mean of tsr

        Returns:
            GPUTensor: reference to out
        """
        if mean is None:
            logger.error("GPUTensor requires mean to be specified.")
            raise ValueError("mean not specified")
        self.ng.mean(self.ng.square(tsr-mean),  axis=axes, out=out)
        return out

    def fabs(self, x, out):
        """
        Calculates absolute value of the elements in a tensor

        Arguments:
            x (GPUTensor): Input tensor
            out (GPUTensor): Output tensor

        Returns:
            GPUTensor: reference to out
        """
        self.ng.fabs(x, out=out)
        return out

    def sqrt(self, x, out):
        """
        Calculates square root of the elements in a tensor

        Arguments:
            x (GPUTensor): Input tensor
            out (GPUTensor): Output tensor

        Returns:
            GPUTensor: reference to out
        """
        self.ng.sqrt(x, out=out)
        return out

    def zeros(self, shape, dtype=default_dtype, persist_values=True):
        """
        Allocate a new GPUTensor and fill it with zeros.

        Arguments:
            shape (tupel): Shape of the desired GPUTensor
            dtype (dtype): Optional datatype
            persist_values (bool, optional): If set to True (the default), the
                                             values assigned to this Tensor
                                             will persist across multiple begin
                                             and end calls.  Setting to False
                                             may provide a performance increase
                                             if values do not need to be
                                             maintained across such calls

        Returns:
            GPUTensor: output
        """
        return self.ng.zeros(shape, dtype=dtype)

    def ones(self, shape, dtype=default_dtype, persist_values=True):
        """
        Allocate a new GPUTensor and fill it with ones.

        Arguments:
            shape (tupel): Shape of the desired GPUTensor
            dtype (dtype): Optional datatype
            persist_values (bool, optional): If set to True (the default), the
                                             values assigned to this Tensor
                                             will persist across multiple begin
                                             and end calls.  Setting to False
                                             may provide a performance increase
                                             if values do not need to be
                                             maintained across such calls

        Returns:
            GPUTensor: output
        """
        return self.ng.ones(shape, dtype=dtype)

    def empty(self, shape, dtype=default_dtype, persist_values=True):
        """
        Allocate a new GPUTensor.

        Arguments:
            shape (tupel): Shape of the desired GPUTensor
            dtype (dtype): Optional datatype
            persist_values (bool, optional): If set to True (the default), the
                                             values assigned to this Tensor
                                             will persist across multiple begin
                                             and end calls.  Setting to False
                                             may provide a performance increase
                                             if values do not need to be
                                             maintained across such calls

        Returns:
            GPUTensor: output
        """
        return self.ng.empty(shape, dtype=dtype)

    def array(self, ary, dtype=default_dtype, persist_values=True, name=None,
              allocator=drv.mem_alloc):
        """
        Allocate a new GPUTensor and fill it with supplied numpy array.

        Arguments:
            ary (ndarray): Numpy array with source data
            dtype (dtype, optional): Optional datatype
            persist_values (bool, optional): If set to True (the default), the
                                             values assigned to this Tensor
                                             will persist across multiple begin
                                             and end calls.  Setting to False
                                             may provide a performance increase
                                             if values do not need to be
                                             maintained across such calls
            name (string): Name for the GPUTensor
            allocator (pycuda): Pycuda memory allocator

        Returns:
            GPUTensor: output
        """
        return self.ng.array(ary, dtype=dtype, name=name)

    def add(self, left, right, out):
        """
        Elementwise addition

        Arguments:
            left (GPUTensor, numeric): left-hand side operand.
            right (GPUTensor, numeric): right-hand side operand.
            out (GPUTensor): where the result will be stored.

        Returns:
            GPUTensor: reference to out
        """
        self.ng.add(left, right, out=out)
        return out

    def subtract(self, left, right, out):
        """
        Elementwise subtraction

        Arguments:
            left (GPUTensor, numeric): left-hand side operand.
            right (GPUTensor, numeric): right-hand side operand.
            out (GPUTensor): where the result will be stored.

        Returns:
            GPUTensor: reference to out
        """
        self.ng.subtract(left, right, out=out)
        return out

    def multiply(self, left, right, out):
        """
        Elementwise multiplication

        Arguments:
            left (GPUTensor, numeric): left-hand side operand.
            right (GPUTensor, numeric): right-hand side operand.
            out (GPUTensor): where the result will be stored.

        Returns:
            GPUTensor: reference to out
        """
        self.ng.multiply(left, right, out=out)
        return out

    def divide(self, left, right, out):
        """
        Elementwise division

        Arguments:
            left (GPUTensor, numeric): left-hand side operand.
            right (GPUTensor, numeric): right-hand side operand.
            out (GPUTensor): where the result will be stored.

        Returns:
            GPUTensor: reference to out
        """
        self.ng.divide(left, right, out=out)
        return out

    def greater(self, left, right, out):
        """
        Elementwise greater than testing

        Arguments:
            left (GPUTensor, numeric): left-hand side operand.
            right (GPUTensor, numeric): right-hand side operand.
            out (GPUTensor): where the result will be stored.

        Returns:
            GPUTensor: reference to out
        """
        self.ng.greater(left, right, out=out)
        return out

    def equal(self, left, right, out):
        """
        Performs element-wise equality testing on each element of left and
        right, storing the result in out.  Each operand is assumed to be the
        same shape (or broadcastable as such).

        Arguments:
            left (GPUTensor, numeric): left-hand side operand.
            right (GPUTensor, numeric): right-hand side operand.
            out (GPUTensor): where the result will be stored.

        Returns:
            GPUTensor: reference to out
        """
        self.ng.equal(left, right, out=out)
        return out

    def not_equal(self, left, right, out):
        """
        Elementwise not equal testing

        Arguments:
            left (GPUTensor, numeric): left-hand side operand.
            right (GPUTensor, numeric): right-hand side operand.
            out (GPUTensor): where the result will be stored.

        Returns:
            GPUTensor: reference to out
        """
        self.ng.not_equal(left, right, out=out)
        return out

    def clip(self, a, a_min, a_max, out):
        """
        Elementwise clipping between a range of specified values

        Arguments:
            a (GPUTensor): input tensor.
            a_min (float): floor value.
            a_max (float): ceiling value.
            out (GPUTensor): where the result will be stored.

        Returns:
            GPUTensor: reference to out
        """
        self.ng.clip(a, a_min, a_max, out=out)
        return out

    def log(self, a, out):
        """
        Elementwise base-e logarithm

        Arguments:
            a (GPUTensor): input tensor.
            out (GPUTensor): where the result will be stored.

        Returns:
            GPUTensor: reference to out
        """
        self.ng.log(a, out=out)
        return out

    def tanh(self, a, out):
        """
        Elementwise tanh

        Arguments:
            a (GPUTensor): input tensor.
            out (GPUTensor): where the result will be stored.

        Returns:
            GPUTensor: reference to out
        """
        self.ng.tanh(a, out=out)
        return out

    def argmax(self, a, out, axis=0):
        """
        Calculates the indices of the maximal element value along the specified
        axis.  If multiple elements contain the maximum, only the elements of
        the first are returned.

        Arguments:
            tsr (GPUTensor): The GPUTensor on which to find the maximum indices
            axis (int): The dimension along which to find the maximum.  If set
                        to None, find the overall maximum index of a flattened
                        representation of tsr.
            out (GPUTensor): Where to store the result.  Should be of the
                             appropriate type and expected shape

        Returns:
            GPUTensor: reference to out
        """
        self.ng.argmax(a, out=out, axis=axis)
        return out

    def softmax(self, x, out):
        """
        Softmax nonlinearity. Computes exp(x-max(x)) / sum_i exp(x_i-max(x_i))

        Arguments:
            x (GPUTensor): input tensor.
            out (GPUTensor): where the result will be stored.

        Returns:
            GPUTensor: reference to out
        """
        out[:] = (self.ng.reciprocal(self.ng.sum(
                  self.ng.exp(x - self.ng.max(x, axis=0)), axis=0)) *
                  self.ng.exp(x - self.ng.max(x, axis=0)))
        return out

    def softmax_gradient(self, y, err, out):
        """
        Gradient of the softmax nonlinearity.

        Arguments:
            y (GPUTensor): input tensor.
            err (GPUTensor): backpropagated error.
            out (GPUTensor): where the result will be stored.

        Returns:
            GPUTensor: reference to out
        """
        raise NotImplementedError("Softmax gradient should use shortcut")
        return out

    def make_binary_mask(self, tsr, keepthresh=0.5, dtype=default_dtype):
        """
        Create a binary mask for dropout layers.

        Arguments:
            tsr (GPUTensor): Output tensor
            keepthresh (float): fraction of ones
        """
        self.ng.dropout(keep=keepthresh, out=tsr)

    def gdm_compound(self, ps_item, us_item, vs_item, momentum_coef,
                     learning_rate, epoch):
        """
        Perform gradient descent update with momentum.

        Arguments:
            ps_item (GPUTensor): parameter tensor (e.g. a weight matrix)
            us_item (GPUTensor): update tensor, contains gradient wrt. weights
            vs_item (GPUTensor): velocity tensor.
            momentum_coef (float): momentum coefficient.
            learning_rate (float): learning rate.
            epoch (int): epoch (used in conjunction with diagnostics).

        Outputs are written to vs_item (updated velocity)
        and ps_item (updated weights)
        """
        vs_item[:] = vs_item * momentum_coef - us_item * learning_rate
        ps_item[:] = ps_item + vs_item

    def gdmwd_compound(self, ps_item, us_item, vs_item, momentum_coef,
                       learning_rate, wd, epoch):
        """
        Perform gradient descent update with momentum and weight decay.

        Arguments:
            ps_item (GPUTensor): parameter tensor (e.g. a weight matrix)
            us_item (GPUTensor): update tensor, contains gradient wrt. weights
            vs_item (GPUTensor): velocity tensor.
            momentum_coef (float): momentum coefficient.
            learning_rate (float): learning rate.
            wd (float): weight decay parameter.
            epoch (int): epoch (used in conjunction with diagnostics).

        Outputs:
            ps_item, the updated weights.
            vs_item, the updated velocity.
            us_item, used as a temp buffer.
        """
        vs_item[:] = vs_item * momentum_coef - us_item * \
            learning_rate - learning_rate * wd * ps_item
        ps_item[:] = ps_item + vs_item

    def exp_mavg(self, mavg, newval, rho):
        """
        Calculate the exponential moving average

        Arguments:
            mavg:  The running value of the moving average
            newval:  New sample to be added to the moving average
            rho:  Interpolation value
        """

        mavg[:] = rho * mavg + (1.0 - rho) * newval

    def ada_update(self, ps_item, us_item, gs_item, ds_item, ls_item, ss_item,
                   rho, epsilon):
        """
        Update rule for AdaDelta (Zeiler, http://arxiv.org/abs/1212.5701)

        Arguments:
            ps_item: weight / parameter (will be updated)
            us_item: update
            gs_item: expected value of Gradient Squared (will be updated)
            ds_item: expected value of Delta Squared (will be updated)
            ls_item: learning rate (will be updated)
            ss_item: Scratch Space
            rho: decay constant (determines window size)
            epsilon: small positive constant for numerical stability
        """
        # Accumulate E[Grad^2]
        gs_item[:] = gs_item * rho + (1.0 - rho) * us_item * us_item

        # Calculate Updates
        ls_item[:] = self.ng.sqrt((ds_item + epsilon) /
                                  (gs_item + epsilon)) * (-1.0) * us_item

        # Accumulate E[Delt^2]
        ds_item[:] = ds_item * rho + (1.0 - rho) * ls_item * ls_item

        # Final update to the params
        ps_item[:] = ps_item + ls_item

    def rms_update(self, params, updates, run_squares, velocity, scratch_space,
                   gamma, epsilon, learning_rate, momentum_coef):

        # Update running squares
        run_squares[:] = gamma * run_squares + (1. - gamma) * updates * updates

        # Now scale the gradient by lr / rms(grad) (with a epsilon term for
        # stability) and use it to update the params
        if momentum_coef == 0:
            params[:] = params - learning_rate * updates * self.ng.reciprocal(
                self.ng.sqrt(run_squares) + epsilon)
        else:
            velocity[:] = velocity * momentum_coef - \
                learning_rate * updates * \
                self.ng.reciprocal(self.ng.sqrt(run_squares) + epsilon)
            params[:] = params + velocity

    def fprop_bn_compound(self, inputs, beta, gamma, eps, xhat,
                          xmean, xvar, gmean, gvar, rho, out):
        """
        Batch normalization forward pass, compounded to run in 3 kernel calls.

        Arguments:
            inputs: input data to be normalized
            beta: location parameter
            gamma: scale parameter
            eps: small constant for numerical stability
            xvar: variance (updated)
            xhat: normalized input (updated)
            out: normalized and rescaled input (updated)
        """
        xvar[:] = self.ng.var(inputs, axis=1)
        xmean[:] = self.ng.mean(inputs, axis=1)
        gmean[:] = gmean * rho + (1.0 - rho) * xmean
        gvar[:] = gvar * rho + (1.0 - rho) * xvar

        xvar[:] = self.ng.reciprocal(self.ng.sqrt(xvar + eps))
        xhat[:] = xvar * (inputs - xmean)
        out[:] = xhat * gamma + beta
        return out

    def bprop_bn_compound(self, xhat, error, xvar, gamma,
                          beta_updates, gamma_updates):
        """
        Batch normalization backward pass, compounded to run with 4 kernel
        calls.

        Arguments:
            xhat: normalized input data (updated)
            error: backpropagated deltas (updated)
            xvar: precomputed variance
            gamma: scale parameter
            beta_updates: gradient update for beta (updated)
            gamma_updates: gradient update for gamma (updated)
        """
        gamma_updates[:] = self.ng.sum(xhat * error, axis=1)
        beta_updates[:] = self.ng.sum(error, axis=1)
        xhat[:] = (xhat * gamma_updates + beta_updates) / float(xhat.shape[1])
        error[:] = xvar * gamma * (error - xhat)
Exemple #8
0
    def __init__(self, rng_seed, stochastic_round=False, device_id=0,
                 num_dev=2):
        drv.init()
        self.num_dev = num_dev

        if device_id == 0:
            self.dev_list = range(num_dev)
        else:
            self.dev_list = device_id

        assert len(self.dev_list) == self.num_dev
        assert self.num_dev <= drv.Device.count()

        self.ctxs = []
        self.devs = []
        self._strms = []
        self._redstrms = []

        self._events = []
        self._redevents = []

        self.async = True
        self._nostrms = [None for i in self.dev_list]

        for i in self.dev_list:
            self.devs.append(drv.Device(i))

        for dev in self.devs:
            self.ctxs.append(
                dev.make_context(drv.ctx_flags.SCHED_BLOCKING_SYNC))
            self._strms.append(drv.Stream())
            self._redstrms.append(drv.Stream())
            self._events.append(drv.Event())
            self._redevents.append(drv.Event())
            drv.Context.pop()

        self.ctxs[0].push()
        atexit.register(drv.Context.pop)
        MGPUTensor.ctxs = self.ctxs
        MGPUTensor.num_dev = num_dev

        self.ng = NervanaGPU(stochastic_round=stochastic_round)
        logger.info("Initialized %d device NervanaGPU, stochastic_round=%s",
                    num_dev, stochastic_round)
        self.ng.block = None
        self.rng_seed = rng_seed
        self.rng_init()

        # Setup the pairwise contexts
        # TODO clean up this code to avoid indexing
        for dev1, ctx1 in zip(self.devs, self.ctxs):
            ctx1.push()
            for dev2, ctx2 in zip(self.devs, self.ctxs):
                if dev1 == dev2:
                    continue
                if dev1.can_access_peer(dev2):
                    ctx1.enable_peer_access(ctx2)
                else:
                    print('Cannot enable peer access between '
                          '{:d} and {:d}'.format(dev1, dev2))
            ctx1.pop()
Exemple #9
0
    def __init__(self,
                 rng_seed,
                 stochastic_round=False,
                 device_id=0,
                 num_dev=2):
        drv.init()
        self.num_dev = num_dev

        if device_id == 0:
            self.dev_list = range(num_dev)
        else:
            self.dev_list = device_id

        assert len(self.dev_list) == self.num_dev
        assert self.num_dev <= drv.Device.count()

        self.ctxs = []
        self.devs = []
        self._strms = []
        self._redstrms = []

        self._events = []
        self._redevents = []

        self. async = True
        self._nostrms = [None for i in self.dev_list]

        for i in self.dev_list:
            self.devs.append(drv.Device(i))

        for dev in self.devs:
            self.ctxs.append(
                dev.make_context(drv.ctx_flags.SCHED_BLOCKING_SYNC))
            self._strms.append(drv.Stream())
            self._redstrms.append(drv.Stream())
            self._events.append(drv.Event())
            self._redevents.append(drv.Event())
            drv.Context.pop()

        self.ctxs[0].push()
        atexit.register(drv.Context.pop)
        MGPUTensor.ctxs = self.ctxs
        MGPUTensor.num_dev = num_dev

        self.ng = NervanaGPU(stochastic_round=stochastic_round)
        logger.info("Initialized %d device NervanaGPU, stochastic_round=%s",
                    num_dev, stochastic_round)
        self.ng.block = None
        self.rng_seed = rng_seed
        self.rng_init()

        # Setup the pairwise contexts
        # TODO clean up this code to avoid indexing
        for dev1, ctx1 in zip(self.devs, self.ctxs):
            ctx1.push()
            for dev2, ctx2 in zip(self.devs, self.ctxs):
                if dev1 == dev2:
                    continue
                if dev1.can_access_peer(dev2):
                    ctx1.enable_peer_access(ctx2)
                else:
                    print('Cannot enable peer access between '
                          '{:d} and {:d}'.format(dev1, dev2))
            ctx1.pop()
Exemple #10
0
    
    # Swap A and B to map from C order to Fortran 
    for r in range(repeat):
        cublas.cublasSgemm(handle, opB, opA, n, m, k, alpha, B.gpudata, ldb, A.gpudata, lda, beta, C.gpudata, ldc)

    end.record()
    end.synchronize()
    msecs = end.time_since(start) / repeat
    gflops = (m * n * k * 2.0) / (msecs * 1000000.0)
    print("%7.3f msecs %4.0f gflops (%s_%s   : %d,%d,%d)" %
          (msecs,gflops,"cublas",op,m,n,k))


np.set_printoptions(threshold=8193, linewidth=600, formatter={'float':lambda x: "% .0f" % x})

ng = NervanaGPU(stochastic_round=False, bench=True)

repeat = 1

for dtype in (np.float16, np.float32,):
    
    for K, C, N in ((32,4096,1512),):

        for alpha, beta in ((1.0,0.0), (0.5,0.5)):

            for op,  dimA,  dimB,  dimC in (
              ("nn", (K,C), (C,N), (K,N) ),  # fprop
              ("tn", (K,C), (K,N), (C,N) ),  # bprop
              ("nt", (K,N), (C,N), (K,C) ),): # update

                devA1 = ng.empty(dimA, dtype=dtype)
Exemple #11
0
    # Swap A and B to map from C order to Fortran 
    for r in range(repeat):
        cublas.cublasSgemm(handle, opB, opA, n, m, k, alpha, B.gpudata, ldb, A.gpudata, lda, beta, C.gpudata, ldc)

    if repeat > 1:
        end.record()
        end.synchronize()
        msecs = end.time_since(start) / repeat
        gflops = (m * n * k * 2.0) / (msecs * 1000000.0)
        print("%7.3f msecs %4.0f gflops (%s_%s   : %d,%d,%d)" %
              (msecs,gflops,"cublas",op,m,n,k))


np.set_printoptions(threshold=8193, linewidth=600, formatter={'float':lambda x: "% .0f" % x})

ng = NervanaGPU(stochastic_round=0, bench=0)

small_1  = (1,2,3,4,5,6,7,8,9,16,32,64,65,72,120,127,128,192)
medium_1 = (32,64,128,192,778,785,786,787,794)
big_1    = (32,64,128,1532,1535,1536,1537,1540,3073,4095)

small_2  = (8,16,32,64,72,96,120,128,192)
medium_2 = (32,64,128,192,256,786-32,786-16,786,786+16,786+32)
big_2    = (32,64,128,1536-80,1536-64,1536,1536+64,1536+80,3072,4096)

# sharedDim = (4096,4096)
# devA1s = ng.empty(sharedDim, dtype=np.float32)
# devB1s = ng.empty(sharedDim, dtype=np.float32)
# devC1s = ng.empty(sharedDim, dtype=np.float32)
# devA2s = ng.empty(sharedDim, dtype=np.float32)
# devB2s = ng.empty(sharedDim, dtype=np.float32)
Exemple #12
0
                           A.gpudata, lda, beta, C.gpudata, ldc)

    if repeat > 1:
        end.record()
        end.synchronize()
        msecs = end.time_since(start) / repeat
        gflops = (m * n * k * 2.0) / (msecs * 1000000.0)
        print "%7.3f msecs %4.0f gflops (%s_%s   : %d,%d,%d)" % (
            msecs, gflops, "cublas", op, m, n, k)


np.set_printoptions(threshold=8193,
                    linewidth=600,
                    formatter={'float': lambda x: "% .0f" % x})

ng = NervanaGPU(stochastic_round=True, bench=False)

small_1 = (1, 2, 3, 4, 5, 6, 7, 8, 9, 16, 32, 64, 65, 72, 120, 127, 128, 192)
medium_1 = (32, 64, 128, 192, 778, 785, 786, 787, 794)
big_1 = (32, 64, 128, 1532, 1535, 1536, 1537, 1540, 3073, 4095)

small_2 = (8, 16, 32, 64, 72, 96, 120, 128, 192)
medium_2 = (32, 64, 128, 192, 256, 786 - 32, 786 - 16, 786, 786 + 16, 786 + 32)
big_2 = (32, 64, 128, 1536 - 80, 1536 - 64, 1536, 1536 + 64, 1536 + 80, 3072,
         4096)

# sharedDim = (4096,4096)
# devA1s = ng.empty(sharedDim, dtype=np.float32)
# devB1s = ng.empty(sharedDim, dtype=np.float32)
# devC1s = ng.empty(sharedDim, dtype=np.float32)
# devA2s = ng.empty(sharedDim, dtype=np.float32)
Exemple #13
0
#!/usr/bin/python

import numpy         as np
import pycuda.driver as drv
from nervanagpu      import NervanaGPU
from pycuda.autoinit import context
from ipdb            import set_trace

np.set_printoptions(threshold=8192*4, linewidth=600, formatter={'int':lambda x: "%2d" % x,'float':lambda x: "%2.0f" % x})

ng = NervanaGPU(stochastic_round=0, bench=1)

dtype  = np.float32 # np.float16 or np.float32
repeat = 50          # repeat count for benchmarking
ones   = 0          # simpler data for debugging
cpu    = 0          # valdiate against numpy
size   = 32         # 32, 64, 128, None=auto

X = 100   # Batch Size
N = 32   # Minibatch Size
C = 3072  # Input  Features
K = 3072  # Output Features
Nin = True

dimW = (K,C)
if Nin:
    dimI = (X,C,N)
    dimO = (X,K,N)
else:
    dimI = (X,N,C)
    dimO = (X,N,K)
Exemple #14
0
    np.float16,
    np.float32,
)

# number of full iterations
loops = 10
# show bechmark details for each layer
layer_bench = 0
# show layer stats after each operation
print_stats = 0
# run network with all zeros to see speed difference
zeros = 0
# print more stuff
verbose = 0

ng = NervanaGPU(bench=layer_bench)

# common convolutional layer settings
conv11 = {"R": 11, "S": 11, "pad_h": 2, "pad_w": 2, "str_h": 4, "str_w": 4}
conv11p0 = {"R": 11, "S": 11, "pad_h": 0, "pad_w": 0, "str_h": 4, "str_w": 4}
conv7 = {"R": 7, "S": 7, "pad_h": 3, "pad_w": 3, "str_h": 2, "str_w": 2}
conv5 = {"R": 5, "S": 5, "pad_h": 2, "pad_w": 2}
conv5p0 = {"R": 5, "S": 5, "pad_h": 0, "pad_w": 0}
conv3 = {"R": 3, "S": 3, "pad_h": 1, "pad_w": 1}
conv2 = {"R": 2, "S": 2, "pad_h": 0, "pad_w": 0, "str_h": 2, "str_w": 2}
conv1 = {"R": 1, "S": 1, "pad_h": 0, "pad_w": 0}

# traditional pooling
pool2s2p0 = {"R": 2, "S": 2}
pool3s2p0 = {"R": 3, "S": 3, "str_h": 2, "str_w": 2}
pool3s2p1 = {"R": 3, "S": 3, "str_h": 2, "str_w": 2, "pad_h": 1, "pad_w": 1}
Exemple #15
0
def run():
    ng = NervanaGPU(stochastic_round=False)

    dt = np.float32
    # N: Number of images in mini-batch
    # C: Number of input feature maps
    # K: Number of output feature maps

    # D: Depth  of input image
    # H: Height of input image
    # W: Width  of input image

    # T: Depth  of filter kernel
    # R: Height of filter kernel
    # S: Width  of filter kernel
    # 
    # * images:      (numColors, imgSizeY, imgSizeX, numImages) with stride given
    # * filters:     (numColors, filterPixels, numFilters) if conv
    # *              (numModules, numColors, filterPixels, numFilters) otherwise
    # *
    # * targets:     (numFilters, numModulesY, numModulesX, numImages)

    N = 128
    C = 3
    K = 64

    D = 1
    H = 64
    W = 64

    T = 1
    R = 8
    S = 8

    pad_h = pad_w = 0
    str_h = str_w = 4

    layer = ng.conv_layer(dt, N, C, K,
            D=D, H=H, W=W,
            T=T, R=R, S=S,
            pad_d=0, pad_h=pad_h, pad_w=pad_w,
            str_d=1, str_h=str_h, str_w=str_w,
            grid_P=0, grid_Q=0, update_size=None)

    numImages = N 
    numFilters = K

    numModulesY = int(math.ceil(float(H - R + 1 + 2*pad_h) / str_h))
    numModulesX = int(math.ceil(float(W - S + 1 + 2*pad_w) / str_w))

    print "Num Modules ", numModulesX, numModulesY


    # Set up images, filters, and outputs
    # imgd = np.loadtxt("im1.txt")
    # img = np.zeros((64, 64, 3))
    # print imgd.shape
    # for i in range(3):
    #     img[:, :, i] = imgd[i*64:(i+1)*64, :]
    # hostImages = np.tile(img)

    hostImages = np.random.rand(C, H, W, N)
    hostFilters = np.random.uniform(low=0.0, high=1.0, size=(C, S*R, numFilters)) #np.ones((C, S*R, numFilters)) #
    hostOutputs = np.zeros((numFilters, numModulesY, numModulesX, N))

    print "Input sum", np.sum(hostImages)

    # Run cc2 kernel    
    devI = ng.array(hostImages, dtype=dt)
    devF = ng.array(hostFilters, dtype=dt)
    devO = ng.array(hostOutputs, dtype=dt)

    ng.fprop_cuda_conv(layer, devI, devF, devO)

    print "CC2 input sum: ", np.sum(devI.asnumpyarray())
    print "CC2 output sum: ", np.sum(devO.asnumpyarray())

    # Run maxwel kernel
    # images: (C * H * W, N)
    # filters:  (C * S * R , numFilters)
    # outputs:  (numFilters * numModulesX * numModulesY, N)
    devI = ng.array(hostImages.reshape((C*H*W, N)), dtype=dt)
    devF = ng.array(hostFilters.reshape((C*S*R, numFilters)), dtype=dt)
    devO2 = ng.array(hostOutputs.reshape(numFilters*numModulesX*numModulesY, N), dtype=dt)

    ng.fprop_conv(layer, devI, devF, devO2)
    print "NG input sum: ", np.sum(devI.asnumpyarray())
    print "NG output sum: ", np.sum(devO2.asnumpyarray())

    hostOutputs1 = np.reshape(devO.asnumpyarray(), devO2.shape)
    hostOutputs2 = devO2.asnumpyarray()

    for i in xrange(hostOutputs1.shape[0]):
       for j in xrange(hostOutputs1.shape[1]):
           assert(abs(hostOutputs1[i, j] - hostOutputs2[i, j]) < 1e-4)
Exemple #16
0
import pycuda.driver as drv
from nervanagpu      import NervanaGPU
from pycuda.autoinit import context
from operator        import mul

print context.get_device().name()

np.set_printoptions(threshold=8193, linewidth=600, formatter={'int':lambda x: "%10d" % x,'float':lambda x: "% .0f" % x})

ops  = set(("update",)) # "fprop","bprop","update"
ones = 0
cpu  = 0  # Set CPU to 1 to check against CPU
repeat = 1
dtype = np.float32

ng = NervanaGPU(stochastic_round=False, bench=True)

conv = ng.conv_layer(
    dtype,
    16,3,8,    # N,C,K
    1,64,64,   # D,H,W
    1,3,3,     # T,R,S
    0,1,1,     # padding
    1,1,1)     # strides


dimI = conv.dimI
dimF = conv.dimF
dimO = conv.dimO

# colapse outer dimensions into one and preserve inner dimension
Exemple #17
0
start, end = (drv.Event(), drv.Event())


def start_bench():
    start.record()


def end_bench(op):
    end.record()
    end.synchronize()
    msecs = end.time_since(start) / repeat
    gflops = conv.flops / (msecs * 1000000.0)
    print "%7.3f msecs %8.3f gflops (%s: %s)" % (msecs, gflops, op, conv)


ng = NervanaGPU(stochastic_round=False, bench=True)

# Create a cuDNN context
cudnn = libcudnn.cudnnCreate()

C_desc = libcudnn.cudnnCreateConvolutionDescriptor()
I_desc = libcudnn.cudnnCreateTensorDescriptor()
O_desc = libcudnn.cudnnCreateTensorDescriptor()
E_desc = libcudnn.cudnnCreateTensorDescriptor()
B_desc = libcudnn.cudnnCreateTensorDescriptor()
F_desc = libcudnn.cudnnCreateFilterDescriptor()
U_desc = libcudnn.cudnnCreateFilterDescriptor()

# Set some options and tensor dimensions
NCHW_fmt = libcudnn.cudnnTensorFormat['CUDNN_TENSOR_NCHW']
cu_dtype = libcudnn.cudnnDataType['CUDNN_DATA_FLOAT']
Exemple #18
0
import sys

if sys.version_info >= (3, 0):
    from functools import reduce

print(context.get_device().name())

np.set_printoptions(threshold=8193, linewidth=600, formatter={'int':lambda x: "%10d" % x,'float':lambda x: "% .0f" % x})

ops  = set(("update",)) # "fprop","bprop","update"
ones = 0
cpu  = 0  # Set CPU to 1 to check against CPU
repeat = 1
dtype = np.float32

ng = NervanaGPU(stochastic_round=False, bench=True)

conv = ng.conv_layer(
    dtype,
    16,3,8,    # N,C,K
    1,64,64,   # D,H,W
    1,3,3,     # T,R,S
    0,1,1,     # padding
    1,1,1)     # strides


dimI = conv.dimI
dimF = conv.dimF
dimO = conv.dimO

# colapse outer dimensions into one and preserve inner dimension
# Note GoogLeNet2 only fits in fp16 currently.  I need to work out delta sharing in inception layers.
nets = ("Alexnet","Overfeat","GoogLeNet1","GoogLeNet2","VGG","VGG_E",)

#Available dtypes: np.float16, np.float32
dtypes = (np.float16,np.float32)

# number of full iterations
loops       = 10
# show bechmark details for each layer
layer_bench = 0
# show layer stats after each operation
print_stats = 0
# run network with all zeros to see speed difference
zeros       = 0

ng = NervanaGPU(bench=layer_bench)

# common convolutional layer settings
conv11    = { "R":11, "S":11, "pad_h":2, "pad_w":2, "str_h":4, "str_w":4 }
conv11p0  = { "R":11, "S":11, "pad_h":0, "pad_w":0, "str_h":4, "str_w":4 }
conv7     = { "R":7,  "S":7,  "pad_h":3, "pad_w":3, "str_h":2, "str_w":2 }
conv5     = { "R":5,  "S":5,  "pad_h":2, "pad_w":2 }
conv5p0   = { "R":5,  "S":5,  "pad_h":0, "pad_w":0 }
conv3     = { "R":3,  "S":3,  "pad_h":1, "pad_w":1 }
conv2     = { "R":2,  "S":2,  "pad_h":0, "pad_w":0, "str_h":2, "str_w":2 }
conv1     = { "R":1,  "S":1,  "pad_h":0, "pad_w":0 }

# traditional pooling
pool2s2p0 = { "R":2, "S":2 }
pool3s2p0 = { "R":3, "S":3, "str_h":2, "str_w":2 }
pool3s2p1 = { "R":3, "S":3, "str_h":2, "str_w":2, "pad_h":1, "pad_w":1 }
Exemple #20
0
# WITHOUT WARRANTIES OR CONDITIONS OF ANY KIND, either express or implied.
# See the License for the specific language governing permissions and
# limitations under the License.

import numpy as np
from nervanagpu import NervanaGPU
from pycuda.autoinit import context
from time import sleep

np.set_printoptions(threshold=8193,
                    linewidth=600,
                    formatter={'float': lambda x: "% .1f" % x})

dtype = np.float32

ng = NervanaGPU(stochastic_round=False, bench=False)

small = (1, 2, 3, 4, 5, 6, 7, 8, 9, 16, 32, 64, 65, 72, 120, 127, 128, 192)
medium = (1, 64, 192, 778, 785, 786, 787, 794)
big = (1, 64, 192, 1532, 1535, 1536, 1537, 1540)

for size in (small, medium, big):  #  small, medium, big
    for m in size:
        for n in (size):
            for op in ("tn", "nn", "nt"):  # "tn","nn","nt",
                for k in size:
                    print("op,M,N,K: ", op, m, n, k)

                    dimA = (m, k) if op[0] == 'n' else (k, m)
                    dimB = (k, n) if op[1] == 'n' else (n, k)
                    dimC = (m, n)
# distributed under the License is distributed on an "AS IS" BASIS,
# WITHOUT WARRANTIES OR CONDITIONS OF ANY KIND, either express or implied.
# See the License for the specific language governing permissions and
# limitations under the License.

# Using just cublas compare N as the contiguous dimension verses the non-contiguous dimension.

import numpy as np
import pycuda.driver as drv
from nervanagpu import NervanaGPU
from pycuda.autoinit import context
from scikits.cuda import cublas

print context.get_device().name()

ng = NervanaGPU(stochastic_round=False, bench=True)

handle = cublas.cublasCreate()

start, end = (drv.Event(), drv.Event())

def cublas_dot(op, A, B, C, repeat=1, warmup=False):

    lda = A.shape[0]
    ldb = B.shape[0]
    ldc = C.shape[0]

    m = C.shape[0]
    n = C.shape[1]
    k = A.shape[1] if op[0] == 'n' else A.shape[0]
Exemple #22
0
    
    # Swap A and B to map from C order to Fortran 
    for r in range(repeat):
        cublas.cublasSgemm(handle, opB, opA, n, m, k, 1.0, B.gpudata, ldb, A.gpudata, lda, 0.0, C.gpudata, ldc)

    end.record()
    end.synchronize()
    msecs = end.time_since(start) / repeat
    gflops = (m * n * k * 2.0) / (msecs * 1000000.0)
    print "%7.3f msecs %4.0f gflops (%s_%s   : %d,%d,%d)" % (msecs,gflops,"cublas",op,m,n,k)

    return gflops

np.set_printoptions(threshold=8193, linewidth=600, formatter={'float':lambda x: "% .0f" % x})

ng = NervanaGPU(stochastic_round=False, bench=True)

for dtype in (np.float16,np.float32):
    
    for K, C, N in ((3072,3072*1,32),(3072,3072*1,64),(3072,3072*1,96),(3072,3072*1,128),
                    (3072,3072*2,32),(3072,3072*2,64),(3072,3072*2,96),(3072,3072*2,128),
                    (3072,3072*3,32),(3072,3072*3,64),(3072,3072*3,96),(3072,3072*3,128),
                    (3072,3072*4,32),(3072,3072*4,64),(3072,3072*4,96),(3072,3072*4,128),): 
                    #(3072,3072,32+128*0),(3072,3072,64+128*0),(3072,3072,96+128*0),(3072,3072,128+128*0),
                    #(3072,3072,32+128*1),(3072,3072,64+128*1),(3072,3072,96+128*1),(3072,3072,128+128*1),
                    #(3072,3072,32+128*2),(3072,3072,64+128*2),(3072,3072,96+128*2),(3072,3072,128+128*2),
                    #(3072,3072,32+128*3),(3072,3072,64+128*3),(3072,3072,96+128*3),(3072,3072,128+128*3),): 
        for op,  dimA,  dimB,  dimC in (
          ("nn", (K,C), (C,N), (K,N) ),  # fprop
          ("tn", (K,C), (K,N), (C,N) ),  # bprop
          ("nt", (K,N), (C,N), (K,C) )): # update
Exemple #23
0
# 
# Unless required by applicable law or agreed to in writing, software
# distributed under the License is distributed on an "AS IS" BASIS,
# WITHOUT WARRANTIES OR CONDITIONS OF ANY KIND, either express or implied.
# See the License for the specific language governing permissions and
# limitations under the License.

import numpy as np
from nervanagpu import NervanaGPU
from pycuda.autoinit import context

print context.get_device().name()

np.set_printoptions(threshold=8193, linewidth=600, formatter={'float':lambda x: "% .0f" % x})

ng = NervanaGPU(stochastic_round=False, bench=True)

dtype  = np.float16
repeat = 1
cpu    = 1  # Set CPU to 1 to check against CPU

for data_type in ("All Ones", "Random Data",): #"All Ones", "Random Data"
    print data_type
    for size in ((3072,3072,3072*2),): #(4095,4095,4095) 
        m, n, k = size
        for op in ("tn","nn","nt"): #"tn","nn","nt"

            dimA = (m,k) if op[0] == 'n' else (k,m)
            dimB = (k,n) if op[1] == 'n' else (n,k)
            dimC = (m,n)
Exemple #24
0
from operator import mul

print context.get_device().name()

np.set_printoptions(threshold=8193,
                    linewidth=600,
                    formatter={
                        'int': lambda x: "%10d" % x,
                        'float': lambda x: "% .3f" % x
                    })

dtype = np.float16
cpu = 1
repeat = 1

ng = NervanaGPU(stochastic_round=False, bench=True)

pool = ng.pool_layer(
    "max",
    64,  # N
    64,
    1,
    64,
    64,  # C,D,H,W
    4,
    1,
    2,
    2,  # J,T,R,S
    0,
    0,
    0,
Exemple #25
0
import numpy as np
import pycuda.autoinit
from nervanagpu import NervanaGPU
nrv = NervanaGPU(default_dtype=np.float32)

a = nrv.array(np.random.randn(200,200))
b = nrv.empty_like(a)
b[:] = a**2
assert not np.any(np.isnan(b.get())), "Shouldn't have any nan's here"

Exemple #26
0
class MGPU(GPU):
    default_dtype = np.float32
    num_dev = 1
    is_dist = True

    def __init__(self, rng_seed, stochastic_round=False, device_id=0,
                 num_dev=2):
        drv.init()
        self.num_dev = num_dev

        if device_id == 0:
            self.dev_list = range(num_dev)
        else:
            self.dev_list = device_id

        assert len(self.dev_list) == self.num_dev
        assert self.num_dev <= drv.Device.count()

        self.ctxs = []
        self.devs = []
        self._strms = []
        self._redstrms = []

        self._events = []
        self._redevents = []

        self.async = True
        self._nostrms = [None for i in self.dev_list]

        for i in self.dev_list:
            self.devs.append(drv.Device(i))

        for dev in self.devs:
            self.ctxs.append(
                dev.make_context(drv.ctx_flags.SCHED_BLOCKING_SYNC))
            self._strms.append(drv.Stream())
            self._redstrms.append(drv.Stream())
            self._events.append(drv.Event())
            self._redevents.append(drv.Event())
            drv.Context.pop()

        self.ctxs[0].push()
        atexit.register(drv.Context.pop)
        MGPUTensor.ctxs = self.ctxs
        MGPUTensor.num_dev = num_dev

        self.ng = NervanaGPU(stochastic_round=stochastic_round)
        logger.info("Initialized %d device NervanaGPU, stochastic_round=%s",
                    num_dev, stochastic_round)
        self.ng.block = None
        self.rng_seed = rng_seed
        self.rng_init()

        # Setup the pairwise contexts
        # TODO clean up this code to avoid indexing
        for dev1, ctx1 in zip(self.devs, self.ctxs):
            ctx1.push()
            for dev2, ctx2 in zip(self.devs, self.ctxs):
                if dev1 == dev2:
                    continue
                if dev1.can_access_peer(dev2):
                    ctx1.enable_peer_access(ctx2)
                else:
                    print('Cannot enable peer access between '
                          '{:d} and {:d}'.format(dev1, dev2))
            ctx1.pop()

    def make_events(self):
        evtlist = []
        for ctx in self.ctxs:
            ctx.push()
            evtlist.append(drv.Event())
            ctx.pop()
        return evtlist

    # These definitions are for performing grouped context commands
    # This is experimental and should remove _stack for actual usage
    def begin_stack(self, block, identifier):
        if block == Block.update:
            self.ng.block = Block.update
            self.call_stack = []
        else:
            pass

    def end_stack(self, block, identifier):
        if block == Block.update:
            self.ng.block = None
            for idx, ctx in enumerate(self.ctxs):
                ctx.push()
                self.ng.stream = self.strms[idx]
                for method, args, kwargs in self.call_stack:
                    myargs = [a._tensorlist[idx] if isinstance(
                        a, MGPUTensor) else a for a in args]
                    mykwargs = {k: v._tensorlist[idx] if isinstance(
                        v, MGPUTensor) else v for k, v in kwargs.iteritems()}
                    getattr(super(MGPU, self), method)(*myargs, **mykwargs)
                self.ng.stream = None
                ctx.pop()
            self.call_stack = None
        else:
            pass

    @property
    def strms(self):
        return self._strms if self.async else self._nostrms

    @property
    def redstrms(self):
        return self._redstrms if self.async else self._nostrms

    def uniform(self, low=0.0, high=1.0, size=1, dtype=default_dtype,
                name=None, persist_values=True, ptype='replica'):
        """
        generate numpy random number and convert to a GPUTensor.
        If called with dtype=None it will probably explode
        """
        assert len(size) == 2
        result = self.empty(size, dtype=dtype, persist_values=persist_values)
        result.ptype = ptype
        beshape = size if ptype == 'replica' else (self.num_dev * size[0],
                                                   size[1])
        ary = np.random.uniform(low, high, beshape).astype(dtype)
        self.set(result, ary)
        return result

    def normal(self, loc=0.0, scale=1.0, size=1, dtype=default_dtype,
               name=None, persist_values=True, ptype='replica'):
        """
        Gaussian/Normal random number sample generation
        """
        assert len(size) == 2
        result = self.empty(size, dtype=dtype, persist_values=persist_values)
        result.ptype = ptype
        beshape = size if ptype == 'replica' else (self.num_dev * size[0],
                                                   size[1])
        ary = np.random.normal(loc, scale, beshape).astype(dtype)
        self.set(result, ary)
        return result

    def synchronize(self):
        if not self.async:
            return
        for s in self.strms:
            s.synchronize()

    def redsynchronize(self):
        if not self.async:
            return
        for s in self.redstrms:
            s.synchronize()

    def allocate_fragment(self, shape, dtype=default_dtype,
                          persist_values=True):
        # TODO: set ptype to be fragment in this case ??
        return self.empty((shape[0], shape[1] / self.num_dev), dtype,
                          persist_values=persist_values)

    def zeros_like(self, ary, dtype=default_dtype, persist_values=True,
                   name=None):
        result = self.zeros(ary.shape, dtype=dtype,
                            persist_values=persist_values)
        result.ptype = ary.ptype
        return result

    def empty_like(self, ary, dtype=default_dtype, persist_values=True,
                   name=None):
        result = self.empty(ary.shape, dtype=dtype,
                            persist_values=persist_values, name=name)
        result.ptype = ary.ptype
        return result

    def set(self, tensor, data):
        assert isinstance(tensor, MGPUTensor)
        if tensor.ptype == 'replica':
            for dest, strm, ctx in zip(tensor.tlist, self.strms, self.ctxs):
                ctx.push()
                drv.memcpy_htod_async(dest.ptr, data, strm)
                ctx.pop()
            # tensor.copy_from(data)
        else:
            self.scatter(data, tensor)

    def scatter(self, hbuf, dbuf):
        '''
        scatters the array data in hbuf to the mgpu tensor
        assumes that dbuf is a M x N and hbuf is M x (Nxk) where k is the
        number of replicas
        also assumes that dtype of hbuf and dbuf are the same
        '''
        assert hbuf.size == dbuf.size * dbuf.num_dev
        assert isinstance(dbuf, MGPUTensor)
        assert hbuf.dtype == dbuf.dtype
        ndata = dbuf.size
        starts = [i * ndata for i in range(self.num_dev)]

        for dest, strm, ctx, doff in zip(dbuf.tlist, self.strms, self.ctxs,
                                         starts):
            src = hbuf.reshape((hbuf.size))[doff:(doff + ndata)]
            ctx.push()
            drv.memcpy_htod_async(dest.ptr, src, strm)
            ctx.pop()

        self.synchronize()

    def fprop_fc(self, out, inputs, weights, layer=None):
        """
        In this case, the weights are shards, the acts are replicas
        ubuf should be of size nout/num_dev x mbsz
        """
        ubuf = layer.mempool[0]
        assert ubuf.shape == (weights.shape[0], inputs.shape[1])

        if layer.use_biases:
            biases = layer.biases.tlist
        else:
            biases = [None for i in range(self.num_dev)]

        for dbuf, ibuf, wt, bs, strm, ctx in zip(ubuf.tlist, inputs.tlist,
                                                 weights.tlist, biases,
                                                 self.strms, self.ctxs):
            ctx.push()
            self.ng.stream = strm
            self.ng.dot(wt, ibuf, dbuf)
            if layer.use_biases:
                self.ng.add(dbuf, bs, out=dbuf)
            ctx.pop()

        # Note, should be safe not to sync because each fragment is computed
        # on the same stream that originates the copy
        # self.synchronize()
        self.fragment_to_replica(ubuf, out)

    def bprop_fc(self, out, weights, deltas, layer=None):
        """
        Backward propagate the error through a fully connected network layer.

        Arguments:
            out (GPUTensor): Where to store the backward propagated errors.
            weights (GPUTensor): The weight coefficient values for this layer.
            deltas (GPUTensor): The error values for this layer
            layer (Layer): The layer object.
        """
        ubuf = layer.mempool[1]
        wtsz = weights.shape[0]
        starts = [i * wtsz for i in range(self.num_dev)]
        assert out.shape == (weights.shape[1], deltas.shape[1])
        assert ubuf.shape == out.shape

        for dbuf, ibuf, wt, strm, ctx, off in zip(out.tlist, deltas.tlist,
                                                  weights.tlist, self.strms,
                                                  self.ctxs, starts):
            ctx.push()
            self.ng.stream = strm
            self.ng.dot(wt.T, ibuf[off:(off + wtsz)], dbuf)
            ctx.pop()

        # Note, should be safe not to sync because each fragment is computed
        # on the same stream that originates the copy
        self.synchronize()
        self.reduce(out, ubuf)

    def update_fc(self, out, inputs, deltas, layer=None):
        wtsz = out.shape[0]
        starts = [i * wtsz for i in range(self.num_dev)]

        for obuf, dbuf, ibuf, strm, ctx, off in zip(out.tlist, deltas.tlist,
                                                    inputs.tlist, self.strms,
                                                    self.ctxs, starts):
            ctx.push()
            self.ng.stream = strm
            self.ng.dot(dbuf[off:(off + wtsz)], ibuf.T, obuf)
            ctx.pop()

        # self.synchronize()

    def update_fc_bias(self, err, out):
        """
        Compute the updated bias gradient for a fully connected network layer.

        Arguments:
            out (GPUTensor): Where to store the updated gradient value.
            err (GPUTensor): backpropagated error
        """
        wtsz = out.shape[0]
        starts = [i * wtsz for i in range(self.num_dev)]

        for ebuf, obuf, strm, ctx, off in zip(err.tlist, out.tlist, self.strms,
                                              self.ctxs, starts):
            ctx.push()
            self.ng.stream = strm
            self.ng.sum(ebuf[off:(off + wtsz)], axis=1, out=obuf)
            ctx.pop()

    def add_fc_bias(self, inputs, bias):
        """
        This is a no-op since we absorb the bias add into the fprop_fc call
        """
        pass

    def reduce_tensor(self, ary, async=True):
        '''
        This is the case for the scalar tensor
        '''
        assert ary.size == 1
        if ary.ptype == 'replica':
            self.ctxs[0].push()
            result = ary.tlist[0].get()
            self.ctxs[0].pop()
            return result

        result = np.zeros((self.num_dev, 1), ary.dtype)
        for i, (ctx, src_buf, strm) in enumerate(zip(
                self.ctxs, ary.tlist, self.strms)):
            ctx.push()
            drv.memcpy_dtoh_async(result[i], src_buf.ptr, strm)
            ctx.pop()
        self.synchronize()
        return result.sum()
from pycuda.autoinit import context
from nervanagpu import NervanaGPU
from nervanagpu.layers import DataLayer, ConvLayer, PoolLayer, FullLayer
print context.get_device().name()

# Compare results here:
# https://github.com/soumith/convnet-benchmarks

# number of full iterations
loops = 10
# show bechmark details for each layer
layer_bench = 0
# show layer stats after each operation
print_stats = 0

ng = NervanaGPU(bench=layer_bench)

# don't learn, just benchmark
momentum = 0.0
learning_rate = 0.0

# common convolutional layer settings
conv3 = {"R": 3, "S": 3, "pad_h": 1, "pad_w": 1}
conv1 = {"R": 1, "S": 1, "pad_h": 0, "pad_w": 0}

# traditional pooling
pool2 = {"op": "max", "R": 2, "S": 2}
pool3 = {"op": "max", "R": 3, "S": 3, "str_h": 2, "str_w": 2}

# maxout pooling
pool1j2 = {"op": "max", "J": 2}  # maxout in the fc layers
from pycuda.autoinit   import context
from nervanagpu        import NervanaGPU
from nervanagpu.layers import DataLayer, ConvLayer, PoolLayer, FullLayer
print(context.get_device().name())

# Compare results here:
# https://github.com/soumith/convnet-benchmarks

# number of full iterations
loops       = 10
# show bechmark details for each layer
layer_bench = 0
# show layer stats after each operation
print_stats = 0

ng = NervanaGPU(bench=layer_bench)

# don't learn, just benchmark
momentum      = 0.0
learning_rate = 0.0

# common convolutional layer settings
conv3   = { "R":3, "S":3, "pad_h":1, "pad_w":1 }
conv1   = { "R":1, "S":1, "pad_h":0, "pad_w":0 }

# traditional pooling
pool2   = { "op":"max", "R":2, "S":2 }
pool3   = { "op":"max", "R":3, "S":3, "str_h":2, "str_w":2 }

# maxout pooling
pool1j2 = { "op":"max", "J":2 } # maxout in the fc layers
Exemple #29
0
import numpy         as np
import pycuda.driver as drv
from nervanagpu      import NervanaGPU
from pycuda.autoinit import context
from operator        import mul

print context.get_device().name()

np.set_printoptions(threshold=8193, linewidth=600, formatter={'int':lambda x: "%10d" % x,'float':lambda x: "% .3f" % x})

dtype  = np.float16
cpu    = 1
repeat = 1

ng = NervanaGPU(stochastic_round=False, bench=True)

pool = ng.pool_layer(
    "max",
    64,         # N
    64,1,64,64, # C,D,H,W
    4,1,2,2,    # J,T,R,S
    0,0,0,0,    # padding
    4,1,2,2)    # strides

dimI = pool.dimI
dimO = pool.dimO

# colapse pooling dimensions into one
# this allows for easy cpu pooling in numpy
def slicable(dim, pad=0):
Exemple #30
0
    end.record()
    end.synchronize()
    msecs = end.time_since(start) / repeat
    gflops = (m * n * k * 2.0) / (msecs * 1000000.0)
    print "%7.3f msecs %4.0f gflops (%s_%s   : %d,%d,%d)" % (
        msecs, gflops, "cublas", op, m, n, k)

    return gflops


np.set_printoptions(threshold=8193,
                    linewidth=600,
                    formatter={'float': lambda x: "% .0f" % x})

ng = NervanaGPU(stochastic_round=False, bench=True)

for dtype in (np.float16, np.float32):

    for K, C, N in (
        (3072, 3072 * 1, 32),
        (3072, 3072 * 1, 64),
        (3072, 3072 * 1, 96),
        (3072, 3072 * 1, 128),
        (3072, 3072 * 2, 32),
        (3072, 3072 * 2, 64),
        (3072, 3072 * 2, 96),
        (3072, 3072 * 2, 128),
        (3072, 3072 * 3, 32),
        (3072, 3072 * 3, 64),
        (3072, 3072 * 3, 96),
Exemple #31
0
class GPU(Backend):
    """
    Sets up a NervanaGPU based backend for matrix operations.
    Note that some functions defined in the generic Backend class such as are
    cross-map pooling and normalization and adaDelta are not implemented for
    this backend.
    """
    default_dtype = np.float32

    def __init__(self, rng_seed, stochastic_round=False, device_id=0):
        self.ng = NervanaGPU(stochastic_round=stochastic_round)
        logger.info("Initialized NervanaGPU with stochastic_round=%s",
                    stochastic_round)
        self.rng_seed = rng_seed
        self.rng_init()
        self.device_id = device_id if device_id is not None else 0

    def __getstate__(self):
        """
        Defines what and how we go about serializing an instance of this class.

        Returns:
            self.__dict__: The full contents of the backend class instance,
                           except for the mem_pool which is on device and
                           cannot be serialized.
        """
        if hasattr(self, 'mem_pool') and self.mem_pool is not None:
            self.mem_pool_pickle = {'shape': self.mem_pool.shape,
                                    'dtype': np.float32}
            self.mem_pool = None

        return self.__dict__

    def __setstate__(self, state):
        """
        Defines how we go about deserializing into an instance of this class.

        Arguments:
            self.__dict__: The full contents of the backend class instance,
                           except for the mem_pool which is on device and
                           cannot be serialized.
        """
        self.__dict__.update(state)
        self.mem_pool = self.ng.empty(self.mem_pool_pickle['shape'],
                                      dtype=self.mem_pool_pickle['dtype'])

    def init_mempool(self, shape, dtype=default_dtype):
        """
        Allocates a memory pool for temporary storage
        """
        self.mem_pool = self.ng.empty(shape, dtype=dtype)

    def alloc_host_mem(self, shape, dtype):
        return drv.pagelocked_empty(shape, dtype, order="C", mem_flags=0)

    def create_stream(self):
        return drv.Stream()

    def async_copy(self, dest, src, stream=None):
        drv.memcpy_htod_async(dest.gpudata, src, stream)

    def rng_init(self):
        """
        Initialize and seed the pseudo random number genrator. Random numbers
        are generated on the host using numpy, then transfered to device.
        """
        seed = None
        if 'rng_seed' in self.__dict__:
            seed = self.rng_seed
            logger.info("Seeding random number generator with: %s", str(seed))
        np.random.seed(seed)

    def flop_timing_init(self, decorate_fc, decorate_conv, decorate_ew):
        """
        Initialize FLOP timing.  Wraps the specified MOP calls via a decorator
        to record elapsed time and number of operations.

        Arguments:
           decorate_fc (list): string giving the function names of fully
                               connected layer forward/backward/update calls
                               to time.
           decorate_conv (list): string giving the function names of
                                 convolutional layer forward/backward/update
                                 calls to time.
           decorate_ew (list): string giving the function names of element-wise
                               calls to time.

        Notes:
            Must be called prior to first flop_timing_start call
        """
        self.start = drv.Event()
        self.end = drv.Event()
        self.flop_timer = FlopsDecorator(self)
        self.flop_timer.decorate(decorate_fc=decorate_fc,
                                 decorate_conv=decorate_conv,
                                 decorate_ew=decorate_ew)

    def flop_timinig_start(self):
        """
        Start a new FLOP timer.
        Returns:
            None: dummy value (not used)
        """
        return self.start.record()

    def flop_timing_finish(self, start_time):
        """
        Complete current FLOP timing.

        Arguments:
            start_time (unused): ignored.

        Returns:
            float: elapsed time in seconds since prior flop_timing_start call.
        """
        self.end.record()
        self.end.synchronize()
        return self.end.time_since(self.start)

    def uniform(self, low=0.0, high=1.0, shape=1, dtype=default_dtype,
                persist_values=True, name=None, allocator=drv.mem_alloc):
        """
        generate numpy random number and convert to a GPUTensor.
        If called with dype=None it will probably explode
        """
        ary = np.random.uniform(low, high, shape)
        return GPUTensor(ary.shape, dtype, allocator=allocator, name=name,
                         rounding=self.ng.round_mode).set(ary)

    def normal(self, loc=0.0, scale=1.0, size=1, dtype=default_dtype,
               persist_values=True, name=None, allocator=drv.mem_alloc):
        """
        Gaussian/Normal random number sample generation
        """
        ary = np.random.normal(loc, scale, size)
        return GPUTensor(ary.shape, dtype, allocator=allocator, name=name,
                         rounding=self.ng.round_mode).set(ary)

    def fprop_fc(self, out, inputs, weights, layer=None):
        """
        Forward propagate the inputs of a fully connected network layer to
        produce output pre-activations (ready for transformation by an
        activation function).

        Arguments:
            out (GPUTensor): Where to store the forward propagated results.
            inputs (GPUTensor): Will be either the dataset input values (first
                                layer), or the outputs from the previous layer.
            weights (GPUTensor): The weight coefficient values for this layer.
            layer (Layer): The layer object.
        """
        self.ng.dot(weights, inputs, out)

    def bprop_fc(self, out, weights, deltas, layer=None):
        """
        Backward propagate the error through a fully connected network layer.

        Arguments:
            out (GPUTensor): Where to store the backward propagated errors.
            weights (GPUTensor): The weight coefficient values for this layer.
            deltas (GPUTensor): The error values for this layer
            layer (Layer): The layer object.
        """
        self.ng.dot(weights.T, deltas, out)

    def update_fc(self, out, inputs, deltas, layer=None):
        """
        Compute the updated gradient for a fully connected network layer.

        Arguments:
            out (GPUTensor): Where to store the updated gradient value.
            inputs (GPUTensor): Will be either the dataset input values (first
                                layer), or the outputs from the previous layer.
            deltas (GPUTensor): The error values for this layer
            layer (Layer): The layer object.
        """
        self.ng.dot(deltas, inputs.T, out)

    def fprop_conv(self, out, inputs, weights, ofmshape, ofmsize, ofmlocs,
                   ifmshape, links, nifm, padding, stride, ngroups, fpropbuf,
                   local=False):
        """
        Forward propagate the inputs of a convolutional network layer to
        produce output pre-activations (ready for transformation by an
        activation function).

        Arguments:
            out (GPUTensor): Where to store the forward propagated results.
            inputs (GPUTensor): Will be either the dataset input values (first
                             layer), or the outputs from the previous layer.
            weights (GPUTensor): The weight coefficient values for this layer.
            ofmshape (tuple): Dimensions of each output feature map (typically
                              number of height and width neurons).
            ofmsize (int): Total size of each output feature map.
            ofmlocs (GPUTensor): Indices giving the location of each element
                                  in each output feature map stored in out.
            ifmshape (tuple): Dimensions of each input feature map (typically
                              number of height and width neurons).  For this
                              backend we expect these values to be square.
            links (GPUTensor): Input receptive field indices.
            nifm (int): Total number of input feature maps.
            padding (int): Number of additional elements to include along each
                           dimension of each local receptive field during the
                           convolution operation.
            stride (int): Number of neurons to shift the filter at each step.
            ngroups (int): Number of groups.
            fpropbuf (GPUTensor): Temporary storage buffer used to hold the
                                  convolved outputs for a single receptive
                                  field.  Not used for this backend.
            local (bool, optional): Whether to do local filtering (True) or
                                    convolution (False, the default)
        """

        '''
        N: Number of images in mini-batch
        C: Number of input feature maps
        K: Number of output feature maps

        D: Depth  of input image
        H: Height of input image
        W: Width  of input image

        T: Depth  of filter kernel
        R: Height of filter kernel
        S: Width  of filter kernel
        '''
        self.ng.fprop_conv(layer=fpropbuf, I=inputs, F=weights, O=out,
                           alpha=1.0, repeat=1)

    def bprop_conv(self, out, weights, deltas, ofmshape, ofmsize, ofmlocs,
                   ifmshape, links, padding, stride, nifm, ngroups, bpropbuf,
                   local=False):
        """
        Backward propagate the error through a convolutional network layer.

        Arguments:
            out (GPUTensor): Where to store the backward propagated errors.
            weights (GPUTensor): The weight coefficient values for this layer.
            deltas (GPUTensor): The error values for this layer
            ofmshape (tuple): Dimensions of each output feature map (typically
                              height and width).
            ofmsize (int): Total size of each output feature map.
            ofmlocs (GPUTensor): Indices giving the location of each element in
                                 each output feature map stored in out.
            ifmshape (tuple): Dimensions of each input feature map (typically
                              height and width).
            links (GPUTensor): Input receptive field indices.
            nifm (int): Total number of input feature maps.
            padding (int): Number of additional elements to include along each
                           dimension of each local receptive field during the
                           convolution operation.
            stride (int): Number of neurons to shift the filter at each step.
            ngroups (int): Number of groups.
            bpropbuf (GPUTensor): Temporary storage buffer used to hold the
                                  backpropagated error for a single receptive
                                  field
            local (bool, optional): Whether to do local filtering (True) or
                                    convolution (False, the default)
        """
        self.ng.bprop_conv(layer=bpropbuf, F=weights, E=deltas, grad_I=out,
                           alpha=1.0, repeat=1)

    def update_conv(self, out, inputs, weights, deltas, ofmshape, ofmsize,
                    ofmlocs, ifmshape, links, nifm, padding, stride, ngroups,
                    fwidth, updatebuf, local=False, layer=None):
        """
        Compute the updated gradient for a convolutional network layer.

        Arguments:
            out (GPUTensor): Where to store the updated gradient value.
            inputs (GPUTensor): Will be either the dataset input values (first
                                layer), or the outputs from the previous layer.
            weights (GPUTensor): The weight coefficient values for this layer.
            deltas (GPUTensor): The error values for this layer
            ofmshape (tuple): Dimensions of each output feature map (typically
                              height and width).
            ofmsize (int): Total size of each output feature map.
            ofmlocs (GPUTensor): Indices giving the location of each element in
                                 each output feature map stored in out.
            ifmshape (tuple): Dimensions of each input feature map (typically
                              height and width).
            links (GPUTensor): Input receptive field indices.
            nifm (int): Total number of input feature maps.
            padding (int): Number of additional elements to include along each
                           dimension of each local receptive field during the
                           convolution operation.
            stride (int): Number of neurons to shift the filter at each step.
            ngroups (int): Number of groups.
            fwidth (int): Filter width.
            updatebuf (GPUTensor): Temporary storage buffer used to hold the
                                   updated gradient for a single receptive
                                   field
            local (bool, optional): Whether to do local filtering (True) or
                                    convolution (False, the default)
            layer (Layer): The layer object.
        """
        self.ng.update_conv(layer=updatebuf, I=inputs, E=deltas, grad_F=out,
                            alpha=1.0, repeat=1)

    def fprop_pool(self, out, inputs, op, ofmshape, ofmsize, ofmlocs, fshape,
                   ifmshape, links, nifm, padding, stride, fpropbuf):
        """
        Forward propagate the inputs of a Pooling network layer to
        produce output pre-activations (ready for transformation by an
        activation function).

        Arguments:
            out (GPUTensor): Where to store the forward propagated results.
            inputs (GPUTensor): Will be either the dataset input values (first
                                layer), or the outputs from the previous layer.
            op (string): The type of pooling operation to apply.  We support
                         "max", "avg", "l2" currently.
            ofmshape (tuple): Dimensions of each output feature map (typically
                              number of height and width neurons).
            ofmsize (int): Total size of each output feature map.
            ofmlocs (GPUTensor): Indices giving the location of each element in
                                 each output feature map stored in out.
            fshape (tuple): Dimensions of each filter (typically height and
                            width).
            ifmshape (tuple): Dimensions of each input feature map (typically
                              number of height and width neurons).
            links (GPUTensor): Input receptive field indices.
            nifm (int): Total number of input feature maps.
            padding (int): Number of additional elements to include along each
                           dimension of each local receptive field during the
                           pooling operation.
            stride (int): Number of neurons to shift the filter at each step.
            fpropbuf (GPUTensor): Temporary storage buffer used to hold the
                                  pooled outputs for a single receptive field.
        """
        op = op.lower()
        if op == "max":
            self.ng.fprop_pool(layer=fpropbuf, I=inputs, O=out, repeat=1)
        else:
            raise AttributeError("unexpected pooling op type: %s", op)

    def bprop_pool(self, out, fouts, inputs, deltas, op, ofmshape, ofmsize,
                   ofmlocs, fshape, fpsize, ifmshape, links, nifm, padding,
                   stride, bpropbuf):
        """
        Backward propagate the error through a pooling network layer.

        Arguments:
            out (GPUTensor): Where to store the backward propagated errors.
            fouts (GPUTensor): Forward propagated outputs from the previous
                               layer.
            inputs (GPUTensor): Will be either the dataset input values (first
                                layer), or the outputs from the previous layer.
            deltas (GPUTensor): The error values for this layer
            op (string): The type of pooling operation to apply.  We support
                         "max", "avg", "l2" currently.
            ofmshape (tuple): Dimensions of each output feature map (typically
                              height and width).
            ofmsize (int): Total size of each output feature map.
            ofmlocs (GPUTensor): Indices giving the location of each element in
                              each output feature map stored in out.
            fshape (tuple): Dimensions of each filter (typically height and
                            width).
            fpsize (int): The size of each filter.
            ifmshape (tuple): Dimensions of each input feature map (typically
                              height and width).
            links (GPUTensor): Input receptive field indices.
            nifm (int): Total number of input feature maps.
            padding (int): Number of additional elements to include along each
                           dimension of each local receptive field during the
                           pooling operation.
            stride (int): Number of neurons to shift the filter at each step.
            bpropbuf (GPUTensor): Temporary storage buffer used to hold the
                                  backpropagated error for a single receptive
                                  field
        """
        op = op.lower()
        if op == "max":
            self.ng.bprop_pool(layer=bpropbuf, I=inputs, E=deltas, grad_I=out,
                               repeat=1)
        else:
            raise AttributeError("unexpected pooling op type: %s", op)

    def logistic(self, x, out):
        """
        Logistic sigmoid nonlinearity, 1/(1+exp(-x))

        Arguments:
            x (GPUTensor): Input tensor
            out (GPUTensor): Output tensor

        """
        self.ng.sig(x, out=out)

        return out

    def rectlin(self, x, out):
        """
        Rectified Linear nonlinearity

        Arguments:
            x (GPUTensor): Input tensor
            out (GPUTensor): Output tensor

        """
        self.ng.maximum(x, 0., out=out)
        return out

    def rectleaky(self, x, slope, out):
        out[:] = self.ng.maximum(x, x*slope)

    def rectleaky_derivative(self, x, slope, out):
        out[:] = self.ng.greater(x, 0) * (1.0 - slope) + slope

    def sum(self, tsr, axes, out):
        """
        Sum

        Arguments:
            tsr  (GPUTensor): Input tensor
            axes (int): Axis along which the reduction is performed. If axes
                        is None,  the tensor is flattened and reduced over
                        both dimensions.
            out (GPUTensor): Output tensor
        """
        if axes is None:
            sze = tsr.shape[0]*tsr.shape[1]
            self.ng.sum(tsr.reshape(sze, 1), axis=0, out=out)
        else:
            self.ng.sum(tsr, axis=axes, out=out)
        return out

    def mean(self, tsr, axes, out):
        """
        Calculates the arithmetic mean of the elements along the specified
        axes.

        Arguments:
            tsr (GPUTensor): Input tensor
            axes (int): Axis along which the reduction is performed. If axes
                        is None,  the tensor is flattened and reduced over
                        both dimensions.
            out (GPUTensor): Output tensor
        """
        if axes is None:
            sze = tsr.shape[0]*tsr.shape[1]
            self.ng.mean(tsr.reshape(sze, 1), axis=0, out=out)
        else:
            self.ng.mean(tsr, axis=axes, out=out)
        return out

    def min(self, tsr, axes, out):
        """
        Calculates the minimum of the elements along the specified
        axes.

        Arguments:
            tsr (GPUTensor): Input tensor
            axes (int): Axis along which the reduction is performed. If axes
                        is None,  the tensor is flattened and reduced over
                        both dimensions.
            out (GPUTensor): Output tensor
        """
        if axes is None:
            sze = tsr.shape[0]*tsr.shape[1]
            self.ng.min(tsr.reshape(sze, 1), axis=0, out=out)
        else:
            self.ng.min(tsr, axis=axes, out=out)
        return out

    def max(self, tsr, axes, out):
        """
        Calculates the maximum of the elements along the specified
        axes.

        Arguments:
            tsr (GPUTensor): Input tensor
            axes (int): Axis along which the reduction is performed. If axes
                        is None,  the tensor is flattened and reduced over
                        both dimensions.
            out (GPUTensor): Output tensor
        """
        if axes is None:
            sze = tsr.shape[0]*tsr.shape[1]
            self.ng.max(tsr.reshape(sze, 1), axis=0, out=out)
        else:
            self.ng.max(tsr, axis=axes, out=out)
        return out

    def variance(self, tsr, axes, out, mean=None):
        """
        Calculates the variance of the elements along the specified
        axes.

        Arguments:
            tsr (GPUTensor): the tensor on which to compute the variance
            axes (int, list, optional): the dimension(s) along which to
                                        variance.  If set to None, we will
                                        variance over all dimensions.
            out (GPUTensor): where the result will be stored.
            mean (GPUTensor): the tensor containing mean of tsr

        Returns:
            GPUTensor: reference to out
        """
        if mean is None:
            logger.error("GPUTensor requires mean to be specified.")
            raise ValueError("mean not specified")
        self.ng.mean(self.ng.square(tsr-mean),  axis=axes, out=out)
        return out

    def fabs(self, x, out):
        """
        Calculates absolute value of the elements in a tensor

        Arguments:
            x (GPUTensor): Input tensor
            out (GPUTensor): Output tensor

        Returns:
            GPUTensor: reference to out
        """
        self.ng.fabs(x, out=out)
        return out

    def sqrt(self, x, out):
        """
        Calculates square root of the elements in a tensor

        Arguments:
            x (GPUTensor): Input tensor
            out (GPUTensor): Output tensor

        Returns:
            GPUTensor: reference to out
        """
        self.ng.sqrt(x, out=out)
        return out

    def zeros(self, shape, dtype=default_dtype, persist_values=True):
        """
        Allocate a new GPUTensor and fill it with zeros.

        Arguments:
            shape (tupel): Shape of the desired GPUTensor
            dtype (dtype): Optional datatype
            persist_values (bool, optional): If set to True (the default), the
                                             values assigned to this Tensor
                                             will persist across multiple begin
                                             and end calls.  Setting to False
                                             may provide a performance increase
                                             if values do not need to be
                                             maintained across such calls

        Returns:
            GPUTensor: output
        """
        return self.ng.zeros(shape, dtype=dtype)

    def ones(self, shape, dtype=default_dtype, persist_values=True):
        """
        Allocate a new GPUTensor and fill it with ones.

        Arguments:
            shape (tupel): Shape of the desired GPUTensor
            dtype (dtype): Optional datatype
            persist_values (bool, optional): If set to True (the default), the
                                             values assigned to this Tensor
                                             will persist across multiple begin
                                             and end calls.  Setting to False
                                             may provide a performance increase
                                             if values do not need to be
                                             maintained across such calls

        Returns:
            GPUTensor: output
        """
        return self.ng.ones(shape, dtype=dtype)

    def empty(self, shape, dtype=default_dtype, persist_values=True):
        """
        Allocate a new GPUTensor.

        Arguments:
            shape (tupel): Shape of the desired GPUTensor
            dtype (dtype): Optional datatype
            persist_values (bool, optional): If set to True (the default), the
                                             values assigned to this Tensor
                                             will persist across multiple begin
                                             and end calls.  Setting to False
                                             may provide a performance increase
                                             if values do not need to be
                                             maintained across such calls

        Returns:
            GPUTensor: output
        """
        return self.ng.empty(shape, dtype=dtype)

    def array(self, ary, dtype=default_dtype, persist_values=True, name=None,
              allocator=drv.mem_alloc):
        """
        Allocate a new GPUTensor and fill it with supplied numpy array.

        Arguments:
            ary (ndarray): Numpy array with source data
            dtype (dtype, optional): Optional datatype
            persist_values (bool, optional): If set to True (the default), the
                                             values assigned to this Tensor
                                             will persist across multiple begin
                                             and end calls.  Setting to False
                                             may provide a performance increase
                                             if values do not need to be
                                             maintained across such calls
            name (string): Name for the GPUTensor
            allocator (pycuda): Pycuda memory allocator

        Returns:
            GPUTensor: output
        """
        return GPUTensor(ary.shape, dtype, allocator=allocator, name=name,
                         rounding=self.ng.round_mode).set(ary)

    def add(self, left, right, out):
        """
        Elementwise addition

        Arguments:
            left (GPUTensor, numeric): left-hand side operand.
            right (GPUTensor, numeric): right-hand side operand.
            out (GPUTensor): where the result will be stored.

        Returns:
            GPUTensor: reference to out
        """
        self.ng.add(left, right, out=out)
        return out

    def subtract(self, left, right, out):
        """
        Elementwise subtraction

        Arguments:
            left (GPUTensor, numeric): left-hand side operand.
            right (GPUTensor, numeric): right-hand side operand.
            out (GPUTensor): where the result will be stored.

        Returns:
            GPUTensor: reference to out
        """
        self.ng.subtract(left, right, out=out)
        return out

    def multiply(self, left, right, out):
        """
        Elementwise multiplication

        Arguments:
            left (GPUTensor, numeric): left-hand side operand.
            right (GPUTensor, numeric): right-hand side operand.
            out (GPUTensor): where the result will be stored.

        Returns:
            GPUTensor: reference to out
        """
        self.ng.multiply(left, right, out=out)
        return out

    def divide(self, left, right, out):
        """
        Elementwise division

        Arguments:
            left (GPUTensor, numeric): left-hand side operand.
            right (GPUTensor, numeric): right-hand side operand.
            out (GPUTensor): where the result will be stored.

        Returns:
            GPUTensor: reference to out
        """
        self.ng.divide(left, right, out=out)
        return out

    def greater(self, left, right, out):
        """
        Elementwise greater than testing

        Arguments:
            left (GPUTensor, numeric): left-hand side operand.
            right (GPUTensor, numeric): right-hand side operand.
            out (GPUTensor): where the result will be stored.

        Returns:
            GPUTensor: reference to out
        """
        self.ng.greater(left, right, out=out)
        return out

    def equal(self, left, right, out):
        """
        Performs element-wise equality testing on each element of left and
        right, storing the result in out.  Each operand is assumed to be the
        same shape (or broadcastable as such).

        Arguments:
            left (GPUTensor, numeric): left-hand side operand.
            right (GPUTensor, numeric): right-hand side operand.
            out (GPUTensor): where the result will be stored.

        Returns:
            GPUTensor: reference to out
        """
        self.ng.equal(left, right, out=out)
        return out

    def not_equal(self, left, right, out):
        """
        Elementwise not equal testing

        Arguments:
            left (GPUTensor, numeric): left-hand side operand.
            right (GPUTensor, numeric): right-hand side operand.
            out (GPUTensor): where the result will be stored.

        Returns:
            GPUTensor: reference to out
        """
        self.ng.not_equal(left, right, out=out)
        return out

    def clip(self, a, a_min, a_max, out):
        """
        Elementwise clipping between a range of specified values

        Arguments:
            a (GPUTensor): input tensor.
            a_min (float): floor value.
            a_max (float): ceiling value.
            out (GPUTensor): where the result will be stored.

        Returns:
            GPUTensor: reference to out
        """
        self.ng.clip(a, a_min, a_max, out=out)
        return out

    def log(self, a, out):
        """
        Elementwise base-e logarithm

        Arguments:
            a (GPUTensor): input tensor.
            out (GPUTensor): where the result will be stored.

        Returns:
            GPUTensor: reference to out
        """
        self.ng.log(a, out=out)
        return out

    def tanh(self, a, out):
        """
        Elementwise tanh

        Arguments:
            a (GPUTensor): input tensor.
            out (GPUTensor): where the result will be stored.

        Returns:
            GPUTensor: reference to out
        """
        self.ng.tanh(a, out=out)
        return out

    def argmax(self, a, out, axis=0):
        """
        Calculates the indices of the maximal element value along the specified
        axis.  If multiple elements contain the maximum, only the elements of
        the first are returned.

        Arguments:
            tsr (GPUTensor): The GPUTensor on which to find the maximum indices
            axis (int): The dimension along which to find the maximum.  If set
                        to None, find the overall maximum index of a flattened
                        representation of tsr.
            out (GPUTensor): Where to store the result.  Should be of the
                             appropriate type and expected shape

        Returns:
            GPUTensor: reference to out
        """
        self.ng.argmax(a, out=out, axis=axis)
        return out

    def softmax(self, x, out):
        """
        Softmax nonlinearity. Computes exp(x-max(x)) / sum_i exp(x_i-max(x_i))

        Arguments:
            x (GPUTensor): input tensor.
            out (GPUTensor): where the result will be stored.

        Returns:
            GPUTensor: reference to out
        """
        out[:] = (self.ng.reciprocal(self.ng.sum(
                  self.ng.exp(x - self.ng.max(x, axis=0)), axis=0)) *
                  self.ng.exp(x - self.ng.max(x, axis=0)))
        return out

    def softmax_gradient(self, y, err, out):
        """
        Gradient of the softmax nonlinearity.

        Arguments:
            y (GPUTensor): input tensor.
            err (GPUTensor): backpropagated error.
            out (GPUTensor): where the result will be stored.

        Returns:
            GPUTensor: reference to out
        """
        raise NotImplementedError("Softmax gradient should use shortcut")
        return out

    def make_binary_mask(self, tsr, keepthresh=0.5, dtype=default_dtype):
        """
        Create a binary mask for dropout layers.

        Arguments:
            tsr (GPUTensor): Output tensor
            keepthresh (float): fraction of ones
        """
        self.ng.dropout(keep=keepthresh, out=tsr)

    def gdm_compound(self, ps_item, us_item, vs_item, momentum_coef,
                     learning_rate, epoch):
        """
        Perform gradient descent update with momentum.

        Arguments:
            ps_item (GPUTensor): parameter tensor (e.g. a weight matrix)
            us_item (GPUTensor): update tensor, contains gradient wrt. weights
            vs_item (GPUTensor): velocity tensor.
            momentum_coef (float): momentum coefficient.
            learning_rate (float): learning rate.
            epoch (int): epoch (used in conjunction with diagnostics).

        Outputs are written to vs_item (updated velocity)
        and ps_item (updated weights)
        """
        vs_item[:] = vs_item * momentum_coef - us_item * learning_rate
        ps_item[:] = ps_item + vs_item

    def gdmwd_compound(self, ps_item, us_item, vs_item, momentum_coef,
                       learning_rate, wd, epoch):
        """
        Perform gradient descent update with momentum and weight decay.

        Arguments:
            ps_item (GPUTensor): parameter tensor (e.g. a weight matrix)
            us_item (GPUTensor): update tensor, contains gradient wrt. weights
            vs_item (GPUTensor): velocity tensor.
            momentum_coef (float): momentum coefficient.
            learning_rate (float): learning rate.
            wd (float): weight decay parameter.
            epoch (int): epoch (used in conjunction with diagnostics).

        Outputs:
            ps_item, the updated weights.
            vs_item, the updated velocity.
            us_item, used as a temp buffer.
        """
        vs_item[:] = vs_item * momentum_coef - us_item * \
            learning_rate - learning_rate * wd * ps_item
        ps_item[:] = ps_item + vs_item

    def ada_update(self, ps_item, us_item, gs_item, ds_item, ls_item, ss_item,
                   rho, epsilon):
        """
        Update rule for AdaDelta (Zeiler, http://arxiv.org/abs/1212.5701)

        Arguments:
            ps_item: weight / parameter (will be updated)
            us_item: update
            gs_item: expected value of Gradient Squared (will be updated)
            ds_item: expected value of Delta Squared (will be updated)
            ls_item: learning rate (will be updated)
            ss_item: Scratch Space
            rho: decay constant (determines window size)
            epsilon: small positive constant for numerical stability
        """
        # Accumulate E[Grad^2]
        gs_item[:] = gs_item * rho + (1.0 - rho) * us_item * us_item

        # Calculate Updates
        ls_item[:] = self.ng.sqrt((ds_item + epsilon) /
                                  (gs_item + epsilon)) * (-1.0) * us_item

        # Accumulate E[Delt^2]
        ds_item[:] = ds_item * rho + (1.0 - rho) * ls_item * ls_item

        # Final update to the params
        ps_item[:] = ps_item + ls_item

    def rms_update(self, params, updates, run_squares, velocity, scratch_space,
                   gamma, epsilon, learning_rate, momentum_coef):

        # Update running squares
        run_squares[:] = gamma * run_squares + (1. - gamma) * updates * updates

        # Now scale the gradient by lr / rms(grad) (with a epsilon term for
        # stability) and use it to update the params
        if momentum_coef == 0:
            params[:] = params - learning_rate * updates * self.ng.reciprocal(
                self.ng.sqrt(run_squares) + epsilon)
        else:
            velocity[:] = velocity * momentum_coef - \
                learning_rate * updates * \
                self.ng.reciprocal(self.ng.sqrt(run_squares) + epsilon)
            params[:] = params + velocity

    def fprop_bn_compound(self, inputs, beta, gamma, eps, xvar, xhat, out):
        """
        Batch normalization forward pass, compounded to run in 3 kernel calls.

        Arguments:
            inputs: input data to be normalized
            beta: location parameter
            gamma: scale parameter
            eps: small constant for numerical stability
            xvar: variance (updated)
            xhat: normalized input (updated)
            out: normalized and rescaled input (updated)
        """
        xvar[:] = self.ng.reciprocal(self.ng.sqrt(self.ng.var(inputs, axis=1) +
                                                  eps))
        xhat[:] = xvar * (inputs - self.ng.mean(inputs, axis=1))
        out[:] = xhat * gamma + beta
        return out

    def bprop_bn_compound(self, xhat, error, xvar, gamma,
                          beta_updates, gamma_updates):
        """
        Batch normalization backward pass, compounded to run with 4 kernel
        calls.

        Arguments:
            xhat: normalized input data (updated)
            error: backpropagated deltas (updated)
            xvar: precomputed variance
            gamma: scale parameter
            beta_updates: gradient update for beta (updated)
            gamma_updates: gradient update for gamma (updated)
        """
        gamma_updates[:] = self.ng.sum(xhat * error, axis=1)
        beta_updates[:] = self.ng.sum(error, axis=1)
        xhat[:] = (xhat * gamma_updates + beta_updates) / float(xhat.shape[1])
        error[:] = xvar * gamma * (error - xhat)
Exemple #32
0
dtype  = np.float16
repeat = 20

start, end = (drv.Event(), drv.Event())

def start_bench():
    start.record()

def end_bench(op):
    end.record()
    end.synchronize()
    msecs  = end.time_since(start) / repeat
    gflops = conv.flops / (msecs * 1000000.0)
    print "%7.3f msecs %8.3f gflops (%s: %s)" % (msecs, gflops, op, conv)

ng = NervanaGPU(stochastic_round=False, bench=True)

# Create a cuDNN context
cudnn = libcudnn.cudnnCreate()

C_desc = libcudnn.cudnnCreateConvolutionDescriptor()
I_desc = libcudnn.cudnnCreateTensorDescriptor()
O_desc = libcudnn.cudnnCreateTensorDescriptor()
E_desc = libcudnn.cudnnCreateTensorDescriptor()
B_desc = libcudnn.cudnnCreateTensorDescriptor()
F_desc = libcudnn.cudnnCreateFilterDescriptor()
U_desc = libcudnn.cudnnCreateFilterDescriptor()

# Set some options and tensor dimensions
NCHW_fmt  = libcudnn.cudnnTensorFormat['CUDNN_TENSOR_NCHW']
cu_dtype  = libcudnn.cudnnDataType['CUDNN_DATA_FLOAT']
Exemple #33
0
        cublas.cublasSgemm(handle, opB, opA, n, m, k, alpha, B.gpudata, ldb,
                           A.gpudata, lda, beta, C.gpudata, ldc)

    end.record()
    end.synchronize()
    msecs = end.time_since(start) / repeat
    gflops = (m * n * k * 2.0) / (msecs * 1000000.0)
    print "%7.3f msecs %4.0f gflops (%s_%s   : %d,%d,%d)" % (
        msecs, gflops, "cublas", op, m, n, k)


np.set_printoptions(threshold=8193,
                    linewidth=600,
                    formatter={'float': lambda x: "% .0f" % x})

ng = NervanaGPU(stochastic_round=False, bench=True)

repeat = 1

for dtype in (
        np.float16,
        np.float32,
):

    for K, C, N in ((32, 4096, 1512), ):

        for alpha, beta in ((1.0, 0.0), (0.5, 0.5)):

            for op, dimA, dimB, dimC in (
                ("nn", (K, C), (C, N), (K, N)),  # fprop
                ("tn", (K, C), (K, N), (C, N)),  # bprop
#!/usr/bin/python

import numpy as np
import pycuda.driver as drv
from nervanagpu import NervanaGPU
from pycuda.autoinit import context
from ipdb import set_trace

np.set_printoptions(threshold=8192 * 4,
                    linewidth=600,
                    formatter={
                        'int': lambda x: "%2d" % x,
                        'float': lambda x: "%2.0f" % x
                    })

ng = NervanaGPU(stochastic_round=0, bench=1)

dtype = np.float32  # np.float16 or np.float32
repeat = 50  # repeat count for benchmarking
ones = 0  # simpler data for debugging
cpu = 0  # valdiate against numpy
size = 32  # 32, 64, 128, None=auto

X = 100  # Batch Size
N = 32  # Minibatch Size
C = 3072  # Input  Features
K = 3072  # Output Features
Nin = True

dimW = (K, C)
if Nin: