def __init__(self, rng_seed, stochastic_round=False, device_id=0): self.ng = NervanaGPU(stochastic_round=stochastic_round) logger.info("Initialized NervanaGPU with stochastic_round=%s", stochastic_round) self.rng_seed = rng_seed self.rng_init() self.device_id = device_id if device_id is not None else 0
def __init__(self, rng_seed, stochastic_round=False, device_id=0): import pycuda.driver as drv drv.init() global ctx ctx = drv.Device(device_id).make_context() import atexit atexit.register(ctx.pop) self.ng = NervanaGPU(stochastic_round=stochastic_round) logger.info("Initialized NervanaGPU with stochastic_round=%s", stochastic_round) self.rng_seed = rng_seed self.rng_init() self.device_id = device_id if device_id is not None else 0
class MGPU(GPU): default_dtype = np.float32 num_dev = 1 is_dist = True def __init__(self, rng_seed, stochastic_round=False, device_id=0, num_dev=2): drv.init() self.num_dev = num_dev if device_id == 0: self.dev_list = range(num_dev) else: self.dev_list = device_id assert len(self.dev_list) == self.num_dev assert self.num_dev <= drv.Device.count() self.ctxs = [] self.devs = [] self._strms = [] self._redstrms = [] self._events = [] self._redevents = [] self. async = True self._nostrms = [None for i in self.dev_list] for i in self.dev_list: self.devs.append(drv.Device(i)) for dev in self.devs: self.ctxs.append( dev.make_context(drv.ctx_flags.SCHED_BLOCKING_SYNC)) self._strms.append(drv.Stream()) self._redstrms.append(drv.Stream()) self._events.append(drv.Event()) self._redevents.append(drv.Event()) drv.Context.pop() self.ctxs[0].push() atexit.register(drv.Context.pop) MGPUTensor.ctxs = self.ctxs MGPUTensor.num_dev = num_dev self.ng = NervanaGPU(stochastic_round=stochastic_round) logger.info("Initialized %d device NervanaGPU, stochastic_round=%s", num_dev, stochastic_round) self.ng.block = None self.rng_seed = rng_seed self.rng_init() # Setup the pairwise contexts # TODO clean up this code to avoid indexing for dev1, ctx1 in zip(self.devs, self.ctxs): ctx1.push() for dev2, ctx2 in zip(self.devs, self.ctxs): if dev1 == dev2: continue if dev1.can_access_peer(dev2): ctx1.enable_peer_access(ctx2) else: print('Cannot enable peer access between ' '{:d} and {:d}'.format(dev1, dev2)) ctx1.pop() def make_events(self): evtlist = [] for ctx in self.ctxs: ctx.push() evtlist.append(drv.Event()) ctx.pop() return evtlist # These definitions are for performing grouped context commands # This is experimental and should remove _stack for actual usage def begin_stack(self, block, identifier): if block == Block.update: self.ng.block = Block.update self.call_stack = [] else: pass def end_stack(self, block, identifier): if block == Block.update: self.ng.block = None for idx, ctx in enumerate(self.ctxs): ctx.push() self.ng.stream = self.strms[idx] for method, args, kwargs in self.call_stack: myargs = [ a._tensorlist[idx] if isinstance(a, MGPUTensor) else a for a in args ] mykwargs = { k: v._tensorlist[idx] if isinstance(v, MGPUTensor) else v for k, v in kwargs.iteritems() } getattr(super(MGPU, self), method)(*myargs, **mykwargs) self.ng.stream = None ctx.pop() self.call_stack = None else: pass @property def strms(self): return self._strms if self. async else self._nostrms @property def redstrms(self): return self._redstrms if self. async else self._nostrms def uniform(self, low=0.0, high=1.0, size=1, dtype=default_dtype, name=None, persist_values=True, ptype='replica'): """ generate numpy random number and convert to a GPUTensor. If called with dtype=None it will probably explode """ assert len(size) == 2 result = self.empty(size, dtype=dtype, persist_values=persist_values) result.ptype = ptype beshape = size if ptype == 'replica' else (self.num_dev * size[0], size[1]) ary = np.random.uniform(low, high, beshape).astype(dtype) self.set(result, ary) return result def normal(self, loc=0.0, scale=1.0, size=1, dtype=default_dtype, name=None, persist_values=True, ptype='replica'): """ Gaussian/Normal random number sample generation """ assert len(size) == 2 result = self.empty(size, dtype=dtype, persist_values=persist_values) result.ptype = ptype beshape = size if ptype == 'replica' else (self.num_dev * size[0], size[1]) ary = np.random.normal(loc, scale, beshape).astype(dtype) self.set(result, ary) return result def synchronize(self): if not self. async: return for s in self.strms: s.synchronize() def redsynchronize(self): if not self. async: return for s in self.redstrms: s.synchronize() def allocate_fragment(self, shape, dtype=default_dtype, persist_values=True): # TODO: set ptype to be fragment in this case ?? return self.empty((shape[0], shape[1] / self.num_dev), dtype, persist_values=persist_values) def zeros_like(self, ary, dtype=default_dtype, persist_values=True, name=None): result = self.zeros(ary.shape, dtype=dtype, persist_values=persist_values) result.ptype = ary.ptype return result def empty_like(self, ary, dtype=default_dtype, persist_values=True, name=None): result = self.empty(ary.shape, dtype=dtype, persist_values=persist_values, name=name) result.ptype = ary.ptype return result def set(self, tensor, data): assert isinstance(tensor, MGPUTensor) if tensor.ptype == 'replica': for dest, strm, ctx in zip(tensor.tlist, self.strms, self.ctxs): ctx.push() drv.memcpy_htod_async(dest.ptr, data, strm) ctx.pop() # tensor.copy_from(data) else: self.scatter(data, tensor) def scatter(self, hbuf, dbuf): ''' scatters the array data in hbuf to the mgpu tensor assumes that dbuf is a M x N and hbuf is M x (Nxk) where k is the number of replicas also assumes that dtype of hbuf and dbuf are the same ''' assert hbuf.size == dbuf.size * dbuf.num_dev assert isinstance(dbuf, MGPUTensor) assert hbuf.dtype == dbuf.dtype ndata = dbuf.size starts = [i * ndata for i in range(self.num_dev)] for dest, strm, ctx, doff in zip(dbuf.tlist, self.strms, self.ctxs, starts): src = hbuf.reshape((hbuf.size))[doff:(doff + ndata)] ctx.push() drv.memcpy_htod_async(dest.ptr, src, strm) ctx.pop() self.synchronize() def fprop_fc(self, out, inputs, weights, layer=None): """ In this case, the weights are shards, the acts are replicas ubuf should be of size nout/num_dev x mbsz """ ubuf = layer.mempool[0] assert ubuf.shape == (weights.shape[0], inputs.shape[1]) if layer.use_biases: biases = layer.biases.tlist else: biases = [None for i in range(self.num_dev)] for dbuf, ibuf, wt, bs, strm, ctx in zip(ubuf.tlist, inputs.tlist, weights.tlist, biases, self.strms, self.ctxs): ctx.push() self.ng.stream = strm self.ng.dot(wt, ibuf, dbuf) if layer.use_biases: self.ng.add(dbuf, bs, out=dbuf) ctx.pop() # Note, should be safe not to sync because each fragment is computed # on the same stream that originates the copy # self.synchronize() self.fragment_to_replica(ubuf, out) def bprop_fc(self, out, weights, deltas, layer=None): """ Backward propagate the error through a fully connected network layer. Arguments: out (GPUTensor): Where to store the backward propagated errors. weights (GPUTensor): The weight coefficient values for this layer. deltas (GPUTensor): The error values for this layer layer (Layer): The layer object. """ ubuf = layer.mempool[1] wtsz = weights.shape[0] starts = [i * wtsz for i in range(self.num_dev)] assert out.shape == (weights.shape[1], deltas.shape[1]) assert ubuf.shape == out.shape for dbuf, ibuf, wt, strm, ctx, off in zip(out.tlist, deltas.tlist, weights.tlist, self.strms, self.ctxs, starts): ctx.push() self.ng.stream = strm self.ng.dot(wt.T, ibuf[off:(off + wtsz)], dbuf) ctx.pop() # Note, should be safe not to sync because each fragment is computed # on the same stream that originates the copy self.synchronize() self.reduce(out, ubuf) def update_fc(self, out, inputs, deltas, layer=None): wtsz = out.shape[0] starts = [i * wtsz for i in range(self.num_dev)] for obuf, dbuf, ibuf, strm, ctx, off in zip(out.tlist, deltas.tlist, inputs.tlist, self.strms, self.ctxs, starts): ctx.push() self.ng.stream = strm self.ng.dot(dbuf[off:(off + wtsz)], ibuf.T, obuf) ctx.pop() # self.synchronize() def update_fc_bias(self, err, out): """ Compute the updated bias gradient for a fully connected network layer. Arguments: out (GPUTensor): Where to store the updated gradient value. err (GPUTensor): backpropagated error """ wtsz = out.shape[0] starts = [i * wtsz for i in range(self.num_dev)] for ebuf, obuf, strm, ctx, off in zip(err.tlist, out.tlist, self.strms, self.ctxs, starts): ctx.push() self.ng.stream = strm self.ng.sum(ebuf[off:(off + wtsz)], axis=1, out=obuf) ctx.pop() def add_fc_bias(self, inputs, bias): """ This is a no-op since we absorb the bias add into the fprop_fc call """ pass def reduce_tensor(self, ary, async=True): ''' This is the case for the scalar tensor ''' assert ary.size == 1 if ary.ptype == 'replica': self.ctxs[0].push() result = ary.tlist[0].get() self.ctxs[0].pop() return result result = np.zeros((self.num_dev, 1), ary.dtype) for i, (ctx, src_buf, strm) in enumerate(zip(self.ctxs, ary.tlist, self.strms)): ctx.push() drv.memcpy_dtoh_async(result[i], src_buf.ptr, strm) ctx.pop() self.synchronize() return result.sum()
# distributed under the License is distributed on an "AS IS" BASIS, # WITHOUT WARRANTIES OR CONDITIONS OF ANY KIND, either express or implied. # See the License for the specific language governing permissions and # limitations under the License. # Using just cublas compare N as the contiguous dimension verses the non-contiguous dimension. import numpy as np import pycuda.driver as drv from nervanagpu import NervanaGPU from pycuda.autoinit import context from scikits.cuda import cublas print(context.get_device().name()) ng = NervanaGPU(stochastic_round=False, bench=True) handle = cublas.cublasCreate() start, end = (drv.Event(), drv.Event()) def cublas_dot(op, A, B, C, repeat=1, warmup=False): lda = A.shape[0] ldb = B.shape[0] ldc = C.shape[0] m = C.shape[0] n = C.shape[1] k = A.shape[1] if op[0] == 'n' else A.shape[0]
class GPU(Backend): """ Sets up a NervanaGPU based backend for matrix operations. Note that some functions defined in the generic Backend class such as are cross-map pooling and normalization and adaDelta are not implemented for this backend. """ default_dtype = np.float32 def __init__(self, rng_seed, stochastic_round=False, device_id=0): self.ng = NervanaGPU(stochastic_round=stochastic_round) logger.info("Initialized NervanaGPU with stochastic_round=%s", stochastic_round) self.rng_seed = rng_seed self.rng_init() self.device_id = device_id if device_id is not None else 0 def __getstate__(self): """ Defines what and how we go about serializing an instance of this class. Returns: self.__dict__: The full contents of the backend class instance, except for the mem_pool which is on device and cannot be serialized. """ if hasattr(self, 'mem_pool') and self.mem_pool is not None: self.mem_pool_pickle = {'shape': self.mem_pool.shape, 'dtype': np.float32} self.mem_pool = None return self.__dict__ def __setstate__(self, state): """ Defines how we go about deserializing into an instance of this class. Arguments: self.__dict__: The full contents of the backend class instance, except for the mem_pool which is on device and cannot be serialized. """ self.__dict__.update(state) self.mem_pool = self.ng.empty(self.mem_pool_pickle['shape'], dtype=self.mem_pool_pickle['dtype']) def init_mempool(self, shape, dtype=default_dtype): """ Allocates a memory pool for temporary storage """ self.mem_pool = self.ng.empty(shape, dtype=dtype) def alloc_host_mem(self, shape, dtype): return drv.pagelocked_empty(shape, dtype, order="C", mem_flags=0) def create_stream(self): return drv.Stream() def async_copy(self, dest, src, stream=None): drv.memcpy_htod_async(dest.gpudata, src, stream) def rng_init(self): """ Initialize and seed the pseudo random number genrator. Random numbers are generated on the host using numpy, then transfered to device. """ seed = None if 'rng_seed' in self.__dict__: seed = self.rng_seed logger.info("Seeding random number generator with: %s", str(seed)) np.random.seed(seed) def flop_timing_init(self, decorate_fc, decorate_conv, decorate_ew): """ Initialize FLOP timing. Wraps the specified MOP calls via a decorator to record elapsed time and number of operations. Arguments: decorate_fc (list): string giving the function names of fully connected layer forward/backward/update calls to time. decorate_conv (list): string giving the function names of convolutional layer forward/backward/update calls to time. decorate_ew (list): string giving the function names of element-wise calls to time. Notes: Must be called prior to first flop_timing_start call """ self.start = drv.Event() self.end = drv.Event() self.flop_timer = FlopsDecorator(self) self.flop_timer.decorate(decorate_fc=decorate_fc, decorate_conv=decorate_conv, decorate_ew=decorate_ew) def flop_timinig_start(self): """ Start a new FLOP timer. Returns: None: dummy value (not used) """ return self.start.record() def flop_timing_finish(self, start_time): """ Complete current FLOP timing. Arguments: start_time (unused): ignored. Returns: float: elapsed time in seconds since prior flop_timing_start call. """ self.end.record() self.end.synchronize() return self.end.time_since(self.start) def uniform(self, low=0.0, high=1.0, shape=1, dtype=default_dtype, persist_values=True, name=None, allocator=drv.mem_alloc): """ generate numpy random number and convert to a GPUTensor. If called with dype=None it will probably explode """ ary = np.random.uniform(low, high, shape) return self.ng.array(ary, dtype=dtype, name=name) def normal(self, loc=0.0, scale=1.0, size=1, dtype=default_dtype, persist_values=True, name=None, allocator=drv.mem_alloc): """ Gaussian/Normal random number sample generation """ ary = np.random.normal(loc, scale, size) return self.ng.array(ary, dtype=dtype, name=name) def fprop_fc(self, out, inputs, weights, layer=None): """ Forward propagate the inputs of a fully connected network layer to produce output pre-activations (ready for transformation by an activation function). Arguments: out (GPUTensor): Where to store the forward propagated results. inputs (GPUTensor): Will be either the dataset input values (first layer), or the outputs from the previous layer. weights (GPUTensor): The weight coefficient values for this layer. layer (Layer): The layer object. """ self.ng.dot(weights, inputs, out) def bprop_fc(self, out, weights, deltas, layer=None): """ Backward propagate the error through a fully connected network layer. Arguments: out (GPUTensor): Where to store the backward propagated errors. weights (GPUTensor): The weight coefficient values for this layer. deltas (GPUTensor): The error values for this layer layer (Layer): The layer object. """ self.ng.dot(weights.T, deltas, out) def update_fc(self, out, inputs, deltas, layer=None): """ Compute the updated gradient for a fully connected network layer. Arguments: out (GPUTensor): Where to store the updated gradient value. inputs (GPUTensor): Will be either the dataset input values (first layer), or the outputs from the previous layer. deltas (GPUTensor): The error values for this layer layer (Layer): The layer object. """ self.ng.dot(deltas, inputs.T, out) def fprop_conv(self, out, inputs, weights, ofmshape, ofmsize, ofmlocs, ifmshape, links, nifm, padding, stride, ngroups, fpropbuf, local=False): """ Forward propagate the inputs of a convolutional network layer to produce output pre-activations (ready for transformation by an activation function). Arguments: out (GPUTensor): Where to store the forward propagated results. inputs (GPUTensor): Will be either the dataset input values (first layer), or the outputs from the previous layer. weights (GPUTensor): The weight coefficient values for this layer. ofmshape (tuple): Dimensions of each output feature map (typically number of height and width neurons). ofmsize (int): Total size of each output feature map. ofmlocs (GPUTensor): Indices giving the location of each element in each output feature map stored in out. ifmshape (tuple): Dimensions of each input feature map (typically number of height and width neurons). For this backend we expect these values to be square. links (GPUTensor): Input receptive field indices. nifm (int): Total number of input feature maps. padding (int): Number of additional elements to include along each dimension of each local receptive field during the convolution operation. stride (int): Number of neurons to shift the filter at each step. ngroups (int): Number of groups. fpropbuf (GPUTensor): Temporary storage buffer used to hold the convolved outputs for a single receptive field. Not used for this backend. local (bool, optional): Whether to do local filtering (True) or convolution (False, the default) """ ''' N: Number of images in mini-batch C: Number of input feature maps K: Number of output feature maps D: Depth of input image H: Height of input image W: Width of input image T: Depth of filter kernel R: Height of filter kernel S: Width of filter kernel ''' self.ng.fprop_conv(layer=fpropbuf, I=inputs, F=weights, O=out, alpha=1.0, repeat=1) def bprop_conv(self, out, weights, deltas, ofmshape, ofmsize, ofmlocs, ifmshape, links, padding, stride, nifm, ngroups, bpropbuf, local=False): """ Backward propagate the error through a convolutional network layer. Arguments: out (GPUTensor): Where to store the backward propagated errors. weights (GPUTensor): The weight coefficient values for this layer. deltas (GPUTensor): The error values for this layer ofmshape (tuple): Dimensions of each output feature map (typically height and width). ofmsize (int): Total size of each output feature map. ofmlocs (GPUTensor): Indices giving the location of each element in each output feature map stored in out. ifmshape (tuple): Dimensions of each input feature map (typically height and width). links (GPUTensor): Input receptive field indices. nifm (int): Total number of input feature maps. padding (int): Number of additional elements to include along each dimension of each local receptive field during the convolution operation. stride (int): Number of neurons to shift the filter at each step. ngroups (int): Number of groups. bpropbuf (GPUTensor): Temporary storage buffer used to hold the backpropagated error for a single receptive field local (bool, optional): Whether to do local filtering (True) or convolution (False, the default) """ self.ng.bprop_conv(layer=bpropbuf, F=weights, E=deltas, grad_I=out, alpha=1.0, repeat=1) def update_conv(self, out, inputs, weights, deltas, ofmshape, ofmsize, ofmlocs, ifmshape, links, nifm, padding, stride, ngroups, fwidth, updatebuf, local=False, layer=None): """ Compute the updated gradient for a convolutional network layer. Arguments: out (GPUTensor): Where to store the updated gradient value. inputs (GPUTensor): Will be either the dataset input values (first layer), or the outputs from the previous layer. weights (GPUTensor): The weight coefficient values for this layer. deltas (GPUTensor): The error values for this layer ofmshape (tuple): Dimensions of each output feature map (typically height and width). ofmsize (int): Total size of each output feature map. ofmlocs (GPUTensor): Indices giving the location of each element in each output feature map stored in out. ifmshape (tuple): Dimensions of each input feature map (typically height and width). links (GPUTensor): Input receptive field indices. nifm (int): Total number of input feature maps. padding (int): Number of additional elements to include along each dimension of each local receptive field during the convolution operation. stride (int): Number of neurons to shift the filter at each step. ngroups (int): Number of groups. fwidth (int): Filter width. updatebuf (GPUTensor): Temporary storage buffer used to hold the updated gradient for a single receptive field local (bool, optional): Whether to do local filtering (True) or convolution (False, the default) layer (Layer): The layer object. """ self.ng.update_conv(layer=updatebuf, I=inputs, E=deltas, grad_F=out, alpha=1.0, repeat=1) def fprop_pool(self, out, inputs, op, ofmshape, ofmsize, ofmlocs, fshape, ifmshape, links, nifm, padding, stride, fpropbuf): """ Forward propagate the inputs of a Pooling network layer to produce output pre-activations (ready for transformation by an activation function). Arguments: out (GPUTensor): Where to store the forward propagated results. inputs (GPUTensor): Will be either the dataset input values (first layer), or the outputs from the previous layer. op (string): The type of pooling operation to apply. We support "max", "avg", "l2" currently. ofmshape (tuple): Dimensions of each output feature map (typically number of height and width neurons). ofmsize (int): Total size of each output feature map. ofmlocs (GPUTensor): Indices giving the location of each element in each output feature map stored in out. fshape (tuple): Dimensions of each filter (typically height and width). ifmshape (tuple): Dimensions of each input feature map (typically number of height and width neurons). links (GPUTensor): Input receptive field indices. nifm (int): Total number of input feature maps. padding (int): Number of additional elements to include along each dimension of each local receptive field during the pooling operation. stride (int): Number of neurons to shift the filter at each step. fpropbuf (GPUTensor): Temporary storage buffer used to hold the pooled outputs for a single receptive field. """ op = op.lower() if op == "max": self.ng.fprop_pool(layer=fpropbuf, I=inputs, O=out, repeat=1) else: raise AttributeError("unexpected pooling op type: %s", op) def bprop_pool(self, out, fouts, inputs, deltas, op, ofmshape, ofmsize, ofmlocs, fshape, fpsize, ifmshape, links, nifm, padding, stride, bpropbuf): """ Backward propagate the error through a pooling network layer. Arguments: out (GPUTensor): Where to store the backward propagated errors. fouts (GPUTensor): Forward propagated outputs from the previous layer. inputs (GPUTensor): Will be either the dataset input values (first layer), or the outputs from the previous layer. deltas (GPUTensor): The error values for this layer op (string): The type of pooling operation to apply. We support "max", "avg", "l2" currently. ofmshape (tuple): Dimensions of each output feature map (typically height and width). ofmsize (int): Total size of each output feature map. ofmlocs (GPUTensor): Indices giving the location of each element in each output feature map stored in out. fshape (tuple): Dimensions of each filter (typically height and width). fpsize (int): The size of each filter. ifmshape (tuple): Dimensions of each input feature map (typically height and width). links (GPUTensor): Input receptive field indices. nifm (int): Total number of input feature maps. padding (int): Number of additional elements to include along each dimension of each local receptive field during the pooling operation. stride (int): Number of neurons to shift the filter at each step. bpropbuf (GPUTensor): Temporary storage buffer used to hold the backpropagated error for a single receptive field """ op = op.lower() if op == "max": self.ng.bprop_pool(layer=bpropbuf, I=inputs, E=deltas, grad_I=out, repeat=1) else: raise AttributeError("unexpected pooling op type: %s", op) def logistic(self, x, out): """ Logistic sigmoid nonlinearity, 1/(1+exp(-x)) Arguments: x (GPUTensor): Input tensor out (GPUTensor): Output tensor """ self.ng.sig(x, out=out) return out def rectlin(self, x, out): """ Rectified Linear nonlinearity Arguments: x (GPUTensor): Input tensor out (GPUTensor): Output tensor """ self.ng.maximum(x, 0., out=out) return out def rectleaky(self, x, slope, out): out[:] = self.ng.maximum(x, x*slope) def rectleaky_derivative(self, x, slope, out): out[:] = self.ng.greater(x, 0) * (1.0 - slope) + slope def sum(self, tsr, axes, out): """ Sum Arguments: tsr (GPUTensor): Input tensor axes (int): Axis along which the reduction is performed. If axes is None, the tensor is flattened and reduced over both dimensions. out (GPUTensor): Output tensor """ if axes is None: sze = tsr.shape[0]*tsr.shape[1] self.ng.sum(tsr.reshape(sze, 1), axis=0, out=out) else: self.ng.sum(tsr, axis=axes, out=out) return out def mean(self, tsr, axes, out): """ Calculates the arithmetic mean of the elements along the specified axes. Arguments: tsr (GPUTensor): Input tensor axes (int): Axis along which the reduction is performed. If axes is None, the tensor is flattened and reduced over both dimensions. out (GPUTensor): Output tensor """ if axes is None: sze = tsr.shape[0]*tsr.shape[1] self.ng.mean(tsr.reshape(sze, 1), axis=0, out=out) else: self.ng.mean(tsr, axis=axes, out=out) return out def min(self, tsr, axes, out): """ Calculates the minimum of the elements along the specified axes. Arguments: tsr (GPUTensor): Input tensor axes (int): Axis along which the reduction is performed. If axes is None, the tensor is flattened and reduced over both dimensions. out (GPUTensor): Output tensor """ if axes is None: sze = tsr.shape[0]*tsr.shape[1] self.ng.min(tsr.reshape(sze, 1), axis=0, out=out) else: self.ng.min(tsr, axis=axes, out=out) return out def max(self, tsr, axes, out): """ Calculates the maximum of the elements along the specified axes. Arguments: tsr (GPUTensor): Input tensor axes (int): Axis along which the reduction is performed. If axes is None, the tensor is flattened and reduced over both dimensions. out (GPUTensor): Output tensor """ if axes is None: sze = tsr.shape[0]*tsr.shape[1] self.ng.max(tsr.reshape(sze, 1), axis=0, out=out) else: self.ng.max(tsr, axis=axes, out=out) return out def variance(self, tsr, axes, out, mean=None): """ Calculates the variance of the elements along the specified axes. Arguments: tsr (GPUTensor): the tensor on which to compute the variance axes (int, list, optional): the dimension(s) along which to variance. If set to None, we will variance over all dimensions. out (GPUTensor): where the result will be stored. mean (GPUTensor): the tensor containing mean of tsr Returns: GPUTensor: reference to out """ if mean is None: logger.error("GPUTensor requires mean to be specified.") raise ValueError("mean not specified") self.ng.mean(self.ng.square(tsr-mean), axis=axes, out=out) return out def fabs(self, x, out): """ Calculates absolute value of the elements in a tensor Arguments: x (GPUTensor): Input tensor out (GPUTensor): Output tensor Returns: GPUTensor: reference to out """ self.ng.fabs(x, out=out) return out def sqrt(self, x, out): """ Calculates square root of the elements in a tensor Arguments: x (GPUTensor): Input tensor out (GPUTensor): Output tensor Returns: GPUTensor: reference to out """ self.ng.sqrt(x, out=out) return out def zeros(self, shape, dtype=default_dtype, persist_values=True): """ Allocate a new GPUTensor and fill it with zeros. Arguments: shape (tupel): Shape of the desired GPUTensor dtype (dtype): Optional datatype persist_values (bool, optional): If set to True (the default), the values assigned to this Tensor will persist across multiple begin and end calls. Setting to False may provide a performance increase if values do not need to be maintained across such calls Returns: GPUTensor: output """ return self.ng.zeros(shape, dtype=dtype) def ones(self, shape, dtype=default_dtype, persist_values=True): """ Allocate a new GPUTensor and fill it with ones. Arguments: shape (tupel): Shape of the desired GPUTensor dtype (dtype): Optional datatype persist_values (bool, optional): If set to True (the default), the values assigned to this Tensor will persist across multiple begin and end calls. Setting to False may provide a performance increase if values do not need to be maintained across such calls Returns: GPUTensor: output """ return self.ng.ones(shape, dtype=dtype) def empty(self, shape, dtype=default_dtype, persist_values=True): """ Allocate a new GPUTensor. Arguments: shape (tupel): Shape of the desired GPUTensor dtype (dtype): Optional datatype persist_values (bool, optional): If set to True (the default), the values assigned to this Tensor will persist across multiple begin and end calls. Setting to False may provide a performance increase if values do not need to be maintained across such calls Returns: GPUTensor: output """ return self.ng.empty(shape, dtype=dtype) def array(self, ary, dtype=default_dtype, persist_values=True, name=None, allocator=drv.mem_alloc): """ Allocate a new GPUTensor and fill it with supplied numpy array. Arguments: ary (ndarray): Numpy array with source data dtype (dtype, optional): Optional datatype persist_values (bool, optional): If set to True (the default), the values assigned to this Tensor will persist across multiple begin and end calls. Setting to False may provide a performance increase if values do not need to be maintained across such calls name (string): Name for the GPUTensor allocator (pycuda): Pycuda memory allocator Returns: GPUTensor: output """ return self.ng.array(ary, dtype=dtype, name=name) def add(self, left, right, out): """ Elementwise addition Arguments: left (GPUTensor, numeric): left-hand side operand. right (GPUTensor, numeric): right-hand side operand. out (GPUTensor): where the result will be stored. Returns: GPUTensor: reference to out """ self.ng.add(left, right, out=out) return out def subtract(self, left, right, out): """ Elementwise subtraction Arguments: left (GPUTensor, numeric): left-hand side operand. right (GPUTensor, numeric): right-hand side operand. out (GPUTensor): where the result will be stored. Returns: GPUTensor: reference to out """ self.ng.subtract(left, right, out=out) return out def multiply(self, left, right, out): """ Elementwise multiplication Arguments: left (GPUTensor, numeric): left-hand side operand. right (GPUTensor, numeric): right-hand side operand. out (GPUTensor): where the result will be stored. Returns: GPUTensor: reference to out """ self.ng.multiply(left, right, out=out) return out def divide(self, left, right, out): """ Elementwise division Arguments: left (GPUTensor, numeric): left-hand side operand. right (GPUTensor, numeric): right-hand side operand. out (GPUTensor): where the result will be stored. Returns: GPUTensor: reference to out """ self.ng.divide(left, right, out=out) return out def greater(self, left, right, out): """ Elementwise greater than testing Arguments: left (GPUTensor, numeric): left-hand side operand. right (GPUTensor, numeric): right-hand side operand. out (GPUTensor): where the result will be stored. Returns: GPUTensor: reference to out """ self.ng.greater(left, right, out=out) return out def equal(self, left, right, out): """ Performs element-wise equality testing on each element of left and right, storing the result in out. Each operand is assumed to be the same shape (or broadcastable as such). Arguments: left (GPUTensor, numeric): left-hand side operand. right (GPUTensor, numeric): right-hand side operand. out (GPUTensor): where the result will be stored. Returns: GPUTensor: reference to out """ self.ng.equal(left, right, out=out) return out def not_equal(self, left, right, out): """ Elementwise not equal testing Arguments: left (GPUTensor, numeric): left-hand side operand. right (GPUTensor, numeric): right-hand side operand. out (GPUTensor): where the result will be stored. Returns: GPUTensor: reference to out """ self.ng.not_equal(left, right, out=out) return out def clip(self, a, a_min, a_max, out): """ Elementwise clipping between a range of specified values Arguments: a (GPUTensor): input tensor. a_min (float): floor value. a_max (float): ceiling value. out (GPUTensor): where the result will be stored. Returns: GPUTensor: reference to out """ self.ng.clip(a, a_min, a_max, out=out) return out def log(self, a, out): """ Elementwise base-e logarithm Arguments: a (GPUTensor): input tensor. out (GPUTensor): where the result will be stored. Returns: GPUTensor: reference to out """ self.ng.log(a, out=out) return out def tanh(self, a, out): """ Elementwise tanh Arguments: a (GPUTensor): input tensor. out (GPUTensor): where the result will be stored. Returns: GPUTensor: reference to out """ self.ng.tanh(a, out=out) return out def argmax(self, a, out, axis=0): """ Calculates the indices of the maximal element value along the specified axis. If multiple elements contain the maximum, only the elements of the first are returned. Arguments: tsr (GPUTensor): The GPUTensor on which to find the maximum indices axis (int): The dimension along which to find the maximum. If set to None, find the overall maximum index of a flattened representation of tsr. out (GPUTensor): Where to store the result. Should be of the appropriate type and expected shape Returns: GPUTensor: reference to out """ self.ng.argmax(a, out=out, axis=axis) return out def softmax(self, x, out): """ Softmax nonlinearity. Computes exp(x-max(x)) / sum_i exp(x_i-max(x_i)) Arguments: x (GPUTensor): input tensor. out (GPUTensor): where the result will be stored. Returns: GPUTensor: reference to out """ out[:] = (self.ng.reciprocal(self.ng.sum( self.ng.exp(x - self.ng.max(x, axis=0)), axis=0)) * self.ng.exp(x - self.ng.max(x, axis=0))) return out def softmax_gradient(self, y, err, out): """ Gradient of the softmax nonlinearity. Arguments: y (GPUTensor): input tensor. err (GPUTensor): backpropagated error. out (GPUTensor): where the result will be stored. Returns: GPUTensor: reference to out """ raise NotImplementedError("Softmax gradient should use shortcut") return out def make_binary_mask(self, tsr, keepthresh=0.5, dtype=default_dtype): """ Create a binary mask for dropout layers. Arguments: tsr (GPUTensor): Output tensor keepthresh (float): fraction of ones """ self.ng.dropout(keep=keepthresh, out=tsr) def gdm_compound(self, ps_item, us_item, vs_item, momentum_coef, learning_rate, epoch): """ Perform gradient descent update with momentum. Arguments: ps_item (GPUTensor): parameter tensor (e.g. a weight matrix) us_item (GPUTensor): update tensor, contains gradient wrt. weights vs_item (GPUTensor): velocity tensor. momentum_coef (float): momentum coefficient. learning_rate (float): learning rate. epoch (int): epoch (used in conjunction with diagnostics). Outputs are written to vs_item (updated velocity) and ps_item (updated weights) """ vs_item[:] = vs_item * momentum_coef - us_item * learning_rate ps_item[:] = ps_item + vs_item def gdmwd_compound(self, ps_item, us_item, vs_item, momentum_coef, learning_rate, wd, epoch): """ Perform gradient descent update with momentum and weight decay. Arguments: ps_item (GPUTensor): parameter tensor (e.g. a weight matrix) us_item (GPUTensor): update tensor, contains gradient wrt. weights vs_item (GPUTensor): velocity tensor. momentum_coef (float): momentum coefficient. learning_rate (float): learning rate. wd (float): weight decay parameter. epoch (int): epoch (used in conjunction with diagnostics). Outputs: ps_item, the updated weights. vs_item, the updated velocity. us_item, used as a temp buffer. """ vs_item[:] = vs_item * momentum_coef - us_item * \ learning_rate - learning_rate * wd * ps_item ps_item[:] = ps_item + vs_item def exp_mavg(self, mavg, newval, rho): """ Calculate the exponential moving average Arguments: mavg: The running value of the moving average newval: New sample to be added to the moving average rho: Interpolation value """ mavg[:] = rho * mavg + (1.0 - rho) * newval def ada_update(self, ps_item, us_item, gs_item, ds_item, ls_item, ss_item, rho, epsilon): """ Update rule for AdaDelta (Zeiler, http://arxiv.org/abs/1212.5701) Arguments: ps_item: weight / parameter (will be updated) us_item: update gs_item: expected value of Gradient Squared (will be updated) ds_item: expected value of Delta Squared (will be updated) ls_item: learning rate (will be updated) ss_item: Scratch Space rho: decay constant (determines window size) epsilon: small positive constant for numerical stability """ # Accumulate E[Grad^2] gs_item[:] = gs_item * rho + (1.0 - rho) * us_item * us_item # Calculate Updates ls_item[:] = self.ng.sqrt((ds_item + epsilon) / (gs_item + epsilon)) * (-1.0) * us_item # Accumulate E[Delt^2] ds_item[:] = ds_item * rho + (1.0 - rho) * ls_item * ls_item # Final update to the params ps_item[:] = ps_item + ls_item def rms_update(self, params, updates, run_squares, velocity, scratch_space, gamma, epsilon, learning_rate, momentum_coef): # Update running squares run_squares[:] = gamma * run_squares + (1. - gamma) * updates * updates # Now scale the gradient by lr / rms(grad) (with a epsilon term for # stability) and use it to update the params if momentum_coef == 0: params[:] = params - learning_rate * updates * self.ng.reciprocal( self.ng.sqrt(run_squares) + epsilon) else: velocity[:] = velocity * momentum_coef - \ learning_rate * updates * \ self.ng.reciprocal(self.ng.sqrt(run_squares) + epsilon) params[:] = params + velocity def fprop_bn_compound(self, inputs, beta, gamma, eps, xhat, xmean, xvar, gmean, gvar, rho, out): """ Batch normalization forward pass, compounded to run in 3 kernel calls. Arguments: inputs: input data to be normalized beta: location parameter gamma: scale parameter eps: small constant for numerical stability xvar: variance (updated) xhat: normalized input (updated) out: normalized and rescaled input (updated) """ xvar[:] = self.ng.var(inputs, axis=1) xmean[:] = self.ng.mean(inputs, axis=1) gmean[:] = gmean * rho + (1.0 - rho) * xmean gvar[:] = gvar * rho + (1.0 - rho) * xvar xvar[:] = self.ng.reciprocal(self.ng.sqrt(xvar + eps)) xhat[:] = xvar * (inputs - xmean) out[:] = xhat * gamma + beta return out def bprop_bn_compound(self, xhat, error, xvar, gamma, beta_updates, gamma_updates): """ Batch normalization backward pass, compounded to run with 4 kernel calls. Arguments: xhat: normalized input data (updated) error: backpropagated deltas (updated) xvar: precomputed variance gamma: scale parameter beta_updates: gradient update for beta (updated) gamma_updates: gradient update for gamma (updated) """ gamma_updates[:] = self.ng.sum(xhat * error, axis=1) beta_updates[:] = self.ng.sum(error, axis=1) xhat[:] = (xhat * gamma_updates + beta_updates) / float(xhat.shape[1]) error[:] = xvar * gamma * (error - xhat)
def __init__(self, rng_seed, stochastic_round=False, device_id=0, num_dev=2): drv.init() self.num_dev = num_dev if device_id == 0: self.dev_list = range(num_dev) else: self.dev_list = device_id assert len(self.dev_list) == self.num_dev assert self.num_dev <= drv.Device.count() self.ctxs = [] self.devs = [] self._strms = [] self._redstrms = [] self._events = [] self._redevents = [] self.async = True self._nostrms = [None for i in self.dev_list] for i in self.dev_list: self.devs.append(drv.Device(i)) for dev in self.devs: self.ctxs.append( dev.make_context(drv.ctx_flags.SCHED_BLOCKING_SYNC)) self._strms.append(drv.Stream()) self._redstrms.append(drv.Stream()) self._events.append(drv.Event()) self._redevents.append(drv.Event()) drv.Context.pop() self.ctxs[0].push() atexit.register(drv.Context.pop) MGPUTensor.ctxs = self.ctxs MGPUTensor.num_dev = num_dev self.ng = NervanaGPU(stochastic_round=stochastic_round) logger.info("Initialized %d device NervanaGPU, stochastic_round=%s", num_dev, stochastic_round) self.ng.block = None self.rng_seed = rng_seed self.rng_init() # Setup the pairwise contexts # TODO clean up this code to avoid indexing for dev1, ctx1 in zip(self.devs, self.ctxs): ctx1.push() for dev2, ctx2 in zip(self.devs, self.ctxs): if dev1 == dev2: continue if dev1.can_access_peer(dev2): ctx1.enable_peer_access(ctx2) else: print('Cannot enable peer access between ' '{:d} and {:d}'.format(dev1, dev2)) ctx1.pop()
def __init__(self, rng_seed, stochastic_round=False, device_id=0, num_dev=2): drv.init() self.num_dev = num_dev if device_id == 0: self.dev_list = range(num_dev) else: self.dev_list = device_id assert len(self.dev_list) == self.num_dev assert self.num_dev <= drv.Device.count() self.ctxs = [] self.devs = [] self._strms = [] self._redstrms = [] self._events = [] self._redevents = [] self. async = True self._nostrms = [None for i in self.dev_list] for i in self.dev_list: self.devs.append(drv.Device(i)) for dev in self.devs: self.ctxs.append( dev.make_context(drv.ctx_flags.SCHED_BLOCKING_SYNC)) self._strms.append(drv.Stream()) self._redstrms.append(drv.Stream()) self._events.append(drv.Event()) self._redevents.append(drv.Event()) drv.Context.pop() self.ctxs[0].push() atexit.register(drv.Context.pop) MGPUTensor.ctxs = self.ctxs MGPUTensor.num_dev = num_dev self.ng = NervanaGPU(stochastic_round=stochastic_round) logger.info("Initialized %d device NervanaGPU, stochastic_round=%s", num_dev, stochastic_round) self.ng.block = None self.rng_seed = rng_seed self.rng_init() # Setup the pairwise contexts # TODO clean up this code to avoid indexing for dev1, ctx1 in zip(self.devs, self.ctxs): ctx1.push() for dev2, ctx2 in zip(self.devs, self.ctxs): if dev1 == dev2: continue if dev1.can_access_peer(dev2): ctx1.enable_peer_access(ctx2) else: print('Cannot enable peer access between ' '{:d} and {:d}'.format(dev1, dev2)) ctx1.pop()
# Swap A and B to map from C order to Fortran for r in range(repeat): cublas.cublasSgemm(handle, opB, opA, n, m, k, alpha, B.gpudata, ldb, A.gpudata, lda, beta, C.gpudata, ldc) end.record() end.synchronize() msecs = end.time_since(start) / repeat gflops = (m * n * k * 2.0) / (msecs * 1000000.0) print("%7.3f msecs %4.0f gflops (%s_%s : %d,%d,%d)" % (msecs,gflops,"cublas",op,m,n,k)) np.set_printoptions(threshold=8193, linewidth=600, formatter={'float':lambda x: "% .0f" % x}) ng = NervanaGPU(stochastic_round=False, bench=True) repeat = 1 for dtype in (np.float16, np.float32,): for K, C, N in ((32,4096,1512),): for alpha, beta in ((1.0,0.0), (0.5,0.5)): for op, dimA, dimB, dimC in ( ("nn", (K,C), (C,N), (K,N) ), # fprop ("tn", (K,C), (K,N), (C,N) ), # bprop ("nt", (K,N), (C,N), (K,C) ),): # update devA1 = ng.empty(dimA, dtype=dtype)
# Swap A and B to map from C order to Fortran for r in range(repeat): cublas.cublasSgemm(handle, opB, opA, n, m, k, alpha, B.gpudata, ldb, A.gpudata, lda, beta, C.gpudata, ldc) if repeat > 1: end.record() end.synchronize() msecs = end.time_since(start) / repeat gflops = (m * n * k * 2.0) / (msecs * 1000000.0) print("%7.3f msecs %4.0f gflops (%s_%s : %d,%d,%d)" % (msecs,gflops,"cublas",op,m,n,k)) np.set_printoptions(threshold=8193, linewidth=600, formatter={'float':lambda x: "% .0f" % x}) ng = NervanaGPU(stochastic_round=0, bench=0) small_1 = (1,2,3,4,5,6,7,8,9,16,32,64,65,72,120,127,128,192) medium_1 = (32,64,128,192,778,785,786,787,794) big_1 = (32,64,128,1532,1535,1536,1537,1540,3073,4095) small_2 = (8,16,32,64,72,96,120,128,192) medium_2 = (32,64,128,192,256,786-32,786-16,786,786+16,786+32) big_2 = (32,64,128,1536-80,1536-64,1536,1536+64,1536+80,3072,4096) # sharedDim = (4096,4096) # devA1s = ng.empty(sharedDim, dtype=np.float32) # devB1s = ng.empty(sharedDim, dtype=np.float32) # devC1s = ng.empty(sharedDim, dtype=np.float32) # devA2s = ng.empty(sharedDim, dtype=np.float32) # devB2s = ng.empty(sharedDim, dtype=np.float32)
A.gpudata, lda, beta, C.gpudata, ldc) if repeat > 1: end.record() end.synchronize() msecs = end.time_since(start) / repeat gflops = (m * n * k * 2.0) / (msecs * 1000000.0) print "%7.3f msecs %4.0f gflops (%s_%s : %d,%d,%d)" % ( msecs, gflops, "cublas", op, m, n, k) np.set_printoptions(threshold=8193, linewidth=600, formatter={'float': lambda x: "% .0f" % x}) ng = NervanaGPU(stochastic_round=True, bench=False) small_1 = (1, 2, 3, 4, 5, 6, 7, 8, 9, 16, 32, 64, 65, 72, 120, 127, 128, 192) medium_1 = (32, 64, 128, 192, 778, 785, 786, 787, 794) big_1 = (32, 64, 128, 1532, 1535, 1536, 1537, 1540, 3073, 4095) small_2 = (8, 16, 32, 64, 72, 96, 120, 128, 192) medium_2 = (32, 64, 128, 192, 256, 786 - 32, 786 - 16, 786, 786 + 16, 786 + 32) big_2 = (32, 64, 128, 1536 - 80, 1536 - 64, 1536, 1536 + 64, 1536 + 80, 3072, 4096) # sharedDim = (4096,4096) # devA1s = ng.empty(sharedDim, dtype=np.float32) # devB1s = ng.empty(sharedDim, dtype=np.float32) # devC1s = ng.empty(sharedDim, dtype=np.float32) # devA2s = ng.empty(sharedDim, dtype=np.float32)
#!/usr/bin/python import numpy as np import pycuda.driver as drv from nervanagpu import NervanaGPU from pycuda.autoinit import context from ipdb import set_trace np.set_printoptions(threshold=8192*4, linewidth=600, formatter={'int':lambda x: "%2d" % x,'float':lambda x: "%2.0f" % x}) ng = NervanaGPU(stochastic_round=0, bench=1) dtype = np.float32 # np.float16 or np.float32 repeat = 50 # repeat count for benchmarking ones = 0 # simpler data for debugging cpu = 0 # valdiate against numpy size = 32 # 32, 64, 128, None=auto X = 100 # Batch Size N = 32 # Minibatch Size C = 3072 # Input Features K = 3072 # Output Features Nin = True dimW = (K,C) if Nin: dimI = (X,C,N) dimO = (X,K,N) else: dimI = (X,N,C) dimO = (X,N,K)
np.float16, np.float32, ) # number of full iterations loops = 10 # show bechmark details for each layer layer_bench = 0 # show layer stats after each operation print_stats = 0 # run network with all zeros to see speed difference zeros = 0 # print more stuff verbose = 0 ng = NervanaGPU(bench=layer_bench) # common convolutional layer settings conv11 = {"R": 11, "S": 11, "pad_h": 2, "pad_w": 2, "str_h": 4, "str_w": 4} conv11p0 = {"R": 11, "S": 11, "pad_h": 0, "pad_w": 0, "str_h": 4, "str_w": 4} conv7 = {"R": 7, "S": 7, "pad_h": 3, "pad_w": 3, "str_h": 2, "str_w": 2} conv5 = {"R": 5, "S": 5, "pad_h": 2, "pad_w": 2} conv5p0 = {"R": 5, "S": 5, "pad_h": 0, "pad_w": 0} conv3 = {"R": 3, "S": 3, "pad_h": 1, "pad_w": 1} conv2 = {"R": 2, "S": 2, "pad_h": 0, "pad_w": 0, "str_h": 2, "str_w": 2} conv1 = {"R": 1, "S": 1, "pad_h": 0, "pad_w": 0} # traditional pooling pool2s2p0 = {"R": 2, "S": 2} pool3s2p0 = {"R": 3, "S": 3, "str_h": 2, "str_w": 2} pool3s2p1 = {"R": 3, "S": 3, "str_h": 2, "str_w": 2, "pad_h": 1, "pad_w": 1}
def run(): ng = NervanaGPU(stochastic_round=False) dt = np.float32 # N: Number of images in mini-batch # C: Number of input feature maps # K: Number of output feature maps # D: Depth of input image # H: Height of input image # W: Width of input image # T: Depth of filter kernel # R: Height of filter kernel # S: Width of filter kernel # # * images: (numColors, imgSizeY, imgSizeX, numImages) with stride given # * filters: (numColors, filterPixels, numFilters) if conv # * (numModules, numColors, filterPixels, numFilters) otherwise # * # * targets: (numFilters, numModulesY, numModulesX, numImages) N = 128 C = 3 K = 64 D = 1 H = 64 W = 64 T = 1 R = 8 S = 8 pad_h = pad_w = 0 str_h = str_w = 4 layer = ng.conv_layer(dt, N, C, K, D=D, H=H, W=W, T=T, R=R, S=S, pad_d=0, pad_h=pad_h, pad_w=pad_w, str_d=1, str_h=str_h, str_w=str_w, grid_P=0, grid_Q=0, update_size=None) numImages = N numFilters = K numModulesY = int(math.ceil(float(H - R + 1 + 2*pad_h) / str_h)) numModulesX = int(math.ceil(float(W - S + 1 + 2*pad_w) / str_w)) print "Num Modules ", numModulesX, numModulesY # Set up images, filters, and outputs # imgd = np.loadtxt("im1.txt") # img = np.zeros((64, 64, 3)) # print imgd.shape # for i in range(3): # img[:, :, i] = imgd[i*64:(i+1)*64, :] # hostImages = np.tile(img) hostImages = np.random.rand(C, H, W, N) hostFilters = np.random.uniform(low=0.0, high=1.0, size=(C, S*R, numFilters)) #np.ones((C, S*R, numFilters)) # hostOutputs = np.zeros((numFilters, numModulesY, numModulesX, N)) print "Input sum", np.sum(hostImages) # Run cc2 kernel devI = ng.array(hostImages, dtype=dt) devF = ng.array(hostFilters, dtype=dt) devO = ng.array(hostOutputs, dtype=dt) ng.fprop_cuda_conv(layer, devI, devF, devO) print "CC2 input sum: ", np.sum(devI.asnumpyarray()) print "CC2 output sum: ", np.sum(devO.asnumpyarray()) # Run maxwel kernel # images: (C * H * W, N) # filters: (C * S * R , numFilters) # outputs: (numFilters * numModulesX * numModulesY, N) devI = ng.array(hostImages.reshape((C*H*W, N)), dtype=dt) devF = ng.array(hostFilters.reshape((C*S*R, numFilters)), dtype=dt) devO2 = ng.array(hostOutputs.reshape(numFilters*numModulesX*numModulesY, N), dtype=dt) ng.fprop_conv(layer, devI, devF, devO2) print "NG input sum: ", np.sum(devI.asnumpyarray()) print "NG output sum: ", np.sum(devO2.asnumpyarray()) hostOutputs1 = np.reshape(devO.asnumpyarray(), devO2.shape) hostOutputs2 = devO2.asnumpyarray() for i in xrange(hostOutputs1.shape[0]): for j in xrange(hostOutputs1.shape[1]): assert(abs(hostOutputs1[i, j] - hostOutputs2[i, j]) < 1e-4)
import pycuda.driver as drv from nervanagpu import NervanaGPU from pycuda.autoinit import context from operator import mul print context.get_device().name() np.set_printoptions(threshold=8193, linewidth=600, formatter={'int':lambda x: "%10d" % x,'float':lambda x: "% .0f" % x}) ops = set(("update",)) # "fprop","bprop","update" ones = 0 cpu = 0 # Set CPU to 1 to check against CPU repeat = 1 dtype = np.float32 ng = NervanaGPU(stochastic_round=False, bench=True) conv = ng.conv_layer( dtype, 16,3,8, # N,C,K 1,64,64, # D,H,W 1,3,3, # T,R,S 0,1,1, # padding 1,1,1) # strides dimI = conv.dimI dimF = conv.dimF dimO = conv.dimO # colapse outer dimensions into one and preserve inner dimension
start, end = (drv.Event(), drv.Event()) def start_bench(): start.record() def end_bench(op): end.record() end.synchronize() msecs = end.time_since(start) / repeat gflops = conv.flops / (msecs * 1000000.0) print "%7.3f msecs %8.3f gflops (%s: %s)" % (msecs, gflops, op, conv) ng = NervanaGPU(stochastic_round=False, bench=True) # Create a cuDNN context cudnn = libcudnn.cudnnCreate() C_desc = libcudnn.cudnnCreateConvolutionDescriptor() I_desc = libcudnn.cudnnCreateTensorDescriptor() O_desc = libcudnn.cudnnCreateTensorDescriptor() E_desc = libcudnn.cudnnCreateTensorDescriptor() B_desc = libcudnn.cudnnCreateTensorDescriptor() F_desc = libcudnn.cudnnCreateFilterDescriptor() U_desc = libcudnn.cudnnCreateFilterDescriptor() # Set some options and tensor dimensions NCHW_fmt = libcudnn.cudnnTensorFormat['CUDNN_TENSOR_NCHW'] cu_dtype = libcudnn.cudnnDataType['CUDNN_DATA_FLOAT']
import sys if sys.version_info >= (3, 0): from functools import reduce print(context.get_device().name()) np.set_printoptions(threshold=8193, linewidth=600, formatter={'int':lambda x: "%10d" % x,'float':lambda x: "% .0f" % x}) ops = set(("update",)) # "fprop","bprop","update" ones = 0 cpu = 0 # Set CPU to 1 to check against CPU repeat = 1 dtype = np.float32 ng = NervanaGPU(stochastic_round=False, bench=True) conv = ng.conv_layer( dtype, 16,3,8, # N,C,K 1,64,64, # D,H,W 1,3,3, # T,R,S 0,1,1, # padding 1,1,1) # strides dimI = conv.dimI dimF = conv.dimF dimO = conv.dimO # colapse outer dimensions into one and preserve inner dimension
# Note GoogLeNet2 only fits in fp16 currently. I need to work out delta sharing in inception layers. nets = ("Alexnet","Overfeat","GoogLeNet1","GoogLeNet2","VGG","VGG_E",) #Available dtypes: np.float16, np.float32 dtypes = (np.float16,np.float32) # number of full iterations loops = 10 # show bechmark details for each layer layer_bench = 0 # show layer stats after each operation print_stats = 0 # run network with all zeros to see speed difference zeros = 0 ng = NervanaGPU(bench=layer_bench) # common convolutional layer settings conv11 = { "R":11, "S":11, "pad_h":2, "pad_w":2, "str_h":4, "str_w":4 } conv11p0 = { "R":11, "S":11, "pad_h":0, "pad_w":0, "str_h":4, "str_w":4 } conv7 = { "R":7, "S":7, "pad_h":3, "pad_w":3, "str_h":2, "str_w":2 } conv5 = { "R":5, "S":5, "pad_h":2, "pad_w":2 } conv5p0 = { "R":5, "S":5, "pad_h":0, "pad_w":0 } conv3 = { "R":3, "S":3, "pad_h":1, "pad_w":1 } conv2 = { "R":2, "S":2, "pad_h":0, "pad_w":0, "str_h":2, "str_w":2 } conv1 = { "R":1, "S":1, "pad_h":0, "pad_w":0 } # traditional pooling pool2s2p0 = { "R":2, "S":2 } pool3s2p0 = { "R":3, "S":3, "str_h":2, "str_w":2 } pool3s2p1 = { "R":3, "S":3, "str_h":2, "str_w":2, "pad_h":1, "pad_w":1 }
# WITHOUT WARRANTIES OR CONDITIONS OF ANY KIND, either express or implied. # See the License for the specific language governing permissions and # limitations under the License. import numpy as np from nervanagpu import NervanaGPU from pycuda.autoinit import context from time import sleep np.set_printoptions(threshold=8193, linewidth=600, formatter={'float': lambda x: "% .1f" % x}) dtype = np.float32 ng = NervanaGPU(stochastic_round=False, bench=False) small = (1, 2, 3, 4, 5, 6, 7, 8, 9, 16, 32, 64, 65, 72, 120, 127, 128, 192) medium = (1, 64, 192, 778, 785, 786, 787, 794) big = (1, 64, 192, 1532, 1535, 1536, 1537, 1540) for size in (small, medium, big): # small, medium, big for m in size: for n in (size): for op in ("tn", "nn", "nt"): # "tn","nn","nt", for k in size: print("op,M,N,K: ", op, m, n, k) dimA = (m, k) if op[0] == 'n' else (k, m) dimB = (k, n) if op[1] == 'n' else (n, k) dimC = (m, n)
# distributed under the License is distributed on an "AS IS" BASIS, # WITHOUT WARRANTIES OR CONDITIONS OF ANY KIND, either express or implied. # See the License for the specific language governing permissions and # limitations under the License. # Using just cublas compare N as the contiguous dimension verses the non-contiguous dimension. import numpy as np import pycuda.driver as drv from nervanagpu import NervanaGPU from pycuda.autoinit import context from scikits.cuda import cublas print context.get_device().name() ng = NervanaGPU(stochastic_round=False, bench=True) handle = cublas.cublasCreate() start, end = (drv.Event(), drv.Event()) def cublas_dot(op, A, B, C, repeat=1, warmup=False): lda = A.shape[0] ldb = B.shape[0] ldc = C.shape[0] m = C.shape[0] n = C.shape[1] k = A.shape[1] if op[0] == 'n' else A.shape[0]
# Swap A and B to map from C order to Fortran for r in range(repeat): cublas.cublasSgemm(handle, opB, opA, n, m, k, 1.0, B.gpudata, ldb, A.gpudata, lda, 0.0, C.gpudata, ldc) end.record() end.synchronize() msecs = end.time_since(start) / repeat gflops = (m * n * k * 2.0) / (msecs * 1000000.0) print "%7.3f msecs %4.0f gflops (%s_%s : %d,%d,%d)" % (msecs,gflops,"cublas",op,m,n,k) return gflops np.set_printoptions(threshold=8193, linewidth=600, formatter={'float':lambda x: "% .0f" % x}) ng = NervanaGPU(stochastic_round=False, bench=True) for dtype in (np.float16,np.float32): for K, C, N in ((3072,3072*1,32),(3072,3072*1,64),(3072,3072*1,96),(3072,3072*1,128), (3072,3072*2,32),(3072,3072*2,64),(3072,3072*2,96),(3072,3072*2,128), (3072,3072*3,32),(3072,3072*3,64),(3072,3072*3,96),(3072,3072*3,128), (3072,3072*4,32),(3072,3072*4,64),(3072,3072*4,96),(3072,3072*4,128),): #(3072,3072,32+128*0),(3072,3072,64+128*0),(3072,3072,96+128*0),(3072,3072,128+128*0), #(3072,3072,32+128*1),(3072,3072,64+128*1),(3072,3072,96+128*1),(3072,3072,128+128*1), #(3072,3072,32+128*2),(3072,3072,64+128*2),(3072,3072,96+128*2),(3072,3072,128+128*2), #(3072,3072,32+128*3),(3072,3072,64+128*3),(3072,3072,96+128*3),(3072,3072,128+128*3),): for op, dimA, dimB, dimC in ( ("nn", (K,C), (C,N), (K,N) ), # fprop ("tn", (K,C), (K,N), (C,N) ), # bprop ("nt", (K,N), (C,N), (K,C) )): # update
# # Unless required by applicable law or agreed to in writing, software # distributed under the License is distributed on an "AS IS" BASIS, # WITHOUT WARRANTIES OR CONDITIONS OF ANY KIND, either express or implied. # See the License for the specific language governing permissions and # limitations under the License. import numpy as np from nervanagpu import NervanaGPU from pycuda.autoinit import context print context.get_device().name() np.set_printoptions(threshold=8193, linewidth=600, formatter={'float':lambda x: "% .0f" % x}) ng = NervanaGPU(stochastic_round=False, bench=True) dtype = np.float16 repeat = 1 cpu = 1 # Set CPU to 1 to check against CPU for data_type in ("All Ones", "Random Data",): #"All Ones", "Random Data" print data_type for size in ((3072,3072,3072*2),): #(4095,4095,4095) m, n, k = size for op in ("tn","nn","nt"): #"tn","nn","nt" dimA = (m,k) if op[0] == 'n' else (k,m) dimB = (k,n) if op[1] == 'n' else (n,k) dimC = (m,n)
from operator import mul print context.get_device().name() np.set_printoptions(threshold=8193, linewidth=600, formatter={ 'int': lambda x: "%10d" % x, 'float': lambda x: "% .3f" % x }) dtype = np.float16 cpu = 1 repeat = 1 ng = NervanaGPU(stochastic_round=False, bench=True) pool = ng.pool_layer( "max", 64, # N 64, 1, 64, 64, # C,D,H,W 4, 1, 2, 2, # J,T,R,S 0, 0, 0,
import numpy as np import pycuda.autoinit from nervanagpu import NervanaGPU nrv = NervanaGPU(default_dtype=np.float32) a = nrv.array(np.random.randn(200,200)) b = nrv.empty_like(a) b[:] = a**2 assert not np.any(np.isnan(b.get())), "Shouldn't have any nan's here"
class MGPU(GPU): default_dtype = np.float32 num_dev = 1 is_dist = True def __init__(self, rng_seed, stochastic_round=False, device_id=0, num_dev=2): drv.init() self.num_dev = num_dev if device_id == 0: self.dev_list = range(num_dev) else: self.dev_list = device_id assert len(self.dev_list) == self.num_dev assert self.num_dev <= drv.Device.count() self.ctxs = [] self.devs = [] self._strms = [] self._redstrms = [] self._events = [] self._redevents = [] self.async = True self._nostrms = [None for i in self.dev_list] for i in self.dev_list: self.devs.append(drv.Device(i)) for dev in self.devs: self.ctxs.append( dev.make_context(drv.ctx_flags.SCHED_BLOCKING_SYNC)) self._strms.append(drv.Stream()) self._redstrms.append(drv.Stream()) self._events.append(drv.Event()) self._redevents.append(drv.Event()) drv.Context.pop() self.ctxs[0].push() atexit.register(drv.Context.pop) MGPUTensor.ctxs = self.ctxs MGPUTensor.num_dev = num_dev self.ng = NervanaGPU(stochastic_round=stochastic_round) logger.info("Initialized %d device NervanaGPU, stochastic_round=%s", num_dev, stochastic_round) self.ng.block = None self.rng_seed = rng_seed self.rng_init() # Setup the pairwise contexts # TODO clean up this code to avoid indexing for dev1, ctx1 in zip(self.devs, self.ctxs): ctx1.push() for dev2, ctx2 in zip(self.devs, self.ctxs): if dev1 == dev2: continue if dev1.can_access_peer(dev2): ctx1.enable_peer_access(ctx2) else: print('Cannot enable peer access between ' '{:d} and {:d}'.format(dev1, dev2)) ctx1.pop() def make_events(self): evtlist = [] for ctx in self.ctxs: ctx.push() evtlist.append(drv.Event()) ctx.pop() return evtlist # These definitions are for performing grouped context commands # This is experimental and should remove _stack for actual usage def begin_stack(self, block, identifier): if block == Block.update: self.ng.block = Block.update self.call_stack = [] else: pass def end_stack(self, block, identifier): if block == Block.update: self.ng.block = None for idx, ctx in enumerate(self.ctxs): ctx.push() self.ng.stream = self.strms[idx] for method, args, kwargs in self.call_stack: myargs = [a._tensorlist[idx] if isinstance( a, MGPUTensor) else a for a in args] mykwargs = {k: v._tensorlist[idx] if isinstance( v, MGPUTensor) else v for k, v in kwargs.iteritems()} getattr(super(MGPU, self), method)(*myargs, **mykwargs) self.ng.stream = None ctx.pop() self.call_stack = None else: pass @property def strms(self): return self._strms if self.async else self._nostrms @property def redstrms(self): return self._redstrms if self.async else self._nostrms def uniform(self, low=0.0, high=1.0, size=1, dtype=default_dtype, name=None, persist_values=True, ptype='replica'): """ generate numpy random number and convert to a GPUTensor. If called with dtype=None it will probably explode """ assert len(size) == 2 result = self.empty(size, dtype=dtype, persist_values=persist_values) result.ptype = ptype beshape = size if ptype == 'replica' else (self.num_dev * size[0], size[1]) ary = np.random.uniform(low, high, beshape).astype(dtype) self.set(result, ary) return result def normal(self, loc=0.0, scale=1.0, size=1, dtype=default_dtype, name=None, persist_values=True, ptype='replica'): """ Gaussian/Normal random number sample generation """ assert len(size) == 2 result = self.empty(size, dtype=dtype, persist_values=persist_values) result.ptype = ptype beshape = size if ptype == 'replica' else (self.num_dev * size[0], size[1]) ary = np.random.normal(loc, scale, beshape).astype(dtype) self.set(result, ary) return result def synchronize(self): if not self.async: return for s in self.strms: s.synchronize() def redsynchronize(self): if not self.async: return for s in self.redstrms: s.synchronize() def allocate_fragment(self, shape, dtype=default_dtype, persist_values=True): # TODO: set ptype to be fragment in this case ?? return self.empty((shape[0], shape[1] / self.num_dev), dtype, persist_values=persist_values) def zeros_like(self, ary, dtype=default_dtype, persist_values=True, name=None): result = self.zeros(ary.shape, dtype=dtype, persist_values=persist_values) result.ptype = ary.ptype return result def empty_like(self, ary, dtype=default_dtype, persist_values=True, name=None): result = self.empty(ary.shape, dtype=dtype, persist_values=persist_values, name=name) result.ptype = ary.ptype return result def set(self, tensor, data): assert isinstance(tensor, MGPUTensor) if tensor.ptype == 'replica': for dest, strm, ctx in zip(tensor.tlist, self.strms, self.ctxs): ctx.push() drv.memcpy_htod_async(dest.ptr, data, strm) ctx.pop() # tensor.copy_from(data) else: self.scatter(data, tensor) def scatter(self, hbuf, dbuf): ''' scatters the array data in hbuf to the mgpu tensor assumes that dbuf is a M x N and hbuf is M x (Nxk) where k is the number of replicas also assumes that dtype of hbuf and dbuf are the same ''' assert hbuf.size == dbuf.size * dbuf.num_dev assert isinstance(dbuf, MGPUTensor) assert hbuf.dtype == dbuf.dtype ndata = dbuf.size starts = [i * ndata for i in range(self.num_dev)] for dest, strm, ctx, doff in zip(dbuf.tlist, self.strms, self.ctxs, starts): src = hbuf.reshape((hbuf.size))[doff:(doff + ndata)] ctx.push() drv.memcpy_htod_async(dest.ptr, src, strm) ctx.pop() self.synchronize() def fprop_fc(self, out, inputs, weights, layer=None): """ In this case, the weights are shards, the acts are replicas ubuf should be of size nout/num_dev x mbsz """ ubuf = layer.mempool[0] assert ubuf.shape == (weights.shape[0], inputs.shape[1]) if layer.use_biases: biases = layer.biases.tlist else: biases = [None for i in range(self.num_dev)] for dbuf, ibuf, wt, bs, strm, ctx in zip(ubuf.tlist, inputs.tlist, weights.tlist, biases, self.strms, self.ctxs): ctx.push() self.ng.stream = strm self.ng.dot(wt, ibuf, dbuf) if layer.use_biases: self.ng.add(dbuf, bs, out=dbuf) ctx.pop() # Note, should be safe not to sync because each fragment is computed # on the same stream that originates the copy # self.synchronize() self.fragment_to_replica(ubuf, out) def bprop_fc(self, out, weights, deltas, layer=None): """ Backward propagate the error through a fully connected network layer. Arguments: out (GPUTensor): Where to store the backward propagated errors. weights (GPUTensor): The weight coefficient values for this layer. deltas (GPUTensor): The error values for this layer layer (Layer): The layer object. """ ubuf = layer.mempool[1] wtsz = weights.shape[0] starts = [i * wtsz for i in range(self.num_dev)] assert out.shape == (weights.shape[1], deltas.shape[1]) assert ubuf.shape == out.shape for dbuf, ibuf, wt, strm, ctx, off in zip(out.tlist, deltas.tlist, weights.tlist, self.strms, self.ctxs, starts): ctx.push() self.ng.stream = strm self.ng.dot(wt.T, ibuf[off:(off + wtsz)], dbuf) ctx.pop() # Note, should be safe not to sync because each fragment is computed # on the same stream that originates the copy self.synchronize() self.reduce(out, ubuf) def update_fc(self, out, inputs, deltas, layer=None): wtsz = out.shape[0] starts = [i * wtsz for i in range(self.num_dev)] for obuf, dbuf, ibuf, strm, ctx, off in zip(out.tlist, deltas.tlist, inputs.tlist, self.strms, self.ctxs, starts): ctx.push() self.ng.stream = strm self.ng.dot(dbuf[off:(off + wtsz)], ibuf.T, obuf) ctx.pop() # self.synchronize() def update_fc_bias(self, err, out): """ Compute the updated bias gradient for a fully connected network layer. Arguments: out (GPUTensor): Where to store the updated gradient value. err (GPUTensor): backpropagated error """ wtsz = out.shape[0] starts = [i * wtsz for i in range(self.num_dev)] for ebuf, obuf, strm, ctx, off in zip(err.tlist, out.tlist, self.strms, self.ctxs, starts): ctx.push() self.ng.stream = strm self.ng.sum(ebuf[off:(off + wtsz)], axis=1, out=obuf) ctx.pop() def add_fc_bias(self, inputs, bias): """ This is a no-op since we absorb the bias add into the fprop_fc call """ pass def reduce_tensor(self, ary, async=True): ''' This is the case for the scalar tensor ''' assert ary.size == 1 if ary.ptype == 'replica': self.ctxs[0].push() result = ary.tlist[0].get() self.ctxs[0].pop() return result result = np.zeros((self.num_dev, 1), ary.dtype) for i, (ctx, src_buf, strm) in enumerate(zip( self.ctxs, ary.tlist, self.strms)): ctx.push() drv.memcpy_dtoh_async(result[i], src_buf.ptr, strm) ctx.pop() self.synchronize() return result.sum()
from pycuda.autoinit import context from nervanagpu import NervanaGPU from nervanagpu.layers import DataLayer, ConvLayer, PoolLayer, FullLayer print context.get_device().name() # Compare results here: # https://github.com/soumith/convnet-benchmarks # number of full iterations loops = 10 # show bechmark details for each layer layer_bench = 0 # show layer stats after each operation print_stats = 0 ng = NervanaGPU(bench=layer_bench) # don't learn, just benchmark momentum = 0.0 learning_rate = 0.0 # common convolutional layer settings conv3 = {"R": 3, "S": 3, "pad_h": 1, "pad_w": 1} conv1 = {"R": 1, "S": 1, "pad_h": 0, "pad_w": 0} # traditional pooling pool2 = {"op": "max", "R": 2, "S": 2} pool3 = {"op": "max", "R": 3, "S": 3, "str_h": 2, "str_w": 2} # maxout pooling pool1j2 = {"op": "max", "J": 2} # maxout in the fc layers
from pycuda.autoinit import context from nervanagpu import NervanaGPU from nervanagpu.layers import DataLayer, ConvLayer, PoolLayer, FullLayer print(context.get_device().name()) # Compare results here: # https://github.com/soumith/convnet-benchmarks # number of full iterations loops = 10 # show bechmark details for each layer layer_bench = 0 # show layer stats after each operation print_stats = 0 ng = NervanaGPU(bench=layer_bench) # don't learn, just benchmark momentum = 0.0 learning_rate = 0.0 # common convolutional layer settings conv3 = { "R":3, "S":3, "pad_h":1, "pad_w":1 } conv1 = { "R":1, "S":1, "pad_h":0, "pad_w":0 } # traditional pooling pool2 = { "op":"max", "R":2, "S":2 } pool3 = { "op":"max", "R":3, "S":3, "str_h":2, "str_w":2 } # maxout pooling pool1j2 = { "op":"max", "J":2 } # maxout in the fc layers
import numpy as np import pycuda.driver as drv from nervanagpu import NervanaGPU from pycuda.autoinit import context from operator import mul print context.get_device().name() np.set_printoptions(threshold=8193, linewidth=600, formatter={'int':lambda x: "%10d" % x,'float':lambda x: "% .3f" % x}) dtype = np.float16 cpu = 1 repeat = 1 ng = NervanaGPU(stochastic_round=False, bench=True) pool = ng.pool_layer( "max", 64, # N 64,1,64,64, # C,D,H,W 4,1,2,2, # J,T,R,S 0,0,0,0, # padding 4,1,2,2) # strides dimI = pool.dimI dimO = pool.dimO # colapse pooling dimensions into one # this allows for easy cpu pooling in numpy def slicable(dim, pad=0):
end.record() end.synchronize() msecs = end.time_since(start) / repeat gflops = (m * n * k * 2.0) / (msecs * 1000000.0) print "%7.3f msecs %4.0f gflops (%s_%s : %d,%d,%d)" % ( msecs, gflops, "cublas", op, m, n, k) return gflops np.set_printoptions(threshold=8193, linewidth=600, formatter={'float': lambda x: "% .0f" % x}) ng = NervanaGPU(stochastic_round=False, bench=True) for dtype in (np.float16, np.float32): for K, C, N in ( (3072, 3072 * 1, 32), (3072, 3072 * 1, 64), (3072, 3072 * 1, 96), (3072, 3072 * 1, 128), (3072, 3072 * 2, 32), (3072, 3072 * 2, 64), (3072, 3072 * 2, 96), (3072, 3072 * 2, 128), (3072, 3072 * 3, 32), (3072, 3072 * 3, 64), (3072, 3072 * 3, 96),
class GPU(Backend): """ Sets up a NervanaGPU based backend for matrix operations. Note that some functions defined in the generic Backend class such as are cross-map pooling and normalization and adaDelta are not implemented for this backend. """ default_dtype = np.float32 def __init__(self, rng_seed, stochastic_round=False, device_id=0): self.ng = NervanaGPU(stochastic_round=stochastic_round) logger.info("Initialized NervanaGPU with stochastic_round=%s", stochastic_round) self.rng_seed = rng_seed self.rng_init() self.device_id = device_id if device_id is not None else 0 def __getstate__(self): """ Defines what and how we go about serializing an instance of this class. Returns: self.__dict__: The full contents of the backend class instance, except for the mem_pool which is on device and cannot be serialized. """ if hasattr(self, 'mem_pool') and self.mem_pool is not None: self.mem_pool_pickle = {'shape': self.mem_pool.shape, 'dtype': np.float32} self.mem_pool = None return self.__dict__ def __setstate__(self, state): """ Defines how we go about deserializing into an instance of this class. Arguments: self.__dict__: The full contents of the backend class instance, except for the mem_pool which is on device and cannot be serialized. """ self.__dict__.update(state) self.mem_pool = self.ng.empty(self.mem_pool_pickle['shape'], dtype=self.mem_pool_pickle['dtype']) def init_mempool(self, shape, dtype=default_dtype): """ Allocates a memory pool for temporary storage """ self.mem_pool = self.ng.empty(shape, dtype=dtype) def alloc_host_mem(self, shape, dtype): return drv.pagelocked_empty(shape, dtype, order="C", mem_flags=0) def create_stream(self): return drv.Stream() def async_copy(self, dest, src, stream=None): drv.memcpy_htod_async(dest.gpudata, src, stream) def rng_init(self): """ Initialize and seed the pseudo random number genrator. Random numbers are generated on the host using numpy, then transfered to device. """ seed = None if 'rng_seed' in self.__dict__: seed = self.rng_seed logger.info("Seeding random number generator with: %s", str(seed)) np.random.seed(seed) def flop_timing_init(self, decorate_fc, decorate_conv, decorate_ew): """ Initialize FLOP timing. Wraps the specified MOP calls via a decorator to record elapsed time and number of operations. Arguments: decorate_fc (list): string giving the function names of fully connected layer forward/backward/update calls to time. decorate_conv (list): string giving the function names of convolutional layer forward/backward/update calls to time. decorate_ew (list): string giving the function names of element-wise calls to time. Notes: Must be called prior to first flop_timing_start call """ self.start = drv.Event() self.end = drv.Event() self.flop_timer = FlopsDecorator(self) self.flop_timer.decorate(decorate_fc=decorate_fc, decorate_conv=decorate_conv, decorate_ew=decorate_ew) def flop_timinig_start(self): """ Start a new FLOP timer. Returns: None: dummy value (not used) """ return self.start.record() def flop_timing_finish(self, start_time): """ Complete current FLOP timing. Arguments: start_time (unused): ignored. Returns: float: elapsed time in seconds since prior flop_timing_start call. """ self.end.record() self.end.synchronize() return self.end.time_since(self.start) def uniform(self, low=0.0, high=1.0, shape=1, dtype=default_dtype, persist_values=True, name=None, allocator=drv.mem_alloc): """ generate numpy random number and convert to a GPUTensor. If called with dype=None it will probably explode """ ary = np.random.uniform(low, high, shape) return GPUTensor(ary.shape, dtype, allocator=allocator, name=name, rounding=self.ng.round_mode).set(ary) def normal(self, loc=0.0, scale=1.0, size=1, dtype=default_dtype, persist_values=True, name=None, allocator=drv.mem_alloc): """ Gaussian/Normal random number sample generation """ ary = np.random.normal(loc, scale, size) return GPUTensor(ary.shape, dtype, allocator=allocator, name=name, rounding=self.ng.round_mode).set(ary) def fprop_fc(self, out, inputs, weights, layer=None): """ Forward propagate the inputs of a fully connected network layer to produce output pre-activations (ready for transformation by an activation function). Arguments: out (GPUTensor): Where to store the forward propagated results. inputs (GPUTensor): Will be either the dataset input values (first layer), or the outputs from the previous layer. weights (GPUTensor): The weight coefficient values for this layer. layer (Layer): The layer object. """ self.ng.dot(weights, inputs, out) def bprop_fc(self, out, weights, deltas, layer=None): """ Backward propagate the error through a fully connected network layer. Arguments: out (GPUTensor): Where to store the backward propagated errors. weights (GPUTensor): The weight coefficient values for this layer. deltas (GPUTensor): The error values for this layer layer (Layer): The layer object. """ self.ng.dot(weights.T, deltas, out) def update_fc(self, out, inputs, deltas, layer=None): """ Compute the updated gradient for a fully connected network layer. Arguments: out (GPUTensor): Where to store the updated gradient value. inputs (GPUTensor): Will be either the dataset input values (first layer), or the outputs from the previous layer. deltas (GPUTensor): The error values for this layer layer (Layer): The layer object. """ self.ng.dot(deltas, inputs.T, out) def fprop_conv(self, out, inputs, weights, ofmshape, ofmsize, ofmlocs, ifmshape, links, nifm, padding, stride, ngroups, fpropbuf, local=False): """ Forward propagate the inputs of a convolutional network layer to produce output pre-activations (ready for transformation by an activation function). Arguments: out (GPUTensor): Where to store the forward propagated results. inputs (GPUTensor): Will be either the dataset input values (first layer), or the outputs from the previous layer. weights (GPUTensor): The weight coefficient values for this layer. ofmshape (tuple): Dimensions of each output feature map (typically number of height and width neurons). ofmsize (int): Total size of each output feature map. ofmlocs (GPUTensor): Indices giving the location of each element in each output feature map stored in out. ifmshape (tuple): Dimensions of each input feature map (typically number of height and width neurons). For this backend we expect these values to be square. links (GPUTensor): Input receptive field indices. nifm (int): Total number of input feature maps. padding (int): Number of additional elements to include along each dimension of each local receptive field during the convolution operation. stride (int): Number of neurons to shift the filter at each step. ngroups (int): Number of groups. fpropbuf (GPUTensor): Temporary storage buffer used to hold the convolved outputs for a single receptive field. Not used for this backend. local (bool, optional): Whether to do local filtering (True) or convolution (False, the default) """ ''' N: Number of images in mini-batch C: Number of input feature maps K: Number of output feature maps D: Depth of input image H: Height of input image W: Width of input image T: Depth of filter kernel R: Height of filter kernel S: Width of filter kernel ''' self.ng.fprop_conv(layer=fpropbuf, I=inputs, F=weights, O=out, alpha=1.0, repeat=1) def bprop_conv(self, out, weights, deltas, ofmshape, ofmsize, ofmlocs, ifmshape, links, padding, stride, nifm, ngroups, bpropbuf, local=False): """ Backward propagate the error through a convolutional network layer. Arguments: out (GPUTensor): Where to store the backward propagated errors. weights (GPUTensor): The weight coefficient values for this layer. deltas (GPUTensor): The error values for this layer ofmshape (tuple): Dimensions of each output feature map (typically height and width). ofmsize (int): Total size of each output feature map. ofmlocs (GPUTensor): Indices giving the location of each element in each output feature map stored in out. ifmshape (tuple): Dimensions of each input feature map (typically height and width). links (GPUTensor): Input receptive field indices. nifm (int): Total number of input feature maps. padding (int): Number of additional elements to include along each dimension of each local receptive field during the convolution operation. stride (int): Number of neurons to shift the filter at each step. ngroups (int): Number of groups. bpropbuf (GPUTensor): Temporary storage buffer used to hold the backpropagated error for a single receptive field local (bool, optional): Whether to do local filtering (True) or convolution (False, the default) """ self.ng.bprop_conv(layer=bpropbuf, F=weights, E=deltas, grad_I=out, alpha=1.0, repeat=1) def update_conv(self, out, inputs, weights, deltas, ofmshape, ofmsize, ofmlocs, ifmshape, links, nifm, padding, stride, ngroups, fwidth, updatebuf, local=False, layer=None): """ Compute the updated gradient for a convolutional network layer. Arguments: out (GPUTensor): Where to store the updated gradient value. inputs (GPUTensor): Will be either the dataset input values (first layer), or the outputs from the previous layer. weights (GPUTensor): The weight coefficient values for this layer. deltas (GPUTensor): The error values for this layer ofmshape (tuple): Dimensions of each output feature map (typically height and width). ofmsize (int): Total size of each output feature map. ofmlocs (GPUTensor): Indices giving the location of each element in each output feature map stored in out. ifmshape (tuple): Dimensions of each input feature map (typically height and width). links (GPUTensor): Input receptive field indices. nifm (int): Total number of input feature maps. padding (int): Number of additional elements to include along each dimension of each local receptive field during the convolution operation. stride (int): Number of neurons to shift the filter at each step. ngroups (int): Number of groups. fwidth (int): Filter width. updatebuf (GPUTensor): Temporary storage buffer used to hold the updated gradient for a single receptive field local (bool, optional): Whether to do local filtering (True) or convolution (False, the default) layer (Layer): The layer object. """ self.ng.update_conv(layer=updatebuf, I=inputs, E=deltas, grad_F=out, alpha=1.0, repeat=1) def fprop_pool(self, out, inputs, op, ofmshape, ofmsize, ofmlocs, fshape, ifmshape, links, nifm, padding, stride, fpropbuf): """ Forward propagate the inputs of a Pooling network layer to produce output pre-activations (ready for transformation by an activation function). Arguments: out (GPUTensor): Where to store the forward propagated results. inputs (GPUTensor): Will be either the dataset input values (first layer), or the outputs from the previous layer. op (string): The type of pooling operation to apply. We support "max", "avg", "l2" currently. ofmshape (tuple): Dimensions of each output feature map (typically number of height and width neurons). ofmsize (int): Total size of each output feature map. ofmlocs (GPUTensor): Indices giving the location of each element in each output feature map stored in out. fshape (tuple): Dimensions of each filter (typically height and width). ifmshape (tuple): Dimensions of each input feature map (typically number of height and width neurons). links (GPUTensor): Input receptive field indices. nifm (int): Total number of input feature maps. padding (int): Number of additional elements to include along each dimension of each local receptive field during the pooling operation. stride (int): Number of neurons to shift the filter at each step. fpropbuf (GPUTensor): Temporary storage buffer used to hold the pooled outputs for a single receptive field. """ op = op.lower() if op == "max": self.ng.fprop_pool(layer=fpropbuf, I=inputs, O=out, repeat=1) else: raise AttributeError("unexpected pooling op type: %s", op) def bprop_pool(self, out, fouts, inputs, deltas, op, ofmshape, ofmsize, ofmlocs, fshape, fpsize, ifmshape, links, nifm, padding, stride, bpropbuf): """ Backward propagate the error through a pooling network layer. Arguments: out (GPUTensor): Where to store the backward propagated errors. fouts (GPUTensor): Forward propagated outputs from the previous layer. inputs (GPUTensor): Will be either the dataset input values (first layer), or the outputs from the previous layer. deltas (GPUTensor): The error values for this layer op (string): The type of pooling operation to apply. We support "max", "avg", "l2" currently. ofmshape (tuple): Dimensions of each output feature map (typically height and width). ofmsize (int): Total size of each output feature map. ofmlocs (GPUTensor): Indices giving the location of each element in each output feature map stored in out. fshape (tuple): Dimensions of each filter (typically height and width). fpsize (int): The size of each filter. ifmshape (tuple): Dimensions of each input feature map (typically height and width). links (GPUTensor): Input receptive field indices. nifm (int): Total number of input feature maps. padding (int): Number of additional elements to include along each dimension of each local receptive field during the pooling operation. stride (int): Number of neurons to shift the filter at each step. bpropbuf (GPUTensor): Temporary storage buffer used to hold the backpropagated error for a single receptive field """ op = op.lower() if op == "max": self.ng.bprop_pool(layer=bpropbuf, I=inputs, E=deltas, grad_I=out, repeat=1) else: raise AttributeError("unexpected pooling op type: %s", op) def logistic(self, x, out): """ Logistic sigmoid nonlinearity, 1/(1+exp(-x)) Arguments: x (GPUTensor): Input tensor out (GPUTensor): Output tensor """ self.ng.sig(x, out=out) return out def rectlin(self, x, out): """ Rectified Linear nonlinearity Arguments: x (GPUTensor): Input tensor out (GPUTensor): Output tensor """ self.ng.maximum(x, 0., out=out) return out def rectleaky(self, x, slope, out): out[:] = self.ng.maximum(x, x*slope) def rectleaky_derivative(self, x, slope, out): out[:] = self.ng.greater(x, 0) * (1.0 - slope) + slope def sum(self, tsr, axes, out): """ Sum Arguments: tsr (GPUTensor): Input tensor axes (int): Axis along which the reduction is performed. If axes is None, the tensor is flattened and reduced over both dimensions. out (GPUTensor): Output tensor """ if axes is None: sze = tsr.shape[0]*tsr.shape[1] self.ng.sum(tsr.reshape(sze, 1), axis=0, out=out) else: self.ng.sum(tsr, axis=axes, out=out) return out def mean(self, tsr, axes, out): """ Calculates the arithmetic mean of the elements along the specified axes. Arguments: tsr (GPUTensor): Input tensor axes (int): Axis along which the reduction is performed. If axes is None, the tensor is flattened and reduced over both dimensions. out (GPUTensor): Output tensor """ if axes is None: sze = tsr.shape[0]*tsr.shape[1] self.ng.mean(tsr.reshape(sze, 1), axis=0, out=out) else: self.ng.mean(tsr, axis=axes, out=out) return out def min(self, tsr, axes, out): """ Calculates the minimum of the elements along the specified axes. Arguments: tsr (GPUTensor): Input tensor axes (int): Axis along which the reduction is performed. If axes is None, the tensor is flattened and reduced over both dimensions. out (GPUTensor): Output tensor """ if axes is None: sze = tsr.shape[0]*tsr.shape[1] self.ng.min(tsr.reshape(sze, 1), axis=0, out=out) else: self.ng.min(tsr, axis=axes, out=out) return out def max(self, tsr, axes, out): """ Calculates the maximum of the elements along the specified axes. Arguments: tsr (GPUTensor): Input tensor axes (int): Axis along which the reduction is performed. If axes is None, the tensor is flattened and reduced over both dimensions. out (GPUTensor): Output tensor """ if axes is None: sze = tsr.shape[0]*tsr.shape[1] self.ng.max(tsr.reshape(sze, 1), axis=0, out=out) else: self.ng.max(tsr, axis=axes, out=out) return out def variance(self, tsr, axes, out, mean=None): """ Calculates the variance of the elements along the specified axes. Arguments: tsr (GPUTensor): the tensor on which to compute the variance axes (int, list, optional): the dimension(s) along which to variance. If set to None, we will variance over all dimensions. out (GPUTensor): where the result will be stored. mean (GPUTensor): the tensor containing mean of tsr Returns: GPUTensor: reference to out """ if mean is None: logger.error("GPUTensor requires mean to be specified.") raise ValueError("mean not specified") self.ng.mean(self.ng.square(tsr-mean), axis=axes, out=out) return out def fabs(self, x, out): """ Calculates absolute value of the elements in a tensor Arguments: x (GPUTensor): Input tensor out (GPUTensor): Output tensor Returns: GPUTensor: reference to out """ self.ng.fabs(x, out=out) return out def sqrt(self, x, out): """ Calculates square root of the elements in a tensor Arguments: x (GPUTensor): Input tensor out (GPUTensor): Output tensor Returns: GPUTensor: reference to out """ self.ng.sqrt(x, out=out) return out def zeros(self, shape, dtype=default_dtype, persist_values=True): """ Allocate a new GPUTensor and fill it with zeros. Arguments: shape (tupel): Shape of the desired GPUTensor dtype (dtype): Optional datatype persist_values (bool, optional): If set to True (the default), the values assigned to this Tensor will persist across multiple begin and end calls. Setting to False may provide a performance increase if values do not need to be maintained across such calls Returns: GPUTensor: output """ return self.ng.zeros(shape, dtype=dtype) def ones(self, shape, dtype=default_dtype, persist_values=True): """ Allocate a new GPUTensor and fill it with ones. Arguments: shape (tupel): Shape of the desired GPUTensor dtype (dtype): Optional datatype persist_values (bool, optional): If set to True (the default), the values assigned to this Tensor will persist across multiple begin and end calls. Setting to False may provide a performance increase if values do not need to be maintained across such calls Returns: GPUTensor: output """ return self.ng.ones(shape, dtype=dtype) def empty(self, shape, dtype=default_dtype, persist_values=True): """ Allocate a new GPUTensor. Arguments: shape (tupel): Shape of the desired GPUTensor dtype (dtype): Optional datatype persist_values (bool, optional): If set to True (the default), the values assigned to this Tensor will persist across multiple begin and end calls. Setting to False may provide a performance increase if values do not need to be maintained across such calls Returns: GPUTensor: output """ return self.ng.empty(shape, dtype=dtype) def array(self, ary, dtype=default_dtype, persist_values=True, name=None, allocator=drv.mem_alloc): """ Allocate a new GPUTensor and fill it with supplied numpy array. Arguments: ary (ndarray): Numpy array with source data dtype (dtype, optional): Optional datatype persist_values (bool, optional): If set to True (the default), the values assigned to this Tensor will persist across multiple begin and end calls. Setting to False may provide a performance increase if values do not need to be maintained across such calls name (string): Name for the GPUTensor allocator (pycuda): Pycuda memory allocator Returns: GPUTensor: output """ return GPUTensor(ary.shape, dtype, allocator=allocator, name=name, rounding=self.ng.round_mode).set(ary) def add(self, left, right, out): """ Elementwise addition Arguments: left (GPUTensor, numeric): left-hand side operand. right (GPUTensor, numeric): right-hand side operand. out (GPUTensor): where the result will be stored. Returns: GPUTensor: reference to out """ self.ng.add(left, right, out=out) return out def subtract(self, left, right, out): """ Elementwise subtraction Arguments: left (GPUTensor, numeric): left-hand side operand. right (GPUTensor, numeric): right-hand side operand. out (GPUTensor): where the result will be stored. Returns: GPUTensor: reference to out """ self.ng.subtract(left, right, out=out) return out def multiply(self, left, right, out): """ Elementwise multiplication Arguments: left (GPUTensor, numeric): left-hand side operand. right (GPUTensor, numeric): right-hand side operand. out (GPUTensor): where the result will be stored. Returns: GPUTensor: reference to out """ self.ng.multiply(left, right, out=out) return out def divide(self, left, right, out): """ Elementwise division Arguments: left (GPUTensor, numeric): left-hand side operand. right (GPUTensor, numeric): right-hand side operand. out (GPUTensor): where the result will be stored. Returns: GPUTensor: reference to out """ self.ng.divide(left, right, out=out) return out def greater(self, left, right, out): """ Elementwise greater than testing Arguments: left (GPUTensor, numeric): left-hand side operand. right (GPUTensor, numeric): right-hand side operand. out (GPUTensor): where the result will be stored. Returns: GPUTensor: reference to out """ self.ng.greater(left, right, out=out) return out def equal(self, left, right, out): """ Performs element-wise equality testing on each element of left and right, storing the result in out. Each operand is assumed to be the same shape (or broadcastable as such). Arguments: left (GPUTensor, numeric): left-hand side operand. right (GPUTensor, numeric): right-hand side operand. out (GPUTensor): where the result will be stored. Returns: GPUTensor: reference to out """ self.ng.equal(left, right, out=out) return out def not_equal(self, left, right, out): """ Elementwise not equal testing Arguments: left (GPUTensor, numeric): left-hand side operand. right (GPUTensor, numeric): right-hand side operand. out (GPUTensor): where the result will be stored. Returns: GPUTensor: reference to out """ self.ng.not_equal(left, right, out=out) return out def clip(self, a, a_min, a_max, out): """ Elementwise clipping between a range of specified values Arguments: a (GPUTensor): input tensor. a_min (float): floor value. a_max (float): ceiling value. out (GPUTensor): where the result will be stored. Returns: GPUTensor: reference to out """ self.ng.clip(a, a_min, a_max, out=out) return out def log(self, a, out): """ Elementwise base-e logarithm Arguments: a (GPUTensor): input tensor. out (GPUTensor): where the result will be stored. Returns: GPUTensor: reference to out """ self.ng.log(a, out=out) return out def tanh(self, a, out): """ Elementwise tanh Arguments: a (GPUTensor): input tensor. out (GPUTensor): where the result will be stored. Returns: GPUTensor: reference to out """ self.ng.tanh(a, out=out) return out def argmax(self, a, out, axis=0): """ Calculates the indices of the maximal element value along the specified axis. If multiple elements contain the maximum, only the elements of the first are returned. Arguments: tsr (GPUTensor): The GPUTensor on which to find the maximum indices axis (int): The dimension along which to find the maximum. If set to None, find the overall maximum index of a flattened representation of tsr. out (GPUTensor): Where to store the result. Should be of the appropriate type and expected shape Returns: GPUTensor: reference to out """ self.ng.argmax(a, out=out, axis=axis) return out def softmax(self, x, out): """ Softmax nonlinearity. Computes exp(x-max(x)) / sum_i exp(x_i-max(x_i)) Arguments: x (GPUTensor): input tensor. out (GPUTensor): where the result will be stored. Returns: GPUTensor: reference to out """ out[:] = (self.ng.reciprocal(self.ng.sum( self.ng.exp(x - self.ng.max(x, axis=0)), axis=0)) * self.ng.exp(x - self.ng.max(x, axis=0))) return out def softmax_gradient(self, y, err, out): """ Gradient of the softmax nonlinearity. Arguments: y (GPUTensor): input tensor. err (GPUTensor): backpropagated error. out (GPUTensor): where the result will be stored. Returns: GPUTensor: reference to out """ raise NotImplementedError("Softmax gradient should use shortcut") return out def make_binary_mask(self, tsr, keepthresh=0.5, dtype=default_dtype): """ Create a binary mask for dropout layers. Arguments: tsr (GPUTensor): Output tensor keepthresh (float): fraction of ones """ self.ng.dropout(keep=keepthresh, out=tsr) def gdm_compound(self, ps_item, us_item, vs_item, momentum_coef, learning_rate, epoch): """ Perform gradient descent update with momentum. Arguments: ps_item (GPUTensor): parameter tensor (e.g. a weight matrix) us_item (GPUTensor): update tensor, contains gradient wrt. weights vs_item (GPUTensor): velocity tensor. momentum_coef (float): momentum coefficient. learning_rate (float): learning rate. epoch (int): epoch (used in conjunction with diagnostics). Outputs are written to vs_item (updated velocity) and ps_item (updated weights) """ vs_item[:] = vs_item * momentum_coef - us_item * learning_rate ps_item[:] = ps_item + vs_item def gdmwd_compound(self, ps_item, us_item, vs_item, momentum_coef, learning_rate, wd, epoch): """ Perform gradient descent update with momentum and weight decay. Arguments: ps_item (GPUTensor): parameter tensor (e.g. a weight matrix) us_item (GPUTensor): update tensor, contains gradient wrt. weights vs_item (GPUTensor): velocity tensor. momentum_coef (float): momentum coefficient. learning_rate (float): learning rate. wd (float): weight decay parameter. epoch (int): epoch (used in conjunction with diagnostics). Outputs: ps_item, the updated weights. vs_item, the updated velocity. us_item, used as a temp buffer. """ vs_item[:] = vs_item * momentum_coef - us_item * \ learning_rate - learning_rate * wd * ps_item ps_item[:] = ps_item + vs_item def ada_update(self, ps_item, us_item, gs_item, ds_item, ls_item, ss_item, rho, epsilon): """ Update rule for AdaDelta (Zeiler, http://arxiv.org/abs/1212.5701) Arguments: ps_item: weight / parameter (will be updated) us_item: update gs_item: expected value of Gradient Squared (will be updated) ds_item: expected value of Delta Squared (will be updated) ls_item: learning rate (will be updated) ss_item: Scratch Space rho: decay constant (determines window size) epsilon: small positive constant for numerical stability """ # Accumulate E[Grad^2] gs_item[:] = gs_item * rho + (1.0 - rho) * us_item * us_item # Calculate Updates ls_item[:] = self.ng.sqrt((ds_item + epsilon) / (gs_item + epsilon)) * (-1.0) * us_item # Accumulate E[Delt^2] ds_item[:] = ds_item * rho + (1.0 - rho) * ls_item * ls_item # Final update to the params ps_item[:] = ps_item + ls_item def rms_update(self, params, updates, run_squares, velocity, scratch_space, gamma, epsilon, learning_rate, momentum_coef): # Update running squares run_squares[:] = gamma * run_squares + (1. - gamma) * updates * updates # Now scale the gradient by lr / rms(grad) (with a epsilon term for # stability) and use it to update the params if momentum_coef == 0: params[:] = params - learning_rate * updates * self.ng.reciprocal( self.ng.sqrt(run_squares) + epsilon) else: velocity[:] = velocity * momentum_coef - \ learning_rate * updates * \ self.ng.reciprocal(self.ng.sqrt(run_squares) + epsilon) params[:] = params + velocity def fprop_bn_compound(self, inputs, beta, gamma, eps, xvar, xhat, out): """ Batch normalization forward pass, compounded to run in 3 kernel calls. Arguments: inputs: input data to be normalized beta: location parameter gamma: scale parameter eps: small constant for numerical stability xvar: variance (updated) xhat: normalized input (updated) out: normalized and rescaled input (updated) """ xvar[:] = self.ng.reciprocal(self.ng.sqrt(self.ng.var(inputs, axis=1) + eps)) xhat[:] = xvar * (inputs - self.ng.mean(inputs, axis=1)) out[:] = xhat * gamma + beta return out def bprop_bn_compound(self, xhat, error, xvar, gamma, beta_updates, gamma_updates): """ Batch normalization backward pass, compounded to run with 4 kernel calls. Arguments: xhat: normalized input data (updated) error: backpropagated deltas (updated) xvar: precomputed variance gamma: scale parameter beta_updates: gradient update for beta (updated) gamma_updates: gradient update for gamma (updated) """ gamma_updates[:] = self.ng.sum(xhat * error, axis=1) beta_updates[:] = self.ng.sum(error, axis=1) xhat[:] = (xhat * gamma_updates + beta_updates) / float(xhat.shape[1]) error[:] = xvar * gamma * (error - xhat)
dtype = np.float16 repeat = 20 start, end = (drv.Event(), drv.Event()) def start_bench(): start.record() def end_bench(op): end.record() end.synchronize() msecs = end.time_since(start) / repeat gflops = conv.flops / (msecs * 1000000.0) print "%7.3f msecs %8.3f gflops (%s: %s)" % (msecs, gflops, op, conv) ng = NervanaGPU(stochastic_round=False, bench=True) # Create a cuDNN context cudnn = libcudnn.cudnnCreate() C_desc = libcudnn.cudnnCreateConvolutionDescriptor() I_desc = libcudnn.cudnnCreateTensorDescriptor() O_desc = libcudnn.cudnnCreateTensorDescriptor() E_desc = libcudnn.cudnnCreateTensorDescriptor() B_desc = libcudnn.cudnnCreateTensorDescriptor() F_desc = libcudnn.cudnnCreateFilterDescriptor() U_desc = libcudnn.cudnnCreateFilterDescriptor() # Set some options and tensor dimensions NCHW_fmt = libcudnn.cudnnTensorFormat['CUDNN_TENSOR_NCHW'] cu_dtype = libcudnn.cudnnDataType['CUDNN_DATA_FLOAT']
cublas.cublasSgemm(handle, opB, opA, n, m, k, alpha, B.gpudata, ldb, A.gpudata, lda, beta, C.gpudata, ldc) end.record() end.synchronize() msecs = end.time_since(start) / repeat gflops = (m * n * k * 2.0) / (msecs * 1000000.0) print "%7.3f msecs %4.0f gflops (%s_%s : %d,%d,%d)" % ( msecs, gflops, "cublas", op, m, n, k) np.set_printoptions(threshold=8193, linewidth=600, formatter={'float': lambda x: "% .0f" % x}) ng = NervanaGPU(stochastic_round=False, bench=True) repeat = 1 for dtype in ( np.float16, np.float32, ): for K, C, N in ((32, 4096, 1512), ): for alpha, beta in ((1.0, 0.0), (0.5, 0.5)): for op, dimA, dimB, dimC in ( ("nn", (K, C), (C, N), (K, N)), # fprop ("tn", (K, C), (K, N), (C, N)), # bprop
#!/usr/bin/python import numpy as np import pycuda.driver as drv from nervanagpu import NervanaGPU from pycuda.autoinit import context from ipdb import set_trace np.set_printoptions(threshold=8192 * 4, linewidth=600, formatter={ 'int': lambda x: "%2d" % x, 'float': lambda x: "%2.0f" % x }) ng = NervanaGPU(stochastic_round=0, bench=1) dtype = np.float32 # np.float16 or np.float32 repeat = 50 # repeat count for benchmarking ones = 0 # simpler data for debugging cpu = 0 # valdiate against numpy size = 32 # 32, 64, 128, None=auto X = 100 # Batch Size N = 32 # Minibatch Size C = 3072 # Input Features K = 3072 # Output Features Nin = True dimW = (K, C) if Nin: