def svm(scores, labels, mode, error=None, allocator=memPool): assert scores.dtype == np.float32 and labels.dtype == np.int32 shape = scores.shape grad = GPUArray.empty(shape, dtype=np.float32, allocator=allocator) if error is None: error = GPUArray.empty((), dtype=np.float32, allocator=allocator) error.fill(0.0) size = prod(scores.shape) spatialDim = prod(scores.shape[2:]) mapStride = spatialDim * scores.shape[1] block = (nthreads, 1, 1) grid = (roundUpDiv(size, nthreads), 1, 1) mod = {"l1": svmL1Mod, "l2": svmL2Mod}[mode] mod.cost(scores, labels, np.int32(size), np.int32(mapStride), np.int32(spatialDim), np.int32(shape[1]), np.int32(shape[0]), error, grad, block=block, grid=grid) return error, grad
def reflectpad(data, pad, allocator=memPool): if data.ndim == 3: batchsize, maps, insize = data.shape lpad, rpad = pad assert insize >= max(lpad, rpad) + 1 outsize = insize + lpad + rpad block = (warpSize, 1, 1) grid = (roundUpDiv(outsize, warpSize), maps, batchsize) outdata = GPUArray.empty((batchsize, maps, outsize), dtype=data.dtype, allocator=allocator) fn = mod.reflectpad1d if data.dtype == np.float32 else mod.reflectpad1dFP16 fn(outdata, data, np.int32(insize), np.int32(lpad), np.int32(rpad), block=block, grid=grid) elif data.ndim == 4: batchsize, maps, inh, inw = data.shape upad, bpad, lpad, rpad = pad assert inh >= max(upad, bpad) + 1 and inw >= max(lpad, rpad) + 1 outh, outw = inh + upad + bpad, inw + lpad + rpad block = (warpSize, 1, 1) grid = (roundUpDiv(outh * outw, warpSize), maps, batchsize) outdata = GPUArray.empty((batchsize, maps, outh, outw), dtype=data.dtype, allocator=allocator) fn = mod.reflectpad2d if data.dtype == np.float32 else mod.reflectpad2dFP16 fn(outdata, data, np.int32(inh), np.int32(inw), np.int32(upad), np.int32(bpad), np.int32(lpad), np.int32(rpad), block=block, grid=grid) else: raise NotImplementedError(data.ndim) return outdata
def segmentSeq(data): assert data.dtype == np.int32 length, = data.shape assert length <= NV segments = GPUArray.empty((length, 3), dtype=np.int32, allocator=memPool) indices = GPUArray.empty(data.shape, dtype=np.int32, allocator=memPool) segmentMod.segmentSeq(segments, indices, data, np.int32(length), block=(NT, 1, 1), grid=(1, 1, 1)) return segments, indices
def radixSort(keys, values): assert keys.dtype == np.int32 and values.dtype == np.int32 assert keys.shape == values.shape length, = keys.shape assert length <= NV outkeys = GPUArray.empty(keys.shape, dtype=keys.dtype, allocator=memPool) outvalues = GPUArray.empty(values.shape, dtype=values.dtype, allocator=memPool) radixMod.radixSort(outkeys, outvalues, keys, values, np.int32(length), block=(NT, 1, 1), grid=(1, 1, 1)) return outkeys, outvalues
def crossEntropy(scores, labels, weights=None, error=None, allocator=memPool): assert scores.dtype == np.float32 and labels.dtype == np.int32 shape = scores.shape if scores.ndim < 4: scores = scores.reshape(*shape, *(1 for _ in range(4 - scores.ndim))) softmax = cudnn.softmaxNd(scores, mode=SoftMaxMode.spatial.value, allocator=allocator) grad = GPUArray.empty(shape, dtype=np.float32, allocator=allocator) if error is None: error = GPUArray.empty((), dtype=np.float32, allocator=allocator) error.fill(0.0) size = prod(scores.shape) spatialDim = prod(scores.shape[2:]) mapStride = spatialDim * scores.shape[1] block = (nthreads, 1, 1) grid = (roundUpDiv(size, nthreads), 1, 1) if weights is None: ceMod.cost(softmax, labels, np.int32(size), np.int32(mapStride), np.int32(spatialDim), np.int32(scores.shape[1]), np.int32(scores.shape[0]), error, grad, block=block, grid=grid) else: wceMod.cost(softmax, labels, weights, np.int32(size), np.int32(mapStride), np.int32(spatialDim), np.int32(shape[1]), np.int32(shape[0]), error, grad, block=block, grid=grid) return error, grad
def concatenate(tup, axis, out=None, allocator=memoryPool): ary = tup[0] dtype, reducedShape = ary.dtype, ary.shape reducedShape = reducedShape[:axis] + reducedShape[axis + 1:] assert all(a.dtype == dtype and a.shape[:axis] + a.shape[axis + 1:] == reducedShape for a in tup[1:]) concatDim = sum(a.dimAt(axis) for a in tup) shape = reducedShape[:axis] + (concatDim, ) + reducedShape[axis:] if out is None: out = GPUArray.empty(shape, dtype=dtype, allocator=allocator) else: assert out.shape == shape and out.dtype == dtype dstPitch = out.strideAt(axis - 1) if axis > 0 else out.nbytes height = prod(shape[:axis]) stride = 0 for a in tup: srcPitch = width = a.strideAt(axis - 1) if axis > 0 else a.nbytes Driver.memcpy2D(width, height, a.gpudata, srcPitch, out.gpudata, dstPitch, dstX=stride) stride += width return out
def split(ary, sections, axis, allocator=memoryPool): shape = ary.shape assert sum(sections) == shape[axis] outs = [ GPUArray.empty(shape[:axis] + (sec, ) + shape[axis + 1:], dtype=ary.dtype, allocator=allocator) for sec in sections ] srcPitch = ary.strideAt(axis - 1) if axis > 0 else ary.nbytes height = prod(shape[:axis]) stride = 0 for out in outs: dstPitch = width = out.strideAt(axis - 1) if axis > 0 else out.nbytes Driver.memcpy2D(width, height, ary.gpudata, srcPitch, out.gpudata, dstPitch, srcX=stride) stride += width return outs
def rescale(data, scale, memoryType, interpolation=InterpolationMode.nn, outdata=None, allocator=memPool): assert data.ndim == 2 and memoryType == MemoryType.grayscale or data.ndim == 3 hscale, wscale = (scale, scale) if isinstance(scale, (int, float)) else scale inrect = getDataRect(data, memoryType) insize, inline = (inrect[2], inrect[3]), getMemoryTypeLineSize( inrect[2], data.dtype, memoryType) outrect = libnpp.nppiGetResizeRect(inrect, wscale, hscale, 0, 0, interpolation.value) outline = getMemoryTypeLineSize(outrect[2], data.dtype, memoryType) outshape = getOutDataShape(data, outrect, memoryType) if outdata is None: outdata = GPUArray.empty(outshape, dtype=data.dtype, allocator=allocator) else: assert outdata.shape == outshape dataPtr, outdataPtr = getDataPointers(data, outdata, memoryType) libnpp.nppiResizeSqrPixel( getDataType(data).value, memoryType.value, dataPtr, insize, inline, inrect, outdataPtr, outline, outrect, wscale, hscale, 0, 0, interpolation.value) return outdata
def maxunpool2dBackward(grad, poolshape, mask, allocator=memPool): assert grad.dtype == np.float32 and mask.dtype == np.int32 batchsize, maps, outh, outw = grad.shape inh, inw = poolshape[2], poolshape[3] ingrad = GPUArray.empty((batchsize, maps, inh, inw), dtype=np.float32, allocator=allocator) size = prod(ingrad.shape) block = (nthreads, 1, 1) grid = (roundUpDiv(size, nthreads), 1, 1) mod.maxunpool2dBackward(ingrad, grad, mask, np.int32(inh), np.int32(inw), np.int32(outh), np.int32(outw), np.int32(maps), np.int32(size), block=block, grid=grid) return ingrad
def upsample3d(data, scale, mode="nearest", allocator=memPool): batchsize, maps, ind, inh, inw = data.shape dscale, hscale, wscale = (scale, scale, scale) if isinstance(scale, int) else scale outd, outh, outw = dscale * ind, hscale * inh, wscale * inw outdata = GPUArray.empty((batchsize, maps, outd, outh, outw), dtype=data.dtype, allocator=allocator) if mode == "nearest": block = (wblocksize, hblocksize, 1) grid = (roundUpDiv(inw, block[0]), roundUpDiv(inh, block[1]), batchsize * maps * ind) nearestMod.upsample3dNearest( outdata, data, np.int32(ind), np.int32(inh), np.int32(inw), np.int32(outd), np.int32(outh), np.int32(outw), np.int32(dscale), np.int32(hscale), np.int32(wscale), block=block, grid=grid ) elif mode == "linear": block = (warpSize, nthreads // warpSize, 1) grid = (roundUpDiv(outw, block[0]), roundUpDiv(outh, block[1]), outd) rd, rh, rw = (ind - 1) / (outd - 1), (inh - 1) / (outh - 1), (inw - 1) / (outw - 1) linearMod.upsample3dLinear( outdata, data, np.int32(batchsize), np.int32(maps), np.int32(ind), np.int32(inh), np.int32(inw), np.int32(outd), np.int32(outh), np.int32(outw), np.float32(rd), np.float32(rh), np.float32(rw), block=block, grid=grid ) else: raise NotImplementedError(mode) return outdata
def warpAffinePoints(data, inpoints, outpoints, memoryType, outshape=None, interpolation=InterpolationMode.nn, cval=0, clip=True, allocator=memPool): assert data.ndim == 2 and memoryType == MemoryType.grayscale or data.ndim == 3 inrect = getDataRect(data, memoryType) insize, inline = (inrect[2], inrect[3]), getMemoryTypeLineSize( inrect[2], data.dtype, memoryType) if outshape is None: outshape = data.shape outrect = getOutDataRect(data, outshape, memoryType) outline = getMemoryTypeLineSize(outrect[2], data.dtype, memoryType) outdata = GPUArray.empty(outshape, dtype=data.dtype, allocator=allocator) outdata.fill(cval) dataPtr, outdataPtr = getDataPointers(data, outdata, memoryType) srcQuad, dstQuad = genAffineQuads(inpoints, outpoints, clip, inrect) libnpp.nppiWarpAffineQuad( getDataType(data).value, memoryType.value, dataPtr, insize, inline, inrect, srcQuad, outdataPtr, outline, outrect, dstQuad, interpolation.value) return outdata
def upsample2dBackward(grad, scale, mode="nearest", allocator=memPool): batchsize, maps, outh, outw = grad.shape hscale, wscale = (scale, scale) if isinstance(scale, int) else scale inh, inw = outh // hscale, outw // wscale if mode == "nearest": ingrad = GPUArray.empty((batchsize, maps, inh, inw), dtype=grad.dtype, allocator=allocator) blk = warpSize * 8 block = (blk, 1, 1) grid = (roundUpDiv(ingrad.size, blk), 1, 1) nearestMod.upsample2dNearestBackward( ingrad, grad, np.int32(inw), np.int32(outw), np.int32(hscale), np.int32(wscale), np.int32(ingrad.size), block=block, grid=grid ) elif mode == "linear": ingrad = GPUArray.zeros((batchsize, maps, inh, inw), dtype=grad.dtype, allocator=allocator) block = (warpSize, nthreads // warpSize, 1) grid = (roundUpDiv(outw, block[0]), roundUpDiv(outh, block[1]), 1) rh, rw = (inh - 1) / (outh - 1), (inw - 1) / (outw - 1) linearMod.upsample2dLinearBackward( ingrad, grad, np.int32(batchsize), np.int32(maps), np.int32(inh), np.int32(inw), np.int32(outh), np.int32(outw), np.float32(rh), np.float32(rw), block=block, grid=grid ) else: raise NotImplementedError(mode) return ingrad
def warpAffine(data, coeffs, memoryType, outshape=None, interpolation=InterpolationMode.nn, cval=0, backward=False, allocator=memPool): assert data.ndim == 2 and memoryType == MemoryType.grayscale or data.ndim == 3 inrect = getDataRect(data, memoryType) insize, inline = (inrect[2], inrect[3]), getMemoryTypeLineSize( inrect[2], data.dtype, memoryType) if outshape is None: outshape = data.shape outrect = getOutDataRect(data, outshape, memoryType) outline = getMemoryTypeLineSize(outrect[2], data.dtype, memoryType) outdata = GPUArray.empty(outshape, dtype=data.dtype, allocator=allocator) outdata.fill(cval) dataPtr, outdataPtr = getDataPointers(data, outdata, memoryType) warpMethod = libnpp.nppiWarpAffine if backward: warpMethod = libnpp.nppiWarpAffineBack warpMethod( getDataType(data).value, memoryType.value, dataPtr, insize, inline, inrect, outdataPtr, outline, outrect, coeffs, interpolation.value) return outdata
def wrapAddVectorToVector(x, y, out=None, alpha=1.0, beta=1.0, allocator=memoryPool): if out is None: out = GPUArray.empty(x.shape, dtype=x.dtype, allocator=allocator) else: assert out.shape == x.shape addKer(out.dtype)(out, x, alpha, y, beta) return out
def instanceNorm2d(data, scale, bias, epsilon=1e-5, out=None, allocator=None): batchsize, maps, height, width = data.shape extmaps = batchsize * maps indata = data.reshape(1, extmaps, height, width) mean = GPUArray.empty((extmaps, ), dtype=np.float32, allocator=allocator) var = GPUArray.empty((extmaps, ), dtype=np.float32, allocator=allocator) if batchsize > 1: scale = tile(scale, batchsize, axis=0, allocator=allocator) bias = tile(bias, batchsize, axis=0, allocator=allocator) outdata, savemean, saveinvvar = context.batchNormNd( indata, mean, var, scale, bias, epsilon, test=False, out=out, allocator=allocator ) return outdata.reshape(data.shape), savemean, saveinvvar, scale
def scanSum(data): assert data.dtype == np.uint32 length, = data.shape assert length <= NT outdata = GPUArray.empty(data.shape, dtype=data.dtype, allocator=memPool) scanMod.scanSum(outdata, data, np.int32(length), block=(NT, 1, 1), grid=(1, 1, 1)) return outdata
def maxpool2d(data, size, stride, pad, allocator=memPool): assert data.dtype == np.float32 batchsize, maps, inh, inw = data.shape fh, fw = size hstride, wstride = stride hpad, wpad = pad outh = (inh - fh + 2 * hpad) // hstride + 1 outw = (inw - fw + 2 * wpad) // wstride + 1 outdata = GPUArray.empty((batchsize, maps, outh, outw), dtype=np.float32, allocator=allocator) mask = GPUArray.empty((batchsize, maps, outh, outw), dtype=np.int32, allocator=allocator) size = prod(outdata.shape) block = (nthreads, 1, 1) grid = (roundUpDiv(size, nthreads), 1, 1) mod.maxpool2d(outdata, data, mask, np.int32(inh), np.int32(inw), np.int32(outh), np.int32(outw), np.int32(maps), np.int32(hstride), np.int32(wstride), np.int32(hpad), np.int32(wpad), np.int32(fh), np.int32(fw), np.int32(size), block=block, grid=grid) return outdata, mask
def ctcLoss(data, datalen, labels, lengths, blank, error=None, normalized=False, returnAlphas=False): T, batchsize, vocabsize = data.shape mx = 2 * np.max(lengths) + 1 config = min(i for i, (NT, VT) in enumerate(configs) if mx <= NT * VT) mod, NT = modules[config], configs[config][0] if not normalized: data = cudnn.softmaxNd(data.reshape(T * batchsize, vocabsize, 1, 1), allocator=memPool).reshape( T, batchsize, vocabsize ) offsets = np.cumsum(lengths, dtype=np.int32) extOffsets = np.empty(shape=(batchsize + 1, ), dtype=np.int32) extOffsets[0] = 0 extOffsets[1:] = offsets alphas = GPUArray.empty((T * (2 * int(offsets[-1]) + batchsize), ), dtype=np.float32, allocator=memPool) offsets = GPUArray.toGpu(extOffsets, allocator=memPool) nll = GPUArray.empty((batchsize, ), dtype=np.float32, allocator=memPool) error = GPUArray.zeros((), dtype=np.float32, allocator=memPool) if error is None else error grad = GPUArray.zeros(data.shape, dtype=np.float32, allocator=memPool) mod.calcAlphas( data, datalen, np.int32(T), np.int32(vocabsize), labels, offsets, alphas, np.int32(blank), nll, error, block=(NT, 1, 1), grid=(batchsize, 1, 1) ) mod.calcBetas( data, datalen, np.int32(T), np.int32(vocabsize), labels, offsets, alphas, np.int32(blank), nll, grad, block=(NT, 1, 1), grid=(batchsize, 1, 1) ) return (error, grad) if not returnAlphas else (error, grad, alphas)
def argminmax(tensor, axis, mode, allocator): assert tensor.dtype == np.float32 or tensor.dtype == np.float16 assert 0 <= axis < tensor.ndim mod = {"max": maxmod, "min": minmod}[mode] if axis == tensor.ndim - 1: block = (warpSize, 1, 1) grid = (prod(tensor.shape[:-1]), 1, 1) idx = GPUArray.empty(tensor.shape[:-1], dtype=np.int32, allocator=allocator) fn = mod.minMaxOnRow if tensor.dtype == np.float32 else mod.minMaxOnRowFP16 fn(idx, tensor, np.int32(tensor.dimAt(-1)), block=block, grid=grid) else: z, width = prod(tensor.shape[:axis]), prod(tensor.shape[axis + 1:]) block = (NT, 1, 1) grid = (roundUpDiv(width, block[0]), 1, z) idx = GPUArray.empty(tensor.shape[:axis] + tensor.shape[axis + 1:], dtype=np.int32, allocator=allocator) fn = mod.minMaxOnCol if tensor.dtype == np.float32 else mod.minMaxOnColFP16 fn(idx, tensor, np.int32(width), np.int32(tensor.dimAt(axis)), block=block, grid=grid) return idx
def createRnn(insize, hsize, dtype, layers=1, algo=RNNAlgo.standard, mode=RNNMode.lstm, direction=DirectionMode.uni, dropout=0.0, seed=0, batchsize=0): rnn = CuDnn.Rnn(context, insize, hsize, np.dtype(dtype), layers, algo.value, mode.value, direction.value, dropout, seed, batchsize) W = GPUArray.empty((rnn.wsize, ), dtype=dtype) params = acquireRnnParams(rnn, W) return rnn, W, params
def build(self): totalbytes = sum( self.align(nbytes) for _, nbytes in self.blocks.values()) self.ary = GPUArray.empty(shape=(totalbytes // self.dtype.itemsize, ), dtype=self.dtype, allocator=self.allocator) blocks = OrderedDict() offset = 0 for name, (shape, nbytes) in self.blocks.items(): blocks[name] = GPUArray(shape=shape, dtype=self.dtype, gpudata=self.ary.gpudata[offset:offset + nbytes]) offset += self.align(nbytes) self.blocks = blocks
def eltwiseTest(): hostInData = np.random.randint(0, 1000, size=(1 << 18, ), dtype=np.int32) indata = GPUArray.toGpu(hostInData) outdata = GPUArray.empty((1 << 18, ), dtype=np.int32) square = ElementwiseKernel([(int_t.ptr, "outdata"), (int_t.const.ptr, "indata")], "outdata[i] = indata[i] * indata[i]", "square") square(outdata, indata) hostOutData = hostInData**2 assert np.allclose(hostOutData, outdata.get()) square(outdata, outdata, slice=slice(None, None, 10)) hostOutData[::10] = hostOutData[::10]**2 assert np.allclose(hostOutData, outdata.get())
def preluBackwardParams(indata, outgrad, sharedMaps=False, allocator=memPool): assert indata.dtype == outgrad.dtype and outgrad.dtype == np.float32 assert indata.shape == outgrad.shape size = prod(outgrad.shape[1:]) stride = prod(outgrad.shape[1:]) block = (nthreads, 1, 1) grid = (roundUpDiv(size, nthreads), 1, 1) slopegrad = GPUArray.empty(outgrad.shape[1:], dtype=np.float32, allocator=allocator) mod.preluBackwardParams( slopegrad, outgrad, indata, np.int32(outgrad.shape[0]), np.int32(stride), np.int32(size), block=block, grid=grid ) shape = (1, prod(slopegrad.shape)) if sharedMaps else (slopegrad.shape[0], prod(slopegrad.shape[1:])) return matsum(slopegrad.reshape(shape), axis=1)
def reduce(self, stage, allocator, *args): size, args = self.prepareArguments(args) blockbit = 9 blocksize = 1 << blockbit blocks = min((size + blocksize - 1) >> blockbit, blocksize) partials = GPUArray.empty((blocks, ) if blocks > 1 else (), dtype=self.outtype, allocator=allocator) kernel = self.module.getFunction("%s_stage%s" % (self.name, stage)) kernel(*args, partials, np.int32(size), block=(blocksize, 1, 1), grid=(blocks, 1, 1)) return self.reduce(2, allocator, partials) if blocks > 1 else partials
def maxpool2dBackward(grad, origshape, mask, size, stride, pad, allocator=memPool): assert grad.dtype == np.float32 and mask.dtype == np.int32 batchsize, maps, outh, outw = grad.shape fh, fw = size hstride, wstride = stride hpad, wpad = pad inh, inw = origshape[2], origshape[3] ingrad = GPUArray.empty((batchsize, maps, inh, inw), dtype=np.float32, allocator=allocator) size = prod(ingrad.shape) block = (nthreads, 1, 1) grid = (roundUpDiv(size, nthreads), 1, 1) mod.maxpool2dBackward(ingrad, grad, mask, np.int32(inh), np.int32(inw), np.int32(outh), np.int32(outw), np.int32(maps), np.int32(hstride), np.int32(wstride), np.int32(hpad), np.int32(wpad), np.int32(fh), np.int32(fw), np.int32(size), block=block, grid=grid) return ingrad
def prelu(data, slopes, inplace=False, sharedMaps=False, allocator=memPool): assert data.dtype == slopes.dtype and slopes.dtype == np.float32 assert slopes.shape == (1, ) if sharedMaps else data.shape[1] == slopes.shape[0] outdata = data if inplace else GPUArray.empty(data.shape, dtype=np.float32, allocator=allocator) mapsize = prod(data.shape[2:]) size = prod(data.shape) block = (nthreads, 1, 1) grid = (roundUpDiv(size, nthreads), 1, 1) divFactor = data.shape[1] if sharedMaps else 1 mod.prelu( outdata, data, slopes, np.int32(divFactor), np.int32(mapsize), np.int32(data.shape[1]), np.int32(size), block=block, grid=grid ) return outdata
def preluBackwardData(grad, slopes, indata, sharedMaps=False, allocator=memPool): assert grad.dtype == slopes.dtype and slopes.dtype == indata.dtype and indata.dtype == np.float32 assert grad.shape == indata.shape assert slopes.shape == (1, ) if sharedMaps else grad.shape[1] == slopes.shape[0] ingrad = GPUArray.empty(grad.shape, dtype=np.float32, allocator=allocator) mapsize = prod(grad.shape[2:]) size = prod(grad.shape) block = (nthreads, 1, 1) grid = (roundUpDiv(size, nthreads), 1, 1) divFactor = grad.shape[1] if sharedMaps else 1 mod.preluBackwardData( ingrad, grad, slopes, indata, np.int32(divFactor), np.int32(mapsize), np.int32(grad.shape[1]), np.int32(size), block=block, grid=grid ) return ingrad
def addVecToMat(vec, mat, axis=0, out=None, allocator=memPool): assert vec.dtype == mat.dtype and (mat.dtype == np.float32 or mat.dtype == np.float16) assert vec.ndim == mat.ndim - 1 and 0 <= axis < 2 assert mat.shape[:-2] == vec.shape[:-1] out = GPUArray.empty(mat.shape, dtype=mat.dtype, allocator=allocator) if out is None else out z = prod(mat.shape[:-2]) n, m = mat.shape[-2:] block = (warpSize, warpSize, 1) grid = (roundUpDiv(m, block[0]), roundUpDiv(n, block[1]), z) if axis == 1: if mat.dimAt(-1) == vec.dimAt(-1): fn = addmod.opRowVecToMat if mat.dtype == np.float32 else addmod.opRowVecToMatFP16 fn(out, vec, mat, np.int32(n), np.int32(m), block=block, grid=grid) else: assert mat.dimAt(-1) % vec.dimAt(-1) == 0 fn = addmod.opRowOneVecToMat if mat.dtype == np.float32 else addmod.opRowOneVecToMatFP16 fn(out, vec, mat, np.int32(n), np.int32(m), np.int32(vec.dimAt(-1)), block=block, grid=grid) else: fn = addmod.opColVecToMat if mat.dtype == np.float32 else addmod.opColVecToMatFP16 fn(out, vec, mat, np.int32(n), np.int32(m), block=block, grid=grid) return out
def randomTest(): data = GPUArray.empty((100, ), dtype=np.float32) fillUniform(data, minval=-1.0, maxval=1.0) fillNormal(data, mean=1.0, stddev=0.1)