Esempio n. 1
0
def svm(scores, labels, mode, error=None, allocator=memPool):
    assert scores.dtype == np.float32 and labels.dtype == np.int32
    shape = scores.shape

    grad = GPUArray.empty(shape, dtype=np.float32, allocator=allocator)
    if error is None:
        error = GPUArray.empty((), dtype=np.float32, allocator=allocator)

    error.fill(0.0)

    size = prod(scores.shape)
    spatialDim = prod(scores.shape[2:])
    mapStride = spatialDim * scores.shape[1]

    block = (nthreads, 1, 1)
    grid = (roundUpDiv(size, nthreads), 1, 1)

    mod = {"l1": svmL1Mod, "l2": svmL2Mod}[mode]

    mod.cost(scores,
             labels,
             np.int32(size),
             np.int32(mapStride),
             np.int32(spatialDim),
             np.int32(shape[1]),
             np.int32(shape[0]),
             error,
             grad,
             block=block,
             grid=grid)

    return error, grad
Esempio n. 2
0
def reflectpad(data, pad, allocator=memPool):
    if data.ndim == 3:
        batchsize, maps, insize = data.shape
        lpad, rpad = pad

        assert insize >= max(lpad, rpad) + 1
        outsize = insize + lpad + rpad

        block = (warpSize, 1, 1)
        grid = (roundUpDiv(outsize, warpSize), maps, batchsize)

        outdata = GPUArray.empty((batchsize, maps, outsize),
                                 dtype=data.dtype,
                                 allocator=allocator)
        fn = mod.reflectpad1d if data.dtype == np.float32 else mod.reflectpad1dFP16

        fn(outdata,
           data,
           np.int32(insize),
           np.int32(lpad),
           np.int32(rpad),
           block=block,
           grid=grid)

    elif data.ndim == 4:
        batchsize, maps, inh, inw = data.shape
        upad, bpad, lpad, rpad = pad

        assert inh >= max(upad, bpad) + 1 and inw >= max(lpad, rpad) + 1
        outh, outw = inh + upad + bpad, inw + lpad + rpad

        block = (warpSize, 1, 1)
        grid = (roundUpDiv(outh * outw, warpSize), maps, batchsize)

        outdata = GPUArray.empty((batchsize, maps, outh, outw),
                                 dtype=data.dtype,
                                 allocator=allocator)
        fn = mod.reflectpad2d if data.dtype == np.float32 else mod.reflectpad2dFP16

        fn(outdata,
           data,
           np.int32(inh),
           np.int32(inw),
           np.int32(upad),
           np.int32(bpad),
           np.int32(lpad),
           np.int32(rpad),
           block=block,
           grid=grid)

    else:
        raise NotImplementedError(data.ndim)

    return outdata
Esempio n. 3
0
def segmentSeq(data):
	assert data.dtype == np.int32

	length, = data.shape
	assert length <= NV

	segments = GPUArray.empty((length, 3), dtype=np.int32, allocator=memPool)
	indices = GPUArray.empty(data.shape, dtype=np.int32, allocator=memPool)

	segmentMod.segmentSeq(segments, indices, data, np.int32(length), block=(NT, 1, 1), grid=(1, 1, 1))
	return segments, indices
Esempio n. 4
0
def radixSort(keys, values):
	assert keys.dtype == np.int32 and values.dtype == np.int32
	assert keys.shape == values.shape

	length, = keys.shape
	assert length <= NV

	outkeys = GPUArray.empty(keys.shape, dtype=keys.dtype, allocator=memPool)
	outvalues = GPUArray.empty(values.shape, dtype=values.dtype, allocator=memPool)

	radixMod.radixSort(outkeys, outvalues, keys, values, np.int32(length), block=(NT, 1, 1), grid=(1, 1, 1))
	return outkeys, outvalues
Esempio n. 5
0
def crossEntropy(scores, labels, weights=None, error=None, allocator=memPool):
    assert scores.dtype == np.float32 and labels.dtype == np.int32

    shape = scores.shape
    if scores.ndim < 4:
        scores = scores.reshape(*shape, *(1 for _ in range(4 - scores.ndim)))

    softmax = cudnn.softmaxNd(scores,
                              mode=SoftMaxMode.spatial.value,
                              allocator=allocator)

    grad = GPUArray.empty(shape, dtype=np.float32, allocator=allocator)
    if error is None:
        error = GPUArray.empty((), dtype=np.float32, allocator=allocator)

    error.fill(0.0)

    size = prod(scores.shape)
    spatialDim = prod(scores.shape[2:])
    mapStride = spatialDim * scores.shape[1]

    block = (nthreads, 1, 1)
    grid = (roundUpDiv(size, nthreads), 1, 1)

    if weights is None:
        ceMod.cost(softmax,
                   labels,
                   np.int32(size),
                   np.int32(mapStride),
                   np.int32(spatialDim),
                   np.int32(scores.shape[1]),
                   np.int32(scores.shape[0]),
                   error,
                   grad,
                   block=block,
                   grid=grid)

    else:
        wceMod.cost(softmax,
                    labels,
                    weights,
                    np.int32(size),
                    np.int32(mapStride),
                    np.int32(spatialDim),
                    np.int32(shape[1]),
                    np.int32(shape[0]),
                    error,
                    grad,
                    block=block,
                    grid=grid)

    return error, grad
Esempio n. 6
0
def concatenate(tup, axis, out=None, allocator=memoryPool):
    ary = tup[0]

    dtype, reducedShape = ary.dtype, ary.shape
    reducedShape = reducedShape[:axis] + reducedShape[axis + 1:]

    assert all(a.dtype == dtype and a.shape[:axis] +
               a.shape[axis + 1:] == reducedShape for a in tup[1:])

    concatDim = sum(a.dimAt(axis) for a in tup)
    shape = reducedShape[:axis] + (concatDim, ) + reducedShape[axis:]

    if out is None:
        out = GPUArray.empty(shape, dtype=dtype, allocator=allocator)
    else:
        assert out.shape == shape and out.dtype == dtype

    dstPitch = out.strideAt(axis - 1) if axis > 0 else out.nbytes
    height = prod(shape[:axis])

    stride = 0

    for a in tup:
        srcPitch = width = a.strideAt(axis - 1) if axis > 0 else a.nbytes

        Driver.memcpy2D(width,
                        height,
                        a.gpudata,
                        srcPitch,
                        out.gpudata,
                        dstPitch,
                        dstX=stride)
        stride += width

    return out
Esempio n. 7
0
def split(ary, sections, axis, allocator=memoryPool):
    shape = ary.shape
    assert sum(sections) == shape[axis]

    outs = [
        GPUArray.empty(shape[:axis] + (sec, ) + shape[axis + 1:],
                       dtype=ary.dtype,
                       allocator=allocator) for sec in sections
    ]

    srcPitch = ary.strideAt(axis - 1) if axis > 0 else ary.nbytes
    height = prod(shape[:axis])

    stride = 0

    for out in outs:
        dstPitch = width = out.strideAt(axis - 1) if axis > 0 else out.nbytes

        Driver.memcpy2D(width,
                        height,
                        ary.gpudata,
                        srcPitch,
                        out.gpudata,
                        dstPitch,
                        srcX=stride)
        stride += width

    return outs
Esempio n. 8
0
def rescale(data,
            scale,
            memoryType,
            interpolation=InterpolationMode.nn,
            outdata=None,
            allocator=memPool):
    assert data.ndim == 2 and memoryType == MemoryType.grayscale or data.ndim == 3
    hscale, wscale = (scale, scale) if isinstance(scale,
                                                  (int, float)) else scale

    inrect = getDataRect(data, memoryType)
    insize, inline = (inrect[2], inrect[3]), getMemoryTypeLineSize(
        inrect[2], data.dtype, memoryType)

    outrect = libnpp.nppiGetResizeRect(inrect, wscale, hscale, 0, 0,
                                       interpolation.value)
    outline = getMemoryTypeLineSize(outrect[2], data.dtype, memoryType)

    outshape = getOutDataShape(data, outrect, memoryType)

    if outdata is None:
        outdata = GPUArray.empty(outshape,
                                 dtype=data.dtype,
                                 allocator=allocator)
    else:
        assert outdata.shape == outshape

    dataPtr, outdataPtr = getDataPointers(data, outdata, memoryType)

    libnpp.nppiResizeSqrPixel(
        getDataType(data).value, memoryType.value, dataPtr, insize, inline,
        inrect, outdataPtr, outline, outrect, wscale, hscale, 0, 0,
        interpolation.value)

    return outdata
Esempio n. 9
0
def maxunpool2dBackward(grad, poolshape, mask, allocator=memPool):
    assert grad.dtype == np.float32 and mask.dtype == np.int32
    batchsize, maps, outh, outw = grad.shape

    inh, inw = poolshape[2], poolshape[3]
    ingrad = GPUArray.empty((batchsize, maps, inh, inw),
                            dtype=np.float32,
                            allocator=allocator)

    size = prod(ingrad.shape)

    block = (nthreads, 1, 1)
    grid = (roundUpDiv(size, nthreads), 1, 1)

    mod.maxunpool2dBackward(ingrad,
                            grad,
                            mask,
                            np.int32(inh),
                            np.int32(inw),
                            np.int32(outh),
                            np.int32(outw),
                            np.int32(maps),
                            np.int32(size),
                            block=block,
                            grid=grid)

    return ingrad
Esempio n. 10
0
def upsample3d(data, scale, mode="nearest", allocator=memPool):
	batchsize, maps, ind, inh, inw = data.shape
	dscale, hscale, wscale = (scale, scale, scale) if isinstance(scale, int) else scale

	outd, outh, outw = dscale * ind, hscale * inh, wscale * inw
	outdata = GPUArray.empty((batchsize, maps, outd, outh, outw), dtype=data.dtype, allocator=allocator)

	if mode == "nearest":
		block = (wblocksize, hblocksize, 1)
		grid = (roundUpDiv(inw, block[0]), roundUpDiv(inh, block[1]), batchsize * maps * ind)

		nearestMod.upsample3dNearest(
			outdata, data, np.int32(ind), np.int32(inh), np.int32(inw),
			np.int32(outd), np.int32(outh), np.int32(outw), np.int32(dscale), np.int32(hscale), np.int32(wscale),
			block=block, grid=grid
		)

	elif mode == "linear":
		block = (warpSize, nthreads // warpSize, 1)
		grid = (roundUpDiv(outw, block[0]), roundUpDiv(outh, block[1]), outd)

		rd, rh, rw = (ind - 1) / (outd - 1), (inh - 1) / (outh - 1), (inw - 1) / (outw - 1)

		linearMod.upsample3dLinear(
			outdata, data, np.int32(batchsize), np.int32(maps), np.int32(ind), np.int32(inh), np.int32(inw),
			np.int32(outd), np.int32(outh), np.int32(outw), np.float32(rd), np.float32(rh), np.float32(rw),
			block=block, grid=grid
		)

	else:
		raise NotImplementedError(mode)

	return outdata
Esempio n. 11
0
def warpAffinePoints(data,
                     inpoints,
                     outpoints,
                     memoryType,
                     outshape=None,
                     interpolation=InterpolationMode.nn,
                     cval=0,
                     clip=True,
                     allocator=memPool):
    assert data.ndim == 2 and memoryType == MemoryType.grayscale or data.ndim == 3

    inrect = getDataRect(data, memoryType)
    insize, inline = (inrect[2], inrect[3]), getMemoryTypeLineSize(
        inrect[2], data.dtype, memoryType)

    if outshape is None:
        outshape = data.shape

    outrect = getOutDataRect(data, outshape, memoryType)
    outline = getMemoryTypeLineSize(outrect[2], data.dtype, memoryType)

    outdata = GPUArray.empty(outshape, dtype=data.dtype, allocator=allocator)
    outdata.fill(cval)

    dataPtr, outdataPtr = getDataPointers(data, outdata, memoryType)
    srcQuad, dstQuad = genAffineQuads(inpoints, outpoints, clip, inrect)

    libnpp.nppiWarpAffineQuad(
        getDataType(data).value, memoryType.value, dataPtr, insize, inline,
        inrect, srcQuad, outdataPtr, outline, outrect, dstQuad,
        interpolation.value)

    return outdata
Esempio n. 12
0
def upsample2dBackward(grad, scale, mode="nearest", allocator=memPool):
	batchsize, maps, outh, outw = grad.shape
	hscale, wscale = (scale, scale) if isinstance(scale, int) else scale

	inh, inw = outh // hscale, outw // wscale

	if mode == "nearest":
		ingrad = GPUArray.empty((batchsize, maps, inh, inw), dtype=grad.dtype, allocator=allocator)

		blk = warpSize * 8
		block = (blk, 1, 1)
		grid = (roundUpDiv(ingrad.size, blk), 1, 1)

		nearestMod.upsample2dNearestBackward(
			ingrad, grad, np.int32(inw), np.int32(outw), np.int32(hscale), np.int32(wscale), np.int32(ingrad.size),
			block=block, grid=grid
		)

	elif mode == "linear":
		ingrad = GPUArray.zeros((batchsize, maps, inh, inw), dtype=grad.dtype, allocator=allocator)

		block = (warpSize, nthreads // warpSize, 1)
		grid = (roundUpDiv(outw, block[0]), roundUpDiv(outh, block[1]), 1)

		rh, rw = (inh - 1) / (outh - 1), (inw - 1) / (outw - 1)

		linearMod.upsample2dLinearBackward(
			ingrad, grad, np.int32(batchsize), np.int32(maps), np.int32(inh), np.int32(inw),
			np.int32(outh), np.int32(outw), np.float32(rh), np.float32(rw), block=block, grid=grid
		)

	else:
		raise NotImplementedError(mode)

	return ingrad
Esempio n. 13
0
def warpAffine(data,
               coeffs,
               memoryType,
               outshape=None,
               interpolation=InterpolationMode.nn,
               cval=0,
               backward=False,
               allocator=memPool):
    assert data.ndim == 2 and memoryType == MemoryType.grayscale or data.ndim == 3

    inrect = getDataRect(data, memoryType)
    insize, inline = (inrect[2], inrect[3]), getMemoryTypeLineSize(
        inrect[2], data.dtype, memoryType)

    if outshape is None:
        outshape = data.shape

    outrect = getOutDataRect(data, outshape, memoryType)
    outline = getMemoryTypeLineSize(outrect[2], data.dtype, memoryType)

    outdata = GPUArray.empty(outshape, dtype=data.dtype, allocator=allocator)
    outdata.fill(cval)

    dataPtr, outdataPtr = getDataPointers(data, outdata, memoryType)

    warpMethod = libnpp.nppiWarpAffine
    if backward:
        warpMethod = libnpp.nppiWarpAffineBack

    warpMethod(
        getDataType(data).value, memoryType.value, dataPtr, insize, inline,
        inrect, outdataPtr, outline, outrect, coeffs, interpolation.value)

    return outdata
Esempio n. 14
0
	def wrapAddVectorToVector(x, y, out=None, alpha=1.0, beta=1.0, allocator=memoryPool):
		if out is None:
			out = GPUArray.empty(x.shape, dtype=x.dtype, allocator=allocator)
		else:
			assert out.shape == x.shape

		addKer(out.dtype)(out, x, alpha, y, beta)
		return out
Esempio n. 15
0
def instanceNorm2d(data, scale, bias, epsilon=1e-5, out=None, allocator=None):
	batchsize, maps, height, width = data.shape
	extmaps = batchsize * maps

	indata = data.reshape(1, extmaps, height, width)

	mean = GPUArray.empty((extmaps, ), dtype=np.float32, allocator=allocator)
	var = GPUArray.empty((extmaps, ), dtype=np.float32, allocator=allocator)

	if batchsize > 1:
		scale = tile(scale, batchsize, axis=0, allocator=allocator)
		bias = tile(bias, batchsize, axis=0, allocator=allocator)

	outdata, savemean, saveinvvar = context.batchNormNd(
		indata, mean, var, scale, bias, epsilon, test=False, out=out, allocator=allocator
	)
	return outdata.reshape(data.shape), savemean, saveinvvar, scale
Esempio n. 16
0
def scanSum(data):
	assert data.dtype == np.uint32

	length, = data.shape
	assert length <= NT

	outdata = GPUArray.empty(data.shape, dtype=data.dtype, allocator=memPool)

	scanMod.scanSum(outdata, data, np.int32(length), block=(NT, 1, 1), grid=(1, 1, 1))
	return outdata
Esempio n. 17
0
def maxpool2d(data, size, stride, pad, allocator=memPool):
    assert data.dtype == np.float32
    batchsize, maps, inh, inw = data.shape

    fh, fw = size
    hstride, wstride = stride
    hpad, wpad = pad

    outh = (inh - fh + 2 * hpad) // hstride + 1
    outw = (inw - fw + 2 * wpad) // wstride + 1

    outdata = GPUArray.empty((batchsize, maps, outh, outw),
                             dtype=np.float32,
                             allocator=allocator)
    mask = GPUArray.empty((batchsize, maps, outh, outw),
                          dtype=np.int32,
                          allocator=allocator)

    size = prod(outdata.shape)

    block = (nthreads, 1, 1)
    grid = (roundUpDiv(size, nthreads), 1, 1)

    mod.maxpool2d(outdata,
                  data,
                  mask,
                  np.int32(inh),
                  np.int32(inw),
                  np.int32(outh),
                  np.int32(outw),
                  np.int32(maps),
                  np.int32(hstride),
                  np.int32(wstride),
                  np.int32(hpad),
                  np.int32(wpad),
                  np.int32(fh),
                  np.int32(fw),
                  np.int32(size),
                  block=block,
                  grid=grid)

    return outdata, mask
Esempio n. 18
0
def ctcLoss(data, datalen, labels, lengths, blank, error=None, normalized=False, returnAlphas=False):
	T, batchsize, vocabsize = data.shape
	mx = 2 * np.max(lengths) + 1

	config = min(i for i, (NT, VT) in enumerate(configs) if mx <= NT * VT)
	mod, NT = modules[config], configs[config][0]

	if not normalized:
		data = cudnn.softmaxNd(data.reshape(T * batchsize, vocabsize, 1, 1), allocator=memPool).reshape(
			T, batchsize, vocabsize
		)

	offsets = np.cumsum(lengths, dtype=np.int32)
	extOffsets = np.empty(shape=(batchsize + 1, ), dtype=np.int32)

	extOffsets[0] = 0
	extOffsets[1:] = offsets

	alphas = GPUArray.empty((T * (2 * int(offsets[-1]) + batchsize), ), dtype=np.float32, allocator=memPool)
	offsets = GPUArray.toGpu(extOffsets, allocator=memPool)

	nll = GPUArray.empty((batchsize, ), dtype=np.float32, allocator=memPool)

	error = GPUArray.zeros((), dtype=np.float32, allocator=memPool) if error is None else error
	grad = GPUArray.zeros(data.shape, dtype=np.float32, allocator=memPool)

	mod.calcAlphas(
		data, datalen, np.int32(T), np.int32(vocabsize), labels, offsets, alphas, np.int32(blank),
		nll, error, block=(NT, 1, 1), grid=(batchsize, 1, 1)
	)

	mod.calcBetas(
		data, datalen, np.int32(T), np.int32(vocabsize), labels, offsets, alphas, np.int32(blank),
		nll, grad, block=(NT, 1, 1), grid=(batchsize, 1, 1)
	)

	return (error, grad) if not returnAlphas else (error, grad, alphas)
Esempio n. 19
0
def argminmax(tensor, axis, mode, allocator):
    assert tensor.dtype == np.float32 or tensor.dtype == np.float16
    assert 0 <= axis < tensor.ndim

    mod = {"max": maxmod, "min": minmod}[mode]

    if axis == tensor.ndim - 1:
        block = (warpSize, 1, 1)
        grid = (prod(tensor.shape[:-1]), 1, 1)

        idx = GPUArray.empty(tensor.shape[:-1],
                             dtype=np.int32,
                             allocator=allocator)
        fn = mod.minMaxOnRow if tensor.dtype == np.float32 else mod.minMaxOnRowFP16

        fn(idx, tensor, np.int32(tensor.dimAt(-1)), block=block, grid=grid)

    else:
        z, width = prod(tensor.shape[:axis]), prod(tensor.shape[axis + 1:])

        block = (NT, 1, 1)
        grid = (roundUpDiv(width, block[0]), 1, z)

        idx = GPUArray.empty(tensor.shape[:axis] + tensor.shape[axis + 1:],
                             dtype=np.int32,
                             allocator=allocator)
        fn = mod.minMaxOnCol if tensor.dtype == np.float32 else mod.minMaxOnColFP16

        fn(idx,
           tensor,
           np.int32(width),
           np.int32(tensor.dimAt(axis)),
           block=block,
           grid=grid)

    return idx
Esempio n. 20
0
def createRnn(insize,
              hsize,
              dtype,
              layers=1,
              algo=RNNAlgo.standard,
              mode=RNNMode.lstm,
              direction=DirectionMode.uni,
              dropout=0.0,
              seed=0,
              batchsize=0):
    rnn = CuDnn.Rnn(context, insize, hsize, np.dtype(dtype), layers,
                    algo.value, mode.value, direction.value, dropout, seed,
                    batchsize)

    W = GPUArray.empty((rnn.wsize, ), dtype=dtype)
    params = acquireRnnParams(rnn, W)

    return rnn, W, params
Esempio n. 21
0
    def build(self):
        totalbytes = sum(
            self.align(nbytes) for _, nbytes in self.blocks.values())

        self.ary = GPUArray.empty(shape=(totalbytes // self.dtype.itemsize, ),
                                  dtype=self.dtype,
                                  allocator=self.allocator)

        blocks = OrderedDict()
        offset = 0

        for name, (shape, nbytes) in self.blocks.items():
            blocks[name] = GPUArray(shape=shape,
                                    dtype=self.dtype,
                                    gpudata=self.ary.gpudata[offset:offset +
                                                             nbytes])
            offset += self.align(nbytes)

        self.blocks = blocks
Esempio n. 22
0
def eltwiseTest():
    hostInData = np.random.randint(0, 1000, size=(1 << 18, ), dtype=np.int32)

    indata = GPUArray.toGpu(hostInData)
    outdata = GPUArray.empty((1 << 18, ), dtype=np.int32)

    square = ElementwiseKernel([(int_t.ptr, "outdata"),
                                (int_t.const.ptr, "indata")],
                               "outdata[i] = indata[i] * indata[i]", "square")

    square(outdata, indata)

    hostOutData = hostInData**2
    assert np.allclose(hostOutData, outdata.get())

    square(outdata, outdata, slice=slice(None, None, 10))

    hostOutData[::10] = hostOutData[::10]**2
    assert np.allclose(hostOutData, outdata.get())
Esempio n. 23
0
def preluBackwardParams(indata, outgrad, sharedMaps=False, allocator=memPool):
	assert indata.dtype == outgrad.dtype and outgrad.dtype == np.float32
	assert indata.shape == outgrad.shape

	size = prod(outgrad.shape[1:])
	stride = prod(outgrad.shape[1:])

	block = (nthreads, 1, 1)
	grid = (roundUpDiv(size, nthreads), 1, 1)

	slopegrad = GPUArray.empty(outgrad.shape[1:], dtype=np.float32, allocator=allocator)

	mod.preluBackwardParams(
		slopegrad, outgrad, indata, np.int32(outgrad.shape[0]), np.int32(stride), np.int32(size),
		block=block, grid=grid
	)

	shape = (1, prod(slopegrad.shape)) if sharedMaps else (slopegrad.shape[0], prod(slopegrad.shape[1:]))
	return matsum(slopegrad.reshape(shape), axis=1)
Esempio n. 24
0
    def reduce(self, stage, allocator, *args):
        size, args = self.prepareArguments(args)

        blockbit = 9
        blocksize = 1 << blockbit

        blocks = min((size + blocksize - 1) >> blockbit, blocksize)
        partials = GPUArray.empty((blocks, ) if blocks > 1 else (),
                                  dtype=self.outtype,
                                  allocator=allocator)

        kernel = self.module.getFunction("%s_stage%s" % (self.name, stage))
        kernel(*args,
               partials,
               np.int32(size),
               block=(blocksize, 1, 1),
               grid=(blocks, 1, 1))

        return self.reduce(2, allocator, partials) if blocks > 1 else partials
Esempio n. 25
0
def maxpool2dBackward(grad,
                      origshape,
                      mask,
                      size,
                      stride,
                      pad,
                      allocator=memPool):
    assert grad.dtype == np.float32 and mask.dtype == np.int32
    batchsize, maps, outh, outw = grad.shape

    fh, fw = size
    hstride, wstride = stride
    hpad, wpad = pad

    inh, inw = origshape[2], origshape[3]
    ingrad = GPUArray.empty((batchsize, maps, inh, inw),
                            dtype=np.float32,
                            allocator=allocator)

    size = prod(ingrad.shape)

    block = (nthreads, 1, 1)
    grid = (roundUpDiv(size, nthreads), 1, 1)

    mod.maxpool2dBackward(ingrad,
                          grad,
                          mask,
                          np.int32(inh),
                          np.int32(inw),
                          np.int32(outh),
                          np.int32(outw),
                          np.int32(maps),
                          np.int32(hstride),
                          np.int32(wstride),
                          np.int32(hpad),
                          np.int32(wpad),
                          np.int32(fh),
                          np.int32(fw),
                          np.int32(size),
                          block=block,
                          grid=grid)

    return ingrad
Esempio n. 26
0
def prelu(data, slopes, inplace=False, sharedMaps=False, allocator=memPool):
	assert data.dtype == slopes.dtype and slopes.dtype == np.float32
	assert slopes.shape == (1, ) if sharedMaps else data.shape[1] == slopes.shape[0]

	outdata = data if inplace else GPUArray.empty(data.shape, dtype=np.float32, allocator=allocator)

	mapsize = prod(data.shape[2:])
	size = prod(data.shape)

	block = (nthreads, 1, 1)
	grid = (roundUpDiv(size, nthreads), 1, 1)

	divFactor = data.shape[1] if sharedMaps else 1

	mod.prelu(
		outdata, data, slopes, np.int32(divFactor), np.int32(mapsize), np.int32(data.shape[1]), np.int32(size),
		block=block, grid=grid
	)

	return outdata
Esempio n. 27
0
def preluBackwardData(grad, slopes, indata, sharedMaps=False, allocator=memPool):
	assert grad.dtype == slopes.dtype and slopes.dtype == indata.dtype and indata.dtype == np.float32
	assert grad.shape == indata.shape
	assert slopes.shape == (1, ) if sharedMaps else grad.shape[1] == slopes.shape[0]

	ingrad = GPUArray.empty(grad.shape, dtype=np.float32, allocator=allocator)

	mapsize = prod(grad.shape[2:])
	size = prod(grad.shape)

	block = (nthreads, 1, 1)
	grid = (roundUpDiv(size, nthreads), 1, 1)

	divFactor = grad.shape[1] if sharedMaps else 1

	mod.preluBackwardData(
		ingrad, grad, slopes, indata, np.int32(divFactor), np.int32(mapsize), np.int32(grad.shape[1]),
		np.int32(size), block=block, grid=grid
	)

	return ingrad
Esempio n. 28
0
def addVecToMat(vec, mat, axis=0, out=None, allocator=memPool):
    assert vec.dtype == mat.dtype and (mat.dtype == np.float32
                                       or mat.dtype == np.float16)
    assert vec.ndim == mat.ndim - 1 and 0 <= axis < 2

    assert mat.shape[:-2] == vec.shape[:-1]
    out = GPUArray.empty(mat.shape, dtype=mat.dtype,
                         allocator=allocator) if out is None else out

    z = prod(mat.shape[:-2])
    n, m = mat.shape[-2:]

    block = (warpSize, warpSize, 1)
    grid = (roundUpDiv(m, block[0]), roundUpDiv(n, block[1]), z)

    if axis == 1:
        if mat.dimAt(-1) == vec.dimAt(-1):
            fn = addmod.opRowVecToMat if mat.dtype == np.float32 else addmod.opRowVecToMatFP16
            fn(out, vec, mat, np.int32(n), np.int32(m), block=block, grid=grid)

        else:
            assert mat.dimAt(-1) % vec.dimAt(-1) == 0

            fn = addmod.opRowOneVecToMat if mat.dtype == np.float32 else addmod.opRowOneVecToMatFP16
            fn(out,
               vec,
               mat,
               np.int32(n),
               np.int32(m),
               np.int32(vec.dimAt(-1)),
               block=block,
               grid=grid)

    else:
        fn = addmod.opColVecToMat if mat.dtype == np.float32 else addmod.opColVecToMatFP16
        fn(out, vec, mat, np.int32(n), np.int32(m), block=block, grid=grid)

    return out
Esempio n. 29
0
def randomTest():
    data = GPUArray.empty((100, ), dtype=np.float32)

    fillUniform(data, minval=-1.0, maxval=1.0)
    fillNormal(data, mean=1.0, stddev=0.1)