def reflectpad1dTest(dtype): batchsize, maps, insize = 4, 8, 48 lpad, rpad = 2, 3 hostData = np.random.randn(batchsize, maps, insize).astype(dtype) data = GPUArray.toGpu(hostData) outdata = reflectpad(data, pad=(lpad, rpad)) hostOutData = outdata.get() outsize = hostOutData.shape[2] assert np.allclose(hostOutData[:, :, lpad:insize + lpad], hostData) assert np.allclose(hostOutData[:, :, :lpad][:, :, ::-1], hostData[:, :, 1:lpad + 1]) assert np.allclose(hostOutData[:, :, insize + lpad:][:, :, ::-1], hostData[:, :, insize - 1 - rpad:insize - 1]) hostGrad = np.random.randn(batchsize, maps, outsize).astype(np.float32) grad = GPUArray.toGpu(hostGrad) ingrad = reflectpadBackward(grad, pad=(lpad, rpad)) hostInGrad = ingrad.get() assert np.allclose(hostInGrad[:, :, lpad + 1:insize - rpad - 1], hostGrad[:, :, 2 * lpad + 1:outsize - 2 * rpad - 1]) assert np.allclose( hostInGrad[:, :, 1:lpad + 1], hostGrad[:, :, :lpad][:, :, ::-1] + hostGrad[:, :, lpad + 1:2 * lpad + 1]) assert np.allclose( hostInGrad[:, :, insize - rpad - 1:insize - 1], hostGrad[:, :, outsize - rpad:][:, :, ::-1] + hostGrad[:, :, outsize - 2 * rpad - 1:outsize - rpad - 1])
def upsample3dNearestTest(): batchsize, maps, ind, inh, inw = 4, 2, 3, 5, 3 scale = 2 hostData = np.random.randn(batchsize, maps, ind, inh, inw).astype(np.float32) data = GPUArray.toGpu(hostData) outdata = upsample3d(data, scale, mode="nearest") hostOutData = np.empty(outdata.shape, dtype=np.float32) for b, c, z, y, x in itertools.product(range(batchsize), range(maps), range(ind), range(inh), range(inw)): hostOutData[b, c, z * scale:(z + 1) * scale, y * scale:(y + 1) * scale, x * scale:(x + 1) * scale] = \ hostData[b, c, z, y, x] assert np.allclose(hostOutData, outdata.get()) hostGrad = np.random.randn(*outdata.shape).astype(np.float32) grad = GPUArray.toGpu(hostGrad) ingrad = upsample3dBackward(grad, scale) hostInGrad = np.zeros(data.shape, dtype=np.float32) for b, c, z, y, x, dz, dy, dx in itertools.product( range(batchsize), range(maps), range(ind), range(inh), range(inw), range(scale), range(scale), range(scale) ): hostInGrad[b, c, z, y, x] += hostGrad[b, c, z * scale + dz, y * scale + dy, x * scale + dx] assert np.allclose(hostInGrad, ingrad.get())
def softmax2dTest(dtype, atol): batchsize, maps, h, w = 5, 8, 2, 3 hostData = np.random.randn(batchsize, maps, h, w).astype(dtype) data = GPUArray.toGpu(hostData) outdata = context.softmaxNd(data) def hostSoftmax(tensor): e = np.exp(tensor - np.amax(tensor)) return e / np.sum(e) hostOutData = np.empty(outdata.shape, dtype=dtype) for b, y, x in itertools.product(range(batchsize), range(h), range(w)): hostOutData[b, :, y, x] = hostSoftmax(hostData[b, :, y, x]) assert np.allclose(hostOutData, outdata.get(), atol=atol) hostGrad = np.random.randn(*outdata.shape).astype(dtype) grad = GPUArray.toGpu(hostGrad) ingrad = context.softmaxNdBackward(grad, outdata) hostInGrad = np.empty(ingrad.shape, dtype=dtype) def hostSoftmaxBackward(d, gr): return d * (gr - np.dot(d, gr)) for b, y, x in itertools.product(range(batchsize), range(h), range(w)): hostInGrad[b, :, y, x] = hostSoftmaxBackward(hostOutData[b, :, y, x], hostGrad[b, :, y, x]) assert np.allclose(hostInGrad, ingrad.get(), atol=atol)
def svmTest(): batchsize, size = 20, 4 hostScores = np.random.randn(batchsize, size).astype(np.float32) hostLabels = np.random.randint(low=0, high=size, size=(batchsize, ), dtype=np.int32) scores, labels = GPUArray.toGpu(hostScores), GPUArray.toGpu(hostLabels) error, grad = svm(scores, labels, mode="l1") hostGrad = np.empty(grad.shape, dtype=np.float32) hostError = 0.0 for b in range(batchsize): for n in range(size): cls = 2 * (hostLabels[b] == n) - 1 val = hostScores[b, n] * cls hostGrad[b, n] = cls / batchsize / size if val < 1 else 0.0 hostError += max(0.0, 1.0 - val) / batchsize / size assert np.allclose(hostGrad, grad.get()) assert np.isclose(hostError, error.get() / scores.shape[0])
def svm(scores, labels, mode, error=None, allocator=memPool): assert scores.dtype == np.float32 and labels.dtype == np.int32 shape = scores.shape grad = GPUArray.empty(shape, dtype=np.float32, allocator=allocator) if error is None: error = GPUArray.empty((), dtype=np.float32, allocator=allocator) error.fill(0.0) size = prod(scores.shape) spatialDim = prod(scores.shape[2:]) mapStride = spatialDim * scores.shape[1] block = (nthreads, 1, 1) grid = (roundUpDiv(size, nthreads), 1, 1) mod = {"l1": svmL1Mod, "l2": svmL2Mod}[mode] mod.cost(scores, labels, np.int32(size), np.int32(mapStride), np.int32(spatialDim), np.int32(shape[1]), np.int32(shape[0]), error, grad, block=block, grid=grid) return error, grad
def upsample2dBackward(grad, scale, mode="nearest", allocator=memPool): batchsize, maps, outh, outw = grad.shape hscale, wscale = (scale, scale) if isinstance(scale, int) else scale inh, inw = outh // hscale, outw // wscale if mode == "nearest": ingrad = GPUArray.empty((batchsize, maps, inh, inw), dtype=grad.dtype, allocator=allocator) blk = warpSize * 8 block = (blk, 1, 1) grid = (roundUpDiv(ingrad.size, blk), 1, 1) nearestMod.upsample2dNearestBackward( ingrad, grad, np.int32(inw), np.int32(outw), np.int32(hscale), np.int32(wscale), np.int32(ingrad.size), block=block, grid=grid ) elif mode == "linear": ingrad = GPUArray.zeros((batchsize, maps, inh, inw), dtype=grad.dtype, allocator=allocator) block = (warpSize, nthreads // warpSize, 1) grid = (roundUpDiv(outw, block[0]), roundUpDiv(outh, block[1]), 1) rh, rw = (inh - 1) / (outh - 1), (inw - 1) / (outw - 1) linearMod.upsample2dLinearBackward( ingrad, grad, np.int32(batchsize), np.int32(maps), np.int32(inh), np.int32(inw), np.int32(outh), np.int32(outw), np.float32(rh), np.float32(rw), block=block, grid=grid ) else: raise NotImplementedError(mode) return ingrad
def upsample2dNearestTest(): batchsize, maps, inh, inw = 1, 2, 16, 15 scale = 2 hostData = np.random.uniform(low=-1.0, high=1.0, size=(batchsize, maps, inh, inw)).astype(np.float32) data = GPUArray.toGpu(hostData) outdata = upsample2d(data, scale, mode="nearest") hostOutData = np.empty(outdata.shape, dtype=np.float32) for b, c, y, x in itertools.product(range(batchsize), range(maps), range(inh), range(inw)): hostOutData[b, c, y * scale:(y + 1) * scale, x * scale:(x + 1) * scale] = hostData[b, c, y, x] assert np.allclose(hostOutData, outdata.get()) hostGrad = np.random.randn(*outdata.shape).astype(np.float32) grad = GPUArray.toGpu(hostGrad) ingrad = upsample2dBackward(grad, scale) hostInGrad = np.zeros(data.shape, dtype=np.float32) for b, c, y, x, dy, dx in itertools.product( range(batchsize), range(maps), range(inh), range(inw), range(scale), range(scale) ): hostInGrad[b, c, y, x] += hostGrad[b, c, y * scale + dy, x * scale + dx] assert np.allclose(hostInGrad, ingrad.get(), atol=1e-5)
def vectorTest(): hostX, hostY = np.random.randn(5).astype( np.float32), np.random.randn(5).astype(np.float32) x, y = GPUArray.toGpu(hostX), GPUArray.toGpu(hostY) assert np.isclose(context.dot(x, y), np.dot(hostX, hostY)) assert np.isclose(context.l1norm(x), np.linalg.norm(hostX, ord=1)) assert np.isclose(context.l2norm(x), np.linalg.norm(hostX, ord=2))
def radixSortTest(): hostKeys = np.random.randint(0, (1 << 31) - 1, size=(250, ), dtype=np.int32) hostValues = np.arange(0, hostKeys.shape[0], dtype=np.int32) outkeys, outvalues = radixSort(GPUArray.toGpu(hostKeys), GPUArray.toGpu(hostValues)) assert (outkeys.get() == np.sort(hostKeys)).all() assert (outvalues.get() == np.argsort(hostKeys)).all()
def maxpool3dTest(dtype, atol): batchsize, maps, d, h, w = 1, 1, 6, 6, 6 size, s, pad = 3, 2, 1 hostData = np.full(shape=(batchsize, maps, d + 2 * pad, h + 2 * pad, w + 2 * pad), fill_value=np.finfo(dtype).min, dtype=dtype) hostData[:, :, pad:-pad, pad:-pad, pad:-pad] = np.random.randn(batchsize, maps, d, h, w).astype(dtype) data = GPUArray.toGpu( np.ascontiguousarray(hostData[:, :, pad:-pad, pad:-pad, pad:-pad])) outdata = context.poolNd(data, size=size, stride=s, pad=pad, mode=CuDnn.POOL_MODE_MAX) hostOutData = np.empty(outdata.shape, dtype=dtype) for b, c, z, y, x in itertools.product(range(batchsize), range(maps), range(hostOutData.shape[2]), range(hostOutData.shape[3]), range(hostOutData.shape[4])): hostOutData[b, c, z, y, x] = np.max(hostData[b, c, z * s:z * s + size, y * s:y * s + size, x * s:x * s + size]) assert np.allclose(hostOutData, outdata.get()) hostGrad = np.random.randn(*outdata.shape).astype(dtype) grad = GPUArray.toGpu(hostGrad) ingrad = context.poolNdBackward(grad, data, outdata, size=size, stride=s, pad=pad, mode=CuDnn.POOL_MODE_MAX) hostInGrad = np.zeros(hostData.shape, dtype=np.float32) for b, c, z, y, x, dz, dy, dx in itertools.product( range(batchsize), range(maps), range(hostOutData.shape[2]), range(hostOutData.shape[3]), range(hostOutData.shape[4]), range(size), range(size), range(size)): if hostData[b, c, z * s + dz, y * s + dy, x * s + dx] == hostOutData[b, c, z, y, x]: hostInGrad[b, c, z * s + dz, y * s + dy, x * s + dx] += hostGrad[b, c, z, y, x] hostInGrad = hostInGrad[:, :, pad:-pad, pad:-pad, pad:-pad].astype(dtype) assert np.allclose(hostInGrad, ingrad.get(), atol=atol)
def matvec(mat, vec, axis=0, out=None, alpha=1.0, beta=0.0, allocator=memPool): assert vec.dtype == mat.dtype and (mat.dtype == np.float32 or mat.dtype == np.float16) assert vec.ndim == mat.ndim - 1 and 0 <= axis < 2 h, w = mat.shape[-2:] if axis == 1: assert mat.dimAt(-1) == vec.dimAt(-1) block = (warpSize, 1, 1) grid = (h, 1, prod(mat.shape[:-2])) if out is None: out = GPUArray.zeros(mat.shape[:-1], dtype=mat.dtype, allocator=allocator) else: assert out.shape == mat.shape[:-1] fn = mulmod.vecMulOnRow if mat.dtype == np.float32 else mulmod.vecMulOnRowFP16 fn(out, mat, vec, np.int32(w), np.int32(h), np.float32(alpha), np.float32(beta), block=block, grid=grid) else: block = (NT, 1, 1) grid = (roundUpDiv(w, block[0]), 1, prod(mat.shape[:-2])) if out is None: out = GPUArray.zeros(mat.shape[:-2] + (w, ), dtype=mat.dtype, allocator=allocator) else: assert out.shape == mat.shape[:-2] + (w, ) fn = mulmod.vecMulOnCol if mat.dtype == np.float32 else mulmod.vecMulOnColFP16 fn(out, mat, vec, np.int32(w), np.int32(h), np.float32(alpha), np.float32(beta), block=block, grid=grid) return out
def segmentSeq(data): assert data.dtype == np.int32 length, = data.shape assert length <= NV segments = GPUArray.empty((length, 3), dtype=np.int32, allocator=memPool) indices = GPUArray.empty(data.shape, dtype=np.int32, allocator=memPool) segmentMod.segmentSeq(segments, indices, data, np.int32(length), block=(NT, 1, 1), grid=(1, 1, 1)) return segments, indices
def reflectpad(data, pad, allocator=memPool): if data.ndim == 3: batchsize, maps, insize = data.shape lpad, rpad = pad assert insize >= max(lpad, rpad) + 1 outsize = insize + lpad + rpad block = (warpSize, 1, 1) grid = (roundUpDiv(outsize, warpSize), maps, batchsize) outdata = GPUArray.empty((batchsize, maps, outsize), dtype=data.dtype, allocator=allocator) fn = mod.reflectpad1d if data.dtype == np.float32 else mod.reflectpad1dFP16 fn(outdata, data, np.int32(insize), np.int32(lpad), np.int32(rpad), block=block, grid=grid) elif data.ndim == 4: batchsize, maps, inh, inw = data.shape upad, bpad, lpad, rpad = pad assert inh >= max(upad, bpad) + 1 and inw >= max(lpad, rpad) + 1 outh, outw = inh + upad + bpad, inw + lpad + rpad block = (warpSize, 1, 1) grid = (roundUpDiv(outh * outw, warpSize), maps, batchsize) outdata = GPUArray.empty((batchsize, maps, outh, outw), dtype=data.dtype, allocator=allocator) fn = mod.reflectpad2d if data.dtype == np.float32 else mod.reflectpad2dFP16 fn(outdata, data, np.int32(inh), np.int32(inw), np.int32(upad), np.int32(bpad), np.int32(lpad), np.int32(rpad), block=block, grid=grid) else: raise NotImplementedError(data.ndim) return outdata
def radixSort(keys, values): assert keys.dtype == np.int32 and values.dtype == np.int32 assert keys.shape == values.shape length, = keys.shape assert length <= NV outkeys = GPUArray.empty(keys.shape, dtype=keys.dtype, allocator=memPool) outvalues = GPUArray.empty(values.shape, dtype=values.dtype, allocator=memPool) radixMod.radixSort(outkeys, outvalues, keys, values, np.int32(length), block=(NT, 1, 1), grid=(1, 1, 1)) return outkeys, outvalues
def crossEntropy(scores, labels, weights=None, error=None, allocator=memPool): assert scores.dtype == np.float32 and labels.dtype == np.int32 shape = scores.shape if scores.ndim < 4: scores = scores.reshape(*shape, *(1 for _ in range(4 - scores.ndim))) softmax = cudnn.softmaxNd(scores, mode=SoftMaxMode.spatial.value, allocator=allocator) grad = GPUArray.empty(shape, dtype=np.float32, allocator=allocator) if error is None: error = GPUArray.empty((), dtype=np.float32, allocator=allocator) error.fill(0.0) size = prod(scores.shape) spatialDim = prod(scores.shape[2:]) mapStride = spatialDim * scores.shape[1] block = (nthreads, 1, 1) grid = (roundUpDiv(size, nthreads), 1, 1) if weights is None: ceMod.cost(softmax, labels, np.int32(size), np.int32(mapStride), np.int32(spatialDim), np.int32(scores.shape[1]), np.int32(scores.shape[0]), error, grad, block=block, grid=grid) else: wceMod.cost(softmax, labels, weights, np.int32(size), np.int32(mapStride), np.int32(spatialDim), np.int32(shape[1]), np.int32(shape[0]), error, grad, block=block, grid=grid) return error, grad
def reflectpadBackward(grad, pad, allocator=memPool): if grad.ndim == 3: batchsize, maps, outsize = grad.shape lpad, rpad = pad block = (warpSize, 1, 1) grid = (roundUpDiv(outsize, warpSize), maps, batchsize) insize = outsize - lpad - rpad ingrad = GPUArray.zeros((batchsize, maps, insize), dtype=grad.dtype, allocator=allocator) fn = mod.reflectpad1dBackward if grad.dtype == np.float32 else mod.reflectpad1dBackwardFP16 fn(ingrad, grad, np.int32(insize), np.int32(lpad), np.int32(rpad), block=block, grid=grid) elif grad.ndim == 4: batchsize, maps, outh, outw = grad.shape upad, bpad, lpad, rpad = pad inh, inw = outh - upad - bpad, outw - lpad - rpad block = (warpSize, 1, 1) grid = (roundUpDiv(outh * outw, warpSize), maps, batchsize) ingrad = GPUArray.zeros((batchsize, maps, inh, inw), dtype=grad.dtype, allocator=allocator) fn = mod.reflectpad2dBackward if grad.dtype == np.float32 else mod.reflectpad2dBackwardFP16 fn(ingrad, grad, np.int32(inh), np.int32(inw), np.int32(upad), np.int32(bpad), np.int32(lpad), np.int32(rpad), block=block, grid=grid) else: raise NotImplementedError(grad.ndim) return ingrad
def unittest(): times, batchsize, vocabsize = 20, 3, 6 hostData, hostDataLen, hostLabels, lengths = createData(times, batchsize, vocabsize) data, datalen, labels = GPUArray.toGpu(hostData), GPUArray.toGpu(hostDataLen), GPUArray.toGpu(hostLabels) blank = 0 error, grad, alphas = ctcLoss(data, datalen, labels, lengths, blank, returnAlphas=True) hostError, hostGrad, hostAlphas = ctcLossTest(hostData, hostDataLen, hostLabels, lengths, blank) assert np.allclose(hostAlphas, alphas.get()) assert np.isclose(hostError, error.get()) assert np.allclose(hostGrad, grad.get(), atol=1e-5)
def batchNorm3dTest(dtype, atol): batchsize, maps, d, h, w = 2, 5, 2, 3, 2 epsilon, norm = 1e-5, batchsize * d * h * w hostData = np.random.randn(batchsize, maps, d, h, w).astype(dtype) hostScale = np.random.randn(1, maps, 1, 1, 1).astype(np.float32) hostBias = np.random.randn(1, maps, 1, 1, 1).astype(np.float32) data, scale, bias = GPUArray.toGpu(hostData), GPUArray.toGpu(hostScale.ravel()), GPUArray.toGpu(hostBias.ravel()) mean, var = GPUArray.zeros(scale.shape, dtype=np.float32), GPUArray.toGpu(np.ones(scale.shape, dtype=np.float32)) outdata, savemean, saveinvvar = context.batchNormNd(data, mean, var, scale, bias, epsilon=epsilon, out=data) hostMean = np.sum(hostData, axis=(0, 2, 3, 4), dtype=np.float32, keepdims=True) / norm hostInvVar = np.sum((hostData - hostMean) ** 2, axis=(0, 2, 3, 4), dtype=np.float32, keepdims=True) / norm hostInvVar = 1.0 / np.sqrt(hostInvVar + epsilon) hostNormData = (hostData - hostMean) * hostInvVar hostOutData = (hostNormData * hostScale + hostBias).astype(dtype) assert np.allclose(hostMean.ravel(), mean.get(), atol=atol) assert np.allclose(hostInvVar.ravel(), saveinvvar.get(), atol=atol) assert np.allclose(hostOutData, outdata.get(), atol=atol) hostGrad = np.random.randn(*outdata.shape).astype(dtype) grad, data = GPUArray.toGpu(hostGrad), GPUArray.toGpu(hostData) ingrad, scalegrad, biasgrad = context.batchNormNdBackward(grad, data, scale, savemean, saveinvvar, epsilon=epsilon) hostScaleGrad = np.sum(hostGrad * hostNormData, axis=(0, 2, 3, 4), dtype=np.float32, keepdims=True) hostBiasGrad = np.sum(hostGrad, axis=(0, 2, 3, 4), dtype=np.float32, keepdims=True) hostMeanGrad = -hostInvVar * hostBiasGrad * hostScale hostVarGrad = np.sum(hostGrad * (hostData - hostMean), axis=(0, 2, 3, 4), dtype=np.float32, keepdims=True) hostVarGrad = -0.5 * hostVarGrad * hostScale * hostInvVar**3 hostInGrad = hostGrad * hostScale * hostInvVar + (2 * hostVarGrad * (hostData - hostMean) + hostMeanGrad) / norm hostInGrad = hostInGrad.astype(dtype) assert np.allclose(hostInGrad, ingrad.get(), atol=atol) assert np.allclose(hostScaleGrad.ravel(), scalegrad.get(), atol=atol) assert np.allclose(hostBiasGrad.ravel(), biasgrad.get(), atol=atol) hostMean = np.random.randn(*hostMean.shape).astype(np.float32) hostVar = 1.0 + np.random.randn(*hostInvVar.shape).astype(np.float32)**2 mean, var = GPUArray.toGpu(hostMean.ravel()), GPUArray.toGpu(hostVar.ravel()) outdata = context.batchNormNd(data, mean, var, scale, bias, test=True) hostOutData = ((hostData - hostMean) / np.sqrt(hostVar + epsilon) * hostScale + hostBias).astype(dtype) assert np.allclose(hostOutData, outdata.get(), atol=atol)
def gbpGbpTest(dtype, atol): formatA, formatB, formatOut = CuBlas.GROUPFORMAT_GBP, CuBlas.GROUPFORMAT_GBP, CuBlas.GROUPFORMAT_GBP groups = 3 hostA = np.random.randn(groups, 4, 3).astype(dtype) hostB = np.random.randn(groups, hostA.shape[2], 5).astype(dtype) hostC = np.random.randn(groups, hostA.shape[1], 6).astype(dtype) hostD = np.random.randn(groups, 8, hostC.shape[2]).astype(dtype) A, B, C, D = GPUArray.toGpu(hostA), GPUArray.toGpu(hostB), GPUArray.toGpu( hostC), GPUArray.toGpu(hostD) out = context.gemmBatched(A, B, formatA=formatA, formatB=formatB, formatOut=formatOut) hostOut = np.empty(out.shape, dtype=dtype) for i in range(groups): np.dot(hostA[i], hostB[i], out=hostOut[i]) assert np.allclose(hostOut, out.get(), atol=atol) out = context.gemmBatched(C, A, formatA=formatA, formatB=formatB, formatOut=formatOut, transpA=True) hostOut = np.empty(out.shape, dtype=dtype) for i in range(groups): np.dot(hostC[i].T, hostA[i], out=hostOut[i]) assert np.allclose(hostOut, out.get(), atol=atol) out = context.gemmBatched(C, D, formatA=formatA, formatB=formatB, formatOut=formatOut, transpB=True) hostOut = np.empty(out.shape, dtype=dtype) for i in range(groups): np.dot(hostC[i], hostD[i].T, out=hostOut[i]) assert np.allclose(hostOut, out.get(), atol=atol)
def bgpBgpTest(dtype, atol): formatA, formatB, formatOut = CuBlas.GROUPFORMAT_BGP, CuBlas.GROUPFORMAT_BGP, CuBlas.GROUPFORMAT_GBP groups = 3 hostA = np.random.randn(4, groups, 7).astype(dtype) hostB = np.random.randn(hostA.shape[2], groups, 5).astype(dtype) hostC = np.random.randn(hostA.shape[0], groups, hostB.shape[2]).astype(dtype) A, B, C = GPUArray.toGpu(hostA), GPUArray.toGpu(hostB), GPUArray.toGpu( hostC) out = context.gemmBatched(A, B, formatA=formatA, formatB=formatB, formatOut=formatOut) hostOut = np.empty(out.shape, dtype=dtype) for i in range(groups): np.dot(hostA[:, i, :], hostB[:, i, :], out=hostOut[i]) assert np.allclose(hostOut, out.get(), atol=atol) out = context.gemmBatched(A, C, formatA=formatA, formatB=formatB, formatOut=formatOut, transpA=True) hostOut = np.empty(out.shape, dtype=dtype) for i in range(groups): np.dot(hostA[:, i, :].T, hostC[:, i, :], out=hostOut[i]) assert np.allclose(hostOut, out.get(), atol=atol) out = context.gemmBatched(B, C, formatA=formatA, formatB=formatB, formatOut=formatOut, transpB=True) hostOut = np.empty(out.shape, dtype=dtype) for i in range(groups): np.dot(hostB[:, i, :], hostC[:, i, :].T, out=hostOut[i]) assert np.allclose(hostOut, out.get(), atol=atol)
def matsum(tensor, axis=0, out=None, alpha=1.0, beta=0.0, allocator=memPool): assert tensor.dtype == np.float32 or tensor.dtype == np.float16 assert 0 <= axis < tensor.ndim if axis == tensor.ndim - 1: block = (warpSize, 1, 1) grid = (prod(tensor.shape[:-1]), 1, 1) if out is None: out = GPUArray.zeros(tensor.shape[:-1], dtype=tensor.dtype, allocator=allocator) else: assert out.shape == tensor.shape[:-1] fn = summod.sumOnRow if tensor.dtype == np.float32 else summod.sumOnRowFP16 fn(out, tensor, np.int32(tensor.dimAt(-1)), np.float32(alpha), np.float32(beta), block=block, grid=grid) else: z, width = prod(tensor.shape[:axis]), prod(tensor.shape[axis + 1:]) block = (NT, 1, 1) grid = (roundUpDiv(width, block[0]), 1, z) if out is None: out = GPUArray.zeros(tensor.shape[:axis] + tensor.shape[axis + 1:], dtype=tensor.dtype, allocator=allocator) else: assert out.shape == tensor.shape[:axis] + tensor.shape[axis + 1:] fn = summod.sumOnCol if tensor.dtype == np.float32 else summod.sumOnColFP16 fn(out, tensor, np.int32(width), np.int32(tensor.dimAt(axis)), np.float32(alpha), np.float32(beta), block=block, grid=grid) return out
def rescale(data, scale, memoryType, interpolation=InterpolationMode.nn, outdata=None, allocator=memPool): assert data.ndim == 2 and memoryType == MemoryType.grayscale or data.ndim == 3 hscale, wscale = (scale, scale) if isinstance(scale, (int, float)) else scale inrect = getDataRect(data, memoryType) insize, inline = (inrect[2], inrect[3]), getMemoryTypeLineSize( inrect[2], data.dtype, memoryType) outrect = libnpp.nppiGetResizeRect(inrect, wscale, hscale, 0, 0, interpolation.value) outline = getMemoryTypeLineSize(outrect[2], data.dtype, memoryType) outshape = getOutDataShape(data, outrect, memoryType) if outdata is None: outdata = GPUArray.empty(outshape, dtype=data.dtype, allocator=allocator) else: assert outdata.shape == outshape dataPtr, outdataPtr = getDataPointers(data, outdata, memoryType) libnpp.nppiResizeSqrPixel( getDataType(data).value, memoryType.value, dataPtr, insize, inline, inrect, outdataPtr, outline, outrect, wscale, hscale, 0, 0, interpolation.value) return outdata
def batchSpeedTest(dtype): from PuzzleLib.Cuda.Benchmarks.Utils import timeKernel A = GPUArray.toGpu(np.random.randn(32, 128, 128).astype(dtype)) v = GPUArray.toGpu(np.random.randn(32, 128).astype(dtype)) timeKernel(addVecToMat, (v, A, 1, A), logname="%s batched addVecToMat on rows" % dtype) timeKernel(addVecToMat, (v, A, 0, A), logname="%s batched addVecToMat on cols" % dtype) timeKernel(argmax, (A, 2), logname="%s batched argmax on rows" % dtype) timeKernel(argmax, (A, 1), logname="%s batched argmax on cols" % dtype) timeKernel(matsum, (A, 2), logname="%s batched matsum on rows" % dtype) timeKernel(matsum, (A, 1), logname="%s batched matsum on cols" % dtype)
def concatenate(tup, axis, out=None, allocator=memoryPool): ary = tup[0] dtype, reducedShape = ary.dtype, ary.shape reducedShape = reducedShape[:axis] + reducedShape[axis + 1:] assert all(a.dtype == dtype and a.shape[:axis] + a.shape[axis + 1:] == reducedShape for a in tup[1:]) concatDim = sum(a.dimAt(axis) for a in tup) shape = reducedShape[:axis] + (concatDim, ) + reducedShape[axis:] if out is None: out = GPUArray.empty(shape, dtype=dtype, allocator=allocator) else: assert out.shape == shape and out.dtype == dtype dstPitch = out.strideAt(axis - 1) if axis > 0 else out.nbytes height = prod(shape[:axis]) stride = 0 for a in tup: srcPitch = width = a.strideAt(axis - 1) if axis > 0 else a.nbytes Driver.memcpy2D(width, height, a.gpudata, srcPitch, out.gpudata, dstPitch, dstX=stride) stride += width return out
def split(ary, sections, axis, allocator=memoryPool): shape = ary.shape assert sum(sections) == shape[axis] outs = [ GPUArray.empty(shape[:axis] + (sec, ) + shape[axis + 1:], dtype=ary.dtype, allocator=allocator) for sec in sections ] srcPitch = ary.strideAt(axis - 1) if axis > 0 else ary.nbytes height = prod(shape[:axis]) stride = 0 for out in outs: dstPitch = width = out.strideAt(axis - 1) if axis > 0 else out.nbytes Driver.memcpy2D(width, height, ary.gpudata, srcPitch, out.gpudata, dstPitch, srcX=stride) stride += width return outs
def maxunpool2d(data, origshape, mask, allocator=memPool): assert data.dtype == np.float32 batchsize, maps, inh, inw = data.shape outh, outw = origshape[2], origshape[3] outdata = GPUArray.zeros((batchsize, maps, outh, outw), dtype=np.float32, allocator=allocator) size = prod(data.shape) block = (nthreads, 1, 1) grid = (roundUpDiv(size, nthreads), 1, 1) mod.maxunpool2d(outdata, data, mask, np.int32(inh), np.int32(inw), np.int32(outh), np.int32(outw), np.int32(maps), np.int32(size), block=block, grid=grid) return outdata
def warpAffinePoints(data, inpoints, outpoints, memoryType, outshape=None, interpolation=InterpolationMode.nn, cval=0, clip=True, allocator=memPool): assert data.ndim == 2 and memoryType == MemoryType.grayscale or data.ndim == 3 inrect = getDataRect(data, memoryType) insize, inline = (inrect[2], inrect[3]), getMemoryTypeLineSize( inrect[2], data.dtype, memoryType) if outshape is None: outshape = data.shape outrect = getOutDataRect(data, outshape, memoryType) outline = getMemoryTypeLineSize(outrect[2], data.dtype, memoryType) outdata = GPUArray.empty(outshape, dtype=data.dtype, allocator=allocator) outdata.fill(cval) dataPtr, outdataPtr = getDataPointers(data, outdata, memoryType) srcQuad, dstQuad = genAffineQuads(inpoints, outpoints, clip, inrect) libnpp.nppiWarpAffineQuad( getDataType(data).value, memoryType.value, dataPtr, insize, inline, inrect, srcQuad, outdataPtr, outline, outrect, dstQuad, interpolation.value) return outdata
def upsample3d(data, scale, mode="nearest", allocator=memPool): batchsize, maps, ind, inh, inw = data.shape dscale, hscale, wscale = (scale, scale, scale) if isinstance(scale, int) else scale outd, outh, outw = dscale * ind, hscale * inh, wscale * inw outdata = GPUArray.empty((batchsize, maps, outd, outh, outw), dtype=data.dtype, allocator=allocator) if mode == "nearest": block = (wblocksize, hblocksize, 1) grid = (roundUpDiv(inw, block[0]), roundUpDiv(inh, block[1]), batchsize * maps * ind) nearestMod.upsample3dNearest( outdata, data, np.int32(ind), np.int32(inh), np.int32(inw), np.int32(outd), np.int32(outh), np.int32(outw), np.int32(dscale), np.int32(hscale), np.int32(wscale), block=block, grid=grid ) elif mode == "linear": block = (warpSize, nthreads // warpSize, 1) grid = (roundUpDiv(outw, block[0]), roundUpDiv(outh, block[1]), outd) rd, rh, rw = (ind - 1) / (outd - 1), (inh - 1) / (outh - 1), (inw - 1) / (outw - 1) linearMod.upsample3dLinear( outdata, data, np.int32(batchsize), np.int32(maps), np.int32(ind), np.int32(inh), np.int32(inw), np.int32(outd), np.int32(outh), np.int32(outw), np.float32(rd), np.float32(rh), np.float32(rw), block=block, grid=grid ) else: raise NotImplementedError(mode) return outdata
def maxunpool2dBackward(grad, poolshape, mask, allocator=memPool): assert grad.dtype == np.float32 and mask.dtype == np.int32 batchsize, maps, outh, outw = grad.shape inh, inw = poolshape[2], poolshape[3] ingrad = GPUArray.empty((batchsize, maps, inh, inw), dtype=np.float32, allocator=allocator) size = prod(ingrad.shape) block = (nthreads, 1, 1) grid = (roundUpDiv(size, nthreads), 1, 1) mod.maxunpool2dBackward(ingrad, grad, mask, np.int32(inh), np.int32(inw), np.int32(outh), np.int32(outw), np.int32(maps), np.int32(size), block=block, grid=grid) return ingrad
def warpAffine(data, coeffs, memoryType, outshape=None, interpolation=InterpolationMode.nn, cval=0, backward=False, allocator=memPool): assert data.ndim == 2 and memoryType == MemoryType.grayscale or data.ndim == 3 inrect = getDataRect(data, memoryType) insize, inline = (inrect[2], inrect[3]), getMemoryTypeLineSize( inrect[2], data.dtype, memoryType) if outshape is None: outshape = data.shape outrect = getOutDataRect(data, outshape, memoryType) outline = getMemoryTypeLineSize(outrect[2], data.dtype, memoryType) outdata = GPUArray.empty(outshape, dtype=data.dtype, allocator=allocator) outdata.fill(cval) dataPtr, outdataPtr = getDataPointers(data, outdata, memoryType) warpMethod = libnpp.nppiWarpAffine if backward: warpMethod = libnpp.nppiWarpAffineBack warpMethod( getDataType(data).value, memoryType.value, dataPtr, insize, inline, inrect, outdataPtr, outline, outrect, coeffs, interpolation.value) return outdata