Python GPUArrayの例、PuzzleLib.Cuda.GPUArray.GPUArray Pythonの例

コード例 #1

0

ファイルを表示

ファイル: Pad.py プロジェクト: rsarbaev/PuzzleLib

def reflectpad1dTest(dtype):
    batchsize, maps, insize = 4, 8, 48
    lpad, rpad = 2, 3

    hostData = np.random.randn(batchsize, maps, insize).astype(dtype)

    data = GPUArray.toGpu(hostData)
    outdata = reflectpad(data, pad=(lpad, rpad))

    hostOutData = outdata.get()
    outsize = hostOutData.shape[2]

    assert np.allclose(hostOutData[:, :, lpad:insize + lpad], hostData)
    assert np.allclose(hostOutData[:, :, :lpad][:, :, ::-1],
                       hostData[:, :, 1:lpad + 1])
    assert np.allclose(hostOutData[:, :, insize + lpad:][:, :, ::-1],
                       hostData[:, :, insize - 1 - rpad:insize - 1])

    hostGrad = np.random.randn(batchsize, maps, outsize).astype(np.float32)

    grad = GPUArray.toGpu(hostGrad)
    ingrad = reflectpadBackward(grad, pad=(lpad, rpad))

    hostInGrad = ingrad.get()

    assert np.allclose(hostInGrad[:, :, lpad + 1:insize - rpad - 1],
                       hostGrad[:, :, 2 * lpad + 1:outsize - 2 * rpad - 1])
    assert np.allclose(
        hostInGrad[:, :, 1:lpad + 1], hostGrad[:, :, :lpad][:, :, ::-1] +
        hostGrad[:, :, lpad + 1:2 * lpad + 1])
    assert np.allclose(
        hostInGrad[:, :, insize - rpad - 1:insize - 1],
        hostGrad[:, :, outsize - rpad:][:, :, ::-1] +
        hostGrad[:, :, outsize - 2 * rpad - 1:outsize - rpad - 1])

コード例 #2

0

ファイルを表示

def upsample3dNearestTest():
	batchsize, maps, ind, inh, inw = 4, 2, 3, 5, 3
	scale = 2

	hostData = np.random.randn(batchsize, maps, ind, inh, inw).astype(np.float32)

	data = GPUArray.toGpu(hostData)
	outdata = upsample3d(data, scale, mode="nearest")

	hostOutData = np.empty(outdata.shape, dtype=np.float32)

	for b, c, z, y, x in itertools.product(range(batchsize), range(maps), range(ind), range(inh), range(inw)):
		hostOutData[b, c, z * scale:(z + 1) * scale, y * scale:(y + 1) * scale, x * scale:(x + 1) * scale] = \
			hostData[b, c, z, y, x]

	assert np.allclose(hostOutData, outdata.get())

	hostGrad = np.random.randn(*outdata.shape).astype(np.float32)

	grad = GPUArray.toGpu(hostGrad)
	ingrad = upsample3dBackward(grad, scale)

	hostInGrad = np.zeros(data.shape, dtype=np.float32)

	for b, c, z, y, x, dz, dy, dx in itertools.product(
		range(batchsize), range(maps), range(ind), range(inh), range(inw), range(scale), range(scale), range(scale)
	):
		hostInGrad[b, c, z, y, x] += hostGrad[b, c, z * scale + dz, y * scale + dy, x * scale + dx]

	assert np.allclose(hostInGrad, ingrad.get())

コード例 #3

0

ファイルを表示

ファイル: CuDnn.py プロジェクト: rsarbaev/PuzzleLib

def softmax2dTest(dtype, atol):
    batchsize, maps, h, w = 5, 8, 2, 3
    hostData = np.random.randn(batchsize, maps, h, w).astype(dtype)

    data = GPUArray.toGpu(hostData)
    outdata = context.softmaxNd(data)

    def hostSoftmax(tensor):
        e = np.exp(tensor - np.amax(tensor))
        return e / np.sum(e)

    hostOutData = np.empty(outdata.shape, dtype=dtype)

    for b, y, x in itertools.product(range(batchsize), range(h), range(w)):
        hostOutData[b, :, y, x] = hostSoftmax(hostData[b, :, y, x])

    assert np.allclose(hostOutData, outdata.get(), atol=atol)

    hostGrad = np.random.randn(*outdata.shape).astype(dtype)

    grad = GPUArray.toGpu(hostGrad)
    ingrad = context.softmaxNdBackward(grad, outdata)

    hostInGrad = np.empty(ingrad.shape, dtype=dtype)

    def hostSoftmaxBackward(d, gr):
        return d * (gr - np.dot(d, gr))

    for b, y, x in itertools.product(range(batchsize), range(h), range(w)):
        hostInGrad[b, :, y, x] = hostSoftmaxBackward(hostOutData[b, :, y, x],
                                                     hostGrad[b, :, y, x])

    assert np.allclose(hostInGrad, ingrad.get(), atol=atol)

コード例 #4

0

ファイルを表示

ファイル: Costs.py プロジェクト: rsarbaev/PuzzleLib

def svmTest():
    batchsize, size = 20, 4

    hostScores = np.random.randn(batchsize, size).astype(np.float32)
    hostLabels = np.random.randint(low=0,
                                   high=size,
                                   size=(batchsize, ),
                                   dtype=np.int32)

    scores, labels = GPUArray.toGpu(hostScores), GPUArray.toGpu(hostLabels)
    error, grad = svm(scores, labels, mode="l1")

    hostGrad = np.empty(grad.shape, dtype=np.float32)
    hostError = 0.0

    for b in range(batchsize):
        for n in range(size):
            cls = 2 * (hostLabels[b] == n) - 1
            val = hostScores[b, n] * cls

            hostGrad[b, n] = cls / batchsize / size if val < 1 else 0.0
            hostError += max(0.0, 1.0 - val) / batchsize / size

    assert np.allclose(hostGrad, grad.get())
    assert np.isclose(hostError, error.get() / scores.shape[0])

コード例 #5

0

ファイルを表示

ファイル: Costs.py プロジェクト: rsarbaev/PuzzleLib

def svm(scores, labels, mode, error=None, allocator=memPool):
    assert scores.dtype == np.float32 and labels.dtype == np.int32
    shape = scores.shape

    grad = GPUArray.empty(shape, dtype=np.float32, allocator=allocator)
    if error is None:
        error = GPUArray.empty((), dtype=np.float32, allocator=allocator)

    error.fill(0.0)

    size = prod(scores.shape)
    spatialDim = prod(scores.shape[2:])
    mapStride = spatialDim * scores.shape[1]

    block = (nthreads, 1, 1)
    grid = (roundUpDiv(size, nthreads), 1, 1)

    mod = {"l1": svmL1Mod, "l2": svmL2Mod}[mode]

    mod.cost(scores,
             labels,
             np.int32(size),
             np.int32(mapStride),
             np.int32(spatialDim),
             np.int32(shape[1]),
             np.int32(shape[0]),
             error,
             grad,
             block=block,
             grid=grid)

    return error, grad

コード例 #6

0

ファイルを表示

def upsample2dBackward(grad, scale, mode="nearest", allocator=memPool):
	batchsize, maps, outh, outw = grad.shape
	hscale, wscale = (scale, scale) if isinstance(scale, int) else scale

	inh, inw = outh // hscale, outw // wscale

	if mode == "nearest":
		ingrad = GPUArray.empty((batchsize, maps, inh, inw), dtype=grad.dtype, allocator=allocator)

		blk = warpSize * 8
		block = (blk, 1, 1)
		grid = (roundUpDiv(ingrad.size, blk), 1, 1)

		nearestMod.upsample2dNearestBackward(
			ingrad, grad, np.int32(inw), np.int32(outw), np.int32(hscale), np.int32(wscale), np.int32(ingrad.size),
			block=block, grid=grid
		)

	elif mode == "linear":
		ingrad = GPUArray.zeros((batchsize, maps, inh, inw), dtype=grad.dtype, allocator=allocator)

		block = (warpSize, nthreads // warpSize, 1)
		grid = (roundUpDiv(outw, block[0]), roundUpDiv(outh, block[1]), 1)

		rh, rw = (inh - 1) / (outh - 1), (inw - 1) / (outw - 1)

		linearMod.upsample2dLinearBackward(
			ingrad, grad, np.int32(batchsize), np.int32(maps), np.int32(inh), np.int32(inw),
			np.int32(outh), np.int32(outw), np.float32(rh), np.float32(rw), block=block, grid=grid
		)

	else:
		raise NotImplementedError(mode)

	return ingrad

コード例 #7

0

ファイルを表示

def upsample2dNearestTest():
	batchsize, maps, inh, inw = 1, 2, 16, 15
	scale = 2

	hostData = np.random.uniform(low=-1.0, high=1.0, size=(batchsize, maps, inh, inw)).astype(np.float32)

	data = GPUArray.toGpu(hostData)
	outdata = upsample2d(data, scale, mode="nearest")

	hostOutData = np.empty(outdata.shape, dtype=np.float32)

	for b, c, y, x in itertools.product(range(batchsize), range(maps), range(inh), range(inw)):
		hostOutData[b, c, y * scale:(y + 1) * scale, x * scale:(x + 1) * scale] = hostData[b, c, y, x]

	assert np.allclose(hostOutData, outdata.get())

	hostGrad = np.random.randn(*outdata.shape).astype(np.float32)

	grad = GPUArray.toGpu(hostGrad)
	ingrad = upsample2dBackward(grad, scale)

	hostInGrad = np.zeros(data.shape, dtype=np.float32)

	for b, c, y, x, dy, dx in itertools.product(
		range(batchsize), range(maps), range(inh), range(inw), range(scale), range(scale)
	):
		hostInGrad[b, c, y, x] += hostGrad[b, c, y * scale + dy, x * scale + dx]

	assert np.allclose(hostInGrad, ingrad.get(), atol=1e-5)

コード例 #8

0

ファイルを表示

ファイル: CuBlas.py プロジェクト: rsarbaev/PuzzleLib

def vectorTest():
    hostX, hostY = np.random.randn(5).astype(
        np.float32), np.random.randn(5).astype(np.float32)
    x, y = GPUArray.toGpu(hostX), GPUArray.toGpu(hostY)

    assert np.isclose(context.dot(x, y), np.dot(hostX, hostY))
    assert np.isclose(context.l1norm(x), np.linalg.norm(hostX, ord=1))
    assert np.isclose(context.l2norm(x), np.linalg.norm(hostX, ord=2))

コード例 #9

0

ファイルを表示

ファイル: RadixSort.py プロジェクト: rsarbaev/PuzzleLib

def radixSortTest():
	hostKeys = np.random.randint(0, (1 << 31) - 1, size=(250, ), dtype=np.int32)
	hostValues = np.arange(0, hostKeys.shape[0], dtype=np.int32)

	outkeys, outvalues = radixSort(GPUArray.toGpu(hostKeys), GPUArray.toGpu(hostValues))

	assert (outkeys.get() == np.sort(hostKeys)).all()
	assert (outvalues.get() == np.argsort(hostKeys)).all()

コード例 #10

0

ファイルを表示

ファイル: CuDnn.py プロジェクト: rsarbaev/PuzzleLib

def maxpool3dTest(dtype, atol):
    batchsize, maps, d, h, w = 1, 1, 6, 6, 6
    size, s, pad = 3, 2, 1

    hostData = np.full(shape=(batchsize, maps, d + 2 * pad, h + 2 * pad,
                              w + 2 * pad),
                       fill_value=np.finfo(dtype).min,
                       dtype=dtype)
    hostData[:, :, pad:-pad, pad:-pad,
             pad:-pad] = np.random.randn(batchsize, maps, d, h,
                                         w).astype(dtype)

    data = GPUArray.toGpu(
        np.ascontiguousarray(hostData[:, :, pad:-pad, pad:-pad, pad:-pad]))
    outdata = context.poolNd(data,
                             size=size,
                             stride=s,
                             pad=pad,
                             mode=CuDnn.POOL_MODE_MAX)

    hostOutData = np.empty(outdata.shape, dtype=dtype)

    for b, c, z, y, x in itertools.product(range(batchsize), range(maps),
                                           range(hostOutData.shape[2]),
                                           range(hostOutData.shape[3]),
                                           range(hostOutData.shape[4])):
        hostOutData[b, c, z, y, x] = np.max(hostData[b, c, z * s:z * s + size,
                                                     y * s:y * s + size,
                                                     x * s:x * s + size])

    assert np.allclose(hostOutData, outdata.get())

    hostGrad = np.random.randn(*outdata.shape).astype(dtype)

    grad = GPUArray.toGpu(hostGrad)
    ingrad = context.poolNdBackward(grad,
                                    data,
                                    outdata,
                                    size=size,
                                    stride=s,
                                    pad=pad,
                                    mode=CuDnn.POOL_MODE_MAX)

    hostInGrad = np.zeros(hostData.shape, dtype=np.float32)

    for b, c, z, y, x, dz, dy, dx in itertools.product(
            range(batchsize), range(maps), range(hostOutData.shape[2]),
            range(hostOutData.shape[3]), range(hostOutData.shape[4]),
            range(size), range(size), range(size)):
        if hostData[b, c, z * s + dz, y * s + dy,
                    x * s + dx] == hostOutData[b, c, z, y, x]:
            hostInGrad[b, c, z * s + dz, y * s + dy,
                       x * s + dx] += hostGrad[b, c, z, y, x]

    hostInGrad = hostInGrad[:, :, pad:-pad, pad:-pad, pad:-pad].astype(dtype)
    assert np.allclose(hostInGrad, ingrad.get(), atol=atol)

コード例 #11

0

ファイルを表示

def matvec(mat, vec, axis=0, out=None, alpha=1.0, beta=0.0, allocator=memPool):
    assert vec.dtype == mat.dtype and (mat.dtype == np.float32
                                       or mat.dtype == np.float16)
    assert vec.ndim == mat.ndim - 1 and 0 <= axis < 2

    h, w = mat.shape[-2:]

    if axis == 1:
        assert mat.dimAt(-1) == vec.dimAt(-1)

        block = (warpSize, 1, 1)
        grid = (h, 1, prod(mat.shape[:-2]))

        if out is None:
            out = GPUArray.zeros(mat.shape[:-1],
                                 dtype=mat.dtype,
                                 allocator=allocator)
        else:
            assert out.shape == mat.shape[:-1]

        fn = mulmod.vecMulOnRow if mat.dtype == np.float32 else mulmod.vecMulOnRowFP16
        fn(out,
           mat,
           vec,
           np.int32(w),
           np.int32(h),
           np.float32(alpha),
           np.float32(beta),
           block=block,
           grid=grid)

    else:
        block = (NT, 1, 1)
        grid = (roundUpDiv(w, block[0]), 1, prod(mat.shape[:-2]))

        if out is None:
            out = GPUArray.zeros(mat.shape[:-2] + (w, ),
                                 dtype=mat.dtype,
                                 allocator=allocator)
        else:
            assert out.shape == mat.shape[:-2] + (w, )

        fn = mulmod.vecMulOnCol if mat.dtype == np.float32 else mulmod.vecMulOnColFP16
        fn(out,
           mat,
           vec,
           np.int32(w),
           np.int32(h),
           np.float32(alpha),
           np.float32(beta),
           block=block,
           grid=grid)

    return out

コード例 #12

0

ファイルを表示

ファイル: RadixSort.py プロジェクト: rsarbaev/PuzzleLib

def segmentSeq(data):
	assert data.dtype == np.int32

	length, = data.shape
	assert length <= NV

	segments = GPUArray.empty((length, 3), dtype=np.int32, allocator=memPool)
	indices = GPUArray.empty(data.shape, dtype=np.int32, allocator=memPool)

	segmentMod.segmentSeq(segments, indices, data, np.int32(length), block=(NT, 1, 1), grid=(1, 1, 1))
	return segments, indices

コード例 #13

0

ファイルを表示

ファイル: Pad.py プロジェクト: rsarbaev/PuzzleLib

def reflectpad(data, pad, allocator=memPool):
    if data.ndim == 3:
        batchsize, maps, insize = data.shape
        lpad, rpad = pad

        assert insize >= max(lpad, rpad) + 1
        outsize = insize + lpad + rpad

        block = (warpSize, 1, 1)
        grid = (roundUpDiv(outsize, warpSize), maps, batchsize)

        outdata = GPUArray.empty((batchsize, maps, outsize),
                                 dtype=data.dtype,
                                 allocator=allocator)
        fn = mod.reflectpad1d if data.dtype == np.float32 else mod.reflectpad1dFP16

        fn(outdata,
           data,
           np.int32(insize),
           np.int32(lpad),
           np.int32(rpad),
           block=block,
           grid=grid)

    elif data.ndim == 4:
        batchsize, maps, inh, inw = data.shape
        upad, bpad, lpad, rpad = pad

        assert inh >= max(upad, bpad) + 1 and inw >= max(lpad, rpad) + 1
        outh, outw = inh + upad + bpad, inw + lpad + rpad

        block = (warpSize, 1, 1)
        grid = (roundUpDiv(outh * outw, warpSize), maps, batchsize)

        outdata = GPUArray.empty((batchsize, maps, outh, outw),
                                 dtype=data.dtype,
                                 allocator=allocator)
        fn = mod.reflectpad2d if data.dtype == np.float32 else mod.reflectpad2dFP16

        fn(outdata,
           data,
           np.int32(inh),
           np.int32(inw),
           np.int32(upad),
           np.int32(bpad),
           np.int32(lpad),
           np.int32(rpad),
           block=block,
           grid=grid)

    else:
        raise NotImplementedError(data.ndim)

    return outdata

コード例 #14

0

ファイルを表示

ファイル: RadixSort.py プロジェクト: rsarbaev/PuzzleLib

def radixSort(keys, values):
	assert keys.dtype == np.int32 and values.dtype == np.int32
	assert keys.shape == values.shape

	length, = keys.shape
	assert length <= NV

	outkeys = GPUArray.empty(keys.shape, dtype=keys.dtype, allocator=memPool)
	outvalues = GPUArray.empty(values.shape, dtype=values.dtype, allocator=memPool)

	radixMod.radixSort(outkeys, outvalues, keys, values, np.int32(length), block=(NT, 1, 1), grid=(1, 1, 1))
	return outkeys, outvalues

コード例 #15

0

ファイルを表示

ファイル: Costs.py プロジェクト: rsarbaev/PuzzleLib

def crossEntropy(scores, labels, weights=None, error=None, allocator=memPool):
    assert scores.dtype == np.float32 and labels.dtype == np.int32

    shape = scores.shape
    if scores.ndim < 4:
        scores = scores.reshape(*shape, *(1 for _ in range(4 - scores.ndim)))

    softmax = cudnn.softmaxNd(scores,
                              mode=SoftMaxMode.spatial.value,
                              allocator=allocator)

    grad = GPUArray.empty(shape, dtype=np.float32, allocator=allocator)
    if error is None:
        error = GPUArray.empty((), dtype=np.float32, allocator=allocator)

    error.fill(0.0)

    size = prod(scores.shape)
    spatialDim = prod(scores.shape[2:])
    mapStride = spatialDim * scores.shape[1]

    block = (nthreads, 1, 1)
    grid = (roundUpDiv(size, nthreads), 1, 1)

    if weights is None:
        ceMod.cost(softmax,
                   labels,
                   np.int32(size),
                   np.int32(mapStride),
                   np.int32(spatialDim),
                   np.int32(scores.shape[1]),
                   np.int32(scores.shape[0]),
                   error,
                   grad,
                   block=block,
                   grid=grid)

    else:
        wceMod.cost(softmax,
                    labels,
                    weights,
                    np.int32(size),
                    np.int32(mapStride),
                    np.int32(spatialDim),
                    np.int32(shape[1]),
                    np.int32(shape[0]),
                    error,
                    grad,
                    block=block,
                    grid=grid)

    return error, grad

コード例 #16

0

ファイルを表示

ファイル: Pad.py プロジェクト: rsarbaev/PuzzleLib

def reflectpadBackward(grad, pad, allocator=memPool):
    if grad.ndim == 3:
        batchsize, maps, outsize = grad.shape
        lpad, rpad = pad

        block = (warpSize, 1, 1)
        grid = (roundUpDiv(outsize, warpSize), maps, batchsize)

        insize = outsize - lpad - rpad
        ingrad = GPUArray.zeros((batchsize, maps, insize),
                                dtype=grad.dtype,
                                allocator=allocator)
        fn = mod.reflectpad1dBackward if grad.dtype == np.float32 else mod.reflectpad1dBackwardFP16

        fn(ingrad,
           grad,
           np.int32(insize),
           np.int32(lpad),
           np.int32(rpad),
           block=block,
           grid=grid)

    elif grad.ndim == 4:
        batchsize, maps, outh, outw = grad.shape
        upad, bpad, lpad, rpad = pad

        inh, inw = outh - upad - bpad, outw - lpad - rpad

        block = (warpSize, 1, 1)
        grid = (roundUpDiv(outh * outw, warpSize), maps, batchsize)

        ingrad = GPUArray.zeros((batchsize, maps, inh, inw),
                                dtype=grad.dtype,
                                allocator=allocator)
        fn = mod.reflectpad2dBackward if grad.dtype == np.float32 else mod.reflectpad2dBackwardFP16

        fn(ingrad,
           grad,
           np.int32(inh),
           np.int32(inw),
           np.int32(upad),
           np.int32(bpad),
           np.int32(lpad),
           np.int32(rpad),
           block=block,
           grid=grid)

    else:
        raise NotImplementedError(grad.ndim)

    return ingrad

コード例 #17

0

ファイルを表示

ファイル: CTC.py プロジェクト: rsarbaev/PuzzleLib

def unittest():
	times, batchsize, vocabsize = 20, 3, 6
	hostData, hostDataLen, hostLabels, lengths = createData(times, batchsize, vocabsize)

	data, datalen, labels = GPUArray.toGpu(hostData), GPUArray.toGpu(hostDataLen), GPUArray.toGpu(hostLabels)
	blank = 0

	error, grad, alphas = ctcLoss(data, datalen, labels, lengths, blank, returnAlphas=True)
	hostError, hostGrad, hostAlphas = ctcLossTest(hostData, hostDataLen, hostLabels, lengths, blank)

	assert np.allclose(hostAlphas, alphas.get())

	assert np.isclose(hostError, error.get())
	assert np.allclose(hostGrad, grad.get(), atol=1e-5)

コード例 #18

0

ファイルを表示

ファイル: CuDnnNorm.py プロジェクト: rsarbaev/PuzzleLib

def batchNorm3dTest(dtype, atol):
	batchsize, maps, d, h, w = 2, 5, 2, 3, 2
	epsilon, norm = 1e-5, batchsize * d * h * w

	hostData = np.random.randn(batchsize, maps, d, h, w).astype(dtype)

	hostScale = np.random.randn(1, maps, 1, 1, 1).astype(np.float32)
	hostBias = np.random.randn(1, maps, 1, 1, 1).astype(np.float32)

	data, scale, bias = GPUArray.toGpu(hostData), GPUArray.toGpu(hostScale.ravel()), GPUArray.toGpu(hostBias.ravel())
	mean, var = GPUArray.zeros(scale.shape, dtype=np.float32), GPUArray.toGpu(np.ones(scale.shape, dtype=np.float32))

	outdata, savemean, saveinvvar = context.batchNormNd(data, mean, var, scale, bias, epsilon=epsilon, out=data)

	hostMean = np.sum(hostData, axis=(0, 2, 3, 4), dtype=np.float32, keepdims=True) / norm

	hostInvVar = np.sum((hostData - hostMean) ** 2, axis=(0, 2, 3, 4), dtype=np.float32, keepdims=True) / norm
	hostInvVar = 1.0 / np.sqrt(hostInvVar + epsilon)

	hostNormData = (hostData - hostMean) * hostInvVar
	hostOutData = (hostNormData * hostScale + hostBias).astype(dtype)

	assert np.allclose(hostMean.ravel(), mean.get(), atol=atol)
	assert np.allclose(hostInvVar.ravel(), saveinvvar.get(), atol=atol)
	assert np.allclose(hostOutData, outdata.get(), atol=atol)

	hostGrad = np.random.randn(*outdata.shape).astype(dtype)

	grad, data = GPUArray.toGpu(hostGrad), GPUArray.toGpu(hostData)
	ingrad, scalegrad, biasgrad = context.batchNormNdBackward(grad, data, scale, savemean, saveinvvar, epsilon=epsilon)

	hostScaleGrad = np.sum(hostGrad * hostNormData, axis=(0, 2, 3, 4), dtype=np.float32, keepdims=True)
	hostBiasGrad = np.sum(hostGrad, axis=(0, 2, 3, 4), dtype=np.float32, keepdims=True)

	hostMeanGrad = -hostInvVar * hostBiasGrad * hostScale

	hostVarGrad = np.sum(hostGrad * (hostData - hostMean), axis=(0, 2, 3, 4), dtype=np.float32, keepdims=True)
	hostVarGrad = -0.5 * hostVarGrad * hostScale * hostInvVar**3

	hostInGrad = hostGrad * hostScale * hostInvVar + (2 * hostVarGrad * (hostData - hostMean) + hostMeanGrad) / norm
	hostInGrad = hostInGrad.astype(dtype)

	assert np.allclose(hostInGrad, ingrad.get(), atol=atol)
	assert np.allclose(hostScaleGrad.ravel(), scalegrad.get(), atol=atol)
	assert np.allclose(hostBiasGrad.ravel(), biasgrad.get(), atol=atol)

	hostMean = np.random.randn(*hostMean.shape).astype(np.float32)
	hostVar = 1.0 + np.random.randn(*hostInvVar.shape).astype(np.float32)**2

	mean, var = GPUArray.toGpu(hostMean.ravel()), GPUArray.toGpu(hostVar.ravel())
	outdata = context.batchNormNd(data, mean, var, scale, bias, test=True)

	hostOutData = ((hostData - hostMean) / np.sqrt(hostVar + epsilon) * hostScale + hostBias).astype(dtype)
	assert np.allclose(hostOutData, outdata.get(), atol=atol)

コード例 #19

0

ファイルを表示

ファイル: CuBlas.py プロジェクト: rsarbaev/PuzzleLib

def gbpGbpTest(dtype, atol):
    formatA, formatB, formatOut = CuBlas.GROUPFORMAT_GBP, CuBlas.GROUPFORMAT_GBP, CuBlas.GROUPFORMAT_GBP
    groups = 3

    hostA = np.random.randn(groups, 4, 3).astype(dtype)
    hostB = np.random.randn(groups, hostA.shape[2], 5).astype(dtype)
    hostC = np.random.randn(groups, hostA.shape[1], 6).astype(dtype)
    hostD = np.random.randn(groups, 8, hostC.shape[2]).astype(dtype)

    A, B, C, D = GPUArray.toGpu(hostA), GPUArray.toGpu(hostB), GPUArray.toGpu(
        hostC), GPUArray.toGpu(hostD)
    out = context.gemmBatched(A,
                              B,
                              formatA=formatA,
                              formatB=formatB,
                              formatOut=formatOut)

    hostOut = np.empty(out.shape, dtype=dtype)
    for i in range(groups):
        np.dot(hostA[i], hostB[i], out=hostOut[i])

    assert np.allclose(hostOut, out.get(), atol=atol)

    out = context.gemmBatched(C,
                              A,
                              formatA=formatA,
                              formatB=formatB,
                              formatOut=formatOut,
                              transpA=True)

    hostOut = np.empty(out.shape, dtype=dtype)
    for i in range(groups):
        np.dot(hostC[i].T, hostA[i], out=hostOut[i])

    assert np.allclose(hostOut, out.get(), atol=atol)

    out = context.gemmBatched(C,
                              D,
                              formatA=formatA,
                              formatB=formatB,
                              formatOut=formatOut,
                              transpB=True)

    hostOut = np.empty(out.shape, dtype=dtype)
    for i in range(groups):
        np.dot(hostC[i], hostD[i].T, out=hostOut[i])

    assert np.allclose(hostOut, out.get(), atol=atol)

コード例 #20

0

ファイルを表示

ファイル: CuBlas.py プロジェクト: rsarbaev/PuzzleLib

def bgpBgpTest(dtype, atol):
    formatA, formatB, formatOut = CuBlas.GROUPFORMAT_BGP, CuBlas.GROUPFORMAT_BGP, CuBlas.GROUPFORMAT_GBP
    groups = 3

    hostA = np.random.randn(4, groups, 7).astype(dtype)
    hostB = np.random.randn(hostA.shape[2], groups, 5).astype(dtype)
    hostC = np.random.randn(hostA.shape[0], groups,
                            hostB.shape[2]).astype(dtype)

    A, B, C = GPUArray.toGpu(hostA), GPUArray.toGpu(hostB), GPUArray.toGpu(
        hostC)
    out = context.gemmBatched(A,
                              B,
                              formatA=formatA,
                              formatB=formatB,
                              formatOut=formatOut)

    hostOut = np.empty(out.shape, dtype=dtype)
    for i in range(groups):
        np.dot(hostA[:, i, :], hostB[:, i, :], out=hostOut[i])

    assert np.allclose(hostOut, out.get(), atol=atol)

    out = context.gemmBatched(A,
                              C,
                              formatA=formatA,
                              formatB=formatB,
                              formatOut=formatOut,
                              transpA=True)

    hostOut = np.empty(out.shape, dtype=dtype)
    for i in range(groups):
        np.dot(hostA[:, i, :].T, hostC[:, i, :], out=hostOut[i])

    assert np.allclose(hostOut, out.get(), atol=atol)

    out = context.gemmBatched(B,
                              C,
                              formatA=formatA,
                              formatB=formatB,
                              formatOut=formatOut,
                              transpB=True)

    hostOut = np.empty(out.shape, dtype=dtype)
    for i in range(groups):
        np.dot(hostB[:, i, :], hostC[:, i, :].T, out=hostOut[i])

    assert np.allclose(hostOut, out.get(), atol=atol)

コード例 #21

0

ファイルを表示

def matsum(tensor, axis=0, out=None, alpha=1.0, beta=0.0, allocator=memPool):
    assert tensor.dtype == np.float32 or tensor.dtype == np.float16
    assert 0 <= axis < tensor.ndim

    if axis == tensor.ndim - 1:
        block = (warpSize, 1, 1)
        grid = (prod(tensor.shape[:-1]), 1, 1)

        if out is None:
            out = GPUArray.zeros(tensor.shape[:-1],
                                 dtype=tensor.dtype,
                                 allocator=allocator)
        else:
            assert out.shape == tensor.shape[:-1]

        fn = summod.sumOnRow if tensor.dtype == np.float32 else summod.sumOnRowFP16
        fn(out,
           tensor,
           np.int32(tensor.dimAt(-1)),
           np.float32(alpha),
           np.float32(beta),
           block=block,
           grid=grid)

    else:
        z, width = prod(tensor.shape[:axis]), prod(tensor.shape[axis + 1:])

        block = (NT, 1, 1)
        grid = (roundUpDiv(width, block[0]), 1, z)

        if out is None:
            out = GPUArray.zeros(tensor.shape[:axis] + tensor.shape[axis + 1:],
                                 dtype=tensor.dtype,
                                 allocator=allocator)
        else:
            assert out.shape == tensor.shape[:axis] + tensor.shape[axis + 1:]

        fn = summod.sumOnCol if tensor.dtype == np.float32 else summod.sumOnColFP16
        fn(out,
           tensor,
           np.int32(width),
           np.int32(tensor.dimAt(axis)),
           np.float32(alpha),
           np.float32(beta),
           block=block,
           grid=grid)

    return out

コード例 #22

0

ファイルを表示

ファイル: NPP.py プロジェクト: rsarbaev/PuzzleLib

def rescale(data,
            scale,
            memoryType,
            interpolation=InterpolationMode.nn,
            outdata=None,
            allocator=memPool):
    assert data.ndim == 2 and memoryType == MemoryType.grayscale or data.ndim == 3
    hscale, wscale = (scale, scale) if isinstance(scale,
                                                  (int, float)) else scale

    inrect = getDataRect(data, memoryType)
    insize, inline = (inrect[2], inrect[3]), getMemoryTypeLineSize(
        inrect[2], data.dtype, memoryType)

    outrect = libnpp.nppiGetResizeRect(inrect, wscale, hscale, 0, 0,
                                       interpolation.value)
    outline = getMemoryTypeLineSize(outrect[2], data.dtype, memoryType)

    outshape = getOutDataShape(data, outrect, memoryType)

    if outdata is None:
        outdata = GPUArray.empty(outshape,
                                 dtype=data.dtype,
                                 allocator=allocator)
    else:
        assert outdata.shape == outshape

    dataPtr, outdataPtr = getDataPointers(data, outdata, memoryType)

    libnpp.nppiResizeSqrPixel(
        getDataType(data).value, memoryType.value, dataPtr, insize, inline,
        inrect, outdataPtr, outline, outrect, wscale, hscale, 0, 0,
        interpolation.value)

    return outdata

コード例 #23

0

ファイルを表示

def batchSpeedTest(dtype):
    from PuzzleLib.Cuda.Benchmarks.Utils import timeKernel

    A = GPUArray.toGpu(np.random.randn(32, 128, 128).astype(dtype))
    v = GPUArray.toGpu(np.random.randn(32, 128).astype(dtype))

    timeKernel(addVecToMat, (v, A, 1, A),
               logname="%s batched addVecToMat on rows" % dtype)
    timeKernel(addVecToMat, (v, A, 0, A),
               logname="%s batched addVecToMat on cols" % dtype)

    timeKernel(argmax, (A, 2), logname="%s batched argmax on rows" % dtype)
    timeKernel(argmax, (A, 1), logname="%s batched argmax on cols" % dtype)

    timeKernel(matsum, (A, 2), logname="%s batched matsum on rows" % dtype)
    timeKernel(matsum, (A, 1), logname="%s batched matsum on cols" % dtype)

コード例 #24

0

ファイルを表示

def concatenate(tup, axis, out=None, allocator=memoryPool):
    ary = tup[0]

    dtype, reducedShape = ary.dtype, ary.shape
    reducedShape = reducedShape[:axis] + reducedShape[axis + 1:]

    assert all(a.dtype == dtype and a.shape[:axis] +
               a.shape[axis + 1:] == reducedShape for a in tup[1:])

    concatDim = sum(a.dimAt(axis) for a in tup)
    shape = reducedShape[:axis] + (concatDim, ) + reducedShape[axis:]

    if out is None:
        out = GPUArray.empty(shape, dtype=dtype, allocator=allocator)
    else:
        assert out.shape == shape and out.dtype == dtype

    dstPitch = out.strideAt(axis - 1) if axis > 0 else out.nbytes
    height = prod(shape[:axis])

    stride = 0

    for a in tup:
        srcPitch = width = a.strideAt(axis - 1) if axis > 0 else a.nbytes

        Driver.memcpy2D(width,
                        height,
                        a.gpudata,
                        srcPitch,
                        out.gpudata,
                        dstPitch,
                        dstX=stride)
        stride += width

    return out

コード例 #25

0

ファイルを表示

def split(ary, sections, axis, allocator=memoryPool):
    shape = ary.shape
    assert sum(sections) == shape[axis]

    outs = [
        GPUArray.empty(shape[:axis] + (sec, ) + shape[axis + 1:],
                       dtype=ary.dtype,
                       allocator=allocator) for sec in sections
    ]

    srcPitch = ary.strideAt(axis - 1) if axis > 0 else ary.nbytes
    height = prod(shape[:axis])

    stride = 0

    for out in outs:
        dstPitch = width = out.strideAt(axis - 1) if axis > 0 else out.nbytes

        Driver.memcpy2D(width,
                        height,
                        ary.gpudata,
                        srcPitch,
                        out.gpudata,
                        dstPitch,
                        srcX=stride)
        stride += width

    return outs

コード例 #26

0

ファイルを表示

def maxunpool2d(data, origshape, mask, allocator=memPool):
    assert data.dtype == np.float32
    batchsize, maps, inh, inw = data.shape

    outh, outw = origshape[2], origshape[3]
    outdata = GPUArray.zeros((batchsize, maps, outh, outw),
                             dtype=np.float32,
                             allocator=allocator)

    size = prod(data.shape)

    block = (nthreads, 1, 1)
    grid = (roundUpDiv(size, nthreads), 1, 1)

    mod.maxunpool2d(outdata,
                    data,
                    mask,
                    np.int32(inh),
                    np.int32(inw),
                    np.int32(outh),
                    np.int32(outw),
                    np.int32(maps),
                    np.int32(size),
                    block=block,
                    grid=grid)

    return outdata

コード例 #27

0

ファイルを表示

ファイル: NPP.py プロジェクト: rsarbaev/PuzzleLib

def warpAffinePoints(data,
                     inpoints,
                     outpoints,
                     memoryType,
                     outshape=None,
                     interpolation=InterpolationMode.nn,
                     cval=0,
                     clip=True,
                     allocator=memPool):
    assert data.ndim == 2 and memoryType == MemoryType.grayscale or data.ndim == 3

    inrect = getDataRect(data, memoryType)
    insize, inline = (inrect[2], inrect[3]), getMemoryTypeLineSize(
        inrect[2], data.dtype, memoryType)

    if outshape is None:
        outshape = data.shape

    outrect = getOutDataRect(data, outshape, memoryType)
    outline = getMemoryTypeLineSize(outrect[2], data.dtype, memoryType)

    outdata = GPUArray.empty(outshape, dtype=data.dtype, allocator=allocator)
    outdata.fill(cval)

    dataPtr, outdataPtr = getDataPointers(data, outdata, memoryType)
    srcQuad, dstQuad = genAffineQuads(inpoints, outpoints, clip, inrect)

    libnpp.nppiWarpAffineQuad(
        getDataType(data).value, memoryType.value, dataPtr, insize, inline,
        inrect, srcQuad, outdataPtr, outline, outrect, dstQuad,
        interpolation.value)

    return outdata

コード例 #28

0

ファイルを表示

def upsample3d(data, scale, mode="nearest", allocator=memPool):
	batchsize, maps, ind, inh, inw = data.shape
	dscale, hscale, wscale = (scale, scale, scale) if isinstance(scale, int) else scale

	outd, outh, outw = dscale * ind, hscale * inh, wscale * inw
	outdata = GPUArray.empty((batchsize, maps, outd, outh, outw), dtype=data.dtype, allocator=allocator)

	if mode == "nearest":
		block = (wblocksize, hblocksize, 1)
		grid = (roundUpDiv(inw, block[0]), roundUpDiv(inh, block[1]), batchsize * maps * ind)

		nearestMod.upsample3dNearest(
			outdata, data, np.int32(ind), np.int32(inh), np.int32(inw),
			np.int32(outd), np.int32(outh), np.int32(outw), np.int32(dscale), np.int32(hscale), np.int32(wscale),
			block=block, grid=grid
		)

	elif mode == "linear":
		block = (warpSize, nthreads // warpSize, 1)
		grid = (roundUpDiv(outw, block[0]), roundUpDiv(outh, block[1]), outd)

		rd, rh, rw = (ind - 1) / (outd - 1), (inh - 1) / (outh - 1), (inw - 1) / (outw - 1)

		linearMod.upsample3dLinear(
			outdata, data, np.int32(batchsize), np.int32(maps), np.int32(ind), np.int32(inh), np.int32(inw),
			np.int32(outd), np.int32(outh), np.int32(outw), np.float32(rd), np.float32(rh), np.float32(rw),
			block=block, grid=grid
		)

	else:
		raise NotImplementedError(mode)

	return outdata

コード例 #29

0

ファイルを表示

def maxunpool2dBackward(grad, poolshape, mask, allocator=memPool):
    assert grad.dtype == np.float32 and mask.dtype == np.int32
    batchsize, maps, outh, outw = grad.shape

    inh, inw = poolshape[2], poolshape[3]
    ingrad = GPUArray.empty((batchsize, maps, inh, inw),
                            dtype=np.float32,
                            allocator=allocator)

    size = prod(ingrad.shape)

    block = (nthreads, 1, 1)
    grid = (roundUpDiv(size, nthreads), 1, 1)

    mod.maxunpool2dBackward(ingrad,
                            grad,
                            mask,
                            np.int32(inh),
                            np.int32(inw),
                            np.int32(outh),
                            np.int32(outw),
                            np.int32(maps),
                            np.int32(size),
                            block=block,
                            grid=grid)

    return ingrad

コード例 #30

0

ファイルを表示

ファイル: NPP.py プロジェクト: rsarbaev/PuzzleLib

def warpAffine(data,
               coeffs,
               memoryType,
               outshape=None,
               interpolation=InterpolationMode.nn,
               cval=0,
               backward=False,
               allocator=memPool):
    assert data.ndim == 2 and memoryType == MemoryType.grayscale or data.ndim == 3

    inrect = getDataRect(data, memoryType)
    insize, inline = (inrect[2], inrect[3]), getMemoryTypeLineSize(
        inrect[2], data.dtype, memoryType)

    if outshape is None:
        outshape = data.shape

    outrect = getOutDataRect(data, outshape, memoryType)
    outline = getMemoryTypeLineSize(outrect[2], data.dtype, memoryType)

    outdata = GPUArray.empty(outshape, dtype=data.dtype, allocator=allocator)
    outdata.fill(cval)

    dataPtr, outdataPtr = getDataPointers(data, outdata, memoryType)

    warpMethod = libnpp.nppiWarpAffine
    if backward:
        warpMethod = libnpp.nppiWarpAffineBack

    warpMethod(
        getDataType(data).value, memoryType.value, dataPtr, insize, inline,
        inrect, outdataPtr, outline, outrect, coeffs, interpolation.value)

    return outdata