def reflectpad1dTest(dtype): batchsize, maps, insize = 4, 8, 48 lpad, rpad = 2, 3 hostData = np.random.randn(batchsize, maps, insize).astype(dtype) data = GPUArray.toGpu(hostData) outdata = reflectpad(data, pad=(lpad, rpad)) hostOutData = outdata.get() outsize = hostOutData.shape[2] assert np.allclose(hostOutData[:, :, lpad:insize + lpad], hostData) assert np.allclose(hostOutData[:, :, :lpad][:, :, ::-1], hostData[:, :, 1:lpad + 1]) assert np.allclose(hostOutData[:, :, insize + lpad:][:, :, ::-1], hostData[:, :, insize - 1 - rpad:insize - 1]) hostGrad = np.random.randn(batchsize, maps, outsize).astype(np.float32) grad = GPUArray.toGpu(hostGrad) ingrad = reflectpadBackward(grad, pad=(lpad, rpad)) hostInGrad = ingrad.get() assert np.allclose(hostInGrad[:, :, lpad + 1:insize - rpad - 1], hostGrad[:, :, 2 * lpad + 1:outsize - 2 * rpad - 1]) assert np.allclose( hostInGrad[:, :, 1:lpad + 1], hostGrad[:, :, :lpad][:, :, ::-1] + hostGrad[:, :, lpad + 1:2 * lpad + 1]) assert np.allclose( hostInGrad[:, :, insize - rpad - 1:insize - 1], hostGrad[:, :, outsize - rpad:][:, :, ::-1] + hostGrad[:, :, outsize - 2 * rpad - 1:outsize - rpad - 1])
def upsample2dNearestTest(): batchsize, maps, inh, inw = 1, 2, 16, 15 scale = 2 hostData = np.random.uniform(low=-1.0, high=1.0, size=(batchsize, maps, inh, inw)).astype(np.float32) data = GPUArray.toGpu(hostData) outdata = upsample2d(data, scale, mode="nearest") hostOutData = np.empty(outdata.shape, dtype=np.float32) for b, c, y, x in itertools.product(range(batchsize), range(maps), range(inh), range(inw)): hostOutData[b, c, y * scale:(y + 1) * scale, x * scale:(x + 1) * scale] = hostData[b, c, y, x] assert np.allclose(hostOutData, outdata.get()) hostGrad = np.random.randn(*outdata.shape).astype(np.float32) grad = GPUArray.toGpu(hostGrad) ingrad = upsample2dBackward(grad, scale) hostInGrad = np.zeros(data.shape, dtype=np.float32) for b, c, y, x, dy, dx in itertools.product( range(batchsize), range(maps), range(inh), range(inw), range(scale), range(scale) ): hostInGrad[b, c, y, x] += hostGrad[b, c, y * scale + dy, x * scale + dx] assert np.allclose(hostInGrad, ingrad.get(), atol=1e-5)
def svmTest(): batchsize, size = 20, 4 hostScores = np.random.randn(batchsize, size).astype(np.float32) hostLabels = np.random.randint(low=0, high=size, size=(batchsize, ), dtype=np.int32) scores, labels = GPUArray.toGpu(hostScores), GPUArray.toGpu(hostLabels) error, grad = svm(scores, labels, mode="l1") hostGrad = np.empty(grad.shape, dtype=np.float32) hostError = 0.0 for b in range(batchsize): for n in range(size): cls = 2 * (hostLabels[b] == n) - 1 val = hostScores[b, n] * cls hostGrad[b, n] = cls / batchsize / size if val < 1 else 0.0 hostError += max(0.0, 1.0 - val) / batchsize / size assert np.allclose(hostGrad, grad.get()) assert np.isclose(hostError, error.get() / scores.shape[0])
def upsample3dNearestTest(): batchsize, maps, ind, inh, inw = 4, 2, 3, 5, 3 scale = 2 hostData = np.random.randn(batchsize, maps, ind, inh, inw).astype(np.float32) data = GPUArray.toGpu(hostData) outdata = upsample3d(data, scale, mode="nearest") hostOutData = np.empty(outdata.shape, dtype=np.float32) for b, c, z, y, x in itertools.product(range(batchsize), range(maps), range(ind), range(inh), range(inw)): hostOutData[b, c, z * scale:(z + 1) * scale, y * scale:(y + 1) * scale, x * scale:(x + 1) * scale] = \ hostData[b, c, z, y, x] assert np.allclose(hostOutData, outdata.get()) hostGrad = np.random.randn(*outdata.shape).astype(np.float32) grad = GPUArray.toGpu(hostGrad) ingrad = upsample3dBackward(grad, scale) hostInGrad = np.zeros(data.shape, dtype=np.float32) for b, c, z, y, x, dz, dy, dx in itertools.product( range(batchsize), range(maps), range(ind), range(inh), range(inw), range(scale), range(scale), range(scale) ): hostInGrad[b, c, z, y, x] += hostGrad[b, c, z * scale + dz, y * scale + dy, x * scale + dx] assert np.allclose(hostInGrad, ingrad.get())
def softmax2dTest(dtype, atol): batchsize, maps, h, w = 5, 8, 2, 3 hostData = np.random.randn(batchsize, maps, h, w).astype(dtype) data = GPUArray.toGpu(hostData) outdata = context.softmaxNd(data) def hostSoftmax(tensor): e = np.exp(tensor - np.amax(tensor)) return e / np.sum(e) hostOutData = np.empty(outdata.shape, dtype=dtype) for b, y, x in itertools.product(range(batchsize), range(h), range(w)): hostOutData[b, :, y, x] = hostSoftmax(hostData[b, :, y, x]) assert np.allclose(hostOutData, outdata.get(), atol=atol) hostGrad = np.random.randn(*outdata.shape).astype(dtype) grad = GPUArray.toGpu(hostGrad) ingrad = context.softmaxNdBackward(grad, outdata) hostInGrad = np.empty(ingrad.shape, dtype=dtype) def hostSoftmaxBackward(d, gr): return d * (gr - np.dot(d, gr)) for b, y, x in itertools.product(range(batchsize), range(h), range(w)): hostInGrad[b, :, y, x] = hostSoftmaxBackward(hostOutData[b, :, y, x], hostGrad[b, :, y, x]) assert np.allclose(hostInGrad, ingrad.get(), atol=atol)
def vectorTest(): hostX, hostY = np.random.randn(5).astype( np.float32), np.random.randn(5).astype(np.float32) x, y = GPUArray.toGpu(hostX), GPUArray.toGpu(hostY) assert np.isclose(context.dot(x, y), np.dot(hostX, hostY)) assert np.isclose(context.l1norm(x), np.linalg.norm(hostX, ord=1)) assert np.isclose(context.l2norm(x), np.linalg.norm(hostX, ord=2))
def radixSortTest(): hostKeys = np.random.randint(0, (1 << 31) - 1, size=(250, ), dtype=np.int32) hostValues = np.arange(0, hostKeys.shape[0], dtype=np.int32) outkeys, outvalues = radixSort(GPUArray.toGpu(hostKeys), GPUArray.toGpu(hostValues)) assert (outkeys.get() == np.sort(hostKeys)).all() assert (outvalues.get() == np.argsort(hostKeys)).all()
def maxpool3dTest(dtype, atol): batchsize, maps, d, h, w = 1, 1, 6, 6, 6 size, s, pad = 3, 2, 1 hostData = np.full(shape=(batchsize, maps, d + 2 * pad, h + 2 * pad, w + 2 * pad), fill_value=np.finfo(dtype).min, dtype=dtype) hostData[:, :, pad:-pad, pad:-pad, pad:-pad] = np.random.randn(batchsize, maps, d, h, w).astype(dtype) data = GPUArray.toGpu( np.ascontiguousarray(hostData[:, :, pad:-pad, pad:-pad, pad:-pad])) outdata = context.poolNd(data, size=size, stride=s, pad=pad, mode=CuDnn.POOL_MODE_MAX) hostOutData = np.empty(outdata.shape, dtype=dtype) for b, c, z, y, x in itertools.product(range(batchsize), range(maps), range(hostOutData.shape[2]), range(hostOutData.shape[3]), range(hostOutData.shape[4])): hostOutData[b, c, z, y, x] = np.max(hostData[b, c, z * s:z * s + size, y * s:y * s + size, x * s:x * s + size]) assert np.allclose(hostOutData, outdata.get()) hostGrad = np.random.randn(*outdata.shape).astype(dtype) grad = GPUArray.toGpu(hostGrad) ingrad = context.poolNdBackward(grad, data, outdata, size=size, stride=s, pad=pad, mode=CuDnn.POOL_MODE_MAX) hostInGrad = np.zeros(hostData.shape, dtype=np.float32) for b, c, z, y, x, dz, dy, dx in itertools.product( range(batchsize), range(maps), range(hostOutData.shape[2]), range(hostOutData.shape[3]), range(hostOutData.shape[4]), range(size), range(size), range(size)): if hostData[b, c, z * s + dz, y * s + dy, x * s + dx] == hostOutData[b, c, z, y, x]: hostInGrad[b, c, z * s + dz, y * s + dy, x * s + dx] += hostGrad[b, c, z, y, x] hostInGrad = hostInGrad[:, :, pad:-pad, pad:-pad, pad:-pad].astype(dtype) assert np.allclose(hostInGrad, ingrad.get(), atol=atol)
def unittest(): times, batchsize, vocabsize = 20, 3, 6 hostData, hostDataLen, hostLabels, lengths = createData(times, batchsize, vocabsize) data, datalen, labels = GPUArray.toGpu(hostData), GPUArray.toGpu(hostDataLen), GPUArray.toGpu(hostLabels) blank = 0 error, grad, alphas = ctcLoss(data, datalen, labels, lengths, blank, returnAlphas=True) hostError, hostGrad, hostAlphas = ctcLossTest(hostData, hostDataLen, hostLabels, lengths, blank) assert np.allclose(hostAlphas, alphas.get()) assert np.isclose(hostError, error.get()) assert np.allclose(hostGrad, grad.get(), atol=1e-5)
def gbpGbpTest(dtype, atol): formatA, formatB, formatOut = CuBlas.GROUPFORMAT_GBP, CuBlas.GROUPFORMAT_GBP, CuBlas.GROUPFORMAT_GBP groups = 3 hostA = np.random.randn(groups, 4, 3).astype(dtype) hostB = np.random.randn(groups, hostA.shape[2], 5).astype(dtype) hostC = np.random.randn(groups, hostA.shape[1], 6).astype(dtype) hostD = np.random.randn(groups, 8, hostC.shape[2]).astype(dtype) A, B, C, D = GPUArray.toGpu(hostA), GPUArray.toGpu(hostB), GPUArray.toGpu( hostC), GPUArray.toGpu(hostD) out = context.gemmBatched(A, B, formatA=formatA, formatB=formatB, formatOut=formatOut) hostOut = np.empty(out.shape, dtype=dtype) for i in range(groups): np.dot(hostA[i], hostB[i], out=hostOut[i]) assert np.allclose(hostOut, out.get(), atol=atol) out = context.gemmBatched(C, A, formatA=formatA, formatB=formatB, formatOut=formatOut, transpA=True) hostOut = np.empty(out.shape, dtype=dtype) for i in range(groups): np.dot(hostC[i].T, hostA[i], out=hostOut[i]) assert np.allclose(hostOut, out.get(), atol=atol) out = context.gemmBatched(C, D, formatA=formatA, formatB=formatB, formatOut=formatOut, transpB=True) hostOut = np.empty(out.shape, dtype=dtype) for i in range(groups): np.dot(hostC[i], hostD[i].T, out=hostOut[i]) assert np.allclose(hostOut, out.get(), atol=atol)
def bgpBgpTest(dtype, atol): formatA, formatB, formatOut = CuBlas.GROUPFORMAT_BGP, CuBlas.GROUPFORMAT_BGP, CuBlas.GROUPFORMAT_GBP groups = 3 hostA = np.random.randn(4, groups, 7).astype(dtype) hostB = np.random.randn(hostA.shape[2], groups, 5).astype(dtype) hostC = np.random.randn(hostA.shape[0], groups, hostB.shape[2]).astype(dtype) A, B, C = GPUArray.toGpu(hostA), GPUArray.toGpu(hostB), GPUArray.toGpu( hostC) out = context.gemmBatched(A, B, formatA=formatA, formatB=formatB, formatOut=formatOut) hostOut = np.empty(out.shape, dtype=dtype) for i in range(groups): np.dot(hostA[:, i, :], hostB[:, i, :], out=hostOut[i]) assert np.allclose(hostOut, out.get(), atol=atol) out = context.gemmBatched(A, C, formatA=formatA, formatB=formatB, formatOut=formatOut, transpA=True) hostOut = np.empty(out.shape, dtype=dtype) for i in range(groups): np.dot(hostA[:, i, :].T, hostC[:, i, :], out=hostOut[i]) assert np.allclose(hostOut, out.get(), atol=atol) out = context.gemmBatched(B, C, formatA=formatA, formatB=formatB, formatOut=formatOut, transpB=True) hostOut = np.empty(out.shape, dtype=dtype) for i in range(groups): np.dot(hostB[:, i, :], hostC[:, i, :].T, out=hostOut[i]) assert np.allclose(hostOut, out.get(), atol=atol)
def batchSpeedTest(dtype): from PuzzleLib.Cuda.Benchmarks.Utils import timeKernel A = GPUArray.toGpu(np.random.randn(32, 128, 128).astype(dtype)) v = GPUArray.toGpu(np.random.randn(32, 128).astype(dtype)) timeKernel(addVecToMat, (v, A, 1, A), logname="%s batched addVecToMat on rows" % dtype) timeKernel(addVecToMat, (v, A, 0, A), logname="%s batched addVecToMat on cols" % dtype) timeKernel(argmax, (A, 2), logname="%s batched argmax on rows" % dtype) timeKernel(argmax, (A, 1), logname="%s batched argmax on cols" % dtype) timeKernel(matsum, (A, 2), logname="%s batched matsum on rows" % dtype) timeKernel(matsum, (A, 1), logname="%s batched matsum on cols" % dtype)
def speedTest(dtype): from PuzzleLib.Cuda.Benchmarks.Utils import timeKernel A = GPUArray.toGpu(np.random.randn(1024, 1024).astype(dtype)) v = GPUArray.toGpu(np.random.randn(1024).astype(dtype)) timeKernel(addVecToMat, (v, A, 1, A), logname="%s addVecToMat on rows" % dtype) timeKernel(addVecToMat, (v, A, 0, A), logname="%s addVecToMat on cols" % dtype) timeKernel(argmax, (A, 1), logname="%s argmax on rows" % dtype) timeKernel(argmax, (A, 0), logname="%s argmax on cols" % dtype) timeKernel(matsum, (A, 1), logname="%s matsum on rows" % dtype) timeKernel(matsum, (A, 0), logname="%s matsum on cols" % dtype)
def matrixTest(dtype, atol): hostA, hostB = np.random.randn(5, 3).astype(dtype), np.random.randn( 3, 4).astype(dtype) A, B = GPUArray.toGpu(hostA), GPUArray.toGpu(hostB) C = context.gemm(A, B) hostC = C.get() assert np.allclose(np.dot(hostA, hostB), hostC) D = context.gemm(B, C, transpB=True) hostD = D.get() assert np.allclose(np.dot(hostB, hostC.T), hostD) E = context.gemm(D, B, transpA=True) assert np.allclose(np.dot(hostD.T, hostB), E.get(), atol=atol)
def reflectpad2dTest(dtype): batchsize, maps, inh, inw = 4, 8, 12, 15 upad, bpad, lpad, rpad = 2, 3, 2, 3 hostData = np.random.randn(batchsize, maps, inh, inw).astype(dtype) data = GPUArray.toGpu(hostData) outdata = reflectpad(data, pad=(upad, bpad, lpad, rpad)) hostOutData = outdata.get() outh, outw = hostOutData.shape[2:] assert np.allclose(hostOutData[:, :, upad:inh + upad, lpad:inw + lpad], hostData) assert np.allclose(hostOutData[:, :, :upad, :lpad][:, :, ::-1, ::-1], hostData[:, :, 1:upad + 1, 1:lpad + 1]) assert np.allclose( hostOutData[:, :, inh + upad:, inw + lpad:][:, :, ::-1, ::-1], hostData[:, :, inh - 1 - bpad:inh - 1, inw - 1 - rpad:inw - 1]) hostGrad = np.random.randn(batchsize, maps, outh, outw).astype(np.float32) grad = GPUArray.toGpu(hostGrad) ingrad = reflectpadBackward(grad, pad=(upad, bpad, lpad, rpad)) hostInGrad = ingrad.get() assert np.allclose( hostInGrad[:, :, upad + 1:inh - bpad - 1, lpad + 1:inw - rpad - 1], hostGrad[:, :, 2 * upad + 1:outh - 2 * bpad - 1, 2 * lpad + 1:outw - 2 * rpad - 1]) assert np.allclose( hostInGrad[:, :, 1:upad + 1, 1:lpad + 1], hostGrad[:, :, :upad, :lpad][:, :, ::-1, ::-1] + hostGrad[:, :, upad + 1:2 * upad + 1, lpad + 1:2 * lpad + 1] + hostGrad[:, :, :upad, lpad + 1:2 * lpad + 1][:, :, ::-1, :] + hostGrad[:, :, upad + 1:2 * upad + 1, :lpad][:, :, :, ::-1]) assert np.allclose( hostInGrad[:, :, inh - bpad - 1:inh - 1, inw - rpad - 1:inw - 1], hostGrad[:, :, outh - bpad:, outw - rpad:][:, :, ::-1, ::-1] + hostGrad[:, :, outh - 2 * bpad - 1:outh - bpad - 1, outw - 2 * rpad - 1:outw - rpad - 1] + hostGrad[:, :, outh - bpad:, outw - 2 * rpad - 1:outw - rpad - 1][:, :, ::-1, :] + hostGrad[:, :, outh - 2 * bpad - 1:outh - bpad - 1, outw - rpad:][:, :, :, ::-1])
def batchCalcTest(dtype, atol): hostA = np.random.randn(8, 32, 64).astype(dtype) hostV = np.random.randn(8, 64).astype(dtype) hostW = np.random.randn(8, 32).astype(dtype) A = GPUArray.toGpu(hostA) v, w = GPUArray.toGpu(hostV), GPUArray.toGpu(hostW) assert np.allclose( addVecToMat(w, A, axis=0).get(), hostA + hostW[:, :, np.newaxis]) assert np.allclose( addVecToMat(v, A, axis=1).get(), hostA + hostV[:, np.newaxis, :]) assert np.allclose(matsum(A, axis=1).get(), np.sum(hostA.astype(np.float32), axis=1).astype(dtype), atol=atol) assert np.allclose(matsum(A, axis=2).get(), np.sum(hostA.astype(np.float32), axis=2).astype(dtype), atol=atol) out = matvec(A, v, axis=1) hostOut = np.empty(out.shape, dtype=np.float32) for i in range(hostA.shape[0]): np.dot(hostA[i].astype(np.float32), hostV[i].astype(np.float32), out=hostOut[i]) assert np.allclose(out.get(), hostOut.astype(dtype), atol=atol) out = matvec(A, w, axis=0) hostOut = np.empty(out.shape, dtype=np.float32) for i in range(hostA.shape[0]): np.dot(hostA[i].T.astype(np.float32), hostW[i].astype(np.float32), out=hostOut[i]) assert np.allclose(out.get(), hostOut.astype(dtype), atol=atol) hostA = np.random.normal(scale=16.0, size=(9, 33, 65)).astype(dtype) A = GPUArray.toGpu(hostA) assert np.allclose(argmax(A, axis=1).get(), np.argmax(hostA, axis=1)) assert np.allclose(argmax(A, axis=2).get(), np.argmax(hostA, axis=2))
def calcTest(dtype, atol): hostA = np.random.randn(128, 500).astype(dtype) hostU = np.random.randn(500).astype(dtype) hostV = np.random.randn(128).astype(dtype) hostW = np.random.randn(125).astype(dtype) A = GPUArray.toGpu(hostA) u, v, w = GPUArray.toGpu(hostU), GPUArray.toGpu(hostV), GPUArray.toGpu( hostW) assert np.allclose(addVecToMat(u, A, axis=1).get(), hostA + hostU[np.newaxis, :], atol=atol) assert np.allclose(addVecToMat(v, A, axis=0).get(), hostA + hostV[:, np.newaxis], atol=atol) assert np.allclose(addVecToMat(w, A, axis=1).get(), hostA + np.tile(hostW, 4)[np.newaxis, :], atol=atol) assert np.allclose(matsum(A, axis=1).get(), np.sum(hostA.astype(np.float32), axis=1).astype(dtype), atol=atol) assert np.allclose(matsum(A, axis=0).get(), np.sum(hostA.astype(np.float32), axis=0).astype(dtype), atol=atol) out = matvec(A, u, axis=1) assert np.allclose(out.get(), np.dot(hostA.astype(np.float32), hostU.astype(np.float32)).astype(dtype), atol=atol) out = matvec(A, v, axis=0) assert np.allclose(out.get(), np.dot(hostA.T.astype(np.float32), hostV.astype(np.float32)).astype(dtype), atol=atol) hostA = 16.0 * np.random.randn(129, 501).astype(dtype) A = GPUArray.toGpu(hostA) assert np.allclose(argmax(A, axis=1).get(), np.argmax(hostA, axis=1)) assert np.allclose(argmax(A, axis=0).get(), np.argmax(hostA, axis=0))
def instanceNorm2dTest(dtype, atol): batchsize, maps, h, w = 3, 4, 5, 5 epsilon, norm = 1e-5, h * w hostData = np.random.randn(batchsize, maps, h, w).astype(dtype) hostScale = np.random.randn(1, maps, 1, 1).astype(np.float32) hostBias = np.random.randn(1, maps, 1, 1).astype(np.float32) data, scale, bias = GPUArray.toGpu(hostData), GPUArray.toGpu(hostScale.ravel()), GPUArray.toGpu(hostBias.ravel()) outdata, savemean, saveinvvar, extscale = instanceNorm2d(data, scale, bias, epsilon=epsilon) hostExtScale, hostExtBias = np.tile(hostScale, (batchsize, 1, 1, 1)), np.tile(hostBias, (batchsize, 1, 1, 1)) hostMean = np.mean(hostData, axis=(2, 3), keepdims=True) hostInvVar = 1.0 / np.sqrt(np.var(hostData, axis=(2, 3), keepdims=True) + epsilon) hostNormData = (hostData - hostMean) * hostInvVar hostOutData = hostNormData * hostExtScale + hostExtBias assert np.allclose(hostMean.ravel(), savemean.get(), atol=atol) assert np.allclose(hostInvVar.ravel(), saveinvvar.get(), atol=atol) assert np.allclose(hostOutData, outdata.get(), atol=atol) hostGrad = np.random.randn(*outdata.shape).astype(dtype) grad = GPUArray.toGpu(hostGrad) ingrad, scalegrad, bgrad = instanceNorm2dBackward(grad, data, extscale, savemean, saveinvvar, epsilon=epsilon) hostScaleGrad = np.sum(hostGrad * hostNormData, axis=(0, 2, 3), dtype=np.float32, keepdims=True) hostBiasGrad = np.sum(hostGrad, axis=(0, 2, 3), dtype=np.float32, keepdims=True) hostScGrad = hostGrad * hostExtScale hostCorrs = np.empty(hostInvVar.shape, dtype=np.float32) for b, c in itertools.product(range(batchsize), range(maps)): hostCorrs[b, c] = np.dot(hostScGrad[b, c].ravel(), hostNormData[b, c].ravel()) / norm hostInGrad = (hostScGrad - np.mean(hostScGrad, axis=(2, 3), keepdims=True) - hostCorrs * hostNormData) * hostInvVar hostInGrad = hostInGrad.astype(dtype) assert np.allclose(hostInGrad, ingrad.get(), atol=atol) assert np.allclose(hostScaleGrad.ravel(), scalegrad.get(), atol=atol) assert np.allclose(hostBiasGrad.ravel(), bgrad.get(), atol=atol)
def batchNorm3dTest(dtype, atol): batchsize, maps, d, h, w = 2, 5, 2, 3, 2 epsilon, norm = 1e-5, batchsize * d * h * w hostData = np.random.randn(batchsize, maps, d, h, w).astype(dtype) hostScale = np.random.randn(1, maps, 1, 1, 1).astype(np.float32) hostBias = np.random.randn(1, maps, 1, 1, 1).astype(np.float32) data, scale, bias = GPUArray.toGpu(hostData), GPUArray.toGpu(hostScale.ravel()), GPUArray.toGpu(hostBias.ravel()) mean, var = GPUArray.zeros(scale.shape, dtype=np.float32), GPUArray.toGpu(np.ones(scale.shape, dtype=np.float32)) outdata, savemean, saveinvvar = context.batchNormNd(data, mean, var, scale, bias, epsilon=epsilon, out=data) hostMean = np.sum(hostData, axis=(0, 2, 3, 4), dtype=np.float32, keepdims=True) / norm hostInvVar = np.sum((hostData - hostMean) ** 2, axis=(0, 2, 3, 4), dtype=np.float32, keepdims=True) / norm hostInvVar = 1.0 / np.sqrt(hostInvVar + epsilon) hostNormData = (hostData - hostMean) * hostInvVar hostOutData = (hostNormData * hostScale + hostBias).astype(dtype) assert np.allclose(hostMean.ravel(), mean.get(), atol=atol) assert np.allclose(hostInvVar.ravel(), saveinvvar.get(), atol=atol) assert np.allclose(hostOutData, outdata.get(), atol=atol) hostGrad = np.random.randn(*outdata.shape).astype(dtype) grad, data = GPUArray.toGpu(hostGrad), GPUArray.toGpu(hostData) ingrad, scalegrad, biasgrad = context.batchNormNdBackward(grad, data, scale, savemean, saveinvvar, epsilon=epsilon) hostScaleGrad = np.sum(hostGrad * hostNormData, axis=(0, 2, 3, 4), dtype=np.float32, keepdims=True) hostBiasGrad = np.sum(hostGrad, axis=(0, 2, 3, 4), dtype=np.float32, keepdims=True) hostMeanGrad = -hostInvVar * hostBiasGrad * hostScale hostVarGrad = np.sum(hostGrad * (hostData - hostMean), axis=(0, 2, 3, 4), dtype=np.float32, keepdims=True) hostVarGrad = -0.5 * hostVarGrad * hostScale * hostInvVar**3 hostInGrad = hostGrad * hostScale * hostInvVar + (2 * hostVarGrad * (hostData - hostMean) + hostMeanGrad) / norm hostInGrad = hostInGrad.astype(dtype) assert np.allclose(hostInGrad, ingrad.get(), atol=atol) assert np.allclose(hostScaleGrad.ravel(), scalegrad.get(), atol=atol) assert np.allclose(hostBiasGrad.ravel(), biasgrad.get(), atol=atol) hostMean = np.random.randn(*hostMean.shape).astype(np.float32) hostVar = 1.0 + np.random.randn(*hostInvVar.shape).astype(np.float32)**2 mean, var = GPUArray.toGpu(hostMean.ravel()), GPUArray.toGpu(hostVar.ravel()) outdata = context.batchNormNd(data, mean, var, scale, bias, test=True) hostOutData = ((hostData - hostMean) / np.sqrt(hostVar + epsilon) * hostScale + hostBias).astype(dtype) assert np.allclose(hostOutData, outdata.get(), atol=atol)
def upsample3dSpeedTest(): from PuzzleLib.Cuda.Benchmarks.Utils import timeKernel batchsize, maps, ind, inh, inw = 32, 16, 4, 32, 32 scale = 2 data = GPUArray.toGpu(np.random.randn(batchsize, maps, ind, inh, inw).astype(np.float32)) timeKernel(upsample3d, args=(data, scale, "nearest", memPool), logname="nearest 3d mode") timeKernel(upsample3d, args=(data, scale, "linear", memPool), logname="linear 3d mode")
def mapLRN2dTest(dtype, atol): batchsize, maps, h, w = 2, 2, 9, 10 N, alpha, beta, K = 5, 1.0, 0.5, 2.0 lookBehind = int((N - 1) / 2) lookAhead = N - lookBehind hostData = np.random.randn(batchsize, maps, h, w).astype(dtype) data = GPUArray.toGpu(hostData) outdata = context.mapLRN(data, N=N, alpha=alpha, beta=beta, K=K) norms = np.empty(hostData.shape, dtype=np.float32) for b, c, y, x in itertools.product(range(batchsize), range(maps), range(h), range(w)): slcy = slice(max(0, y - lookBehind), min(h, y + lookAhead)) slcx = slice(max(0, x - lookBehind), min(w, x + lookAhead)) slc = hostData[b, c, slcy, slcx].ravel() norms[b, c, y, x] = K + np.dot(slc, slc) * alpha / N**2 hostOutData = (hostData / norms**beta).astype(dtype) assert np.allclose(hostOutData, outdata.get(), atol=atol) hostGrad = np.random.randn(*outdata.shape).astype(dtype) grad = GPUArray.toGpu(hostGrad) ingrad = context.mapLRNBackward(data, grad, N=N, alpha=alpha, beta=beta, K=K) hostInGrad = hostGrad / norms**beta k = 2.0 * alpha * beta / N**2 for b, c, y, x in itertools.product(range(batchsize), range(maps), range(h), range(w)): slcy = slice(max(0, y - lookBehind), min(h, y + lookAhead)) slcx = slice(max(0, x - lookBehind), min(w, x + lookAhead)) slcdata, slcgrad = hostData[b, c, slcy, slcx].ravel(), hostGrad[b, c, slcy, slcx].ravel() slcnorms = norms[b, c, slcy, slcx].ravel() hostInGrad[b, c, y, x] -= k * hostData[b, c, y, x] * np.dot(slcgrad, slcdata / slcnorms**(beta + 1)) hostInGrad = hostInGrad.astype(dtype) assert np.allclose(hostInGrad, ingrad.get(), atol=atol)
def scanSumTest(): hostData = np.random.randint(0, 1000, size=(120, ), dtype=np.uint32) outdata = scanSum(GPUArray.toGpu(hostData)) hostOutData = np.empty_like(hostData) hostOutData[0] = 0 hostOutData[1:] = np.cumsum(hostData)[:-1] assert np.allclose(outdata.get(), hostOutData)
def upsample2dLinearTest(): batchsize, maps, inh, inw = 3, 2, 4, 4 hscale, wscale = 2, 3 hostData = np.random.randn(batchsize, maps, inh, inw).astype(np.float32) data = GPUArray.toGpu(hostData) outdata = upsample2d(data, (hscale, wscale), mode="linear") hostOutData = np.zeros(outdata.shape, dtype=np.float32) rh, rw = (inh - 1) / (inh * hscale - 1), (inw - 1) / (inw * wscale - 1) for b, c, y, x, in itertools.product(range(batchsize), range(maps), range(inh * hscale), range(inw * wscale)): iny, inx = int(rh * y), int(rw * x) dy, dx = 1.0 - (rh * y - iny), 1.0 - (rw * x - inx) yi, xi = 1 if y < inh * hscale - 1 else 0, 1 if x < inw * wscale - 1 else 0 hostOutData[b, c, y, x] = dy * (dx * hostData[b, c, iny, inx] + (1 - dx) * hostData[b, c, iny, inx + xi]) + \ (1 - dy) * (dx * hostData[b, c, iny + yi, inx] + (1 - dx) * hostData[b, c, iny + yi, inx + xi]) hostGrad = np.random.randn(*outdata.shape).astype(np.float32) grad = GPUArray.toGpu(hostGrad) ingrad = upsample2dBackward(grad, (hscale, wscale), mode="linear") hostInGrad = np.zeros(data.shape, dtype=np.float32) for b, c, y, x in itertools.product(range(batchsize), range(maps), range(inh * hscale), range(inw * wscale)): iny, inx = int(rh * y), int(rw * x) dy, dx = 1.0 - (rh * y - iny), 1.0 - (rw * x - inx) yi, xi = 1 if y < inh * hscale - 1 else 0, 1 if x < inw * wscale - 1 else 0 val = hostGrad[b, c, y, x] hostInGrad[b, c, iny, inx] += dy * dx * val hostInGrad[b, c, iny, inx + xi] += dy * (1 - dx) * val hostInGrad[b, c, iny + yi, inx] += (1 - dy) * dx * val hostInGrad[b, c, iny + yi, inx + xi] += (1 - dy) * (1 - dx) * val assert np.allclose(hostInGrad, ingrad.get(), atol=1e-5)
def unpoolTest(): batchsize, maps, h, w = 10, 4, 6, 6 size, stride, pad = 2, 2, 1 indata = GPUArray.toGpu( np.random.randn(batchsize, maps, h, w).astype(np.float32)) pooldata, mask = maxpool2d(indata, [size, size], [stride, stride], [pad, pad]) unpooldata = maxunpool2d(pooldata, indata.shape, mask) hostPoolData = pooldata.get() hostMask = mask.get() hostUnpoolData = np.zeros(unpooldata.shape, dtype=np.float32) for b in range(batchsize): for c in range(maps): for y in range(pooldata.shape[2]): for x in range(pooldata.shape[3]): maxidx = hostMask[b, c, y, x] hostUnpoolData[b, c].ravel()[maxidx] = hostPoolData[b, c, y, x] assert np.allclose(hostUnpoolData, unpooldata.get()) hostGrad = np.random.randn(*unpooldata.shape).astype(np.float32) grad = GPUArray.toGpu(hostGrad) ingrad = maxunpool2dBackward(grad, pooldata.shape, mask) hostInGrad = np.empty(ingrad.shape, dtype=np.float32) for b in range(batchsize): for c in range(maps): for y in range(pooldata.shape[2]): for x in range(pooldata.shape[3]): maxidx = hostMask[b, c, y, x] hostInGrad[b, c, y, x] = hostGrad[b, c].ravel()[maxidx] assert np.allclose(hostInGrad, ingrad.get())
def transposeTest(dtype): shapes = [(10, ), (10, 3), (10, 3, 5, 4, 2)] for shape in shapes: for axes in itertools.permutations(range(len(shape))): hostData = np.random.randn(*shape).astype(dtype) data = GPUArray.toGpu(hostData) outdata = context.transpose(data, axes=axes) hostOutData = np.transpose(hostData, axes=axes) assert np.allclose(hostOutData, outdata.get())
def unittest(): batchsize, sentlen, embsize = 10, 5, 20 vocabsize = 1000 hostInData = np.random.randint(low=-1, high=vocabsize, size=(batchsize, sentlen), dtype=np.int32) hostW = np.random.randn(vocabsize, embsize).astype(np.float32) indata, W = GPUArray.toGpu(hostInData), GPUArray.toGpu(hostW) outdata = embed(indata, W) hostOutData = np.zeros(outdata.shape, dtype=np.float32) for b in range(batchsize): for s in range(sentlen): wordidx = int(hostInData[b, s]) if wordidx != -1: hostOutData[b, s] = hostW[wordidx] assert np.allclose(hostOutData, outdata.get()) learnRate = 0.1 hostGrad = np.random.randn(*outdata.shape).astype(np.float32) grad = GPUArray.toGpu(hostGrad) embedBackwardParams(indata, grad, W, learnRate) hostGrad = grad.get() for b in range(batchsize): for s in range(sentlen): wordidx = int(hostInData[b, s]) if wordidx != -1: hostW[wordidx] += learnRate * hostGrad[b, s] assert np.allclose(hostW, W.get())
def moveAxisTest(dtype): shapes = [(10, ), (10, 3), (10, 3, 5, 4, 2)] for shape in shapes: for src, dst in itertools.product(range(len(shape)), range(len(shape))): hostData = np.random.randn(*shape).astype(dtype) data = GPUArray.toGpu(hostData) outdata = context.moveaxis(data, src=src, dst=dst) hostOutData = np.moveaxis(hostData, source=src, destination=dst) assert np.allclose(hostOutData, outdata.get())
def swapAxesTest(dtype): shapes = [(10, ), (10, 3), (10, 3, 5, 4, 2)] for shape in shapes: for axis1, axis2 in itertools.product(range(len(shape)), range(len(shape))): hostData = np.random.randn(*shape).astype(dtype) data = GPUArray.toGpu(hostData) outdata = context.swapaxes(data, axis1=axis1, axis2=axis2) hostOutData = np.swapaxes(hostData, axis1=axis1, axis2=axis2) assert np.allclose(hostOutData, outdata.get())
def crossEntropyTest(): hostScores = np.random.randn(20, 10, 3).astype(np.float32) hostLabels = np.random.randint(low=0, high=10, size=(20, 3)).astype(np.int32) scores, labels = GPUArray.toGpu(hostScores), GPUArray.toGpu(hostLabels) error, grad = crossEntropy(scores, labels) def softmax(w): e = np.exp(w - np.amax(w)) dist = e / np.sum(e) return dist def hostCrossEntropy(smax, target): smax = np.moveaxis(smax, 1, -1).reshape(-1, smax.shape[1]) target = target.ravel() err = np.sum( np.log(np.array([smax[i, target[i]] for i in range(smax.shape[0])]))) return -err / target.size def hostCrossEntropyGrad(target, smax): return np.array([(target == i) - smax[i] for i in range(smax.shape[0])]) hostSoftmax = np.apply_along_axis(softmax, 1, hostScores) hostGrad = np.vstack([ hostCrossEntropyGrad(hostLabels[i], hostSoftmax[i]) / scores.shape[0] for i in range(scores.shape[0]) ]).reshape(*hostSoftmax.shape) assert np.allclose(hostGrad, grad.get()) hostError = hostCrossEntropy(hostSoftmax, hostLabels) assert np.isclose(hostError, error.get() / scores.shape[0])
def depthConcatTest(dtype): hostData1 = np.random.randn(3, 4, 3, 3).astype(dtype) hostData2 = np.random.randn(3, 2, 6, 6).astype(dtype) hostData3 = np.random.randn(3, 5, 4, 4).astype(dtype) allHostData = [hostData1, hostData2, hostData3] allData = [GPUArray.toGpu(data) for data in allHostData] outdata = context.depthConcat(allData) depth, h, w = 0, 0, 0 for data in allHostData: depth += data.shape[1] h, w = max(h, data.shape[2]), max(w, data.shape[3]) hostOutData = np.zeros(shape=(allHostData[0].shape[0], depth, h, w), dtype=dtype) hostOutData[:, :4, 1:4, 1:4] = hostData1 hostOutData[:, 4:6, :, :] = hostData2 hostOutData[:, 6:, 1:5, 1:5] = hostData3 assert np.allclose(hostOutData, outdata.get()) hostGrad = np.random.randn(*hostOutData.shape).astype(dtype) grad = GPUArray.toGpu(hostGrad) ingrads = context.depthSplit(grad, allData) hostInGrads = [ hostGrad[:, :4, 1:4, 1:4], hostGrad[:, 4:6, :, :], hostGrad[:, 6:, 1:5, 1:5] ] assert all( np.allclose(hostInGrad, ingrads[i].get()) for i, hostInGrad in enumerate(hostInGrads))