def generateModule(NT, VT): NV = NT * VT scanSum = scanSumTmpl.substitute(warpSize=warpSize, NT=NT) radixSort = radixSortTmpl.substitute(scanSum=scanSum, warpSize=warpSize, NT=NT, VT=VT, NV=NV) segmentSeq = segmentSeqTmpl.substitute(radixSort=radixSort, NT=NT, VT=VT, NV=NV) return SourceModule(ctcTmpl.substitute(segmentSeq=segmentSeq, NT=NT, VT=VT, NV=NV))
for (int index = blockIdx.x * blockDim.x + threadIdx.x; index < size; index += blockDim.x * gridDim.x) { int c = (index / inw / inh) % maps; int n = index / inw / inh / maps; const float *slice = outgrad + (n * maps + c) * outh * outw; int maxind = mask[index]; ingrad[index] = slice[maxind]; } } """) if device is not None: mod = SourceModule( poolTmpl.substitute(initVal=str(np.finfo(np.float32).min))) def maxpool2d(data, size, stride, pad, allocator=memPool): assert data.dtype == np.float32 batchsize, maps, inh, inw = data.shape fh, fw = size hstride, wstride = stride hpad, wpad = pad outh = (inh - fh + 2 * hpad) // hstride + 1 outw = (inw - fw + 2 * wpad) // wstride + 1 outdata = GPUArray.empty((batchsize, maps, outh, outw), dtype=np.float32,
int idy = blockIdx.y * blockDim.y + threadIdx.y; int idx = blockIdx.x * blockDim.x + threadIdx.x; if (idy >= size || idx >= embsize) return; int wordidx = indata[idy]; if (wordidx == -1) return; float gr = scale * outgrad[embsize * idy + idx]; atomicAdd(&vocabulary[embsize * wordidx + idx], gr); } """ if device is not None: mod = SourceModule(embedTmpl) def embed(data, W, allocator=memPool): assert data.dtype == np.int32 and W.dtype == np.float32 batchsize, sentlen = data.shape _, embsize = W.shape outdata = GPUArray.zeros((batchsize, sentlen, embsize), dtype=np.float32, allocator=allocator) size = batchsize * sentlen block = (warpSize, warpSize, 1) grid = (roundUpDiv(embsize, warpSize), roundUpDiv(size, warpSize), 1)
indices[j] = shIndices[j]; } } } """) NT, VT = 4 * warpSize, 2 NV = NT * VT if device is not None: scanSum = scanSumTmpl.substitute(warpSize=warpSize, NT=NT) scanMod = SourceModule(scanSumTestTmpl.substitute(scanSum=scanSum)) radixSort = radixSortTmpl.substitute(scanSum=scanSum, warpSize=warpSize, NT=NT, VT=VT, NV=NV) radixMod = SourceModule(radixSortTestTmpl.substitute(radixSort=radixSort, NT=NT, VT=VT, NV=NV)) segmentSeq = segmentSeqTmpl.substitute(radixSort=radixSort, NT=NT, VT=VT, NV=NV) segmentMod = SourceModule(segmentTestTmpl.substitute(segmentSeq=segmentSeq, NT=NT, VT=VT, NV=NV)) def scanSum(data): assert data.dtype == np.uint32 length, = data.shape assert length <= NT outdata = GPUArray.empty(data.shape, dtype=data.dtype, allocator=memPool)
int outh = inh + upad + bpad, outw = inw + lpad + rpad; if (index < outh * outw) { int inindex = 0, outindex = 0; map2d(inh, inw, outh, outw, index, upad, lpad, &inindex, &outindex); gpuAtomicAdd(&ingrad[inindex], outgrad[outindex]); } } """) if device is not None: mod = SourceModule( "%s%s%s" % (mapTmpl, padTmpl.substitute( T=half_t, ext="FP16"), padTmpl.substitute(T=float_t, ext=""))) def reflectpad(data, pad, allocator=memPool): if data.ndim == 3: batchsize, maps, insize = data.shape lpad, rpad = pad assert insize >= max(lpad, rpad) + 1 outsize = insize + lpad + rpad block = (warpSize, 1, 1) grid = (roundUpDiv(outsize, warpSize), maps, batchsize) outdata = GPUArray.empty((batchsize, maps, outsize),
atomicAdd(&ingrad[ibstride+icstride + (d1+d1p) * inh*inw + h1 * inw + w1], dd1 * dh0 * dw0 * val); atomicAdd(&ingrad[ibstride+icstride + (d1+d1p) * inh*inw + h1 * inw + w1+w1p], dd1 * dh0 * dw1 * val); atomicAdd(&ingrad[ibstride+icstride + (d1+d1p) * inh*inw + (h1+h1p) * inw + w1], dd1 * dh1 * dw0 * val); atomicAdd(&ingrad[ibstride+icstride + (d1+d1p) * inh*inw + (h1+h1p) * inw + w1+w1p], dd1 * dh1 * dw1 * val); } } } """) hblocksize, wblocksize = 4, warpSize if device is not None: nearestMod = SourceModule(upsampleNearestTmpl.substitute(hBlockSize=hblocksize, wBlockSize=wblocksize)) linearMod = SourceModule(upsampleLinearTmpl.substitute()) def upsample2d(data, scale, mode="nearest", allocator=memPool): batchsize, maps, inh, inw = data.shape hscale, wscale = (scale, scale) if isinstance(scale, int) else scale outh, outw = hscale * inh, wscale * inw outdata = GPUArray.empty((batchsize, maps, outh, outw), dtype=data.dtype, allocator=allocator) if mode == "nearest": block = (wblocksize, hblocksize, 1) grid = (roundUpDiv(inw, block[0]), roundUpDiv(inh, block[1]), batchsize * maps) nearestMod.upsample2dNearest(
{ for (int index = blockIdx.x * blockDim.x + threadIdx.x; index < size; index += blockDim.x * gridDim.x) { slopegrad[index] = outgrad[index] * indata[index] * (indata[index] <= 0.0f); for (int b = 1; b < batchsize; b++) slopegrad[index] += outgrad[index + b * stride] * indata[index + b * stride] * (indata[index + b * stride] <= 0.0f); } } """ if device is not None: mod = SourceModule(preluTmpl) def prelu(data, slopes, inplace=False, sharedMaps=False, allocator=memPool): assert data.dtype == slopes.dtype and slopes.dtype == np.float32 assert slopes.shape == (1, ) if sharedMaps else data.shape[1] == slopes.shape[0] outdata = data if inplace else GPUArray.empty(data.shape, dtype=np.float32, allocator=allocator) mapsize = prod(data.shape[2:]) size = prod(data.shape) block = (nthreads, 1, 1) grid = (roundUpDiv(size, nthreads), 1, 1) divFactor = data.shape[1] if sharedMaps else 1
if (tidy < n && tidx < m) out[offset] = (float)mat[offset] $op (float)vec[tidz * m + tidx % p]; } """) NT = warpSize * 8 if device is not None: maxmod = SourceModule( "#include <cuda_fp16.h>\n\n%s%s" % (minMaxTmpl.substitute(warpSize=warpSize, NT=NT, initVal=np.finfo(np.float32).min, cmpOp=">", T=half_t, ext="FP16"), minMaxTmpl.substitute(warpSize=warpSize, NT=NT, initVal=np.finfo(np.float32).min, cmpOp=">", T=float_t, ext=""))) minmod = SourceModule( "#include <cuda_fp16.h>\n\n%s%s" % (minMaxTmpl.substitute(warpSize=warpSize, NT=NT, initVal=np.finfo(np.float32).max, cmpOp="<", T=half_t, ext="FP16"),
int label = labels[b * spatialDim + m]; float weight = weights[c]; grad[index] = weight * ((c == label) - score) / numSamples; if (c == label) { float error = -weight * log(score) / spatialDim; atomicAdd(totalError, error); } } } """) if device is not None: ceMod = SourceModule(costLblTmpl.substitute(logic=crossEntropyLogic)) wceMod = SourceModule(wceTmpl.substitute()) svmL1Mod = SourceModule(costLblTmpl.substitute(logic=svmL1Logic)) svmL2Mod = SourceModule(costLblTmpl.substitute(logic=svmL2Logic)) def crossEntropy(scores, labels, weights=None, error=None, allocator=memPool): assert scores.dtype == np.float32 and labels.dtype == np.int32 shape = scores.shape if scores.ndim < 4: scores = scores.reshape(*shape, *(1 for _ in range(4 - scores.ndim))) softmax = cudnn.softmaxNd(scores, mode=SoftMaxMode.spatial.value,