Esempio n. 1
0
def generateModule(NT, VT):
	NV = NT * VT

	scanSum = scanSumTmpl.substitute(warpSize=warpSize, NT=NT)
	radixSort = radixSortTmpl.substitute(scanSum=scanSum, warpSize=warpSize, NT=NT, VT=VT, NV=NV)
	segmentSeq = segmentSeqTmpl.substitute(radixSort=radixSort, NT=NT, VT=VT, NV=NV)

	return SourceModule(ctcTmpl.substitute(segmentSeq=segmentSeq, NT=NT, VT=VT, NV=NV))
Esempio n. 2
0
	for (int index = blockIdx.x * blockDim.x + threadIdx.x; index < size; index += blockDim.x * gridDim.x)
	{
		int c = (index / inw / inh) % maps;
		int n = index / inw / inh / maps;

		const float *slice = outgrad + (n * maps + c) * outh * outw;
		int maxind = mask[index];

		ingrad[index] = slice[maxind];
	}
}

""")

if device is not None:
    mod = SourceModule(
        poolTmpl.substitute(initVal=str(np.finfo(np.float32).min)))


def maxpool2d(data, size, stride, pad, allocator=memPool):
    assert data.dtype == np.float32
    batchsize, maps, inh, inw = data.shape

    fh, fw = size
    hstride, wstride = stride
    hpad, wpad = pad

    outh = (inh - fh + 2 * hpad) // hstride + 1
    outw = (inw - fw + 2 * wpad) // wstride + 1

    outdata = GPUArray.empty((batchsize, maps, outh, outw),
                             dtype=np.float32,
Esempio n. 3
0
	int idy = blockIdx.y * blockDim.y + threadIdx.y;
	int idx = blockIdx.x * blockDim.x + threadIdx.x;

	if (idy >= size || idx >= embsize) return;

	int wordidx = indata[idy];
	if (wordidx == -1) return;

	float gr = scale * outgrad[embsize * idy + idx];
	atomicAdd(&vocabulary[embsize * wordidx + idx], gr);
}

"""

if device is not None:
    mod = SourceModule(embedTmpl)


def embed(data, W, allocator=memPool):
    assert data.dtype == np.int32 and W.dtype == np.float32

    batchsize, sentlen = data.shape
    _, embsize = W.shape

    outdata = GPUArray.zeros((batchsize, sentlen, embsize),
                             dtype=np.float32,
                             allocator=allocator)
    size = batchsize * sentlen

    block = (warpSize, warpSize, 1)
    grid = (roundUpDiv(embsize, warpSize), roundUpDiv(size, warpSize), 1)
Esempio n. 4
0
			indices[j] = shIndices[j];
		}
	}
}

""")


NT, VT = 4 * warpSize, 2
NV = NT * VT


if device is not None:
	scanSum = scanSumTmpl.substitute(warpSize=warpSize, NT=NT)
	scanMod = SourceModule(scanSumTestTmpl.substitute(scanSum=scanSum))

	radixSort = radixSortTmpl.substitute(scanSum=scanSum, warpSize=warpSize, NT=NT, VT=VT, NV=NV)
	radixMod = SourceModule(radixSortTestTmpl.substitute(radixSort=radixSort, NT=NT, VT=VT, NV=NV))

	segmentSeq = segmentSeqTmpl.substitute(radixSort=radixSort, NT=NT, VT=VT, NV=NV)
	segmentMod = SourceModule(segmentTestTmpl.substitute(segmentSeq=segmentSeq, NT=NT, VT=VT, NV=NV))


def scanSum(data):
	assert data.dtype == np.uint32

	length, = data.shape
	assert length <= NT

	outdata = GPUArray.empty(data.shape, dtype=data.dtype, allocator=memPool)
Esempio n. 5
0
	int outh = inh + upad + bpad, outw = inw + lpad + rpad;

	if (index < outh * outw)
	{
		int inindex = 0, outindex = 0;
		map2d(inh, inw, outh, outw, index, upad, lpad, &inindex, &outindex);

		gpuAtomicAdd(&ingrad[inindex], outgrad[outindex]);
	}
}

""")

if device is not None:
    mod = SourceModule(
        "%s%s%s" % (mapTmpl, padTmpl.substitute(
            T=half_t, ext="FP16"), padTmpl.substitute(T=float_t, ext="")))


def reflectpad(data, pad, allocator=memPool):
    if data.ndim == 3:
        batchsize, maps, insize = data.shape
        lpad, rpad = pad

        assert insize >= max(lpad, rpad) + 1
        outsize = insize + lpad + rpad

        block = (warpSize, 1, 1)
        grid = (roundUpDiv(outsize, warpSize), maps, batchsize)

        outdata = GPUArray.empty((batchsize, maps, outsize),
Esempio n. 6
0
			atomicAdd(&ingrad[ibstride+icstride + (d1+d1p) * inh*inw + h1 * inw + w1], dd1 * dh0 * dw0 * val);
			atomicAdd(&ingrad[ibstride+icstride + (d1+d1p) * inh*inw + h1 * inw + w1+w1p], dd1 * dh0 * dw1 * val);
			atomicAdd(&ingrad[ibstride+icstride + (d1+d1p) * inh*inw + (h1+h1p) * inw + w1], dd1 * dh1 * dw0 * val);
			atomicAdd(&ingrad[ibstride+icstride + (d1+d1p) * inh*inw + (h1+h1p) * inw + w1+w1p], dd1 * dh1 * dw1 * val);
		}
	}
}

""")


hblocksize, wblocksize = 4, warpSize


if device is not None:
	nearestMod = SourceModule(upsampleNearestTmpl.substitute(hBlockSize=hblocksize, wBlockSize=wblocksize))
	linearMod = SourceModule(upsampleLinearTmpl.substitute())


def upsample2d(data, scale, mode="nearest", allocator=memPool):
	batchsize, maps, inh, inw = data.shape
	hscale, wscale = (scale, scale) if isinstance(scale, int) else scale

	outh, outw = hscale * inh, wscale * inw
	outdata = GPUArray.empty((batchsize, maps, outh, outw), dtype=data.dtype, allocator=allocator)

	if mode == "nearest":
		block = (wblocksize, hblocksize, 1)
		grid = (roundUpDiv(inw, block[0]), roundUpDiv(inh, block[1]), batchsize * maps)

		nearestMod.upsample2dNearest(
Esempio n. 7
0
{
	for (int index = blockIdx.x * blockDim.x + threadIdx.x; index < size; index += blockDim.x * gridDim.x)
	{
		slopegrad[index] = outgrad[index] * indata[index] * (indata[index] <= 0.0f);

		for (int b = 1; b < batchsize; b++)
			slopegrad[index] += outgrad[index + b * stride] * indata[index + b * stride] *
								(indata[index + b * stride] <= 0.0f);
	}
}

"""


if device is not None:
	mod = SourceModule(preluTmpl)


def prelu(data, slopes, inplace=False, sharedMaps=False, allocator=memPool):
	assert data.dtype == slopes.dtype and slopes.dtype == np.float32
	assert slopes.shape == (1, ) if sharedMaps else data.shape[1] == slopes.shape[0]

	outdata = data if inplace else GPUArray.empty(data.shape, dtype=np.float32, allocator=allocator)

	mapsize = prod(data.shape[2:])
	size = prod(data.shape)

	block = (nthreads, 1, 1)
	grid = (roundUpDiv(size, nthreads), 1, 1)

	divFactor = data.shape[1] if sharedMaps else 1
Esempio n. 8
0
	if (tidy < n && tidx < m)
		out[offset] = (float)mat[offset] $op (float)vec[tidz * m + tidx % p];
}

""")

NT = warpSize * 8

if device is not None:
    maxmod = SourceModule(
        "#include <cuda_fp16.h>\n\n%s%s" %
        (minMaxTmpl.substitute(warpSize=warpSize,
                               NT=NT,
                               initVal=np.finfo(np.float32).min,
                               cmpOp=">",
                               T=half_t,
                               ext="FP16"),
         minMaxTmpl.substitute(warpSize=warpSize,
                               NT=NT,
                               initVal=np.finfo(np.float32).min,
                               cmpOp=">",
                               T=float_t,
                               ext="")))

    minmod = SourceModule(
        "#include <cuda_fp16.h>\n\n%s%s" %
        (minMaxTmpl.substitute(warpSize=warpSize,
                               NT=NT,
                               initVal=np.finfo(np.float32).max,
                               cmpOp="<",
                               T=half_t,
                               ext="FP16"),
Esempio n. 9
0
		int label = labels[b * spatialDim + m];
		float weight = weights[c];
		grad[index] = weight * ((c == label) - score) / numSamples;

		if (c == label)
		{
			float error = -weight * log(score) / spatialDim;
			atomicAdd(totalError, error);
		}
	}
}

""")

if device is not None:
    ceMod = SourceModule(costLblTmpl.substitute(logic=crossEntropyLogic))
    wceMod = SourceModule(wceTmpl.substitute())

    svmL1Mod = SourceModule(costLblTmpl.substitute(logic=svmL1Logic))
    svmL2Mod = SourceModule(costLblTmpl.substitute(logic=svmL2Logic))


def crossEntropy(scores, labels, weights=None, error=None, allocator=memPool):
    assert scores.dtype == np.float32 and labels.dtype == np.int32

    shape = scores.shape
    if scores.ndim < 4:
        scores = scores.reshape(*shape, *(1 for _ in range(4 - scores.ndim)))

    softmax = cudnn.softmaxNd(scores,
                              mode=SoftMaxMode.spatial.value,