Ejemplo n.º 1
0
def svm(scores, labels, mode, error=None, allocator=memPool):
    assert scores.dtype == np.float32 and labels.dtype == np.int32
    shape = scores.shape

    grad = GPUArray.empty(shape, dtype=np.float32, allocator=allocator)
    if error is None:
        error = GPUArray.empty((), dtype=np.float32, allocator=allocator)

    error.fill(0.0)

    size = prod(scores.shape)
    spatialDim = prod(scores.shape[2:])
    mapStride = spatialDim * scores.shape[1]

    block = (nthreads, 1, 1)
    grid = (roundUpDiv(size, nthreads), 1, 1)

    mod = {"l1": svmL1Mod, "l2": svmL2Mod}[mode]

    mod.cost(scores,
             labels,
             np.int32(size),
             np.int32(mapStride),
             np.int32(spatialDim),
             np.int32(shape[1]),
             np.int32(shape[0]),
             error,
             grad,
             block=block,
             grid=grid)

    return error, grad
Ejemplo n.º 2
0
def matvec(mat, vec, axis=0, out=None, alpha=1.0, beta=0.0, allocator=memPool):
    assert vec.dtype == mat.dtype and (mat.dtype == np.float32
                                       or mat.dtype == np.float16)
    assert vec.ndim == mat.ndim - 1 and 0 <= axis < 2

    h, w = mat.shape[-2:]

    if axis == 1:
        assert mat.dimAt(-1) == vec.dimAt(-1)

        block = (warpSize, 1, 1)
        grid = (h, 1, prod(mat.shape[:-2]))

        if out is None:
            out = GPUArray.zeros(mat.shape[:-1],
                                 dtype=mat.dtype,
                                 allocator=allocator)
        else:
            assert out.shape == mat.shape[:-1]

        fn = mulmod.vecMulOnRow if mat.dtype == np.float32 else mulmod.vecMulOnRowFP16
        fn(out,
           mat,
           vec,
           np.int32(w),
           np.int32(h),
           np.float32(alpha),
           np.float32(beta),
           block=block,
           grid=grid)

    else:
        block = (NT, 1, 1)
        grid = (roundUpDiv(w, block[0]), 1, prod(mat.shape[:-2]))

        if out is None:
            out = GPUArray.zeros(mat.shape[:-2] + (w, ),
                                 dtype=mat.dtype,
                                 allocator=allocator)
        else:
            assert out.shape == mat.shape[:-2] + (w, )

        fn = mulmod.vecMulOnCol if mat.dtype == np.float32 else mulmod.vecMulOnColFP16
        fn(out,
           mat,
           vec,
           np.int32(w),
           np.int32(h),
           np.float32(alpha),
           np.float32(beta),
           block=block,
           grid=grid)

    return out
Ejemplo n.º 3
0
def crossEntropy(scores, labels, weights=None, error=None, allocator=memPool):
    assert scores.dtype == np.float32 and labels.dtype == np.int32

    shape = scores.shape
    if scores.ndim < 4:
        scores = scores.reshape(*shape, *(1 for _ in range(4 - scores.ndim)))

    softmax = cudnn.softmaxNd(scores,
                              mode=SoftMaxMode.spatial.value,
                              allocator=allocator)

    grad = GPUArray.empty(shape, dtype=np.float32, allocator=allocator)
    if error is None:
        error = GPUArray.empty((), dtype=np.float32, allocator=allocator)

    error.fill(0.0)

    size = prod(scores.shape)
    spatialDim = prod(scores.shape[2:])
    mapStride = spatialDim * scores.shape[1]

    block = (nthreads, 1, 1)
    grid = (roundUpDiv(size, nthreads), 1, 1)

    if weights is None:
        ceMod.cost(softmax,
                   labels,
                   np.int32(size),
                   np.int32(mapStride),
                   np.int32(spatialDim),
                   np.int32(scores.shape[1]),
                   np.int32(scores.shape[0]),
                   error,
                   grad,
                   block=block,
                   grid=grid)

    else:
        wceMod.cost(softmax,
                    labels,
                    weights,
                    np.int32(size),
                    np.int32(mapStride),
                    np.int32(spatialDim),
                    np.int32(shape[1]),
                    np.int32(shape[0]),
                    error,
                    grad,
                    block=block,
                    grid=grid)

    return error, grad
Ejemplo n.º 4
0
def matsum(tensor, axis=0, out=None, alpha=1.0, beta=0.0, allocator=memPool):
    assert tensor.dtype == np.float32 or tensor.dtype == np.float16
    assert 0 <= axis < tensor.ndim

    if axis == tensor.ndim - 1:
        block = (warpSize, 1, 1)
        grid = (prod(tensor.shape[:-1]), 1, 1)

        if out is None:
            out = GPUArray.zeros(tensor.shape[:-1],
                                 dtype=tensor.dtype,
                                 allocator=allocator)
        else:
            assert out.shape == tensor.shape[:-1]

        fn = summod.sumOnRow if tensor.dtype == np.float32 else summod.sumOnRowFP16
        fn(out,
           tensor,
           np.int32(tensor.dimAt(-1)),
           np.float32(alpha),
           np.float32(beta),
           block=block,
           grid=grid)

    else:
        z, width = prod(tensor.shape[:axis]), prod(tensor.shape[axis + 1:])

        block = (NT, 1, 1)
        grid = (roundUpDiv(width, block[0]), 1, z)

        if out is None:
            out = GPUArray.zeros(tensor.shape[:axis] + tensor.shape[axis + 1:],
                                 dtype=tensor.dtype,
                                 allocator=allocator)
        else:
            assert out.shape == tensor.shape[:axis] + tensor.shape[axis + 1:]

        fn = summod.sumOnCol if tensor.dtype == np.float32 else summod.sumOnColFP16
        fn(out,
           tensor,
           np.int32(width),
           np.int32(tensor.dimAt(axis)),
           np.float32(alpha),
           np.float32(beta),
           block=block,
           grid=grid)

    return out
Ejemplo n.º 5
0
def maxunpool2dBackward(grad, poolshape, mask, allocator=memPool):
    assert grad.dtype == np.float32 and mask.dtype == np.int32
    batchsize, maps, outh, outw = grad.shape

    inh, inw = poolshape[2], poolshape[3]
    ingrad = GPUArray.empty((batchsize, maps, inh, inw),
                            dtype=np.float32,
                            allocator=allocator)

    size = prod(ingrad.shape)

    block = (nthreads, 1, 1)
    grid = (roundUpDiv(size, nthreads), 1, 1)

    mod.maxunpool2dBackward(ingrad,
                            grad,
                            mask,
                            np.int32(inh),
                            np.int32(inw),
                            np.int32(outh),
                            np.int32(outw),
                            np.int32(maps),
                            np.int32(size),
                            block=block,
                            grid=grid)

    return ingrad
Ejemplo n.º 6
0
def maxunpool2d(data, origshape, mask, allocator=memPool):
    assert data.dtype == np.float32
    batchsize, maps, inh, inw = data.shape

    outh, outw = origshape[2], origshape[3]
    outdata = GPUArray.zeros((batchsize, maps, outh, outw),
                             dtype=np.float32,
                             allocator=allocator)

    size = prod(data.shape)

    block = (nthreads, 1, 1)
    grid = (roundUpDiv(size, nthreads), 1, 1)

    mod.maxunpool2d(outdata,
                    data,
                    mask,
                    np.int32(inh),
                    np.int32(inw),
                    np.int32(outh),
                    np.int32(outw),
                    np.int32(maps),
                    np.int32(size),
                    block=block,
                    grid=grid)

    return outdata
Ejemplo n.º 7
0
def preluBackwardParams(indata, outgrad, sharedMaps=False, allocator=memPool):
	assert indata.dtype == outgrad.dtype and outgrad.dtype == np.float32
	assert indata.shape == outgrad.shape

	size = prod(outgrad.shape[1:])
	stride = prod(outgrad.shape[1:])

	block = (nthreads, 1, 1)
	grid = (roundUpDiv(size, nthreads), 1, 1)

	slopegrad = GPUArray.empty(outgrad.shape[1:], dtype=np.float32, allocator=allocator)

	mod.preluBackwardParams(
		slopegrad, outgrad, indata, np.int32(outgrad.shape[0]), np.int32(stride), np.int32(size),
		block=block, grid=grid
	)

	shape = (1, prod(slopegrad.shape)) if sharedMaps else (slopegrad.shape[0], prod(slopegrad.shape[1:]))
	return matsum(slopegrad.reshape(shape), axis=1)
Ejemplo n.º 8
0
def prelu(data, slopes, inplace=False, sharedMaps=False, allocator=memPool):
	assert data.dtype == slopes.dtype and slopes.dtype == np.float32
	assert slopes.shape == (1, ) if sharedMaps else data.shape[1] == slopes.shape[0]

	outdata = data if inplace else GPUArray.empty(data.shape, dtype=np.float32, allocator=allocator)

	mapsize = prod(data.shape[2:])
	size = prod(data.shape)

	block = (nthreads, 1, 1)
	grid = (roundUpDiv(size, nthreads), 1, 1)

	divFactor = data.shape[1] if sharedMaps else 1

	mod.prelu(
		outdata, data, slopes, np.int32(divFactor), np.int32(mapsize), np.int32(data.shape[1]), np.int32(size),
		block=block, grid=grid
	)

	return outdata
Ejemplo n.º 9
0
def preluBackwardData(grad, slopes, indata, sharedMaps=False, allocator=memPool):
	assert grad.dtype == slopes.dtype and slopes.dtype == indata.dtype and indata.dtype == np.float32
	assert grad.shape == indata.shape
	assert slopes.shape == (1, ) if sharedMaps else grad.shape[1] == slopes.shape[0]

	ingrad = GPUArray.empty(grad.shape, dtype=np.float32, allocator=allocator)

	mapsize = prod(grad.shape[2:])
	size = prod(grad.shape)

	block = (nthreads, 1, 1)
	grid = (roundUpDiv(size, nthreads), 1, 1)

	divFactor = grad.shape[1] if sharedMaps else 1

	mod.preluBackwardData(
		ingrad, grad, slopes, indata, np.int32(divFactor), np.int32(mapsize), np.int32(grad.shape[1]),
		np.int32(size), block=block, grid=grid
	)

	return ingrad
Ejemplo n.º 10
0
def argminmax(tensor, axis, mode, allocator):
    assert tensor.dtype == np.float32 or tensor.dtype == np.float16
    assert 0 <= axis < tensor.ndim

    mod = {"max": maxmod, "min": minmod}[mode]

    if axis == tensor.ndim - 1:
        block = (warpSize, 1, 1)
        grid = (prod(tensor.shape[:-1]), 1, 1)

        idx = GPUArray.empty(tensor.shape[:-1],
                             dtype=np.int32,
                             allocator=allocator)
        fn = mod.minMaxOnRow if tensor.dtype == np.float32 else mod.minMaxOnRowFP16

        fn(idx, tensor, np.int32(tensor.dimAt(-1)), block=block, grid=grid)

    else:
        z, width = prod(tensor.shape[:axis]), prod(tensor.shape[axis + 1:])

        block = (NT, 1, 1)
        grid = (roundUpDiv(width, block[0]), 1, z)

        idx = GPUArray.empty(tensor.shape[:axis] + tensor.shape[axis + 1:],
                             dtype=np.int32,
                             allocator=allocator)
        fn = mod.minMaxOnCol if tensor.dtype == np.float32 else mod.minMaxOnColFP16

        fn(idx,
           tensor,
           np.int32(width),
           np.int32(tensor.dimAt(axis)),
           block=block,
           grid=grid)

    return idx
Ejemplo n.º 11
0
def getRnnParam(rnn, W, layer, linLayer, Wshape):
    Wtuple, biasTuple = rnn.getParam(W, layer, linLayer)

    Woffset, wsize = Wtuple
    biasOffset, biasSize = biasTuple

    dtype, gpudata = W.dtype, W.gpudata
    Wbytes, biasBytes = wsize * dtype.itemsize, biasSize * dtype.itemsize

    assert prod(Wshape) == wsize
    w = GPUArray(Wshape,
                 dtype=W.dtype,
                 gpudata=W.gpudata[Woffset:Woffset + Wbytes])

    bias = GPUArray((biasSize, ),
                    dtype=W.dtype,
                    gpudata=W.gpudata[biasOffset:biasOffset + biasBytes])
    return w, bias
Ejemplo n.º 12
0
def maxpool2dBackward(grad,
                      origshape,
                      mask,
                      size,
                      stride,
                      pad,
                      allocator=memPool):
    assert grad.dtype == np.float32 and mask.dtype == np.int32
    batchsize, maps, outh, outw = grad.shape

    fh, fw = size
    hstride, wstride = stride
    hpad, wpad = pad

    inh, inw = origshape[2], origshape[3]
    ingrad = GPUArray.empty((batchsize, maps, inh, inw),
                            dtype=np.float32,
                            allocator=allocator)

    size = prod(ingrad.shape)

    block = (nthreads, 1, 1)
    grid = (roundUpDiv(size, nthreads), 1, 1)

    mod.maxpool2dBackward(ingrad,
                          grad,
                          mask,
                          np.int32(inh),
                          np.int32(inw),
                          np.int32(outh),
                          np.int32(outw),
                          np.int32(maps),
                          np.int32(hstride),
                          np.int32(wstride),
                          np.int32(hpad),
                          np.int32(wpad),
                          np.int32(fh),
                          np.int32(fw),
                          np.int32(size),
                          block=block,
                          grid=grid)

    return ingrad
Ejemplo n.º 13
0
def maxpool2d(data, size, stride, pad, allocator=memPool):
    assert data.dtype == np.float32
    batchsize, maps, inh, inw = data.shape

    fh, fw = size
    hstride, wstride = stride
    hpad, wpad = pad

    outh = (inh - fh + 2 * hpad) // hstride + 1
    outw = (inw - fw + 2 * wpad) // wstride + 1

    outdata = GPUArray.empty((batchsize, maps, outh, outw),
                             dtype=np.float32,
                             allocator=allocator)
    mask = GPUArray.empty((batchsize, maps, outh, outw),
                          dtype=np.int32,
                          allocator=allocator)

    size = prod(outdata.shape)

    block = (nthreads, 1, 1)
    grid = (roundUpDiv(size, nthreads), 1, 1)

    mod.maxpool2d(outdata,
                  data,
                  mask,
                  np.int32(inh),
                  np.int32(inw),
                  np.int32(outh),
                  np.int32(outw),
                  np.int32(maps),
                  np.int32(hstride),
                  np.int32(wstride),
                  np.int32(hpad),
                  np.int32(wpad),
                  np.int32(fh),
                  np.int32(fw),
                  np.int32(size),
                  block=block,
                  grid=grid)

    return outdata, mask
Ejemplo n.º 14
0
def addVecToMat(vec, mat, axis=0, out=None, allocator=memPool):
    assert vec.dtype == mat.dtype and (mat.dtype == np.float32
                                       or mat.dtype == np.float16)
    assert vec.ndim == mat.ndim - 1 and 0 <= axis < 2

    assert mat.shape[:-2] == vec.shape[:-1]
    out = GPUArray.empty(mat.shape, dtype=mat.dtype,
                         allocator=allocator) if out is None else out

    z = prod(mat.shape[:-2])
    n, m = mat.shape[-2:]

    block = (warpSize, warpSize, 1)
    grid = (roundUpDiv(m, block[0]), roundUpDiv(n, block[1]), z)

    if axis == 1:
        if mat.dimAt(-1) == vec.dimAt(-1):
            fn = addmod.opRowVecToMat if mat.dtype == np.float32 else addmod.opRowVecToMatFP16
            fn(out, vec, mat, np.int32(n), np.int32(m), block=block, grid=grid)

        else:
            assert mat.dimAt(-1) % vec.dimAt(-1) == 0

            fn = addmod.opRowOneVecToMat if mat.dtype == np.float32 else addmod.opRowOneVecToMatFP16
            fn(out,
               vec,
               mat,
               np.int32(n),
               np.int32(m),
               np.int32(vec.dimAt(-1)),
               block=block,
               grid=grid)

    else:
        fn = addmod.opColVecToMat if mat.dtype == np.float32 else addmod.opColVecToMatFP16
        fn(out, vec, mat, np.int32(n), np.int32(m), block=block, grid=grid)

    return out