Beispiel #1
0
def bgpBgpTest():
    groups = 3

    A = Driver.to_device(queue,
                         np.random.randn(4, groups, 7).astype(np.float32))
    B = Driver.to_device(
        queue,
        np.random.randn(A.shape[2], groups, 5).astype(np.float32))
    C = Driver.to_device(
        queue,
        np.random.randn(A.shape[0], groups, B.shape[2]).astype(np.float32))

    out = mulTensorBatch(A, B, formatA="bgp", formatB="bgp")

    hostOut = np.empty(out.shape, dtype=np.float32)
    for i in range(groups):
        hostOut[:, i, :] = np.dot(A.get()[:, i, :], B.get()[:, i, :])

    assert np.allclose(hostOut, out.get())

    out = mulTensorBatch(A, C, formatA="bgp", formatB="bgp", transpA=True)

    hostOut = np.empty(out.shape, dtype=np.float32)
    for i in range(groups):
        hostOut[:, i, :] = np.dot(A.get()[:, i, :].T, C.get()[:, i, :])

    assert np.allclose(hostOut, out.get())

    out = mulTensorBatch(B, C, formatA="bgp", formatB="bgp", transpB=True)

    hostOut = np.empty(out.shape, dtype=np.float32)
    for i in range(groups):
        hostOut[:, i, :] = np.dot(B.get()[:, i, :], C.get()[:, i, :].T)

    assert np.allclose(hostOut, out.get())
Beispiel #2
0
def depthConcatTest():
	data1 = Driver.to_device(queue, np.random.randn(3, 4, 3, 3).astype(np.float32))
	data2 = Driver.to_device(queue, np.random.randn(3, 2, 6, 6).astype(np.float32))
	data3 = Driver.to_device(queue, np.random.randn(3, 5, 4, 4).astype(np.float32))

	alldata = [data1, data2, data3]
	outdata = depthConcat(alldata)

	depth, h, w = 0, 0, 0
	for data in alldata:
		depth += data.shape[1]
		h, w = max(h, data.shape[2]), max(w, data.shape[3])

	hostOutData = np.zeros(shape=(data1.shape[0], depth, h, w), dtype=np.float32)

	hostOutData[:, :4, 1:4, 1:4] = data1.get()
	hostOutData[:, 4:6, :, :] = data2.get()
	hostOutData[:, 6:, 1:5, 1:5] = data3.get()

	assert np.allclose(hostOutData, outdata.get())

	grad = Driver.to_device(queue, np.random.randn(*hostOutData.shape).astype(np.float32))
	ingrads = depthSplit(grad, alldata)

	hostInGrads = [np.empty(data.shape, dtype=np.float32) for data in alldata]

	hostInGrads[0] = grad.get()[:, :4, 1:4, 1:4]
	hostInGrads[1] = grad.get()[:, 4:6, :, :]
	hostInGrads[2] = grad.get()[:, 6:, 1:5, 1:5]

	assert all(np.allclose(hostInGrad, ingrads[i].get()) for i, hostInGrad in enumerate(hostInGrads))
Beispiel #3
0
def instanceNorm2d(data, scale, bias, epsilon=1e-5):
    batchsize = data.shape[0]
    if batchsize > 1:
        extscale = Utils.tile(scale, batchsize, axis=1)
        extbias = Utils.tile(bias, batchsize, axis=1)

    else:
        extscale = scale
        extbias = bias

    indata = data.reshape(1, batchsize * data.shape[1], data.shape[2],
                          data.shape[3])
    mean = Driver.empty(queue, (1, indata.shape[1], 1, 1),
                        dtype=np.float32,
                        allocator=memPool)
    var = Driver.empty(queue, (1, indata.shape[1], 1, 1),
                       dtype=np.float32,
                       allocator=memPool)

    outdata, savemean, saveinvvar = MIOpen.batchNorm2d(indata,
                                                       extscale,
                                                       extbias,
                                                       mean,
                                                       var,
                                                       epsilon,
                                                       test=False)
    return outdata.reshape(data.shape), savemean, saveinvvar, extscale
Beispiel #4
0
def maxpool2dTest():
	batchsize, maps, h, w = 1, 1, 8, 8
	data = Driver.to_device(queue, np.random.randn(batchsize, maps, h, w).astype(np.float32))

	outdata, workspace = pool2d(data, test=False)

	def maxDownSample2d(dat, factor):
		trimrows = dat.shape[0] // factor * factor
		trimcols = dat.shape[1] // factor * factor

		maxSoFar = None
		first = True

		for coff in range(factor):
			for roff in range(factor):
				hopped = dat[roff:trimrows:factor, coff:trimcols:factor]
				if first:
					maxSoFar = hopped
					first = False
				else:
					maxSoFar = np.maximum(maxSoFar, hopped)

		return maxSoFar

	hostOutData = maxDownSample2d(data.get()[0, 0], 2)
	assert np.allclose(hostOutData, outdata.get())

	grad = Driver.to_device(queue, np.random.randn(*outdata.shape).astype(np.float32))
	pool2dBackward(data, outdata, grad, workspace)
Beispiel #5
0
def batchNorm2d(data, scale, bias, mean, var, epsilon=1e-5, factor=1.0, test=False, mode=BatchNormMode.spatial,
				out=None):
	assert data.ndim == scale.ndim and scale.ndim == bias.ndim and bias.ndim == mean.ndim and mean.ndim == var.ndim
	checkOffsets(bias, mean, var)

	descData = createDescribed4dTensor(data, allowOffset=True)
	descScale = createDescribed4dTensor(scale)

	if out is None:
		descOutData = createDescribed4dTensor(Driver.empty(queue, data.shape, dtype=data.dtype, allocator=memPool))
	else:
		descOutData = createDescribed4dTensor(out)

	if test:
		savemean, saveinvvar = None, None
		libmiopen.miopenBatchNormalizationForwardInference(context, mode.value, 1.0, 0.0, descData.desc, descData.ptr,
														   descOutData.desc, descOutData.ptr, descScale.desc,
														   descScale.ptr, bias.int_ptr, mean.int_ptr, var.int_ptr,
														   epsilon)
	else:
		savemean = Driver.empty(queue, mean.shape, dtype=data.dtype, allocator=memPool)
		saveinvvar = Driver.empty(queue, var.shape, dtype=data.dtype, allocator=memPool)

		libmiopen.miopenBatchNormalizationForwardTraining(context, mode.value, 1.0, 0.0, descData.desc, descData.ptr,
														  descOutData.desc, descOutData.ptr, descScale.desc,
														  descScale.ptr, bias.int_ptr, factor,
														  mean.int_ptr, var.int_ptr, epsilon,
														  savemean.int_ptr, saveinvvar.int_ptr)

	destroyDescribedTensors(descData, descScale, descOutData)

	if test:
		return descOutData.tensor
	else:
		return descOutData.tensor, savemean, saveinvvar
Beispiel #6
0
def batchNorm2dBackward(data, grad, scale, savemean=None, saveinvvar=None, epsilon=1e-5, mode=BatchNormMode.spatial):
	assert data.ndim == grad.ndim and grad.ndim == scale.ndim

	if savemean is not None:
		assert scale.ndim == savemean.ndim
		checkOffsets(savemean)
		savemean = savemean.int_ptr

	if saveinvvar is not None:
		assert scale.ndim == saveinvvar.ndim
		checkOffsets(saveinvvar)
		saveinvvar = saveinvvar.int_ptr

	descData = createDescribed4dTensor(data, allowOffset=True)
	descGrad = createDescribed4dTensor(grad)
	descScale = createDescribed4dTensor(scale)

	descInGrad = createDescribed4dTensor(Driver.empty(queue, grad.shape, dtype=grad.dtype, allocator=memPool))
	scalegrad = Driver.empty(queue, scale.shape, dtype=scale.dtype, allocator=memPool)
	bgrad = Driver.empty(queue, scale.shape, dtype=scale.dtype, allocator=memPool)

	libmiopen.miopenBatchNormalizationBackward(context, mode.value, 1.0, 0.0, 1.0, 0.0, descData.desc, descData.ptr,
											   descGrad.desc, descGrad.ptr, descInGrad.desc, descInGrad.ptr,
											   descScale.desc, descScale.ptr, scalegrad.int_ptr, bgrad.int_ptr,
											   epsilon, savemean, saveinvvar)

	destroyDescribedTensors(descData, descGrad, descInGrad, descScale)
	return descInGrad.tensor, scalegrad, bgrad
Beispiel #7
0
def lrn(data, mode=LRNMode.map, N=5, alpha=1e-4, beta=0.75, K=2.0, test=False):
	descData = createDescribed4dTensor(data, allowOffset=True)
	descOutData = createDescribed4dTensor(Driver.empty(queue, data.shape, dtype=data.dtype, allocator=memPool))

	descLRN = createDescribedLRN(mode, N, alpha, beta, K)

	workspace, ptr = None, None
	if not test:
		size = libmiopen.miopenLRNGetWorkSpaceSize(descOutData.desc)
		workspace = Driver.empty(queue, (size, ), dtype=np.uint8, allocator=memPool)

		ptr = workspace.int_ptr

	libmiopen.miopenLRNForward(context, descLRN.desc, 1.0, descData.desc, descData.ptr, 0.0,
							   descOutData.desc, descOutData.ptr, not test, ptr)

	if mode == LRNMode.cross:
		signKer(descOutData.tensor, descOutData.tensor, descData.tensor)

	destroyDescribedTensors(descData, descOutData)
	destroyDescribedLRN(descLRN)

	if test:
		return descOutData.tensor
	else:
		return descOutData.tensor, workspace
Beispiel #8
0
def pool2d(data, size=2, stride=2, pad=0, mode=PoolMode.max, test=False):
	descData = createDescribed4dTensor(data, allowOffset=True)

	descPool = createDescribedPool2d(size, stride, pad, mode)
	outshape = getPool2dOutShape(descPool, descData)

	descOutData = createDescribed4dTensor(Driver.empty(queue, outshape, dtype=data.dtype, allocator=memPool))

	workspace, ptr, size = None, None, 0
	if not test:
		size = libmiopen.miopenPoolingGetWorkSpaceSize(descOutData.desc)
		workspace = Driver.empty(queue, (size, ), dtype=np.uint8, allocator=memPool)

		ptr = workspace.int_ptr

	libmiopen.miopenPoolingForward(context, descPool.desc, 1.0, descData.desc, descData.ptr, 0.0, descOutData.desc,
								   descOutData.ptr, not test, ptr, size)

	destroyDescribedTensors(descData, descOutData)
	destroyDescribedPool(descPool)

	if test:
		return descOutData.tensor
	else:
		return descOutData.tensor, workspace
Beispiel #9
0
def argminmax(mat, axis, mode):
	assert mat.ndim == 2 and mat.dtype == np.float32

	if mode == "max":
		mod = maxmod
	else:
		mod = minmod

	if axis == 0:
		colKernel = mod.minMaxOnCol

		block = (NT2, 1, 1)
		grid = (roundUp(mat.shape[1], block[0]), 1, 1)

		target = Driver.empty(queue, (mat.shape[1], ), dtype=np.float32, allocator=memPool)
		idx = Driver.empty(queue, (mat.shape[1], ), dtype=np.int32, allocator=memPool)

		colKernel(queue, grid, block, mat.data, target.data, idx.data, np.int32(mat.shape[1]), np.int32(mat.shape[0]))

	elif axis == 1:
		rowKernel = mod.minMaxOnRow

		block = (NT1, 1, 1)
		grid = (mat.shape[0] * block[0], 1, 1)

		target = Driver.empty(queue, (mat.shape[0], ), dtype=np.float32, allocator=memPool)
		idx = Driver.empty(queue, (mat.shape[0], ), dtype=np.int32, allocator=memPool)

		rowKernel(queue, grid, block, mat.data, target.data, idx.data, np.int32(mat.shape[1]), np.int32(mat.shape[0]))

	else:
		raise NotImplementedError()

	return idx
Beispiel #10
0
def softmaxTest():
	batchsize, maps = 5, 8
	data = Driver.to_device(queue, np.random.randn(batchsize, maps, 1, 1).astype(np.float32))

	outdata = softmax2d(data)

	def hostSoftmax(w):
		e = np.exp(w - np.amax(w))
		p = e / np.sum(e)
		return p

	hostData = data.get().reshape(batchsize, maps)
	hostOutData = np.vstack([hostSoftmax(hostData[i]) for i in range(batchsize)])
	assert np.allclose(hostOutData, outdata.get().reshape(batchsize, maps))

	grad = Driver.to_device(queue, np.random.randn(batchsize, maps, 1, 1).astype(np.float32))
	ingrad = softmax2dBackward(outdata, grad)

	def hostSoftmaxBackward(outdat, gr):
		ingr = np.zeros(outdat.shape, dtype=np.float32)
		for i in range(ingr.shape[0]):
			ingr[i] += outdat[i] * gr[i]

			for j in range(outdat.shape[0]):
				ingr[i] -= outdat[i] * outdat[j] * gr[j]
		return ingr

	hostGrad = grad.get().reshape(batchsize, maps)
	hostInGrad = np.vstack([hostSoftmaxBackward(hostOutData[i], hostGrad[i]) for i in range(batchsize)])
	assert np.allclose(hostInGrad, ingrad.get().reshape(batchsize, maps))
Beispiel #11
0
def crossMapLRNTest():
	maps = 10
	N, alpha, beta, K = 5, 1.0, 0.5, 2.0

	lookBehind = int((N - 1) / 2)
	lookAhead = N - lookBehind

	data = Driver.to_device(queue, np.random.randn(1, maps, 1, 1).astype(np.float32))
	outdata, workspace = lrn(data, mode=LRNMode.cross, N=N, alpha=alpha, beta=beta, K=K)

	hostData = data.get().reshape(maps, ).astype(np.float32)
	norms = np.empty((maps, ), dtype=np.float32)
	for i in range(maps):
		norm = 0.0
		for j in range(max(0, i - lookBehind), min(maps, i + lookAhead)):
			norm += hostData[j]**2
		norms[i] = K + norm * alpha / N

	hostOutData = hostData / norms**beta
	assert np.allclose(hostOutData, outdata.reshape(maps, ).get())

	grad = Driver.to_device(queue, np.random.randn(1, maps, 1, 1).astype(np.float32))
	ingrad = lrnBackward(data, outdata, grad, workspace, mode=LRNMode.cross, N=N, alpha=alpha, beta=beta, K=K)

	hostGrad = grad.get().reshape(maps, ).astype(np.float32)
	hostInGrad = np.zeros((maps, ), dtype=np.float32)
	k = 2.0 * alpha * beta / N
	for i in range(maps):
		hostInGrad[i] += hostGrad[i] / norms[i]**beta

		for j in range(max(0, i - lookBehind), min(maps, i + lookAhead)):
			hostInGrad[j] -= hostGrad[i] * k * hostData[i] * hostData[j] / norms[i]**(beta + 1)

	assert np.allclose(hostInGrad, ingrad.reshape(maps, ).get())
Beispiel #12
0
    def __call__(self, *args, **kwargs):
        slc = kwargs.get("slice", None)
        kernel = self.get_kernel(slc is not None)

        if slc is not None:
            start, step, stop = slc[:]

            args, size = rewriteArgs(args)
            if stop is None:
                stop = size

            gridsize, blocks = Driver.splay(self.context,
                                            (stop - start) // step)
            block = (blocks, 1, 1)
            grid = (gridsize, 1, 1)

            kernel(self.queue, grid, block, *args, start, step, stop)

        else:
            args, size = rewriteArgs(args)

            gridsize, blocks = Driver.splay(self.context, size)
            block = (blocks, 1, 1)
            grid = (gridsize, 1, 1)

            kernel(self.queue, grid, block, *args, size)
Beispiel #13
0
def svmTest():
    batchsize, size = 20, 4

    scores = Driver.to_device(
        queue,
        np.random.randn(batchsize, size).astype(np.float32))
    labels = Driver.to_device(
        queue,
        np.random.randint(low=0, high=size, size=(batchsize, ),
                          dtype=np.int32))

    error, grad = svm(scores, labels, mode="l1")

    hostScores, hostLabels = scores.get(), labels.get()

    hostGrad = np.empty(grad.shape, dtype=np.float32)
    hostError = 0.0

    for b in range(batchsize):
        for n in range(size):
            cls = 2 * (hostLabels[b] == n) - 1
            val = hostScores[b, n] * cls

            hostGrad[b, n] = cls / batchsize / size if val < 1 else 0.0
            hostError += max(0.0, 1.0 - val) / batchsize / size

    assert np.allclose(hostGrad, grad.get())
    assert np.isclose(hostError, error.get() / scores.shape[0])
Beispiel #14
0
def split(ary, sections, axis):
    assert np.sum(sections) == ary.shape[axis]

    outs = []
    for sec in sections:
        shape = ary.shape[:axis] + (sec, ) + ary.shape[axis + 1:]
        outs.append(
            Driver.empty(queue, shape, dtype=ary.dtype, allocator=memoryPool))

    ary = ary.reshape(int(np.prod(ary.shape[:axis])),
                      int(np.prod(ary.shape[axis:])))

    stride = 0
    for i, out in enumerate(outs):
        out = out.reshape(int(np.prod(out.shape[:axis])),
                          int(np.prod(out.shape[axis:])))

        Driver.enqueue_copy_3d(queue,
                               out.base_data,
                               ary.base_data,
                               dest_origin=(out.offset, 0, 0),
                               src_origin=(ary.offset + stride, 0, 0),
                               region=(out.strides[0], ary.shape[0], 1),
                               dest_pitches=(out.strides[0], 0),
                               src_pitches=(ary.strides[0], 0))

        stride += out.strides[0]

    return outs
Beispiel #15
0
def maxpool2d(data, size, stride, pad):
    assert data.dtype == np.float32
    batchsize, maps, inh, inw = data.shape

    fh, fw = size
    hstride, wstride = stride
    hpad, wpad = pad

    outh = (inh - fh + 2 * hpad) // hstride + 1
    outw = (inw - fw + 2 * wpad) // wstride + 1

    outdata = Driver.empty(queue, (batchsize, maps, outh, outw),
                           dtype=np.float32,
                           allocator=memPool)
    mask = Driver.empty(queue, (batchsize, maps, outh, outw),
                        dtype=np.int32,
                        allocator=memPool)

    kernel = mod.maxpool2d

    size = int(np.prod(outdata.shape))

    block = (nthreads, 1, 1)
    grid = (roundUp(size, nthreads), 1, 1)

    kernel(queue, grid, block,
           outdata.data, data.data, mask.data, np.int32(inh), np.int32(inw),
           np.int32(outh), np.int32(outw), np.int32(maps), np.int32(hstride),
           np.int32(wstride), np.int32(hpad), np.int32(wpad), np.int32(fh),
           np.int32(fw), np.int32(size))

    return outdata, mask
Beispiel #16
0
def svm(scores, labels, mode, error=None):
    assert scores.dtype == np.float32 and labels.dtype == np.int32
    shape = scores.shape

    grad = Driver.empty(queue, shape, dtype=np.float32, allocator=memPool)
    if error is None:
        error = Driver.empty(queue, (), dtype=np.float32, allocator=memPool)

    error.fill(0.0)

    size = int(np.prod(scores.shape))
    spatialDim = int(np.prod(scores.shape[2:]))
    mapStride = spatialDim * scores.shape[1]

    block = (nthreads, 1, 1)
    grid = (roundUp(size, nthreads), 1, 1)

    if mode == "l1":
        krl = svmL1Mod.cost
    elif mode == "l2":
        krl = svmL2Mod.cost
    else:
        raise ValueError()

    krl(queue, grid, block, scores.data, labels.base_data,
        np.int32(labels.offset // labels.dtype.itemsize), np.int32(size),
        np.int32(mapStride), np.int32(spatialDim), np.int32(shape[1]),
        np.int32(shape[0]), error.data, grad.data)

    return error, grad
Beispiel #17
0
def vecBgpTest():
    groups = 5

    tensor = Driver.to_device(queue,
                              np.random.randn(7, groups, 4).astype(np.float32))
    x = Driver.to_device(
        queue,
        np.random.randn(groups, tensor.shape[2]).astype(np.float32))
    y = Driver.to_device(
        queue,
        np.random.randn(groups, tensor.shape[0]).astype(np.float32))

    out = mulTensorOnVecGroup(tensor, x)

    hostOut = np.empty(out.shape, dtype=np.float32)
    for i in range(groups):
        hostOut[i] = np.dot(tensor.get()[:, i, :], x.get()[i])

    assert np.allclose(hostOut, out.get())

    out = mulTensorOnVecGroup(tensor, y, transpT=True)

    hostOut = np.empty(out.shape, dtype=np.float32)
    for i in range(groups):
        hostOut[i] = np.dot(tensor.get()[:, i, :].T, y.get()[i])

    assert np.allclose(hostOut, out.get())
Beispiel #18
0
    def __call__(self, shape, dtype, constant=1.0):
        tup = (shape, dtype, constant)
        fills = self.pool.get(tup, None)

        if fills is None:
            if constant == 1.0:
                fills = Driver.to_device(queue,
                                         np.ones(shape, dtype=dtype),
                                         allocator=memoryPool)
            elif constant == 0.0:
                fills = Driver.zeros(queue,
                                     shape,
                                     dtype=dtype,
                                     allocator=memoryPool)
            else:
                if isinstance(constant, tuple) or isinstance(
                        constant, list) or isinstance(constant, range):
                    fills = []
                    for i in range(shape[0]):
                        fills.extend([constant[i]] * shape[1])
                    fills = np.array(fills, dtype=dtype)
                else:
                    fills = np.full(shape, constant, dtype=dtype)

                fills = Driver.to_device(queue, fills, allocator=memoryPool)

            self.pool[tup] = fills

        return fills
Beispiel #19
0
def backwardParamsRnn(data,
                      outdata,
                      w,
                      trainReserve,
                      descRnn,
                      inithidden=None):
    assert data.ndim == 3 and data.dtype == np.float32 and descRnn.insize == data.shape[
        2]
    assert outdata.ndim == 3 and outdata.dtype == data.dtype
    assert w.ndim == 1 and w.dtype == np.float32

    seqlen, batchsize, _ = data.shape

    if descRnn.dir == DirectionMode.uni:
        assert outdata.shape[2] == descRnn.hsize
        dims, strides = (descRnn.layers, batchsize,
                         descRnn.hsize), (batchsize * descRnn.hsize,
                                          descRnn.hsize, 1)
    else:
        assert outdata.shape[2] == 2 * descRnn.hsize
        dims, strides = (2 * descRnn.layers, batchsize,
                         descRnn.hsize), (batchsize * descRnn.hsize,
                                          descRnn.hsize, 1)

    if inithidden is not None:
        assert inithidden.dtype == np.float32 and inithidden.shape == dims
    else:
        inithidden = Driver.zeros(queue,
                                  dims,
                                  dtype=np.float32,
                                  allocator=memPool)

    descHx = createDescribedNdTensor(dims, strides, inithidden)

    descDatas = []
    descOutDatas = []

    for d in range(data.shape[0]):
        descDatas.append(createDescribedNdTensor(None, None, data[0]))
        descOutDatas.append(createDescribedNdTensor(None, None, outdata[0]))

    indescs, outdescs = [d.desc
                         for d in descDatas], [d.desc for d in descOutDatas]

    dw = Driver.zeros(queue, w.shape, dtype=np.float32, allocator=memPool)
    descDw = createDescribedNdTensor(None, None, dw)

    workspace, reserveSpace = trainReserve

    libmiopen.miopenRNNBackwardWeights(context, descRnn.desc, seqlen, indescs,
                                       data.int_ptr, descHx.desc, descHx.ptr,
                                       outdescs, outdata.int_ptr, descDw.desc,
                                       descDw.ptr, workspace.int_ptr,
                                       workspace.nbytes, reserveSpace.int_ptr,
                                       reserveSpace.nbytes)

    destroyDescribedTensors(*descDatas, *descOutDatas, descHx, descDw)

    return dw
Beispiel #20
0
def unittest():
    batchsize, maps, h, w = 3, 4, 5, 5
    epsilon = 1e-5

    data = Driver.to_device(
        queue,
        np.random.randn(batchsize, maps, h, w).astype(np.float32))
    scale = Driver.to_device(queue,
                             np.random.randn(1, maps, 1, 1).astype(np.float32))
    bias = Driver.to_device(queue,
                            np.random.randn(1, maps, 1, 1).astype(np.float32))

    outdata, savemean, saveinvvar, extscale = instanceNorm2d(
        data, scale, bias, epsilon)

    hostData = data.get().reshape(data.shape[0] * data.shape[1], -1)
    hostScale, hostBias = scale.get().reshape(maps,
                                              1), bias.get().reshape(maps, 1)
    hostExtScale, hostExtBias = np.tile(hostScale, (batchsize, 1)), np.tile(
        hostBias, (batchsize, 1))

    hostMean = np.mean(hostData, axis=1, keepdims=True)
    hostInvVar = 1.0 / np.sqrt(np.var(hostData, axis=1) + epsilon)
    hostOutData = (hostData - hostMean) * hostInvVar[:, np.newaxis]
    hostOutScData = hostOutData * hostExtScale + hostExtBias

    assert np.allclose(hostOutScData.reshape(data.shape), outdata.get())
    assert np.allclose(hostMean.reshape(savemean.shape), savemean.get())
    assert np.allclose(hostInvVar.reshape(saveinvvar.shape), saveinvvar.get())

    grad = Driver.to_device(
        queue,
        np.random.randn(batchsize, maps, h, w).astype(np.float32))
    ingrad, scalegrad, bgrad = instanceNorm2dBackward(grad, data, extscale,
                                                      savemean, saveinvvar,
                                                      epsilon)

    hostGrad = grad.get().reshape(grad.shape[0] * grad.shape[1], -1)
    hostScGrad = hostGrad * hostExtScale
    hostCorrs = np.empty(hostInvVar.shape, dtype=np.float32)
    for i in range(hostCorrs.shape[0]):
        hostCorrs[i] = np.dot(hostScGrad[i],
                              hostOutData[i]) / hostScGrad.shape[1]
    hostInGrad = hostScGrad - np.mean(
        hostScGrad, axis=1,
        keepdims=True) - hostCorrs[:, np.newaxis] * hostOutData
    hostInGrad *= hostInvVar[:, np.newaxis]

    hostScaleGrad = np.sum(np.sum(hostOutData * hostGrad,
                                  axis=1).reshape(batchsize, -1),
                           axis=0)
    hostBiasGrad = np.sum(np.sum(hostGrad, axis=1).reshape(batchsize, -1),
                          axis=0)

    assert np.allclose(hostInGrad.reshape(grad.shape), ingrad.get())
    assert np.allclose(hostScaleGrad.reshape(1, maps, 1, 1), scalegrad.get())
    assert np.allclose(hostBiasGrad.reshape(1, maps, 1, 1), bgrad.get())
Beispiel #21
0
def speedTest():
	from PuzzleLib.OpenCL.Benchmarks.Utils import timeKernel

	A = Driver.to_device(queue, np.random.randn(1024, 1024).astype(np.float32))
	v = Driver.to_device(queue, np.random.randn(1024).astype(np.float32))

	timeKernel(addVecToMat, (v, A, 1, True), logname="addVecToMat on rows")
	timeKernel(addVecToMat, (v, A, 0, True), logname="addVecToMat on cols")

	timeKernel(argmax, (A, 1), logname="argmax on rows")
	timeKernel(argmax, (A, 0), logname="argmax on cols")
Beispiel #22
0
def upsample2dLinearTest():
    batchsize, maps, inh, inw = 3, 2, 4, 4
    hscale, wscale = 2, 3

    data = Driver.to_device(
        queue,
        np.random.randn(batchsize, maps, inh, inw).astype(np.float32))

    outdata = upsample2d(data, (hscale, wscale), mode="linear")

    hostData = data.get()
    hostOutData = np.zeros(outdata.shape, dtype=np.float32)
    rh, rw = (inh - 1) / (inh * hscale - 1), (inw - 1) / (inw * wscale - 1)

    for b in range(batchsize):
        for c in range(maps):
            for y in range(inh * hscale):
                for x in range(inw * wscale):
                    iny, inx = int(rh * y), int(rw * x)
                    dy, dx = 1.0 - (rh * y - iny), 1.0 - (rw * x - inx)

                    xi = 1 if x < inw * wscale - 1 else 0
                    yi = 1 if y < inh * hscale - 1 else 0

                    hostOutData[b, c, y, x] = \
                    dy * (dx * hostData[b, c, iny, inx] + (1 - dx) * hostData[b, c, iny, inx + xi]) + \
                    (1 - dy) * (dx * hostData[b, c, iny + yi, inx] + (1 - dx) * hostData[b, c, iny + yi, inx + xi])

    grad = Driver.to_device(queue,
                            np.random.randn(*outdata.shape).astype(np.float32))
    ingrad = upsample2dBackward(grad, (hscale, wscale), mode="linear")

    hostGrad = grad.get()
    hostInGrad = np.zeros(data.shape, dtype=np.float32)

    for b in range(batchsize):
        for c in range(maps):
            for y in range(inh * hscale):
                for x in range(inw * wscale):
                    iny, inx = int(rh * y), int(rw * x)
                    dy, dx = 1.0 - (rh * y - iny), 1.0 - (rw * x - inx)

                    xi = 1 if x < inw * wscale - 1 else 0
                    yi = 1 if y < inh * hscale - 1 else 0

                    val = hostGrad[b, c, y, x]

                    hostInGrad[b, c, iny, inx] += dy * dx * val
                    hostInGrad[b, c, iny, inx + xi] += dy * (1 - dx) * val
                    hostInGrad[b, c, iny + yi, inx] += (1 - dy) * dx * val
                    hostInGrad[b, c, iny + yi,
                               inx + xi] += (1 - dy) * (1 - dx) * val

    assert np.allclose(hostInGrad, ingrad.get())
Beispiel #23
0
def calcTest():
	A = Driver.to_device(queue, np.random.randn(128, 500).astype(np.float32))

	v = Driver.to_device(queue, np.random.randn(500).astype(np.float32))
	w = Driver.to_device(queue, np.random.randn(128).astype(np.float32))
	m = Driver.to_device(queue, np.random.randn(125).astype(np.float32))

	assert np.allclose(A.get() + v.get()[np.newaxis, :], addVecToMat(v, A, axis=1).get())
	assert np.allclose(A.get() + w.get()[:, np.newaxis], addVecToMat(w, A, axis=0).get())
	assert np.allclose(A.get() + np.tile(m.get(), 4)[np.newaxis, :], addVecToMat(m, A, axis=1).get())

	assert np.allclose(argmax(A, axis=1).get(), np.argmax(A.get(), axis=1))
	assert np.allclose(argmax(A, axis=0).get(), np.argmax(A.get(), axis=0))
Beispiel #24
0
def eltwiseTest(context, queue):
    outdata = Driver.empty(queue, (1 << 18, ), dtype=np.int32)
    indata = Driver.to_device(
        queue, np.random.randint(0, 1000, size=(1 << 18, ), dtype=np.int32))

    krl = ElementwiseKernel("int *outdata, const int *indata",
                            "outdata[i] = indata[i] * indata[i]", "krl",
                            context, queue)

    krl(outdata, indata)

    hostOutData = indata.get() * indata.get()
    assert np.allclose(hostOutData, outdata.get())
Beispiel #25
0
def conv2dBackwardParams(data, grad, W, bias=None, stride=1, pad=0, wgrad=None, bgrad=None, scale=1.0, momentum=0.0,
						 mode=ConvMode.conv, algo=None):
	assert data.ndim == grad.ndim
	if mode == ConvMode.conv:
		assert grad.shape[1] == W.shape[0] and data.shape[1] == W.shape[1]
	else:
		assert grad.shape[1] == W.shape[1] and data.shape[1] == W.shape[0]

	descData = createDescribed4dTensor(data, allowOffset=True)
	descGrad = createDescribed4dTensor(grad)

	descConv = createDescribedConv2d(stride, pad, 1, mode)

	if wgrad is not None and scale == 1.0 and momentum == 0.0:
		descWGrad = createDescribed4dTensor(wgrad)
	else:
		descWGrad = createDescribed4dTensor(Driver.zeros(queue, W.shape, dtype=W.dtype, allocator=memPool))

	_, ptr, size = conv2dBackwardParamsWorkspace(descGrad, descData, descConv, descWGrad)
	algo = cacheConv2dParamsAlgo(descGrad, descData, descConv, descWGrad, (ptr, size), algo)

	libmiopen.miopenConvolutionBackwardWeights(context, 1.0, descGrad.desc, descGrad.ptr, descData.desc, descData.ptr,
											   descConv.desc, algo.value, 0.0, descWGrad.desc, descWGrad.ptr,
											   ptr, size)

	if wgrad is not None and scale != 1.0 or momentum != 0.0:
		CLBlas.addVectorToVector(descWGrad.tensor.ravel(), wgrad.ravel(), out=wgrad.ravel(), alpha=scale, beta=momentum)

	tup = (descWGrad.tensor, )
	if bias is not None:
		assert bias.ndim == data.ndim

		if bgrad is not None and scale == 1.0 and momentum == 0.0:
			descBGrad = createDescribed4dTensor(bgrad)
		else:
			descBGrad = createDescribed4dTensor(Driver.empty(queue, bias.shape, dtype=bias.dtype, allocator=memPool))

		libmiopen.miopenConvolutionBackwardBias(context, 1.0, descGrad.desc, descGrad.ptr, 0.0, descBGrad.desc,
												descBGrad.ptr)

		if bgrad is not None and scale != 1.0 or momentum != 0.0:
			CLBlas.addVectorToVector(descBGrad.tensor.ravel(), bgrad.ravel(), out=bgrad.ravel(),
									 alpha=scale, beta=momentum)

		tup = (descWGrad.tensor, descBGrad.tensor)
		destroyDescribedTensors(descBGrad)

	destroyDescribedConv(descConv)
	destroyDescribedTensors(descData, descGrad, descWGrad)

	return tup
Beispiel #26
0
def argminmaxBatch(mats, axis, mode):
    assert mats.ndim == 3 and mats.dtype == np.float32

    block = (warpSize, 1, 1)

    if axis == 1:
        if mode == "max":
            mod = maxBatchMod
        else:
            mod = minBatchMod

        colKernel = mod.minMaxBatchOnCol
        target = Driver.empty(queue, (mats.shape[0], mats.shape[2]),
                              dtype=np.float32,
                              allocator=memPool)
        idx = Driver.empty(queue, (mats.shape[0], mats.shape[2]),
                           dtype=np.int32,
                           allocator=memPool)

        grid = (mats.shape[2] * warpSize, 1, mats.shape[0])

        colKernel(queue, grid, block, mats.data, target.data, idx.data,
                  np.int32(mats.shape[2]), np.int32(mats.shape[1]))

    elif axis == 2:
        if mode == "max":
            mod = maxmod
        else:
            mod = minmod

        rowKernel = mod.minMaxOnRow

        target = Driver.empty(queue,
                              mats.shape[:2],
                              dtype=np.float32,
                              allocator=memPool)
        idx = Driver.empty(queue,
                           mats.shape[:2],
                           dtype=np.int32,
                           allocator=memPool)

        grid = (mats.shape[0] * mats.shape[1] * block[0], 1, 1)

        rowKernel(queue, grid, block, mats.data, target.data, idx.data,
                  np.int32(mats.shape[2]), np.int32(mats.shape[1]))

    else:
        raise ValueError("Unsupported axis %s was given" % axis)

    return idx
Beispiel #27
0
def initOpenCL():
    from PuzzleLib.OpenCL.Driver import Driver
    from PuzzleLib.OpenCL.Kernels import Templates
    from PuzzleLib.OpenCL.Utils import context, queue

    global GPUArray, to_gpu, empty, zeros
    GPUArray = Driver.Array
    to_gpu = lambda *args, **kwargs: Driver.to_device(queue, *args, **kwargs)
    empty = lambda *args, **kwargs: Driver.empty(queue, *args, **kwargs)
    zeros = lambda *args, **kwargs: Driver.zeros(queue, *args, **kwargs)

    global minimum, maximum
    minimum = lambda ary: Templates.minimum(context, queue, ary)
    maximum = lambda ary: Templates.maximum(context, queue, ary)
Beispiel #28
0
def create_some_context():
    platforms = Driver.get_platforms()

    for vendor in ["AMD Accelerated Parallel Processing"]:
        platform = next((pl for pl in platforms if pl.name == vendor), None)
        if platform is not None:
            break

    assert platform is not None
    device = platform.get_devices(type=Driver.device_type.GPU)[0]

    context = Driver.Context([device])
    queue = Driver.CommandQueue(context, profiling=True)

    return context, queue
Beispiel #29
0
def gbpGbpTest():
    groups = 3

    A = Driver.to_device(queue,
                         np.random.randn(groups, 4, 3).astype(np.float32))
    B = Driver.to_device(
        queue,
        np.random.randn(groups, A.shape[2], 4).astype(np.float32))
    C = Driver.to_device(
        queue,
        np.random.randn(groups, A.shape[1], 6).astype(np.float32))
    D = Driver.to_device(
        queue,
        np.random.randn(groups, 8, C.shape[2]).astype(np.float32))

    out = mulTensorBatch(A, B, formatA="gbp", formatB="gbp", formatOut="gbp")

    hostOut = np.empty(out.shape, dtype=np.float32)
    for i in range(groups):
        hostOut[i] = np.dot(A.get()[i], B.get()[i])

    assert np.allclose(hostOut, out.get())

    out = mulTensorBatch(C,
                         A,
                         formatA="gbp",
                         formatB="gbp",
                         formatOut="gbp",
                         transpA=True)

    hostOut = np.empty(out.shape, dtype=np.float32)
    for i in range(groups):
        hostOut[i] = np.dot(C.get()[i].T, A.get()[i])

    assert np.allclose(hostOut, out.get())

    out = mulTensorBatch(C,
                         D,
                         formatA="gbp",
                         formatB="gbp",
                         formatOut="gbp",
                         transpB=True)

    hostOut = np.empty(out.shape, dtype=np.float32)
    for i in range(groups):
        hostOut[i] = np.dot(C.get()[i], D.get()[i].T)

    assert np.allclose(hostOut, out.get())
Beispiel #30
0
def upsample3dBackward(grad, scale, mode="nearest"):
    batchsize, maps, outd, outh, outw = grad.shape

    if isinstance(scale, int):
        dscale, hscale, wscale = scale, scale, scale
    else:
        dscale, hscale, wscale = scale

    ind, inh, inw = outd // dscale, outh // hscale, outw // wscale

    if mode == "nearest":
        ingrad = Driver.empty(queue, (batchsize, maps, ind, inh, inw),
                              dtype=grad.dtype,
                              allocator=memPool)

        blk = warpSize * 4
        block = (blk, 1, 1)

        grid = (roundUp(ingrad.size, blk), 1, 1)

        kernel = nearestMod.upsample3dNearestBackward
        kernel(queue, grid, block, ingrad.data, grad.data, np.int32(inh),
               np.int32(inw), np.int32(outh), np.int32(outw), np.int32(dscale),
               np.int32(hscale), np.int32(wscale), np.int32(ingrad.size))

    elif mode == "linear":
        ingrad = Driver.zeros(queue, (batchsize, maps, ind, inh, inw),
                              dtype=grad.dtype,
                              allocator=memPool)

        block = (warpSize // 4, warpSize // 4, 1)
        grid = (roundUp(outw, block[0]), roundUp(outh, block[1]), outd)

        rd = (ind - 1) / (outd - 1)
        rh = (inh - 1) / (outh - 1)
        rw = (inw - 1) / (outw - 1)

        kernel = linearMod.upsample3dLinearBackward

        kernel(queue, grid, block, ingrad.data, grad.data, np.int32(batchsize),
               np.int32(maps), np.int32(ind), np.int32(inh), np.int32(inw),
               np.int32(outd), np.int32(outh), np.int32(outw), np.float32(rd),
               np.float32(rh), np.float32(rw))

    else:
        raise ValueError("Unrecognized sampling mode")

    return ingrad