def bgpBgpTest(): groups = 3 A = Driver.to_device(queue, np.random.randn(4, groups, 7).astype(np.float32)) B = Driver.to_device( queue, np.random.randn(A.shape[2], groups, 5).astype(np.float32)) C = Driver.to_device( queue, np.random.randn(A.shape[0], groups, B.shape[2]).astype(np.float32)) out = mulTensorBatch(A, B, formatA="bgp", formatB="bgp") hostOut = np.empty(out.shape, dtype=np.float32) for i in range(groups): hostOut[:, i, :] = np.dot(A.get()[:, i, :], B.get()[:, i, :]) assert np.allclose(hostOut, out.get()) out = mulTensorBatch(A, C, formatA="bgp", formatB="bgp", transpA=True) hostOut = np.empty(out.shape, dtype=np.float32) for i in range(groups): hostOut[:, i, :] = np.dot(A.get()[:, i, :].T, C.get()[:, i, :]) assert np.allclose(hostOut, out.get()) out = mulTensorBatch(B, C, formatA="bgp", formatB="bgp", transpB=True) hostOut = np.empty(out.shape, dtype=np.float32) for i in range(groups): hostOut[:, i, :] = np.dot(B.get()[:, i, :], C.get()[:, i, :].T) assert np.allclose(hostOut, out.get())
def depthConcatTest(): data1 = Driver.to_device(queue, np.random.randn(3, 4, 3, 3).astype(np.float32)) data2 = Driver.to_device(queue, np.random.randn(3, 2, 6, 6).astype(np.float32)) data3 = Driver.to_device(queue, np.random.randn(3, 5, 4, 4).astype(np.float32)) alldata = [data1, data2, data3] outdata = depthConcat(alldata) depth, h, w = 0, 0, 0 for data in alldata: depth += data.shape[1] h, w = max(h, data.shape[2]), max(w, data.shape[3]) hostOutData = np.zeros(shape=(data1.shape[0], depth, h, w), dtype=np.float32) hostOutData[:, :4, 1:4, 1:4] = data1.get() hostOutData[:, 4:6, :, :] = data2.get() hostOutData[:, 6:, 1:5, 1:5] = data3.get() assert np.allclose(hostOutData, outdata.get()) grad = Driver.to_device(queue, np.random.randn(*hostOutData.shape).astype(np.float32)) ingrads = depthSplit(grad, alldata) hostInGrads = [np.empty(data.shape, dtype=np.float32) for data in alldata] hostInGrads[0] = grad.get()[:, :4, 1:4, 1:4] hostInGrads[1] = grad.get()[:, 4:6, :, :] hostInGrads[2] = grad.get()[:, 6:, 1:5, 1:5] assert all(np.allclose(hostInGrad, ingrads[i].get()) for i, hostInGrad in enumerate(hostInGrads))
def instanceNorm2d(data, scale, bias, epsilon=1e-5): batchsize = data.shape[0] if batchsize > 1: extscale = Utils.tile(scale, batchsize, axis=1) extbias = Utils.tile(bias, batchsize, axis=1) else: extscale = scale extbias = bias indata = data.reshape(1, batchsize * data.shape[1], data.shape[2], data.shape[3]) mean = Driver.empty(queue, (1, indata.shape[1], 1, 1), dtype=np.float32, allocator=memPool) var = Driver.empty(queue, (1, indata.shape[1], 1, 1), dtype=np.float32, allocator=memPool) outdata, savemean, saveinvvar = MIOpen.batchNorm2d(indata, extscale, extbias, mean, var, epsilon, test=False) return outdata.reshape(data.shape), savemean, saveinvvar, extscale
def maxpool2dTest(): batchsize, maps, h, w = 1, 1, 8, 8 data = Driver.to_device(queue, np.random.randn(batchsize, maps, h, w).astype(np.float32)) outdata, workspace = pool2d(data, test=False) def maxDownSample2d(dat, factor): trimrows = dat.shape[0] // factor * factor trimcols = dat.shape[1] // factor * factor maxSoFar = None first = True for coff in range(factor): for roff in range(factor): hopped = dat[roff:trimrows:factor, coff:trimcols:factor] if first: maxSoFar = hopped first = False else: maxSoFar = np.maximum(maxSoFar, hopped) return maxSoFar hostOutData = maxDownSample2d(data.get()[0, 0], 2) assert np.allclose(hostOutData, outdata.get()) grad = Driver.to_device(queue, np.random.randn(*outdata.shape).astype(np.float32)) pool2dBackward(data, outdata, grad, workspace)
def batchNorm2d(data, scale, bias, mean, var, epsilon=1e-5, factor=1.0, test=False, mode=BatchNormMode.spatial, out=None): assert data.ndim == scale.ndim and scale.ndim == bias.ndim and bias.ndim == mean.ndim and mean.ndim == var.ndim checkOffsets(bias, mean, var) descData = createDescribed4dTensor(data, allowOffset=True) descScale = createDescribed4dTensor(scale) if out is None: descOutData = createDescribed4dTensor(Driver.empty(queue, data.shape, dtype=data.dtype, allocator=memPool)) else: descOutData = createDescribed4dTensor(out) if test: savemean, saveinvvar = None, None libmiopen.miopenBatchNormalizationForwardInference(context, mode.value, 1.0, 0.0, descData.desc, descData.ptr, descOutData.desc, descOutData.ptr, descScale.desc, descScale.ptr, bias.int_ptr, mean.int_ptr, var.int_ptr, epsilon) else: savemean = Driver.empty(queue, mean.shape, dtype=data.dtype, allocator=memPool) saveinvvar = Driver.empty(queue, var.shape, dtype=data.dtype, allocator=memPool) libmiopen.miopenBatchNormalizationForwardTraining(context, mode.value, 1.0, 0.0, descData.desc, descData.ptr, descOutData.desc, descOutData.ptr, descScale.desc, descScale.ptr, bias.int_ptr, factor, mean.int_ptr, var.int_ptr, epsilon, savemean.int_ptr, saveinvvar.int_ptr) destroyDescribedTensors(descData, descScale, descOutData) if test: return descOutData.tensor else: return descOutData.tensor, savemean, saveinvvar
def batchNorm2dBackward(data, grad, scale, savemean=None, saveinvvar=None, epsilon=1e-5, mode=BatchNormMode.spatial): assert data.ndim == grad.ndim and grad.ndim == scale.ndim if savemean is not None: assert scale.ndim == savemean.ndim checkOffsets(savemean) savemean = savemean.int_ptr if saveinvvar is not None: assert scale.ndim == saveinvvar.ndim checkOffsets(saveinvvar) saveinvvar = saveinvvar.int_ptr descData = createDescribed4dTensor(data, allowOffset=True) descGrad = createDescribed4dTensor(grad) descScale = createDescribed4dTensor(scale) descInGrad = createDescribed4dTensor(Driver.empty(queue, grad.shape, dtype=grad.dtype, allocator=memPool)) scalegrad = Driver.empty(queue, scale.shape, dtype=scale.dtype, allocator=memPool) bgrad = Driver.empty(queue, scale.shape, dtype=scale.dtype, allocator=memPool) libmiopen.miopenBatchNormalizationBackward(context, mode.value, 1.0, 0.0, 1.0, 0.0, descData.desc, descData.ptr, descGrad.desc, descGrad.ptr, descInGrad.desc, descInGrad.ptr, descScale.desc, descScale.ptr, scalegrad.int_ptr, bgrad.int_ptr, epsilon, savemean, saveinvvar) destroyDescribedTensors(descData, descGrad, descInGrad, descScale) return descInGrad.tensor, scalegrad, bgrad
def lrn(data, mode=LRNMode.map, N=5, alpha=1e-4, beta=0.75, K=2.0, test=False): descData = createDescribed4dTensor(data, allowOffset=True) descOutData = createDescribed4dTensor(Driver.empty(queue, data.shape, dtype=data.dtype, allocator=memPool)) descLRN = createDescribedLRN(mode, N, alpha, beta, K) workspace, ptr = None, None if not test: size = libmiopen.miopenLRNGetWorkSpaceSize(descOutData.desc) workspace = Driver.empty(queue, (size, ), dtype=np.uint8, allocator=memPool) ptr = workspace.int_ptr libmiopen.miopenLRNForward(context, descLRN.desc, 1.0, descData.desc, descData.ptr, 0.0, descOutData.desc, descOutData.ptr, not test, ptr) if mode == LRNMode.cross: signKer(descOutData.tensor, descOutData.tensor, descData.tensor) destroyDescribedTensors(descData, descOutData) destroyDescribedLRN(descLRN) if test: return descOutData.tensor else: return descOutData.tensor, workspace
def pool2d(data, size=2, stride=2, pad=0, mode=PoolMode.max, test=False): descData = createDescribed4dTensor(data, allowOffset=True) descPool = createDescribedPool2d(size, stride, pad, mode) outshape = getPool2dOutShape(descPool, descData) descOutData = createDescribed4dTensor(Driver.empty(queue, outshape, dtype=data.dtype, allocator=memPool)) workspace, ptr, size = None, None, 0 if not test: size = libmiopen.miopenPoolingGetWorkSpaceSize(descOutData.desc) workspace = Driver.empty(queue, (size, ), dtype=np.uint8, allocator=memPool) ptr = workspace.int_ptr libmiopen.miopenPoolingForward(context, descPool.desc, 1.0, descData.desc, descData.ptr, 0.0, descOutData.desc, descOutData.ptr, not test, ptr, size) destroyDescribedTensors(descData, descOutData) destroyDescribedPool(descPool) if test: return descOutData.tensor else: return descOutData.tensor, workspace
def argminmax(mat, axis, mode): assert mat.ndim == 2 and mat.dtype == np.float32 if mode == "max": mod = maxmod else: mod = minmod if axis == 0: colKernel = mod.minMaxOnCol block = (NT2, 1, 1) grid = (roundUp(mat.shape[1], block[0]), 1, 1) target = Driver.empty(queue, (mat.shape[1], ), dtype=np.float32, allocator=memPool) idx = Driver.empty(queue, (mat.shape[1], ), dtype=np.int32, allocator=memPool) colKernel(queue, grid, block, mat.data, target.data, idx.data, np.int32(mat.shape[1]), np.int32(mat.shape[0])) elif axis == 1: rowKernel = mod.minMaxOnRow block = (NT1, 1, 1) grid = (mat.shape[0] * block[0], 1, 1) target = Driver.empty(queue, (mat.shape[0], ), dtype=np.float32, allocator=memPool) idx = Driver.empty(queue, (mat.shape[0], ), dtype=np.int32, allocator=memPool) rowKernel(queue, grid, block, mat.data, target.data, idx.data, np.int32(mat.shape[1]), np.int32(mat.shape[0])) else: raise NotImplementedError() return idx
def softmaxTest(): batchsize, maps = 5, 8 data = Driver.to_device(queue, np.random.randn(batchsize, maps, 1, 1).astype(np.float32)) outdata = softmax2d(data) def hostSoftmax(w): e = np.exp(w - np.amax(w)) p = e / np.sum(e) return p hostData = data.get().reshape(batchsize, maps) hostOutData = np.vstack([hostSoftmax(hostData[i]) for i in range(batchsize)]) assert np.allclose(hostOutData, outdata.get().reshape(batchsize, maps)) grad = Driver.to_device(queue, np.random.randn(batchsize, maps, 1, 1).astype(np.float32)) ingrad = softmax2dBackward(outdata, grad) def hostSoftmaxBackward(outdat, gr): ingr = np.zeros(outdat.shape, dtype=np.float32) for i in range(ingr.shape[0]): ingr[i] += outdat[i] * gr[i] for j in range(outdat.shape[0]): ingr[i] -= outdat[i] * outdat[j] * gr[j] return ingr hostGrad = grad.get().reshape(batchsize, maps) hostInGrad = np.vstack([hostSoftmaxBackward(hostOutData[i], hostGrad[i]) for i in range(batchsize)]) assert np.allclose(hostInGrad, ingrad.get().reshape(batchsize, maps))
def crossMapLRNTest(): maps = 10 N, alpha, beta, K = 5, 1.0, 0.5, 2.0 lookBehind = int((N - 1) / 2) lookAhead = N - lookBehind data = Driver.to_device(queue, np.random.randn(1, maps, 1, 1).astype(np.float32)) outdata, workspace = lrn(data, mode=LRNMode.cross, N=N, alpha=alpha, beta=beta, K=K) hostData = data.get().reshape(maps, ).astype(np.float32) norms = np.empty((maps, ), dtype=np.float32) for i in range(maps): norm = 0.0 for j in range(max(0, i - lookBehind), min(maps, i + lookAhead)): norm += hostData[j]**2 norms[i] = K + norm * alpha / N hostOutData = hostData / norms**beta assert np.allclose(hostOutData, outdata.reshape(maps, ).get()) grad = Driver.to_device(queue, np.random.randn(1, maps, 1, 1).astype(np.float32)) ingrad = lrnBackward(data, outdata, grad, workspace, mode=LRNMode.cross, N=N, alpha=alpha, beta=beta, K=K) hostGrad = grad.get().reshape(maps, ).astype(np.float32) hostInGrad = np.zeros((maps, ), dtype=np.float32) k = 2.0 * alpha * beta / N for i in range(maps): hostInGrad[i] += hostGrad[i] / norms[i]**beta for j in range(max(0, i - lookBehind), min(maps, i + lookAhead)): hostInGrad[j] -= hostGrad[i] * k * hostData[i] * hostData[j] / norms[i]**(beta + 1) assert np.allclose(hostInGrad, ingrad.reshape(maps, ).get())
def __call__(self, *args, **kwargs): slc = kwargs.get("slice", None) kernel = self.get_kernel(slc is not None) if slc is not None: start, step, stop = slc[:] args, size = rewriteArgs(args) if stop is None: stop = size gridsize, blocks = Driver.splay(self.context, (stop - start) // step) block = (blocks, 1, 1) grid = (gridsize, 1, 1) kernel(self.queue, grid, block, *args, start, step, stop) else: args, size = rewriteArgs(args) gridsize, blocks = Driver.splay(self.context, size) block = (blocks, 1, 1) grid = (gridsize, 1, 1) kernel(self.queue, grid, block, *args, size)
def svmTest(): batchsize, size = 20, 4 scores = Driver.to_device( queue, np.random.randn(batchsize, size).astype(np.float32)) labels = Driver.to_device( queue, np.random.randint(low=0, high=size, size=(batchsize, ), dtype=np.int32)) error, grad = svm(scores, labels, mode="l1") hostScores, hostLabels = scores.get(), labels.get() hostGrad = np.empty(grad.shape, dtype=np.float32) hostError = 0.0 for b in range(batchsize): for n in range(size): cls = 2 * (hostLabels[b] == n) - 1 val = hostScores[b, n] * cls hostGrad[b, n] = cls / batchsize / size if val < 1 else 0.0 hostError += max(0.0, 1.0 - val) / batchsize / size assert np.allclose(hostGrad, grad.get()) assert np.isclose(hostError, error.get() / scores.shape[0])
def split(ary, sections, axis): assert np.sum(sections) == ary.shape[axis] outs = [] for sec in sections: shape = ary.shape[:axis] + (sec, ) + ary.shape[axis + 1:] outs.append( Driver.empty(queue, shape, dtype=ary.dtype, allocator=memoryPool)) ary = ary.reshape(int(np.prod(ary.shape[:axis])), int(np.prod(ary.shape[axis:]))) stride = 0 for i, out in enumerate(outs): out = out.reshape(int(np.prod(out.shape[:axis])), int(np.prod(out.shape[axis:]))) Driver.enqueue_copy_3d(queue, out.base_data, ary.base_data, dest_origin=(out.offset, 0, 0), src_origin=(ary.offset + stride, 0, 0), region=(out.strides[0], ary.shape[0], 1), dest_pitches=(out.strides[0], 0), src_pitches=(ary.strides[0], 0)) stride += out.strides[0] return outs
def maxpool2d(data, size, stride, pad): assert data.dtype == np.float32 batchsize, maps, inh, inw = data.shape fh, fw = size hstride, wstride = stride hpad, wpad = pad outh = (inh - fh + 2 * hpad) // hstride + 1 outw = (inw - fw + 2 * wpad) // wstride + 1 outdata = Driver.empty(queue, (batchsize, maps, outh, outw), dtype=np.float32, allocator=memPool) mask = Driver.empty(queue, (batchsize, maps, outh, outw), dtype=np.int32, allocator=memPool) kernel = mod.maxpool2d size = int(np.prod(outdata.shape)) block = (nthreads, 1, 1) grid = (roundUp(size, nthreads), 1, 1) kernel(queue, grid, block, outdata.data, data.data, mask.data, np.int32(inh), np.int32(inw), np.int32(outh), np.int32(outw), np.int32(maps), np.int32(hstride), np.int32(wstride), np.int32(hpad), np.int32(wpad), np.int32(fh), np.int32(fw), np.int32(size)) return outdata, mask
def svm(scores, labels, mode, error=None): assert scores.dtype == np.float32 and labels.dtype == np.int32 shape = scores.shape grad = Driver.empty(queue, shape, dtype=np.float32, allocator=memPool) if error is None: error = Driver.empty(queue, (), dtype=np.float32, allocator=memPool) error.fill(0.0) size = int(np.prod(scores.shape)) spatialDim = int(np.prod(scores.shape[2:])) mapStride = spatialDim * scores.shape[1] block = (nthreads, 1, 1) grid = (roundUp(size, nthreads), 1, 1) if mode == "l1": krl = svmL1Mod.cost elif mode == "l2": krl = svmL2Mod.cost else: raise ValueError() krl(queue, grid, block, scores.data, labels.base_data, np.int32(labels.offset // labels.dtype.itemsize), np.int32(size), np.int32(mapStride), np.int32(spatialDim), np.int32(shape[1]), np.int32(shape[0]), error.data, grad.data) return error, grad
def vecBgpTest(): groups = 5 tensor = Driver.to_device(queue, np.random.randn(7, groups, 4).astype(np.float32)) x = Driver.to_device( queue, np.random.randn(groups, tensor.shape[2]).astype(np.float32)) y = Driver.to_device( queue, np.random.randn(groups, tensor.shape[0]).astype(np.float32)) out = mulTensorOnVecGroup(tensor, x) hostOut = np.empty(out.shape, dtype=np.float32) for i in range(groups): hostOut[i] = np.dot(tensor.get()[:, i, :], x.get()[i]) assert np.allclose(hostOut, out.get()) out = mulTensorOnVecGroup(tensor, y, transpT=True) hostOut = np.empty(out.shape, dtype=np.float32) for i in range(groups): hostOut[i] = np.dot(tensor.get()[:, i, :].T, y.get()[i]) assert np.allclose(hostOut, out.get())
def __call__(self, shape, dtype, constant=1.0): tup = (shape, dtype, constant) fills = self.pool.get(tup, None) if fills is None: if constant == 1.0: fills = Driver.to_device(queue, np.ones(shape, dtype=dtype), allocator=memoryPool) elif constant == 0.0: fills = Driver.zeros(queue, shape, dtype=dtype, allocator=memoryPool) else: if isinstance(constant, tuple) or isinstance( constant, list) or isinstance(constant, range): fills = [] for i in range(shape[0]): fills.extend([constant[i]] * shape[1]) fills = np.array(fills, dtype=dtype) else: fills = np.full(shape, constant, dtype=dtype) fills = Driver.to_device(queue, fills, allocator=memoryPool) self.pool[tup] = fills return fills
def backwardParamsRnn(data, outdata, w, trainReserve, descRnn, inithidden=None): assert data.ndim == 3 and data.dtype == np.float32 and descRnn.insize == data.shape[ 2] assert outdata.ndim == 3 and outdata.dtype == data.dtype assert w.ndim == 1 and w.dtype == np.float32 seqlen, batchsize, _ = data.shape if descRnn.dir == DirectionMode.uni: assert outdata.shape[2] == descRnn.hsize dims, strides = (descRnn.layers, batchsize, descRnn.hsize), (batchsize * descRnn.hsize, descRnn.hsize, 1) else: assert outdata.shape[2] == 2 * descRnn.hsize dims, strides = (2 * descRnn.layers, batchsize, descRnn.hsize), (batchsize * descRnn.hsize, descRnn.hsize, 1) if inithidden is not None: assert inithidden.dtype == np.float32 and inithidden.shape == dims else: inithidden = Driver.zeros(queue, dims, dtype=np.float32, allocator=memPool) descHx = createDescribedNdTensor(dims, strides, inithidden) descDatas = [] descOutDatas = [] for d in range(data.shape[0]): descDatas.append(createDescribedNdTensor(None, None, data[0])) descOutDatas.append(createDescribedNdTensor(None, None, outdata[0])) indescs, outdescs = [d.desc for d in descDatas], [d.desc for d in descOutDatas] dw = Driver.zeros(queue, w.shape, dtype=np.float32, allocator=memPool) descDw = createDescribedNdTensor(None, None, dw) workspace, reserveSpace = trainReserve libmiopen.miopenRNNBackwardWeights(context, descRnn.desc, seqlen, indescs, data.int_ptr, descHx.desc, descHx.ptr, outdescs, outdata.int_ptr, descDw.desc, descDw.ptr, workspace.int_ptr, workspace.nbytes, reserveSpace.int_ptr, reserveSpace.nbytes) destroyDescribedTensors(*descDatas, *descOutDatas, descHx, descDw) return dw
def unittest(): batchsize, maps, h, w = 3, 4, 5, 5 epsilon = 1e-5 data = Driver.to_device( queue, np.random.randn(batchsize, maps, h, w).astype(np.float32)) scale = Driver.to_device(queue, np.random.randn(1, maps, 1, 1).astype(np.float32)) bias = Driver.to_device(queue, np.random.randn(1, maps, 1, 1).astype(np.float32)) outdata, savemean, saveinvvar, extscale = instanceNorm2d( data, scale, bias, epsilon) hostData = data.get().reshape(data.shape[0] * data.shape[1], -1) hostScale, hostBias = scale.get().reshape(maps, 1), bias.get().reshape(maps, 1) hostExtScale, hostExtBias = np.tile(hostScale, (batchsize, 1)), np.tile( hostBias, (batchsize, 1)) hostMean = np.mean(hostData, axis=1, keepdims=True) hostInvVar = 1.0 / np.sqrt(np.var(hostData, axis=1) + epsilon) hostOutData = (hostData - hostMean) * hostInvVar[:, np.newaxis] hostOutScData = hostOutData * hostExtScale + hostExtBias assert np.allclose(hostOutScData.reshape(data.shape), outdata.get()) assert np.allclose(hostMean.reshape(savemean.shape), savemean.get()) assert np.allclose(hostInvVar.reshape(saveinvvar.shape), saveinvvar.get()) grad = Driver.to_device( queue, np.random.randn(batchsize, maps, h, w).astype(np.float32)) ingrad, scalegrad, bgrad = instanceNorm2dBackward(grad, data, extscale, savemean, saveinvvar, epsilon) hostGrad = grad.get().reshape(grad.shape[0] * grad.shape[1], -1) hostScGrad = hostGrad * hostExtScale hostCorrs = np.empty(hostInvVar.shape, dtype=np.float32) for i in range(hostCorrs.shape[0]): hostCorrs[i] = np.dot(hostScGrad[i], hostOutData[i]) / hostScGrad.shape[1] hostInGrad = hostScGrad - np.mean( hostScGrad, axis=1, keepdims=True) - hostCorrs[:, np.newaxis] * hostOutData hostInGrad *= hostInvVar[:, np.newaxis] hostScaleGrad = np.sum(np.sum(hostOutData * hostGrad, axis=1).reshape(batchsize, -1), axis=0) hostBiasGrad = np.sum(np.sum(hostGrad, axis=1).reshape(batchsize, -1), axis=0) assert np.allclose(hostInGrad.reshape(grad.shape), ingrad.get()) assert np.allclose(hostScaleGrad.reshape(1, maps, 1, 1), scalegrad.get()) assert np.allclose(hostBiasGrad.reshape(1, maps, 1, 1), bgrad.get())
def speedTest(): from PuzzleLib.OpenCL.Benchmarks.Utils import timeKernel A = Driver.to_device(queue, np.random.randn(1024, 1024).astype(np.float32)) v = Driver.to_device(queue, np.random.randn(1024).astype(np.float32)) timeKernel(addVecToMat, (v, A, 1, True), logname="addVecToMat on rows") timeKernel(addVecToMat, (v, A, 0, True), logname="addVecToMat on cols") timeKernel(argmax, (A, 1), logname="argmax on rows") timeKernel(argmax, (A, 0), logname="argmax on cols")
def upsample2dLinearTest(): batchsize, maps, inh, inw = 3, 2, 4, 4 hscale, wscale = 2, 3 data = Driver.to_device( queue, np.random.randn(batchsize, maps, inh, inw).astype(np.float32)) outdata = upsample2d(data, (hscale, wscale), mode="linear") hostData = data.get() hostOutData = np.zeros(outdata.shape, dtype=np.float32) rh, rw = (inh - 1) / (inh * hscale - 1), (inw - 1) / (inw * wscale - 1) for b in range(batchsize): for c in range(maps): for y in range(inh * hscale): for x in range(inw * wscale): iny, inx = int(rh * y), int(rw * x) dy, dx = 1.0 - (rh * y - iny), 1.0 - (rw * x - inx) xi = 1 if x < inw * wscale - 1 else 0 yi = 1 if y < inh * hscale - 1 else 0 hostOutData[b, c, y, x] = \ dy * (dx * hostData[b, c, iny, inx] + (1 - dx) * hostData[b, c, iny, inx + xi]) + \ (1 - dy) * (dx * hostData[b, c, iny + yi, inx] + (1 - dx) * hostData[b, c, iny + yi, inx + xi]) grad = Driver.to_device(queue, np.random.randn(*outdata.shape).astype(np.float32)) ingrad = upsample2dBackward(grad, (hscale, wscale), mode="linear") hostGrad = grad.get() hostInGrad = np.zeros(data.shape, dtype=np.float32) for b in range(batchsize): for c in range(maps): for y in range(inh * hscale): for x in range(inw * wscale): iny, inx = int(rh * y), int(rw * x) dy, dx = 1.0 - (rh * y - iny), 1.0 - (rw * x - inx) xi = 1 if x < inw * wscale - 1 else 0 yi = 1 if y < inh * hscale - 1 else 0 val = hostGrad[b, c, y, x] hostInGrad[b, c, iny, inx] += dy * dx * val hostInGrad[b, c, iny, inx + xi] += dy * (1 - dx) * val hostInGrad[b, c, iny + yi, inx] += (1 - dy) * dx * val hostInGrad[b, c, iny + yi, inx + xi] += (1 - dy) * (1 - dx) * val assert np.allclose(hostInGrad, ingrad.get())
def calcTest(): A = Driver.to_device(queue, np.random.randn(128, 500).astype(np.float32)) v = Driver.to_device(queue, np.random.randn(500).astype(np.float32)) w = Driver.to_device(queue, np.random.randn(128).astype(np.float32)) m = Driver.to_device(queue, np.random.randn(125).astype(np.float32)) assert np.allclose(A.get() + v.get()[np.newaxis, :], addVecToMat(v, A, axis=1).get()) assert np.allclose(A.get() + w.get()[:, np.newaxis], addVecToMat(w, A, axis=0).get()) assert np.allclose(A.get() + np.tile(m.get(), 4)[np.newaxis, :], addVecToMat(m, A, axis=1).get()) assert np.allclose(argmax(A, axis=1).get(), np.argmax(A.get(), axis=1)) assert np.allclose(argmax(A, axis=0).get(), np.argmax(A.get(), axis=0))
def eltwiseTest(context, queue): outdata = Driver.empty(queue, (1 << 18, ), dtype=np.int32) indata = Driver.to_device( queue, np.random.randint(0, 1000, size=(1 << 18, ), dtype=np.int32)) krl = ElementwiseKernel("int *outdata, const int *indata", "outdata[i] = indata[i] * indata[i]", "krl", context, queue) krl(outdata, indata) hostOutData = indata.get() * indata.get() assert np.allclose(hostOutData, outdata.get())
def conv2dBackwardParams(data, grad, W, bias=None, stride=1, pad=0, wgrad=None, bgrad=None, scale=1.0, momentum=0.0, mode=ConvMode.conv, algo=None): assert data.ndim == grad.ndim if mode == ConvMode.conv: assert grad.shape[1] == W.shape[0] and data.shape[1] == W.shape[1] else: assert grad.shape[1] == W.shape[1] and data.shape[1] == W.shape[0] descData = createDescribed4dTensor(data, allowOffset=True) descGrad = createDescribed4dTensor(grad) descConv = createDescribedConv2d(stride, pad, 1, mode) if wgrad is not None and scale == 1.0 and momentum == 0.0: descWGrad = createDescribed4dTensor(wgrad) else: descWGrad = createDescribed4dTensor(Driver.zeros(queue, W.shape, dtype=W.dtype, allocator=memPool)) _, ptr, size = conv2dBackwardParamsWorkspace(descGrad, descData, descConv, descWGrad) algo = cacheConv2dParamsAlgo(descGrad, descData, descConv, descWGrad, (ptr, size), algo) libmiopen.miopenConvolutionBackwardWeights(context, 1.0, descGrad.desc, descGrad.ptr, descData.desc, descData.ptr, descConv.desc, algo.value, 0.0, descWGrad.desc, descWGrad.ptr, ptr, size) if wgrad is not None and scale != 1.0 or momentum != 0.0: CLBlas.addVectorToVector(descWGrad.tensor.ravel(), wgrad.ravel(), out=wgrad.ravel(), alpha=scale, beta=momentum) tup = (descWGrad.tensor, ) if bias is not None: assert bias.ndim == data.ndim if bgrad is not None and scale == 1.0 and momentum == 0.0: descBGrad = createDescribed4dTensor(bgrad) else: descBGrad = createDescribed4dTensor(Driver.empty(queue, bias.shape, dtype=bias.dtype, allocator=memPool)) libmiopen.miopenConvolutionBackwardBias(context, 1.0, descGrad.desc, descGrad.ptr, 0.0, descBGrad.desc, descBGrad.ptr) if bgrad is not None and scale != 1.0 or momentum != 0.0: CLBlas.addVectorToVector(descBGrad.tensor.ravel(), bgrad.ravel(), out=bgrad.ravel(), alpha=scale, beta=momentum) tup = (descWGrad.tensor, descBGrad.tensor) destroyDescribedTensors(descBGrad) destroyDescribedConv(descConv) destroyDescribedTensors(descData, descGrad, descWGrad) return tup
def argminmaxBatch(mats, axis, mode): assert mats.ndim == 3 and mats.dtype == np.float32 block = (warpSize, 1, 1) if axis == 1: if mode == "max": mod = maxBatchMod else: mod = minBatchMod colKernel = mod.minMaxBatchOnCol target = Driver.empty(queue, (mats.shape[0], mats.shape[2]), dtype=np.float32, allocator=memPool) idx = Driver.empty(queue, (mats.shape[0], mats.shape[2]), dtype=np.int32, allocator=memPool) grid = (mats.shape[2] * warpSize, 1, mats.shape[0]) colKernel(queue, grid, block, mats.data, target.data, idx.data, np.int32(mats.shape[2]), np.int32(mats.shape[1])) elif axis == 2: if mode == "max": mod = maxmod else: mod = minmod rowKernel = mod.minMaxOnRow target = Driver.empty(queue, mats.shape[:2], dtype=np.float32, allocator=memPool) idx = Driver.empty(queue, mats.shape[:2], dtype=np.int32, allocator=memPool) grid = (mats.shape[0] * mats.shape[1] * block[0], 1, 1) rowKernel(queue, grid, block, mats.data, target.data, idx.data, np.int32(mats.shape[2]), np.int32(mats.shape[1])) else: raise ValueError("Unsupported axis %s was given" % axis) return idx
def initOpenCL(): from PuzzleLib.OpenCL.Driver import Driver from PuzzleLib.OpenCL.Kernels import Templates from PuzzleLib.OpenCL.Utils import context, queue global GPUArray, to_gpu, empty, zeros GPUArray = Driver.Array to_gpu = lambda *args, **kwargs: Driver.to_device(queue, *args, **kwargs) empty = lambda *args, **kwargs: Driver.empty(queue, *args, **kwargs) zeros = lambda *args, **kwargs: Driver.zeros(queue, *args, **kwargs) global minimum, maximum minimum = lambda ary: Templates.minimum(context, queue, ary) maximum = lambda ary: Templates.maximum(context, queue, ary)
def create_some_context(): platforms = Driver.get_platforms() for vendor in ["AMD Accelerated Parallel Processing"]: platform = next((pl for pl in platforms if pl.name == vendor), None) if platform is not None: break assert platform is not None device = platform.get_devices(type=Driver.device_type.GPU)[0] context = Driver.Context([device]) queue = Driver.CommandQueue(context, profiling=True) return context, queue
def gbpGbpTest(): groups = 3 A = Driver.to_device(queue, np.random.randn(groups, 4, 3).astype(np.float32)) B = Driver.to_device( queue, np.random.randn(groups, A.shape[2], 4).astype(np.float32)) C = Driver.to_device( queue, np.random.randn(groups, A.shape[1], 6).astype(np.float32)) D = Driver.to_device( queue, np.random.randn(groups, 8, C.shape[2]).astype(np.float32)) out = mulTensorBatch(A, B, formatA="gbp", formatB="gbp", formatOut="gbp") hostOut = np.empty(out.shape, dtype=np.float32) for i in range(groups): hostOut[i] = np.dot(A.get()[i], B.get()[i]) assert np.allclose(hostOut, out.get()) out = mulTensorBatch(C, A, formatA="gbp", formatB="gbp", formatOut="gbp", transpA=True) hostOut = np.empty(out.shape, dtype=np.float32) for i in range(groups): hostOut[i] = np.dot(C.get()[i].T, A.get()[i]) assert np.allclose(hostOut, out.get()) out = mulTensorBatch(C, D, formatA="gbp", formatB="gbp", formatOut="gbp", transpB=True) hostOut = np.empty(out.shape, dtype=np.float32) for i in range(groups): hostOut[i] = np.dot(C.get()[i], D.get()[i].T) assert np.allclose(hostOut, out.get())
def upsample3dBackward(grad, scale, mode="nearest"): batchsize, maps, outd, outh, outw = grad.shape if isinstance(scale, int): dscale, hscale, wscale = scale, scale, scale else: dscale, hscale, wscale = scale ind, inh, inw = outd // dscale, outh // hscale, outw // wscale if mode == "nearest": ingrad = Driver.empty(queue, (batchsize, maps, ind, inh, inw), dtype=grad.dtype, allocator=memPool) blk = warpSize * 4 block = (blk, 1, 1) grid = (roundUp(ingrad.size, blk), 1, 1) kernel = nearestMod.upsample3dNearestBackward kernel(queue, grid, block, ingrad.data, grad.data, np.int32(inh), np.int32(inw), np.int32(outh), np.int32(outw), np.int32(dscale), np.int32(hscale), np.int32(wscale), np.int32(ingrad.size)) elif mode == "linear": ingrad = Driver.zeros(queue, (batchsize, maps, ind, inh, inw), dtype=grad.dtype, allocator=memPool) block = (warpSize // 4, warpSize // 4, 1) grid = (roundUp(outw, block[0]), roundUp(outh, block[1]), outd) rd = (ind - 1) / (outd - 1) rh = (inh - 1) / (outh - 1) rw = (inw - 1) / (outw - 1) kernel = linearMod.upsample3dLinearBackward kernel(queue, grid, block, ingrad.data, grad.data, np.int32(batchsize), np.int32(maps), np.int32(ind), np.int32(inh), np.int32(inw), np.int32(outd), np.int32(outh), np.int32(outw), np.float32(rd), np.float32(rh), np.float32(rw)) else: raise ValueError("Unrecognized sampling mode") return ingrad