def test_saved(precision): a = np.load("a%d.npy" % precision) b = np.load("b%d.npy" % precision) print("NANS:", np.isnan(a).any() or np.isnan(b).any()) c = np.dot(a, b) print(b) print("A:", a.shape, a.dtype) print("B:", b.shape, b.dtype) print("C:", c.shape, c.dtype) # ad = gpuarray.to_gpu(a) # bd = gpuarray.to_gpu(b) ad = GPUTensor(a) bd = GPUTensor(b) cd = gpu_tensor_gemm(context.cublas, ad, bd) print("A:", ad.shape, ad.strides, ad.size, ad.mem_size, str(ad.flags.c_contiguous)) print("B:", bd.shape, bd.strides, bd.size, bd.mem_size, str(bd.flags.c_contiguous)) print("C:", cd.shape, cd.strides, cd.size, cd.mem_size, str(cd.flags.c_contiguous)) c2 = cd.get() # print("C2:", c2.shape) check_results(c, c2)
class BatchNormalization(Layer): def __init__(self, config): super().__init__("BatchNormalization") assert(config["affine"]) self.eps = config["eps"] variance = np.load(os.path.join(config["baseDir"], config["parameterFiles"][3])) #variance = self.load_tensor(config, 3, dtype=np.float32) nelem = variance.shape[0] if config["varianceFormat"] == "variance" and libcudnn.cudnnGetVersion() < 5000: # print("FIXING variance format") variance += self.eps variance = np.reciprocal(np.sqrt(variance)) self.variance = GPUTensor(variance, dtype=np.float32, shape=(1, nelem, 1, 1)) self.W = self.load_tensor(config, 0, dtype=np.float32, shape=(1, nelem, 1, 1)) self.bias = self.load_tensor(config, 1, dtype=np.float32, shape=(1, nelem, 1, 1)) # shape=(1, self.W.shape[0], 1, 1)) self.average = self.load_tensor(config, 2, dtype=np.float32, shape=(1, nelem, 1, 1)) self.param_desc = self.average.get_cudnn_tensor_desc() self.in_desc = None self.out_desc = None def configure(self, input): self.output = GPUTensor(input.shape, input.dtype) if self.in_desc: libcudnn.cudnnDestroyTensorDescriptor(self.in_desc.ptr) if self.out_desc: libcudnn.cudnnDestroyTensorDescriptor(self.out_desc.ptr) self.in_desc = input.get_cudnn_tensor_desc() self.out_desc = self.output.get_cudnn_tensor_desc() # print("BatchNormalization:configure() input=", input.shape, self.W.shape[0]) def fprop(self, input): # The input transformation performed by this function is defined as: # y := alpha*y + beta *(bnScale * (x-estimatedMean)/sqrt(epsilon + estimatedVariance)+bnBias) # print("IN:", self.in_desc) # print("OUT:", self.out_desc) # print("PARAM:", self.param_desc) # print("EPSILON:", self.eps) # print("VARP:", self.variance.get_gpu_voidp()) libcudnn.cudnnBatchNormalizationForwardInference(context.cudnn, libcudnn.cudnnBatchNormMode['CUDNN_BATCHNORM_SPATIAL'], 1.0, 0.0, self.in_desc.ptr, input.get_gpu_voidp(), self.out_desc.ptr, self.output.get_gpu_voidp(), self.param_desc.ptr, self.W.get_gpu_voidp(), self.bias.get_gpu_voidp(), self.average.get_gpu_voidp(), self.variance.get_gpu_voidp(), self.eps) self.check_truth() def __str__(self): return "BatchNormalization: %dx%d" % (self.W.shape[0], self.bias.shape[0])
class Linear(Layer): def __init__(self, config): super().__init__("Linear") self.W = self.load_tensor(config, 0) self.bias = self.load_tensor(config, 1, shape=(1, self.W.shape[0], 1, 1)) # self.bias = GPUTensor(os.path.join(config["baseDir"], config["parameterFiles"][1])) self.b_desc = self.bias.get_cudnn_tensor_desc() # print(self.W.shape) def configure(self, input): # print("Linear::configure: input shape =", input.shape) # print("Linear::configure: W shape =", self.W.shape) # print("Linear::configure: b shape =", self.bias.shape) elems_per_image = np.prod(input.shape) # print(elems_per_image, self.W.shape[1]) assert (elems_per_image == self.W.shape[1]) self.output = GPUTensor((1, self.W.shape[0], 1, 1), dtype=input.dtype) self.output_desc = self.output.get_cudnn_tensor_desc() if self.truth is not None: print("OUTPUT TRUTH SHAPE:", self.truth.shape, self.output.shape) def fprop(self, input): # print("PAAAAAAAAAAAAAAAAAAAAAAAAAAAAAAAAA") input_2d = input.reshape((self.W.shape[1], 1)) output_2d = self.output.reshape(self.W.shape[0], 1) # print(input_2d.flags.c_contiguous) # print(output_2d.flags.c_contiguous) # test_cublas() # np.save("a16.npy", self.W.get()) # np.save("b16.npy", input_2d.get()) # exit(0) # ad = self.W # print("A:", ad.shape, ad.strides, ad.size, ad.mem_size, str(ad.flags.c_contiguous)) # print("B:", input.shape, input.strides, input.size, input.mem_size, str(input.flags.c_contiguous)) # print("B':", input_2d.shape, input_2d.strides, input_2d.size, input_2d.mem_size, str(input_2d.flags.c_contiguous)) # print("C:", output_2d.shape, output_2d.strides, output_2d.size, output_2d.mem_size, str(output_2d.flags.c_contiguous)) # print("Linear::fprop()", self.W.shape, input_2d.shape, output_2d.shape) cublas_dot.cublas_gemm(context.cublas, self.W, input_2d, output_2d) # print("Linear::fprop()", self.output.shape) libcudnn.cudnnAddTensor(context.cudnn, 1.0, self.b_desc.ptr, self.bias.get_gpu_voidp(), 1.0, self.output_desc.ptr, self.output.get_gpu_voidp()) self.check_truth() def __str__(self): return "Linear: %dx%d" % (self.W.shape[0], self.W.shape[1])
def configure(self, input): self.output = GPUTensor(input.shape, input.dtype) if self.in_desc: libcudnn.cudnnDestroyTensorDescriptor(self.in_desc.ptr) if self.out_desc: libcudnn.cudnnDestroyTensorDescriptor(self.out_desc.ptr) self.in_desc = input.get_cudnn_tensor_desc() self.out_desc = self.output.get_cudnn_tensor_desc()
class Linear(Layer): def __init__(self, config): super().__init__("Linear") self.W = self.load_tensor(config, 0) self.bias = self.load_tensor(config, 1, shape=(1, self.W.shape[0], 1, 1)) # self.bias = GPUTensor(os.path.join(config["baseDir"], config["parameterFiles"][1])) self.b_desc = self.bias.get_cudnn_tensor_desc() # print(self.W.shape) def configure(self, input): # print("Linear::configure: input shape =", input.shape) # print("Linear::configure: W shape =", self.W.shape) # print("Linear::configure: b shape =", self.bias.shape) elems_per_image = np.prod(input.shape) # print(elems_per_image, self.W.shape[1]) assert(elems_per_image == self.W.shape[1]) self.output = GPUTensor((1,self.W.shape[0], 1, 1), dtype=input.dtype) self.output_desc = self.output.get_cudnn_tensor_desc() if self.truth is not None: print("OUTPUT TRUTH SHAPE:", self.truth.shape, self.output.shape) def fprop(self, input): # print("PAAAAAAAAAAAAAAAAAAAAAAAAAAAAAAAAA") input_2d = input.reshape((self.W.shape[1], 1)) output_2d = self.output.reshape(self.W.shape[0], 1) # print(input_2d.flags.c_contiguous) # print(output_2d.flags.c_contiguous) # test_cublas() # np.save("a16.npy", self.W.get()) # np.save("b16.npy", input_2d.get()) # exit(0) # ad = self.W # print("A:", ad.shape, ad.strides, ad.size, ad.mem_size, str(ad.flags.c_contiguous)) # print("B:", input.shape, input.strides, input.size, input.mem_size, str(input.flags.c_contiguous)) # print("B':", input_2d.shape, input_2d.strides, input_2d.size, input_2d.mem_size, str(input_2d.flags.c_contiguous)) # print("C:", output_2d.shape, output_2d.strides, output_2d.size, output_2d.mem_size, str(output_2d.flags.c_contiguous)) # print("Linear::fprop()", self.W.shape, input_2d.shape, output_2d.shape) cublas_dot.cublas_gemm(context.cublas, self.W, input_2d, output_2d) # print("Linear::fprop()", self.output.shape) libcudnn.cudnnAddTensor(context.cudnn, 1.0, self.b_desc.ptr, self.bias.get_gpu_voidp(), 1.0, self.output_desc.ptr, self.output.get_gpu_voidp()) self.check_truth() def __str__(self): return "Linear: %dx%d" % (self.W.shape[0], self.W.shape[1])
def configure(self, input): # print("Convolution::configure: input shape =", input.shape) in_images = input.shape[0] in_channels = input.shape[1] in_height = input.shape[2] in_width = input.shape[3] assert (in_channels == self.num_filter_channels) out_width = int((1.0 * in_width + 2 * self.padW - self.kW) / self.dW + 1) out_height = int((1.0 * in_height + 2 * self.padH - self.kH) / self.dH + 1) self.output = GPUTensor( (in_images, self.num_filter_maps, out_height, out_width), input.dtype) # print("ONCV:", input.dtype, self.output.dtype) # print("Convolution::configure: output shape =", self.output.shape) # initialize cudnn descriptors if self.in_desc: libcudnn.cudnnDestroyTensorDescriptor(self.in_desc.ptr) if self.out_desc: libcudnn.cudnnDestroyTensorDescriptor(self.out_desc.ptr) self.in_desc = input.get_cudnn_tensor_desc() # Get output dimensions (first two values are n_input and filters_out) _, _, out_height2, out_width2 = libcudnn.cudnnGetConvolution2dForwardOutputDim( self.conv_desc, self.in_desc.ptr, self.filt_desc) assert (out_width == out_width2) assert (out_height == out_height2) self.out_desc = self.output.get_cudnn_tensor_desc() # find best convolution algorithm self.algo = libcudnn.cudnnGetConvolutionForwardAlgorithm( context.cudnn, self.in_desc.ptr, self.filt_desc, self.conv_desc, self.out_desc.ptr, self.convolution_fwd_pref, 0) print("Convolution::configure: algo=%s" % str(self.algo.value)) self.ws_size = libcudnn.cudnnGetConvolutionForwardWorkspaceSize( context.cudnn, self.in_desc.ptr, self.filt_desc, self.conv_desc, self.out_desc.ptr, self.algo) self.ws_ptr = drv.mem_alloc( self.ws_size.value) if self.ws_size.value > 0 else 0 print("Convolution::configure: workspace size=%d" % self.ws_size.value)
def configure(self, input): # print("Linear::configure: input shape =", input.shape) # print("Linear::configure: W shape =", self.W.shape) # print("Linear::configure: b shape =", self.bias.shape) elems_per_image = np.prod(input.shape) # print(elems_per_image, self.W.shape[1]) assert (elems_per_image == self.W.shape[1]) self.output = GPUTensor((1, self.W.shape[0], 1, 1), dtype=input.dtype) self.output_desc = self.output.get_cudnn_tensor_desc() if self.truth is not None: print("OUTPUT TRUTH SHAPE:", self.truth.shape, self.output.shape)
def benchmark(datasrc, model): start = time.time() label, data = datasrc.get_item() print("Data load time: %.2fms" % ((time.time() - start) * 1000.0)) start = time.time() data = np.ascontiguousarray(np.expand_dims(np.rollaxis(data, 2), 0)).astype(model.dtype) data = model.normalize(data) print("Data prep time: %.2fms" % ((time.time() - start) * 1000.0)) input_tensor = GPUTensor(data) # warmup... for i in range(1): model.evaluate(input_tensor) start = time.time() num_iterations = 100 print("Timing %d iterations..." % num_iterations) for i in range(num_iterations): if i == num_iterations - 1: drv.start_profiler() y = model.evaluate(input_tensor) print(y) drv.stop_profiler() et = (time.time() - start) * 1000 / num_iterations print("Model eval time: %.2fms = %.1ffps" % (et, 1000.0 / et))
def __init__(self, config): super().__init__("BatchNormalization") assert(config["affine"]) self.eps = config["eps"] variance = np.load(os.path.join(config["baseDir"], config["parameterFiles"][3])) #variance = self.load_tensor(config, 3, dtype=np.float32) nelem = variance.shape[0] if config["varianceFormat"] == "variance" and libcudnn.cudnnGetVersion() < 5000: # print("FIXING variance format") variance += self.eps variance = np.reciprocal(np.sqrt(variance)) self.variance = GPUTensor(variance, dtype=np.float32, shape=(1, nelem, 1, 1)) self.W = self.load_tensor(config, 0, dtype=np.float32, shape=(1, nelem, 1, 1)) self.bias = self.load_tensor(config, 1, dtype=np.float32, shape=(1, nelem, 1, 1)) # shape=(1, self.W.shape[0], 1, 1)) self.average = self.load_tensor(config, 2, dtype=np.float32, shape=(1, nelem, 1, 1)) self.param_desc = self.average.get_cudnn_tensor_desc() self.in_desc = None self.out_desc = None
def configure(self, input): in_images = input.shape[0] in_channels = input.shape[1] in_height = input.shape[2] in_width = input.shape[3] assert(in_width >= self.kW) assert(in_height >= self.kH) out_width = int((math.floor(1.0 * in_width - self.kW + 2*self.padW) / self.dW) + 1) out_height = int((math.floor(1.0 * in_height - self.kH + 2*self.padH) / self.dH) + 1) self.output = GPUTensor( (in_images, in_channels, out_height, out_width), input.dtype ) if self.pool_desc: libcudnn.cudnnDestroyPoolingDescriptor(self.pool_desc) if self.in_desc: libcudnn.cudnnDestroyTensorDescriptor(self.in_desc) if self.out_desc: libcudnn.cudnnDestroyTensorDescriptor(self.out_desc) self.in_desc = input.get_cudnn_tensor_desc() self.out_desc = self.output.get_cudnn_tensor_desc() self.pool_desc = libcudnn.cudnnCreatePoolingDescriptor() libcudnn.cudnnSetPooling2dDescriptor(self.pool_desc, libcudnn.cudnnPoolingMode["CUDNN_POOLING_MAX"], # libcudnn.cudnnNanPropagation["CUDNN_NOT_PROPAGATE_NAN"], self.kH, self.kW, self.padH, self.padW, self.dH, self.dW)
def load_tensor(self, config, index, dtype=None, shape=None): filename = os.path.join(config["baseDir"], config["parameterFiles"][index]) if dtype is None: dtype = config["dtype"] return GPUTensor(filename, dtype, shape)
def configure(self, input): in_images = input.shape[0] in_channels = input.shape[1] in_height = input.shape[2] in_width = input.shape[3] assert (in_width >= self.kW) assert (in_height >= self.kH) out_width = int((math.floor(1.0 * in_width - self.kW + 2 * self.padW) / self.dW) + 1) out_height = int( (math.floor(1.0 * in_height - self.kH + 2 * self.padH) / self.dH) + 1) self.output = GPUTensor( (in_images, in_channels, out_height, out_width), input.dtype) if self.pool_desc: libcudnn.cudnnDestroyPoolingDescriptor(self.pool_desc) if self.in_desc: libcudnn.cudnnDestroyTensorDescriptor(self.in_desc) if self.out_desc: libcudnn.cudnnDestroyTensorDescriptor(self.out_desc) self.in_desc = input.get_cudnn_tensor_desc() self.out_desc = self.output.get_cudnn_tensor_desc() self.pool_desc = libcudnn.cudnnCreatePoolingDescriptor() libcudnn.cudnnSetPooling2dDescriptor( self.pool_desc, libcudnn.cudnnPoolingMode["CUDNN_POOLING_MAX"], # libcudnn.cudnnNanPropagation["CUDNN_NOT_PROPAGATE_NAN"], self.kH, self.kW, self.padH, self.padW, self.dH, self.dW)
def __init__(self, config): super().__init__("BatchNormalization") assert (config["affine"]) self.eps = config["eps"] variance = np.load( os.path.join(config["baseDir"], config["parameterFiles"][3])) #variance = self.load_tensor(config, 3, dtype=np.float32) nelem = variance.shape[0] if config["varianceFormat"] == "variance" and libcudnn.cudnnGetVersion( ) < 5000: # print("FIXING variance format") variance += self.eps variance = np.reciprocal(np.sqrt(variance)) self.variance = GPUTensor(variance, dtype=np.float32, shape=(1, nelem, 1, 1)) self.W = self.load_tensor(config, 0, dtype=np.float32, shape=(1, nelem, 1, 1)) self.bias = self.load_tensor(config, 1, dtype=np.float32, shape=(1, nelem, 1, 1)) # shape=(1, self.W.shape[0], 1, 1)) self.average = self.load_tensor(config, 2, dtype=np.float32, shape=(1, nelem, 1, 1)) self.param_desc = self.average.get_cudnn_tensor_desc() self.in_desc = None self.out_desc = None
def configure(self, input): # print("Linear::configure: input shape =", input.shape) # print("Linear::configure: W shape =", self.W.shape) # print("Linear::configure: b shape =", self.bias.shape) elems_per_image = np.prod(input.shape) # print(elems_per_image, self.W.shape[1]) assert(elems_per_image == self.W.shape[1]) self.output = GPUTensor((1,self.W.shape[0], 1, 1), dtype=input.dtype) self.output_desc = self.output.get_cudnn_tensor_desc() if self.truth is not None: print("OUTPUT TRUTH SHAPE:", self.truth.shape, self.output.shape)
def configure(self, input): # print("Convolution::configure: input shape =", input.shape) in_images = input.shape[0] in_channels = input.shape[1] in_height = input.shape[2] in_width = input.shape[3] assert(in_channels == self.num_filter_channels) out_width = int((1.0 * in_width + 2*self.padW - self.kW) / self.dW + 1); out_height = int((1.0 * in_height + 2*self.padH - self.kH) / self.dH + 1); self.output = GPUTensor((in_images, self.num_filter_maps, out_height, out_width), input.dtype) # print("ONCV:", input.dtype, self.output.dtype) # print("Convolution::configure: output shape =", self.output.shape) # initialize cudnn descriptors if self.in_desc: libcudnn.cudnnDestroyTensorDescriptor(self.in_desc.ptr) if self.out_desc: libcudnn.cudnnDestroyTensorDescriptor(self.out_desc.ptr) self.in_desc = input.get_cudnn_tensor_desc() # Get output dimensions (first two values are n_input and filters_out) _, _, out_height2, out_width2 = libcudnn.cudnnGetConvolution2dForwardOutputDim( self.conv_desc, self.in_desc.ptr, self.filt_desc) assert(out_width == out_width2) assert(out_height == out_height2) self.out_desc = self.output.get_cudnn_tensor_desc() # find best convolution algorithm self.algo = libcudnn.cudnnGetConvolutionForwardAlgorithm(context.cudnn, self.in_desc.ptr, self.filt_desc, self.conv_desc, self.out_desc.ptr, self.convolution_fwd_pref, 0) print("Convolution::configure: algo=%s" % str(self.algo.value)) self.ws_size = libcudnn.cudnnGetConvolutionForwardWorkspaceSize(context.cudnn, self.in_desc.ptr, self.filt_desc, self.conv_desc, self.out_desc.ptr, self.algo) self.ws_ptr = drv.mem_alloc(self.ws_size.value) if self.ws_size.value > 0 else 0 print("Convolution::configure: workspace size=%d" % self.ws_size.value)
import math import numpy as np import pycuda.autoinit import libcudnn from gputensor import GPUTensor dt = np.float16 xh = np.ones((1,1,4,4), dtype=dt) * 2.0 # print(xh) cudnn_context = libcudnn.cudnnCreate() print("CUDNN Version: %d" % libcudnn.cudnnGetVersion()) x = GPUTensor(xh) y = GPUTensor(xh.shape, dtype=dt) pdt = np.float32 w = GPUTensor(np.ones(1).reshape(1,1,1,1), dtype=pdt) bias = GPUTensor(np.zeros(1).reshape(1,1,1,1), dtype=pdt) mean = GPUTensor(np.ones(1).reshape(1,1,1,1), dtype=pdt) var = GPUTensor(np.ones(1).reshape(1,1,1,1) * 0.5, dtype=pdt) x_desc = x.get_cudnn_tensor_desc() y_desc = y.get_cudnn_tensor_desc() print(x_desc) print(y_desc) param_desc = var.get_cudnn_tensor_desc() print(param_desc)
class Convolution(SlidingLayer): convolution_mode = libcudnn.cudnnConvolutionMode['CUDNN_CROSS_CORRELATION'] # convolution_mode = libcudnn.cudnnConvolutionMode['CUDNN_CONVOLUTION'] convolution_fwd_pref = libcudnn.cudnnConvolutionFwdPreference['CUDNN_CONVOLUTION_FWD_PREFER_FASTEST'] def __init__(self, config, name="Convolution"): super().__init__(config, name) self.output = None self.W = self.load_tensor(config, 0) self.alpha = 1.0 self.beta = 0.0 self.in_desc = None self.out_desc = None self.num_filter_maps = self.W.shape[0] self.num_filter_channels = self.W.shape[1] self.bias = self.load_tensor(config, 1, shape=(1, self.num_filter_maps, 1, 1)) # assert(self.bias.shape[0] == self.num_filter_maps) # self.bias = self.bias.reshape((1, self.num_filter_maps, 1, 1)) # print(self.bias.shape) self.b_desc = self.bias.get_cudnn_tensor_desc() self.filt_desc = libcudnn.cudnnCreateFilterDescriptor() print("FILT:", self.W.dtype, gputensor.np_2_cudnn_dtype[self.W.dtype]) print("FILT:", self.W.shape, self.num_filter_maps, self.num_filter_channels, self.kH, self.kW) libcudnn.cudnnSetFilter4dDescriptor(self.filt_desc, gputensor.np_2_cudnn_dtype[self.W.dtype], self.num_filter_maps, self.num_filter_channels, self.kH, self.kW) # print("B:", self.bias.shape) # self.bias_desc = self.conv_desc = libcudnn.cudnnCreateConvolutionDescriptor() libcudnn.cudnnSetConvolution2dDescriptor(self.conv_desc, self.padH, self.padW, self.dH, self.dW, 1, 1, self.convolution_mode) def __del__(self): pass # if self.filt_desc: # libcudnn.cudnnDestroyFilterDescriptor(self.filt_desc) # if self.conv_desc: # libcudnn.cudnnDestroyConvolutionDescriptor(self.conv_desc) def configure(self, input): # print("Convolution::configure: input shape =", input.shape) in_images = input.shape[0] in_channels = input.shape[1] in_height = input.shape[2] in_width = input.shape[3] assert(in_channels == self.num_filter_channels) out_width = int((1.0 * in_width + 2*self.padW - self.kW) / self.dW + 1); out_height = int((1.0 * in_height + 2*self.padH - self.kH) / self.dH + 1); self.output = GPUTensor((in_images, self.num_filter_maps, out_height, out_width), input.dtype) # print("ONCV:", input.dtype, self.output.dtype) # print("Convolution::configure: output shape =", self.output.shape) # initialize cudnn descriptors if self.in_desc: libcudnn.cudnnDestroyTensorDescriptor(self.in_desc.ptr) if self.out_desc: libcudnn.cudnnDestroyTensorDescriptor(self.out_desc.ptr) self.in_desc = input.get_cudnn_tensor_desc() # Get output dimensions (first two values are n_input and filters_out) _, _, out_height2, out_width2 = libcudnn.cudnnGetConvolution2dForwardOutputDim( self.conv_desc, self.in_desc.ptr, self.filt_desc) assert(out_width == out_width2) assert(out_height == out_height2) self.out_desc = self.output.get_cudnn_tensor_desc() # find best convolution algorithm self.algo = libcudnn.cudnnGetConvolutionForwardAlgorithm(context.cudnn, self.in_desc.ptr, self.filt_desc, self.conv_desc, self.out_desc.ptr, self.convolution_fwd_pref, 0) print("Convolution::configure: algo=%s" % str(self.algo.value)) self.ws_size = libcudnn.cudnnGetConvolutionForwardWorkspaceSize(context.cudnn, self.in_desc.ptr, self.filt_desc, self.conv_desc, self.out_desc.ptr, self.algo) self.ws_ptr = drv.mem_alloc(self.ws_size.value) if self.ws_size.value > 0 else 0 print("Convolution::configure: workspace size=%d" % self.ws_size.value) def fprop(self, input): # print("\nConvolution::fprop: alpha=%f, beta=%f" % (self.alpha, self.beta)) ws_data = ctypes.c_void_p(int(self.ws_ptr)) self.start.record() libcudnn.cudnnConvolutionForward(context.cudnn, self.alpha, self.in_desc.ptr, input.get_gpu_voidp(), self.filt_desc, self.W.get_gpu_voidp(), self.conv_desc, self.algo, ws_data, self.ws_size.value, self.beta, self.out_desc.ptr, self.output.get_gpu_voidp()) libcudnn.cudnnAddTensor(context.cudnn, 1.0, self.b_desc.ptr, self.bias.get_gpu_voidp(), 1.0, self.out_desc.ptr, self.output.get_gpu_voidp()) self.check_truth() def __str__(self): return "%s, W=%s, b=%s" % (SlidingLayer.__str__(self), self.W.shape, self.bias.shape)
import math import numpy as np import pycuda.autoinit import libcudnn from gputensor import GPUTensor xh = np.array( [[[[ 1 + m * x for x in list(range(4)) ] for m in range(4) ]]], dtype=np.float32 ) print(xh) print(xh.shape) cudnn_context = libcudnn.cudnnCreate() x = GPUTensor(xh) x_desc = x.get_cudnn_tensor_desc() print(x_desc) kW = 2 kH = 2 dW = 1 dH = 1 padW = 0 padH = 0 in_width = x.shape[3] in_height = x.shape[2] out_width = int((math.floor(1.0 * in_width - kW + 2*padW) / dW) + 1) out_height = int((math.floor(1.0 * in_height - kH + 2*padH) / dH) + 1) print("Ot:", out_width, out_height)
import math import numpy as np import pycuda.autoinit import libcudnn from gputensor import GPUTensor dt = np.float16 xh = np.ones((1, 1, 4, 4), dtype=dt) * 2.0 # print(xh) cudnn_context = libcudnn.cudnnCreate() print("CUDNN Version: %d" % libcudnn.cudnnGetVersion()) x = GPUTensor(xh) y = GPUTensor(xh.shape, dtype=dt) pdt = np.float32 w = GPUTensor(np.ones(1).reshape(1, 1, 1, 1), dtype=pdt) bias = GPUTensor(np.zeros(1).reshape(1, 1, 1, 1), dtype=pdt) mean = GPUTensor(np.ones(1).reshape(1, 1, 1, 1), dtype=pdt) var = GPUTensor(np.ones(1).reshape(1, 1, 1, 1) * 0.5, dtype=pdt) x_desc = x.get_cudnn_tensor_desc() y_desc = y.get_cudnn_tensor_desc() print(x_desc) print(y_desc) param_desc = var.get_cudnn_tensor_desc() print(param_desc)
class BatchNormalization(Layer): def __init__(self, config): super().__init__("BatchNormalization") assert (config["affine"]) self.eps = config["eps"] variance = np.load( os.path.join(config["baseDir"], config["parameterFiles"][3])) #variance = self.load_tensor(config, 3, dtype=np.float32) nelem = variance.shape[0] if config["varianceFormat"] == "variance" and libcudnn.cudnnGetVersion( ) < 5000: # print("FIXING variance format") variance += self.eps variance = np.reciprocal(np.sqrt(variance)) self.variance = GPUTensor(variance, dtype=np.float32, shape=(1, nelem, 1, 1)) self.W = self.load_tensor(config, 0, dtype=np.float32, shape=(1, nelem, 1, 1)) self.bias = self.load_tensor(config, 1, dtype=np.float32, shape=(1, nelem, 1, 1)) # shape=(1, self.W.shape[0], 1, 1)) self.average = self.load_tensor(config, 2, dtype=np.float32, shape=(1, nelem, 1, 1)) self.param_desc = self.average.get_cudnn_tensor_desc() self.in_desc = None self.out_desc = None def configure(self, input): self.output = GPUTensor(input.shape, input.dtype) if self.in_desc: libcudnn.cudnnDestroyTensorDescriptor(self.in_desc.ptr) if self.out_desc: libcudnn.cudnnDestroyTensorDescriptor(self.out_desc.ptr) self.in_desc = input.get_cudnn_tensor_desc() self.out_desc = self.output.get_cudnn_tensor_desc() # print("BatchNormalization:configure() input=", input.shape, self.W.shape[0]) def fprop(self, input): # The input transformation performed by this function is defined as: # y := alpha*y + beta *(bnScale * (x-estimatedMean)/sqrt(epsilon + estimatedVariance)+bnBias) # print("IN:", self.in_desc) # print("OUT:", self.out_desc) # print("PARAM:", self.param_desc) # print("EPSILON:", self.eps) # print("VARP:", self.variance.get_gpu_voidp()) libcudnn.cudnnBatchNormalizationForwardInference( context.cudnn, libcudnn.cudnnBatchNormMode['CUDNN_BATCHNORM_SPATIAL'], 1.0, 0.0, self.in_desc.ptr, input.get_gpu_voidp(), self.out_desc.ptr, self.output.get_gpu_voidp(), self.param_desc.ptr, self.W.get_gpu_voidp(), self.bias.get_gpu_voidp(), self.average.get_gpu_voidp(), self.variance.get_gpu_voidp(), self.eps) self.check_truth() def __str__(self): return "BatchNormalization: %dx%d" % (self.W.shape[0], self.bias.shape[0])
import math import numpy as np import pycuda.autoinit import libcudnn from gputensor import GPUTensor xh = np.array( [[[[1, 2, 3], [2, 3, 4], [3, 4, 5]], [[1, 2, 3], [2, 3, 4], [3, 4, 5]], [[1, 2, 3], [2, 3, 4], [3, 4, 5]]]], dtype=np.float32) print(xh.shape) print(xh) cudnn_context = libcudnn.cudnnCreate() x = GPUTensor(xh) x_desc = x.get_cudnn_tensor_desc() print(x_desc) print("X:\n", x.get()) b = GPUTensor(np.array([1, 2, 3], dtype=np.float32).reshape((1, 3, 1, 1))) b_desc = b.get_cudnn_tensor_desc() print(x.shape) print(b.shape) # print(b.get()) libcudnn.cudnnAddTensor(cudnn_context, 1.0, b_desc.ptr, b.get_gpu_voidp(), 1.0, x_desc.ptr, x.get_gpu_voidp()) print("X2:\n", x.get())
class Pooling(SlidingLayer): class Mode(enum.IntEnum): MAX = 1, AVG = 2 def __init__(self, mode, config, name="Pooling"): super().__init__(config, name) self.mode = mode assert (config["ceil_mode"] == False) self.alpha = 1.0 self.beta = 0.0 self.pool_desc = None self.in_desc = None self.out_desc = None def configure(self, input): in_images = input.shape[0] in_channels = input.shape[1] in_height = input.shape[2] in_width = input.shape[3] assert (in_width >= self.kW) assert (in_height >= self.kH) out_width = int((math.floor(1.0 * in_width - self.kW + 2 * self.padW) / self.dW) + 1) out_height = int( (math.floor(1.0 * in_height - self.kH + 2 * self.padH) / self.dH) + 1) self.output = GPUTensor( (in_images, in_channels, out_height, out_width), input.dtype) if self.pool_desc: libcudnn.cudnnDestroyPoolingDescriptor(self.pool_desc) if self.in_desc: libcudnn.cudnnDestroyTensorDescriptor(self.in_desc) if self.out_desc: libcudnn.cudnnDestroyTensorDescriptor(self.out_desc) self.in_desc = input.get_cudnn_tensor_desc() self.out_desc = self.output.get_cudnn_tensor_desc() self.pool_desc = libcudnn.cudnnCreatePoolingDescriptor() libcudnn.cudnnSetPooling2dDescriptor( self.pool_desc, libcudnn.cudnnPoolingMode["CUDNN_POOLING_MAX"], # libcudnn.cudnnNanPropagation["CUDNN_NOT_PROPAGATE_NAN"], self.kH, self.kW, self.padH, self.padW, self.dH, self.dW) def fprop(self, input): in_data = ctypes.c_void_p(int(input.gpudata)) out_data = ctypes.c_void_p(int(self.output.gpudata)) # print("Pooling::fprop()") # print("in_data:", input.ptr) # print("out_data:", self.output.ptr) libcudnn.cudnnPoolingForward(context.cudnn, self.pool_desc, self.alpha, self.in_desc.ptr, input.get_gpu_voidp(), self.beta, self.out_desc.ptr, self.output.get_gpu_voidp()) self.check_truth()
import math import numpy as np import pycuda.autoinit import libcudnn from gputensor import GPUTensor xh = np.array([[[[1 + m * x for x in list(range(4))] for m in range(4)]]], dtype=np.float32) print(xh) print(xh.shape) cudnn_context = libcudnn.cudnnCreate() x = GPUTensor(xh) x_desc = x.get_cudnn_tensor_desc() print(x_desc) kW = 2 kH = 2 dW = 1 dH = 1 padW = 0 padH = 0 in_width = x.shape[3] in_height = x.shape[2] out_width = int((math.floor(1.0 * in_width - kW + 2 * padW) / dW) + 1) out_height = int((math.floor(1.0 * in_height - kH + 2 * padH) / dH) + 1) print("Ot:", out_width, out_height)
class Convolution(SlidingLayer): convolution_mode = libcudnn.cudnnConvolutionMode['CUDNN_CROSS_CORRELATION'] # convolution_mode = libcudnn.cudnnConvolutionMode['CUDNN_CONVOLUTION'] convolution_fwd_pref = libcudnn.cudnnConvolutionFwdPreference[ 'CUDNN_CONVOLUTION_FWD_PREFER_FASTEST'] def __init__(self, config, name="Convolution"): super().__init__(config, name) self.output = None self.W = self.load_tensor(config, 0) self.alpha = 1.0 self.beta = 0.0 self.in_desc = None self.out_desc = None self.num_filter_maps = self.W.shape[0] self.num_filter_channels = self.W.shape[1] self.bias = self.load_tensor(config, 1, shape=(1, self.num_filter_maps, 1, 1)) # assert(self.bias.shape[0] == self.num_filter_maps) # self.bias = self.bias.reshape((1, self.num_filter_maps, 1, 1)) # print(self.bias.shape) self.b_desc = self.bias.get_cudnn_tensor_desc() self.filt_desc = libcudnn.cudnnCreateFilterDescriptor() print("FILT:", self.W.dtype, gputensor.np_2_cudnn_dtype[self.W.dtype]) print("FILT:", self.W.shape, self.num_filter_maps, self.num_filter_channels, self.kH, self.kW) libcudnn.cudnnSetFilter4dDescriptor( self.filt_desc, gputensor.np_2_cudnn_dtype[self.W.dtype], self.num_filter_maps, self.num_filter_channels, self.kH, self.kW) # print("B:", self.bias.shape) # self.bias_desc = self.conv_desc = libcudnn.cudnnCreateConvolutionDescriptor() libcudnn.cudnnSetConvolution2dDescriptor(self.conv_desc, self.padH, self.padW, self.dH, self.dW, 1, 1, self.convolution_mode) def __del__(self): pass # if self.filt_desc: # libcudnn.cudnnDestroyFilterDescriptor(self.filt_desc) # if self.conv_desc: # libcudnn.cudnnDestroyConvolutionDescriptor(self.conv_desc) def configure(self, input): # print("Convolution::configure: input shape =", input.shape) in_images = input.shape[0] in_channels = input.shape[1] in_height = input.shape[2] in_width = input.shape[3] assert (in_channels == self.num_filter_channels) out_width = int((1.0 * in_width + 2 * self.padW - self.kW) / self.dW + 1) out_height = int((1.0 * in_height + 2 * self.padH - self.kH) / self.dH + 1) self.output = GPUTensor( (in_images, self.num_filter_maps, out_height, out_width), input.dtype) # print("ONCV:", input.dtype, self.output.dtype) # print("Convolution::configure: output shape =", self.output.shape) # initialize cudnn descriptors if self.in_desc: libcudnn.cudnnDestroyTensorDescriptor(self.in_desc.ptr) if self.out_desc: libcudnn.cudnnDestroyTensorDescriptor(self.out_desc.ptr) self.in_desc = input.get_cudnn_tensor_desc() # Get output dimensions (first two values are n_input and filters_out) _, _, out_height2, out_width2 = libcudnn.cudnnGetConvolution2dForwardOutputDim( self.conv_desc, self.in_desc.ptr, self.filt_desc) assert (out_width == out_width2) assert (out_height == out_height2) self.out_desc = self.output.get_cudnn_tensor_desc() # find best convolution algorithm self.algo = libcudnn.cudnnGetConvolutionForwardAlgorithm( context.cudnn, self.in_desc.ptr, self.filt_desc, self.conv_desc, self.out_desc.ptr, self.convolution_fwd_pref, 0) print("Convolution::configure: algo=%s" % str(self.algo.value)) self.ws_size = libcudnn.cudnnGetConvolutionForwardWorkspaceSize( context.cudnn, self.in_desc.ptr, self.filt_desc, self.conv_desc, self.out_desc.ptr, self.algo) self.ws_ptr = drv.mem_alloc( self.ws_size.value) if self.ws_size.value > 0 else 0 print("Convolution::configure: workspace size=%d" % self.ws_size.value) def fprop(self, input): # print("\nConvolution::fprop: alpha=%f, beta=%f" % (self.alpha, self.beta)) ws_data = ctypes.c_void_p(int(self.ws_ptr)) self.start.record() libcudnn.cudnnConvolutionForward(context.cudnn, self.alpha, self.in_desc.ptr, input.get_gpu_voidp(), self.filt_desc, self.W.get_gpu_voidp(), self.conv_desc, self.algo, ws_data, self.ws_size.value, self.beta, self.out_desc.ptr, self.output.get_gpu_voidp()) libcudnn.cudnnAddTensor(context.cudnn, 1.0, self.b_desc.ptr, self.bias.get_gpu_voidp(), 1.0, self.out_desc.ptr, self.output.get_gpu_voidp()) self.check_truth() def __str__(self): return "%s, W=%s, b=%s" % (SlidingLayer.__str__(self), self.W.shape, self.bias.shape)
num_errors = 0 num = datasrc.num_items() if args.num_images == 0 else args.num_images # inputs = np.load("truth/input.npy") # results = [["n01986214","n04252225" ], # ["n03938244","n02840245"], # ["n01644900","n01770393"], # ["n04019541","n04019541"]] for i in range(num): yt, data = datasrc.get_item() data = np.ascontiguousarray(np.expand_dims(np.rollaxis(data, 2), 0)).astype(model.dtype) data = model.normalize(data) # yt = results[i][0] # data = np.expand_dims(inputs[i], 0).astype(input_dtype) # print(data.shape, data.dtype) # print(data2.shape, data2.dtype) # print(np.allclose(data,data2)) # continue # exit(0) input_tensor = GPUTensor(data) # print(data.shape) # model.configure(input_tensor) y = model.evaluate(input_tensor) print(y, yt) if y != yt: num_errors += 1 print("DONE: %d images classified, error rate=%.4f" % (num, 1.0 * num_errors / num))
class Pooling(SlidingLayer): class Mode(enum.IntEnum): MAX = 1, AVG = 2 def __init__(self, mode, config, name="Pooling"): super().__init__(config, name) self.mode = mode assert(config["ceil_mode"] == False) self.alpha = 1.0 self.beta = 0.0 self.pool_desc = None self.in_desc = None self.out_desc = None def configure(self, input): in_images = input.shape[0] in_channels = input.shape[1] in_height = input.shape[2] in_width = input.shape[3] assert(in_width >= self.kW) assert(in_height >= self.kH) out_width = int((math.floor(1.0 * in_width - self.kW + 2*self.padW) / self.dW) + 1) out_height = int((math.floor(1.0 * in_height - self.kH + 2*self.padH) / self.dH) + 1) self.output = GPUTensor( (in_images, in_channels, out_height, out_width), input.dtype ) if self.pool_desc: libcudnn.cudnnDestroyPoolingDescriptor(self.pool_desc) if self.in_desc: libcudnn.cudnnDestroyTensorDescriptor(self.in_desc) if self.out_desc: libcudnn.cudnnDestroyTensorDescriptor(self.out_desc) self.in_desc = input.get_cudnn_tensor_desc() self.out_desc = self.output.get_cudnn_tensor_desc() self.pool_desc = libcudnn.cudnnCreatePoolingDescriptor() libcudnn.cudnnSetPooling2dDescriptor(self.pool_desc, libcudnn.cudnnPoolingMode["CUDNN_POOLING_MAX"], # libcudnn.cudnnNanPropagation["CUDNN_NOT_PROPAGATE_NAN"], self.kH, self.kW, self.padH, self.padW, self.dH, self.dW) def fprop(self, input): in_data = ctypes.c_void_p(int(input.gpudata)) out_data = ctypes.c_void_p(int(self.output.gpudata)) # print("Pooling::fprop()") # print("in_data:", input.ptr) # print("out_data:", self.output.ptr) libcudnn.cudnnPoolingForward(context.cudnn, self.pool_desc, self.alpha, self.in_desc.ptr, input.get_gpu_voidp(), self.beta, self.out_desc.ptr, self.output.get_gpu_voidp()) self.check_truth()
import numpy as np import pycuda.autoinit import libcudnn, ctypes from gputensor import GPUTensor from scipy.misc import logsumexp xo = np.array([1,2,3,4,5,6,7,8], dtype=np.float32) print(np.log(np.exp(xo) / np.sum(np.exp(xo)))) xn = xo.reshape((1,8,1,1)) print("LOGSUMEXP", logsumexp(xn)) print(xn.shape) x = GPUTensor(xn) y = GPUTensor((1,8,1,1), dtype=np.float32) print(x.shape, x.dtype) print(y.shape, y.dtype) cudnn_context = libcudnn.cudnnCreate() x_desc = x.get_cudnn_tensor_desc() y_desc = y.get_cudnn_tensor_desc() # print(libcudnn.cudnnGetTensor4dDescriptor(x_desc)) # exit(0) algo = libcudnn.cudnnSoftmaxAlgorithm["CUDNN_SOFTMAX_LOG"] mode = libcudnn.cudnnSoftmaxMode['CUDNN_SOFTMAX_MODE_CHANNEL'] # mode = libcudnn.cudnnSoftmaxMode['CUDNN_SOFTMAX_MODE_INSTANCE'] alpha = 1.0
import math import numpy as np import pycuda.autoinit import libcudnn from gputensor import GPUTensor xh = np.array( [[ [[1,2,3],[2,3,4],[3,4,5]], [[1,2,3],[2,3,4],[3,4,5]], [[1,2,3],[2,3,4],[3,4,5]] ] ], dtype=np.float32) print(xh.shape) print(xh) cudnn_context = libcudnn.cudnnCreate() x = GPUTensor(xh) x_desc = x.get_cudnn_tensor_desc() print(x_desc) print("X:\n", x.get()) b = GPUTensor(np.array([ 1, 2, 3 ],dtype=np.float32).reshape((1,3,1,1))) b_desc = b.get_cudnn_tensor_desc() print(x.shape) print(b.shape) # print(b.get()) libcudnn.cudnnAddTensor(cudnn_context, 1.0, b_desc.ptr, b.get_gpu_voidp(), 1.0, x_desc.ptr, x.get_gpu_voidp()) print("X2:\n", x.get())
import math import numpy as np import pycuda.autoinit import libcudnn from gputensor import GPUTensor xh = np.array( [[[[ 1 + m * x for x in list(range(4)) ] for m in range(4) ]]], dtype=np.float16 ) yh = xh + 0.5 print(xh) print(yh) print(xh.shape) cudnn_context = libcudnn.cudnnCreate() x = GPUTensor(xh) y = GPUTensor(yh) print(x.dtype, y.dtype) x_desc = x.get_cudnn_tensor_desc() y_desc = y.get_cudnn_tensor_desc() print(x_desc) libcudnn.cudnnAddTensor(cudnn_context, 1.0, x_desc.ptr, x.get_gpu_voidp(), 1.0, y_desc.ptr, y.get_gpu_voidp()) yh2 = y.get() print(y) print(y.dtype)
import numpy as np import pycuda.autoinit import libcudnn, ctypes from gputensor import GPUTensor from scipy.misc import logsumexp xo = np.array([1, 2, 3, 4, 5, 6, 7, 8], dtype=np.float32) print(np.log(np.exp(xo) / np.sum(np.exp(xo)))) xn = xo.reshape((1, 8, 1, 1)) print("LOGSUMEXP", logsumexp(xn)) print(xn.shape) x = GPUTensor(xn) y = GPUTensor((1, 8, 1, 1), dtype=np.float32) print(x.shape, x.dtype) print(y.shape, y.dtype) cudnn_context = libcudnn.cudnnCreate() x_desc = x.get_cudnn_tensor_desc() y_desc = y.get_cudnn_tensor_desc() # print(libcudnn.cudnnGetTensor4dDescriptor(x_desc)) # exit(0) algo = libcudnn.cudnnSoftmaxAlgorithm["CUDNN_SOFTMAX_LOG"] mode = libcudnn.cudnnSoftmaxMode['CUDNN_SOFTMAX_MODE_CHANNEL'] # mode = libcudnn.cudnnSoftmaxMode['CUDNN_SOFTMAX_MODE_INSTANCE'] alpha = 1.0 beta = 0.0
import math import numpy as np import pycuda.autoinit import libcudnn from gputensor import GPUTensor xh = np.array([[[[1 + m * x for x in list(range(4))] for m in range(4)]]], dtype=np.float16) yh = xh + 0.5 print(xh) print(yh) print(xh.shape) cudnn_context = libcudnn.cudnnCreate() x = GPUTensor(xh) y = GPUTensor(yh) print(x.dtype, y.dtype) x_desc = x.get_cudnn_tensor_desc() y_desc = y.get_cudnn_tensor_desc() print(x_desc) libcudnn.cudnnAddTensor(cudnn_context, 1.0, x_desc.ptr, x.get_gpu_voidp(), 1.0, y_desc.ptr, y.get_gpu_voidp()) yh2 = y.get() print(y) print(y.dtype)