def configure(self, input): in_images = input.shape[0] in_channels = input.shape[1] in_height = input.shape[2] in_width = input.shape[3] assert(in_width >= self.kW) assert(in_height >= self.kH) out_width = int((math.floor(1.0 * in_width - self.kW + 2*self.padW) / self.dW) + 1) out_height = int((math.floor(1.0 * in_height - self.kH + 2*self.padH) / self.dH) + 1) self.output = GPUTensor( (in_images, in_channels, out_height, out_width), input.dtype ) if self.pool_desc: libcudnn.cudnnDestroyPoolingDescriptor(self.pool_desc) if self.in_desc: libcudnn.cudnnDestroyTensorDescriptor(self.in_desc) if self.out_desc: libcudnn.cudnnDestroyTensorDescriptor(self.out_desc) self.in_desc = input.get_cudnn_tensor_desc() self.out_desc = self.output.get_cudnn_tensor_desc() self.pool_desc = libcudnn.cudnnCreatePoolingDescriptor() libcudnn.cudnnSetPooling2dDescriptor(self.pool_desc, libcudnn.cudnnPoolingMode["CUDNN_POOLING_MAX"], # libcudnn.cudnnNanPropagation["CUDNN_NOT_PROPAGATE_NAN"], self.kH, self.kW, self.padH, self.padW, self.dH, self.dW)
def configure(self, input): self.output = GPUTensor(input.shape, input.dtype) if self.in_desc: libcudnn.cudnnDestroyTensorDescriptor(self.in_desc.ptr) if self.out_desc: libcudnn.cudnnDestroyTensorDescriptor(self.out_desc.ptr) self.in_desc = input.get_cudnn_tensor_desc() self.out_desc = self.output.get_cudnn_tensor_desc()
def configure(self, input): # print("Convolution::configure: input shape =", input.shape) in_images = input.shape[0] in_channels = input.shape[1] in_height = input.shape[2] in_width = input.shape[3] assert (in_channels == self.num_filter_channels) out_width = int((1.0 * in_width + 2 * self.padW - self.kW) / self.dW + 1) out_height = int((1.0 * in_height + 2 * self.padH - self.kH) / self.dH + 1) self.output = GPUTensor( (in_images, self.num_filter_maps, out_height, out_width), input.dtype) # print("ONCV:", input.dtype, self.output.dtype) # print("Convolution::configure: output shape =", self.output.shape) # initialize cudnn descriptors if self.in_desc: libcudnn.cudnnDestroyTensorDescriptor(self.in_desc.ptr) if self.out_desc: libcudnn.cudnnDestroyTensorDescriptor(self.out_desc.ptr) self.in_desc = input.get_cudnn_tensor_desc() # Get output dimensions (first two values are n_input and filters_out) _, _, out_height2, out_width2 = libcudnn.cudnnGetConvolution2dForwardOutputDim( self.conv_desc, self.in_desc.ptr, self.filt_desc) assert (out_width == out_width2) assert (out_height == out_height2) self.out_desc = self.output.get_cudnn_tensor_desc() # find best convolution algorithm self.algo = libcudnn.cudnnGetConvolutionForwardAlgorithm( context.cudnn, self.in_desc.ptr, self.filt_desc, self.conv_desc, self.out_desc.ptr, self.convolution_fwd_pref, 0) print("Convolution::configure: algo=%s" % str(self.algo.value)) self.ws_size = libcudnn.cudnnGetConvolutionForwardWorkspaceSize( context.cudnn, self.in_desc.ptr, self.filt_desc, self.conv_desc, self.out_desc.ptr, self.algo) self.ws_ptr = drv.mem_alloc( self.ws_size.value) if self.ws_size.value > 0 else 0 print("Convolution::configure: workspace size=%d" % self.ws_size.value)
def configure(self, input): # print("Convolution::configure: input shape =", input.shape) in_images = input.shape[0] in_channels = input.shape[1] in_height = input.shape[2] in_width = input.shape[3] assert(in_channels == self.num_filter_channels) out_width = int((1.0 * in_width + 2*self.padW - self.kW) / self.dW + 1); out_height = int((1.0 * in_height + 2*self.padH - self.kH) / self.dH + 1); self.output = GPUTensor((in_images, self.num_filter_maps, out_height, out_width), input.dtype) # print("ONCV:", input.dtype, self.output.dtype) # print("Convolution::configure: output shape =", self.output.shape) # initialize cudnn descriptors if self.in_desc: libcudnn.cudnnDestroyTensorDescriptor(self.in_desc.ptr) if self.out_desc: libcudnn.cudnnDestroyTensorDescriptor(self.out_desc.ptr) self.in_desc = input.get_cudnn_tensor_desc() # Get output dimensions (first two values are n_input and filters_out) _, _, out_height2, out_width2 = libcudnn.cudnnGetConvolution2dForwardOutputDim( self.conv_desc, self.in_desc.ptr, self.filt_desc) assert(out_width == out_width2) assert(out_height == out_height2) self.out_desc = self.output.get_cudnn_tensor_desc() # find best convolution algorithm self.algo = libcudnn.cudnnGetConvolutionForwardAlgorithm(context.cudnn, self.in_desc.ptr, self.filt_desc, self.conv_desc, self.out_desc.ptr, self.convolution_fwd_pref, 0) print("Convolution::configure: algo=%s" % str(self.algo.value)) self.ws_size = libcudnn.cudnnGetConvolutionForwardWorkspaceSize(context.cudnn, self.in_desc.ptr, self.filt_desc, self.conv_desc, self.out_desc.ptr, self.algo) self.ws_ptr = drv.mem_alloc(self.ws_size.value) if self.ws_size.value > 0 else 0 print("Convolution::configure: workspace size=%d" % self.ws_size.value)
def configure(self, input): in_images = input.shape[0] in_channels = input.shape[1] in_height = input.shape[2] in_width = input.shape[3] assert (in_width >= self.kW) assert (in_height >= self.kH) out_width = int((math.floor(1.0 * in_width - self.kW + 2 * self.padW) / self.dW) + 1) out_height = int( (math.floor(1.0 * in_height - self.kH + 2 * self.padH) / self.dH) + 1) self.output = GPUTensor( (in_images, in_channels, out_height, out_width), input.dtype) if self.pool_desc: libcudnn.cudnnDestroyPoolingDescriptor(self.pool_desc) if self.in_desc: libcudnn.cudnnDestroyTensorDescriptor(self.in_desc) if self.out_desc: libcudnn.cudnnDestroyTensorDescriptor(self.out_desc) self.in_desc = input.get_cudnn_tensor_desc() self.out_desc = self.output.get_cudnn_tensor_desc() self.pool_desc = libcudnn.cudnnCreatePoolingDescriptor() libcudnn.cudnnSetPooling2dDescriptor( self.pool_desc, libcudnn.cudnnPoolingMode["CUDNN_POOLING_MAX"], # libcudnn.cudnnNanPropagation["CUDNN_NOT_PROPAGATE_NAN"], self.kH, self.kW, self.padH, self.padW, self.dH, self.dW)
def benchmark_conv(kw, kh, bsz): start, end = (drv.Event(), drv.Event()) def start_bench(): start.record() def end_bench(): end.record() end.synchronize() return end.time_since(start) n_input = bsz filters_in = 3 filters_out = 64 height_in = 224 width_in = 224 height_filter = kh width_filter = kw pad_h = 3 pad_w = 3 vertical_stride = 1 horizontal_stride = 1 upscalex = 1 upscaley = 1 alpha = 1.0 beta = 1.0 # Input tensor X = gpuarray.to_gpu(np.random.rand(n_input, filters_in, height_in, width_in) .astype(np.float32)) # Filter tensor filters = gpuarray.to_gpu(np.random.rand(filters_out, filters_in, height_filter, width_filter).astype(np.float32)) # Descriptor for input X_desc = libcudnn.cudnnCreateTensorDescriptor() libcudnn.cudnnSetTensor4dDescriptor(X_desc, tensor_format, data_type, n_input, filters_in, height_in, width_in) # Filter descriptor filters_desc = libcudnn.cudnnCreateFilterDescriptor() libcudnn.cudnnSetFilter4dDescriptor(filters_desc, data_type, filters_out, filters_in, height_filter, width_filter) # Convolution descriptor conv_desc = libcudnn.cudnnCreateConvolutionDescriptor() libcudnn.cudnnSetConvolution2dDescriptor(conv_desc, pad_h, pad_w, vertical_stride, horizontal_stride, upscalex, upscaley, convolution_mode) # Get output dimensions (first two values are n_input and filters_out) _, _, height_output, width_output = libcudnn.cudnnGetConvolution2dForwardOutputDim( conv_desc, X_desc, filters_desc) # Output tensor Y = gpuarray.empty((n_input, filters_out, height_output, width_output), np.float32) y_desc = libcudnn.cudnncreatetensordescriptor() libcudnn.cudnnsettensor4ddescriptor(y_desc, tensor_format, data_type, n_input, filters_out, height_output, width_output) # Get pointers to GPU memory X_data = ctypes.c_void_p(int(X.gpudata)) filters_data = ctypes.c_void_p(int(filters.gpudata)) Y_data = ctypes.c_void_p(int(Y.gpudata)) # Perform convolution algo = libcudnn.cudnnGetConvolutionForwardAlgorithm(cudnn_context, X_desc, filters_desc, conv_desc, Y_desc, convolution_fwd_pref, 0) # print("Cudnn algorithm = %d" % algo.value) ws_size = libcudnn.cudnnGetConvolutionForwardWorkspaceSize(cudnn_context, X_desc, filters_desc, conv_desc, Y_desc, algo) ws_ptr = drv.mem_alloc(ws_size.value) if ws_size.value > 0 else 0 ws_data = ctypes.c_void_p(int(ws_ptr)) libcudnn.cudnnConvolutionForward(cudnn_context, alpha, X_desc, X_data, filters_desc, filters_data, conv_desc, algo, ws_data, ws_size.value, beta, Y_desc, Y_data) start_bench() for i in range(10): libcudnn.cudnnConvolutionForward(cudnn_context, alpha, X_desc, X_data, filters_desc, filters_data, conv_desc, algo, ws_data, ws_size.value, beta, Y_desc, Y_data) ms = end_bench() ws_ptr = None libcudnn.cudnnDestroyTensorDescriptor(X_desc) libcudnn.cudnnDestroyTensorDescriptor(Y_desc) libcudnn.cudnnDestroyFilterDescriptor(filters_desc) libcudnn.cudnnDestroyConvolutionDescriptor(conv_desc) return ms / 10
# Perform convolution algo = libcudnn.cudnnGetConvolutionForwardAlgorithm(cudnn_context, X_desc, filters_desc, conv_desc, Y_desc, convolution_fwd_pref, 0) print("Cudnn algorithm = %d" % algo.value) ws_size = libcudnn.cudnnGetConvolutionForwardWorkspaceSize( cudnn_context, X_desc, filters_desc, conv_desc, Y_desc, algo) ws_ptr = drv.mem_alloc(ws_size.value) if ws_size.value > 0 else 0 ws_data = ctypes.c_void_p(int(ws_ptr)) start_bench() libcudnn.cudnnConvolutionForward(cudnn_context, alpha, X_desc, X_data, filters_desc, filters_data, conv_desc, algo, ws_data, ws_size.value, beta, Y_desc, Y_data) end_bench("fprop") ws_ptr = None # Clean up libcudnn.cudnnDestroyTensorDescriptor(X_desc) libcudnn.cudnnDestroyTensorDescriptor(Y_desc) libcudnn.cudnnDestroyFilterDescriptor(filters_desc) libcudnn.cudnnDestroyConvolutionDescriptor(conv_desc) libcudnn.cudnnDestroy(cudnn_context)
maxU = parU[0:1,0:1] maxo = ng.max(abs(cuO - nlO.T), partial=parO, out=maxO).get()[0,0] maxb = ng.max(abs(cuB - nlB.T), partial=parB, out=maxB).get()[0,0] maxu = ng.max(abs(cuU - nlU.T), partial=parU, out=maxU).get()[0,0] meano = ng.mean(abs(cuO), partial=parO, out=maxO).get()[0,0] meanb = ng.mean(abs(cuB), partial=parB, out=maxB).get()[0,0] meanu = ng.mean(abs(cuU), partial=parU, out=maxU).get()[0,0] print " maxerr mean pct" print "fprop: %7.5f %6.2f %5.3f" % (maxo, meano, 100*maxo/meano) print "bprop: %7.5f %6.2f %5.3f" % (maxb, meanb, 100*maxb/meanb) print "updat: %7.5f %6.2f %5.3f" % (maxu, meanu, 100*maxu/meanu) # free up memory from this layer before proceeding cuB = cuU = cuO = None nlB = nlU = nlO = None parO = parB = parU = maxO = maxB = maxU = None libcudnn.cudnnDestroyTensorDescriptor(I_desc) libcudnn.cudnnDestroyTensorDescriptor(O_desc) libcudnn.cudnnDestroyFilterDescriptor(F_desc) libcudnn.cudnnDestroyTensorDescriptor(E_desc) libcudnn.cudnnDestroyTensorDescriptor(B_desc) libcudnn.cudnnDestroyFilterDescriptor(U_desc) libcudnn.cudnnDestroyConvolutionDescriptor(C_desc) libcudnn.cudnnDestroy(cudnn)
X_data = ctypes.c_void_p(int(X.gpudata)) filters_data = ctypes.c_void_p(int(filters.gpudata)) Y_data = ctypes.c_void_p(int(Y.gpudata)) # Perform convolution algo = libcudnn.cudnnGetConvolutionForwardAlgorithm(cudnn_context, X_desc, filters_desc, conv_desc, Y_desc, convolution_fwd_pref, 0) print("Cudnn algorithm = %d" % algo.value) ws_size = libcudnn.cudnnGetConvolutionForwardWorkspaceSize(cudnn_context, X_desc, filters_desc, conv_desc, Y_desc, algo) ws_ptr = drv.mem_alloc(ws_size.value) if ws_size.value > 0 else 0 ws_data = ctypes.c_void_p(int(ws_ptr)) start_bench() libcudnn.cudnnConvolutionForward(cudnn_context, alpha, X_desc, X_data, filters_desc, filters_data, conv_desc, algo, ws_data, ws_size.value, beta, Y_desc, Y_data) end_bench("fprop") ws_ptr = None # Clean up libcudnn.cudnnDestroyTensorDescriptor(X_desc) libcudnn.cudnnDestroyTensorDescriptor(Y_desc) libcudnn.cudnnDestroyFilterDescriptor(filters_desc) libcudnn.cudnnDestroyConvolutionDescriptor(conv_desc) libcudnn.cudnnDestroy(cudnn_context)
maxB = parB[0:1, 0:1] maxU = parU[0:1, 0:1] maxo = ng.max(abs(cuO - nlO.T), partial=parO, out=maxO).get()[0, 0] maxb = ng.max(abs(cuB - nlB.T), partial=parB, out=maxB).get()[0, 0] maxu = ng.max(abs(cuU - nlU.T), partial=parU, out=maxU).get()[0, 0] meano = ng.mean(abs(cuO), partial=parO, out=maxO).get()[0, 0] meanb = ng.mean(abs(cuB), partial=parB, out=maxB).get()[0, 0] meanu = ng.mean(abs(cuU), partial=parU, out=maxU).get()[0, 0] print " maxerr mean pct" print "fprop: %7.5f %6.2f %5.3f" % (maxo, meano, 100 * maxo / meano) print "bprop: %7.5f %6.2f %5.3f" % (maxb, meanb, 100 * maxb / meanb) print "updat: %7.5f %6.2f %5.3f" % (maxu, meanu, 100 * maxu / meanu) # free up memory from this layer before proceeding cuB = cuU = cuO = None nlB = nlU = nlO = None parO = parB = parU = maxO = maxB = maxU = None libcudnn.cudnnDestroyTensorDescriptor(I_desc) libcudnn.cudnnDestroyTensorDescriptor(O_desc) libcudnn.cudnnDestroyFilterDescriptor(F_desc) libcudnn.cudnnDestroyTensorDescriptor(E_desc) libcudnn.cudnnDestroyTensorDescriptor(B_desc) libcudnn.cudnnDestroyFilterDescriptor(U_desc) libcudnn.cudnnDestroyConvolutionDescriptor(C_desc) libcudnn.cudnnDestroy(cudnn)