def get_default_handle(): """Get the default handle of CuDNN.""" global _handles, _pid pid = os.getpid() if _pid != pid: # not initialized yet _handles = {} atexit.register(shutdown) _pid = pid device = cuda.Context.get_device() if device in _handles: return _handles[device] handle = libcudnn.cudnnCreate() _handles[device] = handle return handle
import math import numpy as np import pycuda.autoinit import libcudnn from gputensor import GPUTensor dt = np.float16 xh = np.ones((1,1,4,4), dtype=dt) * 2.0 # print(xh) cudnn_context = libcudnn.cudnnCreate() print("CUDNN Version: %d" % libcudnn.cudnnGetVersion()) x = GPUTensor(xh) y = GPUTensor(xh.shape, dtype=dt) pdt = np.float32 w = GPUTensor(np.ones(1).reshape(1,1,1,1), dtype=pdt) bias = GPUTensor(np.zeros(1).reshape(1,1,1,1), dtype=pdt) mean = GPUTensor(np.ones(1).reshape(1,1,1,1), dtype=pdt) var = GPUTensor(np.ones(1).reshape(1,1,1,1) * 0.5, dtype=pdt) x_desc = x.get_cudnn_tensor_desc() y_desc = y.get_cudnn_tensor_desc() print(x_desc) print(y_desc) param_desc = var.get_cudnn_tensor_desc()
start, end = (drv.Event(), drv.Event()) def start_bench(): start.record() def end_bench(op): end.record() end.synchronize() msecs = end.time_since(start) / repeat gflops = conv.flops / (msecs * 1000000.0) print "%7.3f msecs %8.3f gflops (%s: %s)" % (msecs, gflops, op, conv) ng = NervanaGPU(stochastic_round=False, bench=True) # Create a cuDNN context cudnn = libcudnn.cudnnCreate() C_desc = libcudnn.cudnnCreateConvolutionDescriptor() I_desc = libcudnn.cudnnCreateTensorDescriptor() O_desc = libcudnn.cudnnCreateTensorDescriptor() E_desc = libcudnn.cudnnCreateTensorDescriptor() B_desc = libcudnn.cudnnCreateTensorDescriptor() F_desc = libcudnn.cudnnCreateFilterDescriptor() U_desc = libcudnn.cudnnCreateFilterDescriptor() # Set some options and tensor dimensions NCHW_fmt = libcudnn.cudnnTensorFormat['CUDNN_TENSOR_NCHW'] cu_dtype = libcudnn.cudnnDataType['CUDNN_DATA_FLOAT'] conv_mode = libcudnn.cudnnConvolutionMode['CUDNN_CROSS_CORRELATION'] fwd_pref = libcudnn.cudnnConvolutionFwdPreference['CUDNN_CONVOLUTION_FWD_NO_WORKSPACE'] # CUDNN_CONVOLUTION_FWD_NO_WORKSPACE
import pycuda.autoinit import pycuda.driver as drv from pycuda import gpuarray import libcudnn, ctypes import numpy as np # Create a cuDNN context cudnn_context = libcudnn.cudnnCreate() # Set some options and tensor dimensions tensor_format = libcudnn.cudnnTensorFormat['CUDNN_TENSOR_NCHW'] data_type = libcudnn.cudnnDataType['CUDNN_DATA_FLOAT'] convolution_mode = libcudnn.cudnnConvolutionMode['CUDNN_CROSS_CORRELATION'] convolution_fwd_pref = libcudnn.cudnnConvolutionFwdPreference['CUDNN_CONVOLUTION_FWD_PREFER_FASTEST'] start, end = (drv.Event(), drv.Event()) def start_bench(): start.record() def end_bench(op): end.record() end.synchronize() msecs = end.time_since(start) print("%7.3f msecs" % (msecs)) n_input = 64 filters_in = 128 filters_out = 128 height_in = 112 width_in = 112
def start_bench(): start.record() def end_bench(op): end.record() end.synchronize() msecs = end.time_since(start) / repeat gflops = conv.flops / (msecs * 1000000.0) print "%7.3f msecs %8.3f gflops (%s: %s)" % (msecs, gflops, op, conv) ng = NervanaGPU(stochastic_round=False, bench=True) # Create a cuDNN context cudnn = libcudnn.cudnnCreate() C_desc = libcudnn.cudnnCreateConvolutionDescriptor() I_desc = libcudnn.cudnnCreateTensorDescriptor() O_desc = libcudnn.cudnnCreateTensorDescriptor() E_desc = libcudnn.cudnnCreateTensorDescriptor() B_desc = libcudnn.cudnnCreateTensorDescriptor() F_desc = libcudnn.cudnnCreateFilterDescriptor() U_desc = libcudnn.cudnnCreateFilterDescriptor() # Set some options and tensor dimensions NCHW_fmt = libcudnn.cudnnTensorFormat['CUDNN_TENSOR_NCHW'] cu_dtype = libcudnn.cudnnDataType['CUDNN_DATA_FLOAT'] conv_mode = libcudnn.cudnnConvolutionMode['CUDNN_CROSS_CORRELATION'] fwd_pref = libcudnn.cudnnConvolutionFwdPreference[ 'CUDNN_CONVOLUTION_FWD_NO_WORKSPACE']
import pycuda.driver as drv from pycuda import gpuarray import libcudnn, ctypes import numpy as np inputsize = 100 hiddensize = 200 seqlength = 50 minibatch = 8 numlayers = 2 inputmode = 0 direction = 0 mode = 0 datatype = 0 handle = libcudnn.cudnnCreate() rnndesc = libcudnn.cudnnCreateRNNDescriptor() dropoutdesc = libcudnn.cudnnCreateDropoutDescriptor() cudnnSetDropoutDescriptor(dropoutdesc, handle, 0, 0, 0, 0) libcudnn.cudnnSetRNNDescriptor(rnndesc, hiddensize, seqlength, numlayers, dropoutdesc, inputmode, direction, mode, datatype) xdescs = [libcudnn.cudnnCreateTensorDescriptor() for _ in xrange(seqlength)] [libcudnn.cudnnSetTensorNdDescriptor(xdesc, 0, 3, [inputsize, minibatch, seqlength]) for xdesc in xdescs] hxdesc = libcudnn.cudnnCreateTensorDescriptor() libcudnn.cudnnSetTensorNdDescriptor(hxdesc, 0, 3, [hiddensize, minibatch, numlayers]) cxdesc = libcudnn.cudnnCreateTensorDescriptor()