def __call__(self, input): if not self.jit or not isinstance(input, torch.cuda.FloatTensor): norm = input.norm(2, input.dim() - 1) return torch.cat([norm, norm.new(norm.size()).zero_()], input.dim() - 1) out = input.new(input.size()) input = input.contiguous() if not iscomplex(input): raise TypeError('The input and outputs should be complex') if (self.modulus_cache[input.get_device()] is None): kernel = b""" extern "C" __global__ void abs_complex_value(const float * x, float2 * z, int n) { int i = blockIdx.x * blockDim.x + threadIdx.x; if (i >= n) return; z[i] = make_float2(normf(2, x + 2*i), 0); } """ print('modulus.cu') prog = Program(kernel, b'modulus.cu') ptx = prog.compile(['-arch='+get_compute_arch(input)]) module = Module() module.load(bytes(ptx.encode())) self.modulus_cache[input.get_device()] = module fabs = self.modulus_cache[input.get_device()].get_function('abs_complex_value') fabs(grid=(self.GET_BLOCKS(int(out.nelement())//2), 1, 1), block=(self.CUDA_NUM_THREADS, 1, 1), args=[input.data_ptr(), out.data_ptr(), out.numel() // 2], stream=Stream(ptr=torch.cuda.current_stream().cuda_stream)) return out
def compile_with_nvrtc(cuda_text): c = Program(cuda_text) device = deviceinfo() cp = str(device.cc_major.value * 10 + device.cc_minor.value) ptx = c.compile(['-arch=compute_' + cp]) with open(PTX_PATH, 'w+') as f: f.write(ptx)
class SingleDeviceKernel(): def __init__(self, kernel_prog, ): self.func_name = func_name or kernel_name self.name = kernel_name + ".cu" with open(path.join(kernel_dir, self.name), 'r') as cu_f: self.kernel_source = cu_f.read().encode() self.prog = Program(self.kernel_source, self.name.encode()) ptx = self.prog.compile([self.get_compute_arch_arg(device_id)]) self.module = Module() self.module.load(ptx.encode()) def prep_args(self, kwargs): args = [] for k, v in kwargs.items(): try: args.append(v.data_ptr()) except: args.append(v) return args def linear_launch(num_threads, *args): kernel_func = self.module.get_function(self.func_name) kernel_func.linear_launch( num_threads, args = self.prep_args(args), stream=Stream( ptr = torch.cuda.current_stream().cuda_stream ) )
def __call__(self, points, indices): size = points.size() if size in self.cached_functions: func = self.cached_functions[size] else: kernel = open('utils/cuda/distance.cu', 'r').read() kernel = Template(kernel).substitute(BATCH_SIZE=size[0], NUM_ROWS=size[1]) program = Program(kernel, 'distance.cu') ptx = program.compile() m = function.Module() m.load(bytes(ptx.encode())) func = m.get_function('distance') self.cached_functions[size] = func indices = indices.contiguous() col_ind = indices.new(indices.size(1)) col_ptr = indices.new(size[0], size[1] + 1) col_ptr.fill_(indices.size(1)) col_ptr.fill_(0) grid = ((indices.size(1) + 1024 - 1) // 1024, 1, 1) block = (1024, 1, 1) func(grid=grid, block=block, args=[indices.data_ptr(), col_ind.data_ptr(), col_ptr.data_ptr(), indices.size(1)], stream=Stream(ptr=torch.cuda.current_stream().cuda_stream)) return col_ind, col_ptr
def __call__(self, input): if not self.jit or not isinstance(input, torch.cuda.FloatTensor): norm = input.norm(2, input.dim() - 1) return torch.cat([norm, norm.new(norm.size()).zero_()], input.dim() - 1) out = input.new(input.size()) input = input.contiguous() if not iscomplex(input): raise TypeError('The input and outputs should be complex') if (self.modulus_cache[input.get_device()] is None): kernel = """ extern "C" __global__ void abs_complex_value(const float * x, float2 * z, int n) { int i = blockIdx.x * blockDim.x + threadIdx.x; if (i >= n) return; z[i] = make_float2(normf(2, x + 2*i), 0); } """ print('modulus.cu') prog = Program(kernel, 'modulus.cu') ptx = prog.compile([('-arch='+get_compute_arch(input))]) module = Module() module.load(ptx.encode()) self.modulus_cache[input.get_device()] = module fabs = self.modulus_cache[input.get_device()].get_function('abs_complex_value') fabs(grid=(self.GET_BLOCKS(int(out.nelement())//2), 1, 1), block=(self.CUDA_NUM_THREADS, 1, 1), args=[input.data_ptr(), out.data_ptr(), out.numel() // 2], stream=Stream(ptr=torch.cuda.current_stream().cuda_stream)) return out
def compile(self): if self.ptx is None: from pynvrtc.compiler import Program program = Program(kernel, 'recurrent_forget_mult.cu') GPUForgetMult.ptx = program.compile() if torch.cuda.current_device() not in GPUForgetMult.configured_gpus: from cupy.cuda import function m = function.Module() m.load(bytes(self.ptx.encode())) # Forward ForgetMult self.forget_mult = m.get_function('recurrent_forget_mult') self.forget_mult_bwd = m.get_function('recurrent_forget_mult_bwd') # Backward ForgetMult self.bwd_forget_mult = m.get_function('recurrent_bwd_forget_mult') self.bwd_forget_mult_bwd = m.get_function( 'recurrent_bwd_forget_mult_bwd') Stream = namedtuple('Stream', ['ptr']) self.stream = Stream(ptr=torch.cuda.current_stream().cuda_stream) GPUForgetMult.configured_gpus[torch.cuda.current_device()] = ( self.forget_mult, self.forget_mult_bwd, self.bwd_forget_mult, self.bwd_forget_mult_bwd, self.stream) self.forget_mult, self.forget_mult_bwd, self.bwd_forget_mult, self.bwd_forget_mult_bwd, self.stream = GPUForgetMult.configured_gpus[ torch.cuda.current_device()]
def compile_kernel(kernel, filename, functioname): program = Program(bytes(kernel, 'ascii'), filename) ptx = program.compile() m = function.Module() m.load(bytes(ptx.encode())) f = m.get_function(functioname) return f
def _initial_cupy(name="indices_slice"): program = Program(kernel.encode(), (name + '.cu').encode()) ptx = program.compile() m = function.Module() m.load(bytes(ptx.encode())) kernel_function = m.get_function(name) Stream = namedtuple('Stream', ['ptr']) s = Stream(ptr=torch.cuda.current_stream().cuda_stream) return kernel_function, s
def backward(ctx, grad_h): ################### if ctx.ptx is None: program = Program(kernel.encode(), 'recurrent_forget_mult.cu'.encode()) GPUForgetMult.ptx = program.compile() if torch.cuda.current_device() not in GPUForgetMult.configured_gpus: m = function.Module() m.load(bytes(ctx.ptx.encode())) ctx.forget_mult = m.get_function('recurrent_forget_mult') ctx.bwd_forget_mult = m.get_function('bwd_recurrent_forget_mult') Stream = namedtuple('Stream', ['ptr']) ctx.stream = Stream(ptr=torch.cuda.current_stream().cuda_stream) GPUForgetMult.configured_gpus[torch.cuda.current_device()] = ( ctx.forget_mult, ctx.bwd_forget_mult, ctx.stream) ctx.forget_mult, ctx.bwd_forget_mult, ctx.stream = GPUForgetMult.configured_gpus[ torch.cuda.current_device()] ################# f, x, hidden_init = ctx.saved_tensors h = ctx.result ### seq_size, batch_size, hidden_size = f.size() # Zeroing is not necessary as these will be overwritten grad_f = f.new(*f.size()) grad_x = f.new(*f.size()) grad_h_init = f.new(batch_size, hidden_size) ### grid_hidden_size = min(hidden_size, 512) grid = (math.ceil(hidden_size / grid_hidden_size), batch_size) ctx.bwd_forget_mult(grid=grid, block=(grid_hidden_size, 1), args=[ h.data_ptr(), f.data_ptr(), x.data_ptr(), grad_h.data_ptr(), grad_f.data_ptr(), grad_x.data_ptr(), grad_h_init.data_ptr(), seq_size, batch_size, hidden_size ], stream=ctx.stream) ### if hidden_init is not None: return grad_f, grad_x, grad_h_init return grad_f, grad_x
def compile(): program = Program(kernel.encode(), 'recurrent_forget_mult.cu'.encode()) ptx = program.compile() m = function.Module() m.load(bytes(ptx.encode())) GPUForgetMult.forget_mult = m.get_function('recurrent_forget_mult') GPUForgetMult.bwd_forget_mult = m.get_function( 'bwd_recurrent_forget_mult') Stream = namedtuple('Stream', ['ptr']) GPUForgetMult.stream = Stream( ptr=torch.cuda.current_stream().cuda_stream)
def compile(self): # Create program program = Program(self.kernel, self.title) # Compile program arch = "-arch={0}".format(cupyKernel.get_compute_arch()) ptx = program.compile([arch]) # Load Program m = function.Module() m.load(bytes(ptx.encode())) # Get Function Pointer self.func = m.get_function(self.func_name) self.compiled = True
def smooth_local_affine(output_cpu, input_cpu, epsilon, patch, h, w, f_r, f_e): # program = Program(src.encode('utf-8'), 'best_local_affine_kernel.cu'.encode('utf-8')) # ptx = program.compile(['-I/usr/local/cuda/include'.encode('utf-8')]) program = Program(src, 'best_local_affine_kernel.cu') ptx = program.compile(['-I/home/jsy/software/cuda/cuda-10.1/include']) # ptx = program.compile(['-I/usr/local/cuda/include']) m = function.Module() m.load(bytes(ptx.encode())) _reconstruction_best_kernel = m.get_function('reconstruction_best_kernel') _bilateral_smooth_kernel = m.get_function('bilateral_smooth_kernel') _best_local_affine_kernel = m.get_function('best_local_affine_kernel') Stream = namedtuple('Stream', ['ptr']) s = Stream(ptr=torch.cuda.current_stream().cuda_stream) filter_radius = f_r sigma1 = filter_radius / 3 sigma2 = f_e radius = (patch - 1) / 2 filtered_best_output = torch.zeros(np.shape(input_cpu)).cuda() affine_model = torch.zeros((h * w, 12)).cuda() filtered_affine_model = torch.zeros((h * w, 12)).cuda() input_ = torch.from_numpy(input_cpu).cuda() output_ = torch.from_numpy(output_cpu).cuda() _best_local_affine_kernel( grid=(int((h * w) / 256 + 1), 1), block=(256, 1, 1), args=[output_.data_ptr(), input_.data_ptr(), affine_model.data_ptr(), np.int32(h), np.int32(w), np.float32(epsilon), np.int32(radius)], stream=s ) _bilateral_smooth_kernel( grid=(int((h * w) / 256 + 1), 1), block=(256, 1, 1), args=[affine_model.data_ptr(), filtered_affine_model.data_ptr(), input_.data_ptr(), np.int32(h), np.int32(w), np.int32(f_r), np.float32(sigma1), np.float32(sigma2)], stream=s ) _reconstruction_best_kernel( grid=(int((h * w) / 256 + 1), 1), block=(256, 1, 1), args=[input_.data_ptr(), filtered_affine_model.data_ptr(), filtered_best_output.data_ptr(), np.int32(h), np.int32(w)], stream=s ) numpy_filtered_best_output = filtered_best_output.cpu().numpy() return numpy_filtered_best_output
def forward(ctx, f, x, hidden_init=None): ctx.ptx = GPUForgetMult.ptx ctx.configured_gpus = GPUForgetMult.configured_gpus # the self.compile function ################## if ctx.ptx is None: program = Program(kernel.encode(), 'recurrent_forget_mult.cu'.encode()) GPUForgetMult.ptx = program.compile() ctx.ptx = GPUForgetMult.ptx if torch.cuda.current_device() not in GPUForgetMult.configured_gpus: m = function.Module() m.load(bytes(ctx.ptx.encode())) ctx.forget_mult = m.get_function('recurrent_forget_mult') ctx.bwd_forget_mult = m.get_function('bwd_recurrent_forget_mult') Stream = namedtuple('Stream', ['ptr']) ctx.stream = Stream(ptr=torch.cuda.current_stream().cuda_stream) GPUForgetMult.configured_gpus[torch.cuda.current_device()] = ( ctx.forget_mult, ctx.bwd_forget_mult, ctx.stream) ctx.forget_mult, ctx.bwd_forget_mult, ctx.stream = GPUForgetMult.configured_gpus[ torch.cuda.current_device()] ################### seq_size, batch_size, hidden_size = f.size() result = f.new(seq_size + 1, batch_size, hidden_size) # We only zero the result array (result[0]) if we don't set a hidden initial state # All other values (result[1:]) are overwritten by default if hidden_init is not None: result[0, :, :] = hidden_init else: result = result.zero_() ### grid_hidden_size = min(hidden_size, 512) grid = (math.ceil(hidden_size / grid_hidden_size), batch_size) ctx.forget_mult(grid=grid, block=(grid_hidden_size, 1), args=[ result.data_ptr(), f.data_ptr(), x.data_ptr(), seq_size, batch_size, hidden_size ], stream=ctx.stream) ctx.save_for_backward(f, x, hidden_init) ctx.result = result return result[1:, :, :]
def get_kernel_func(kname, ksrc, dtype): if kname + dtype not in modules: ksrc = ksrc.replace('DTYPE', dtype) prog = Program(ksrc, kname + dtype + '.cu') ptx = prog.compile() log = prog._interface.nvrtcGetProgramLog(prog._program) if len(log.strip()) > 0: print(log) module = cupy.cuda.function.Module() module.load(bytes(ptx.encode())) modules[kname + dtype] = module else: module = modules[kname + dtype] Stream = namedtuple('Stream', ['ptr']) s = Stream(ptr=torch.cuda.current_stream().cuda_stream) return module.get_function(kname), s
def compile(self): if self.ptx is None: program = Program(kernel.encode(), 'recurrent_forget_mult.cu'.encode()) GPUReverseForgetMult.ptx = program.compile() if torch.cuda.current_device() not in GPUReverseForgetMult.configured_gpus: m = function.Module() m.load(bytes(self.ptx.encode())) self.forget_mult = m.get_function('recurrent_reverse_forget_mult') self.bwd_forget_mult = m.get_function('bwd_recurrent_reverse_forget_mult') Stream = namedtuple('Stream', ['ptr']) self.stream = Stream(ptr=torch.cuda.current_stream().cuda_stream) GPUReverseForgetMult.configured_gpus[torch.cuda.current_device()] = (self.forget_mult, self.bwd_forget_mult, self.stream) self.forget_mult, self.bwd_forget_mult, self.stream = GPUReverseForgetMult.configured_gpus[torch.cuda.current_device()]
def compile(self): if self.ptx is None: program = Program(kernel.encode(), 'recurrent_forget_mult.cu'.encode()) GPUForgetMult.ptx = program.compile() if torch.cuda.current_device() not in GPUForgetMult.configured_gpus: m = function.Module() m.load(bytes(self.ptx.encode())) self.forget_mult = m.get_function('recurrent_forget_mult') self.bwd_forget_mult = m.get_function('bwd_recurrent_forget_mult') Stream = namedtuple('Stream', ['ptr']) self.stream = Stream(ptr=torch.cuda.current_stream().cuda_stream) GPUForgetMult.configured_gpus[torch.cuda.current_device()] = (self.forget_mult, self.bwd_forget_mult, self.stream) self.forget_mult, self.bwd_forget_mult, self.stream = GPUForgetMult.configured_gpus[torch.cuda.current_device()]
def compile(cls): """Compiles forward and backward GPU kernels for uni- and bi-directional SRU. Assumes there is only one GPU. """ if cls._STREAM is not None: return prog = Program(SRU_CODE.encode(), 'sru_prog.cu'.encode()) ptx = prog.compile() mod = function.Module() mod.load(bytes(ptx.encode())) cls._FWD_FUNC = mod.get_function('sru_fwd') cls._BWD_FUNC = mod.get_function('sru_bwd') cls._BiFWD_FUNC = mod.get_function('sru_bi_fwd') cls._BiBWD_FUNC = mod.get_function('sru_bi_bwd') Stream = namedtuple('Stream', ['ptr']) cls._STREAM = Stream(ptr=torch.cuda.current_stream().cuda_stream)
def compile_cuda( cuda_file ): with open( cuda_file, 'rb' ) as f: src = f.read() from pynvrtc.compiler import Program prog = Program( src.decode(), cuda_file ) ptx = prog.compile( [ '-use_fast_math', '-lineinfo', '-default-device', '-std=c++11', '-rdc', 'true', #'-IC:\\ProgramData\\NVIDIA Corporation\OptiX SDK 7.2.0\include', #'-IC:\\Program Files\\NVIDIA GPU Computing Toolkit\CUDA\\v11.1\include' '-I/usr/local/cuda/include', '-I/home/kmorley/Code/support/NVIDIA-OptiX-SDK-7.2.0-linux64-x86_64/include/' ] ) return ptx
def smooth_local_affine(output_cpu, input_cpu, epsilon, patch, h, w, f_r, f_e): program = Program(src.encode('utf-8'), 'best_local_affine_kernel.cu'.encode('utf-8')) ptx = program.compile(['-I/usr/local/cuda-9.1/include'.encode('utf-8')]) m = function.Module() m.load(bytes(ptx.encode())) _reconstruction_best_kernel = m.get_function('reconstruction_best_kernel') _bilateral_smooth_kernel = m.get_function('bilateral_smooth_kernel') _best_local_affine_kernel = m.get_function('best_local_affine_kernel') Stream = namedtuple('Stream', ['ptr']) s = Stream(ptr=torch.cuda.current_stream().cuda_stream) filter_radius = f_r sigma1 = filter_radius / 3 sigma2 = f_e radius = (patch - 1) / 2 filtered_best_output = torch.zeros(np.shape(input_cpu)).cuda() affine_model = torch.zeros((h * w, 12)).cuda() filtered_affine_model =torch.zeros((h * w, 12)).cuda() input_ = torch.from_numpy(input_cpu).cuda() output_ = torch.from_numpy(output_cpu).cuda() _best_local_affine_kernel( grid=(int((h * w) / 256 + 1), 1), block=(256, 1, 1), args=[output_.data_ptr(), input_.data_ptr(), affine_model.data_ptr(), np.int32(h), np.int32(w), np.float32(epsilon), np.int32(radius)], stream=s ) _bilateral_smooth_kernel( grid=(int((h * w) / 256 + 1), 1), block=(256, 1, 1), args=[affine_model.data_ptr(), filtered_affine_model.data_ptr(), input_.data_ptr(), np.int32(h), np.int32(w), np.int32(f_r), np.float32(sigma1), np.float32(sigma2)], stream=s ) _reconstruction_best_kernel( grid=(int((h * w) / 256 + 1), 1), block=(256, 1, 1), args=[input_.data_ptr(), filtered_affine_model.data_ptr(), filtered_best_output.data_ptr(), np.int32(h), np.int32(w)], stream=s ) numpy_filtered_best_output = filtered_best_output.cpu().numpy() return numpy_filtered_best_output
def get_kernel_func(kname, ksrc, dtype): if kname + dtype not in modules: ksrc = ksrc.replace('DTYPE', dtype) #prog = Program(ksrc.encode('utf-8'), (kname+dtype+'.cu').encode('utf-8')) #uncomment the line above and comment the line below if it causes the following error: AttributeError: 'Program' object has no attribute '_program' prog = Program(ksrc, kname + dtype + '.cu') ptx = prog.compile() log = prog._interface.nvrtcGetProgramLog(prog._program) if len(log.strip()) > 0: print(log) module = cupy.cuda.function.Module() module.load(bytes(ptx.encode())) modules[kname + dtype] = module else: module = modules[kname + dtype] Stream = namedtuple('Stream', ['ptr']) s = Stream(ptr=torch.cuda.current_stream().cuda_stream) return module.get_function(kname), s
def compile(): if GPUForgetMult.ptx is None: program = Program(kernel, 'recurrent_forget_mult.cu') GPUForgetMult.ptx = program.compile() if torch.cuda.current_device() not in GPUForgetMult.configured_gpus: m = function.Module() m.load(bytes(GPUForgetMult.ptx.encode())) forget_mult = m.get_function('recurrent_forget_mult') bwd_forget_mult = m.get_function('bwd_recurrent_forget_mult') Stream = namedtuple('Stream', ['ptr']) stream = Stream(ptr=torch.cuda.current_stream().cuda_stream) GPUForgetMult.configured_gpus[torch.cuda.current_device()] = ( forget_mult, bwd_forget_mult, stream) return GPUForgetMult.configured_gpus[torch.cuda.current_device()]
def compile(self): if self.ptx is None: program = Program(kernel, 'recurrent_forget_mult.cu', lib_name="C:\\Program Files\\NVIDIA GPU Computing Toolkit\\CUDA\\v10.1\\bin\\nvrtc64_101_0.dll") GPUForgetMult.ptx = program.compile() if torch.cuda.current_device() not in GPUForgetMult.configured_gpus: m = function.Module() m.load(bytes(self.ptx.encode())) self.forget_mult = m.get_function('recurrent_forget_mult') self.bwd_forget_mult = m.get_function('bwd_recurrent_forget_mult') Stream = namedtuple('Stream', ['ptr']) self.stream = Stream(ptr=torch.cuda.current_stream().cuda_stream) GPUForgetMult.configured_gpus[torch.cuda.current_device()] = ( self.forget_mult, self.bwd_forget_mult, self.stream) self.forget_mult, self.bwd_forget_mult, self.stream = GPUForgetMult.configured_gpus[torch.cuda.current_device()]
def compile(self): if GPULayerNorm.ptx is None: program = Program(kernel.encode(), 'layer_norm.cu'.encode()) GPULayerNorm.ptx = program.compile() if torch.cuda.current_device() not in GPULayerNorm.configured_gpus: m = function.Module() m.load(bytes(GPULayerNorm.ptx.encode())) self.layer_norm = m.get_function('layer_norm') self.bwd_layer_norm = m.get_function('bwd_layer_norm') self.bwd_mults = m.get_function('bwd_multiplications') Stream = namedtuple('Stream', ['ptr']) self.stream = Stream(ptr=torch.cuda.current_stream().cuda_stream) GPULayerNorm.configured_gpus[torch.cuda.current_device()] = (self.layer_norm, self.bwd_layer_norm, self.bwd_mults, self.stream) self.layer_norm, self.bwd_layer_norm, self.bwd_mults, self.stream = GPULayerNorm.configured_gpus[torch.cuda.current_device()]
def __call__(self, values, col_ind, col_ptr, size, dense): func_id = (size[0], size[1], size[2], dense.size(1), dense.size(2)) if func_id in self.cached_functions: func = self.cached_functions[func_id] else: kernel = open('utils/cuda/sparse_bmm.cu', 'r').read() kernel = Template(kernel).substitute(BATCH_SIZE=size[0], SPARSE_NUM_ROWS=size[1], SPARSE_NUM_COLS=size[2], DENSE_NUM_ROWS=dense.size(1), DENSE_NUM_COLS=dense.size(2)) program = Program(kernel, 'sparse_bmm.cu') ptx = program.compile() m = function.Module() m.load(bytes(ptx.encode())) func = m.get_function('sparse_bmm') self.cached_functions[func_id] = func values = values.contiguous() col_ind = col_ind.contiguous() col_ptr = col_ptr.contiguous() dense = dense.contiguous() result = values.new(size[0], size[1], dense.size(2)) block = (8, 8, 8) grid = tuple([(result.size(i) + block[i] - 1) // block[i] for i in range(3)]) func(grid=grid, block=block, args=[ result.data_ptr(), values.data_ptr(), col_ind.data_ptr(), col_ptr.data_ptr(), dense.data_ptr() ], stream=Stream(ptr=torch.cuda.current_stream().cuda_stream)) return result
def compile(self): if self.ptx is None: program = Program(kernel, 'relu.cu') GPUReLUF.ptx = program.compile() if torch.cuda.current_device() not in GPUReLUF.configured_gpus: m = function.Module() m.load(bytes(self.ptx)) self.relu_forward = m.get_function('relu_forward') self.relu_backward = m.get_function('relu_backward') Stream = namedtuple('Stream', ['ptr']) self.stream = Stream(ptr=torch.cuda.current_stream().cuda_stream) GPUReLUF.configured_gpus[torch.cuda.current_device()] = ( self.relu_forward, self.relu_backward, self.stream) self.relu_forward, self.relu_backward, self.stream = GPUReLUF.configured_gpus[ torch.cuda.current_device()]
def compile(self): if self.ptx is None: #program = Program(kernel.encode(), 'recurrent_forget_mult.cu'.encode()) # https://github.com/salesforce/pytorch-qrnn/issues/26 program = Program(kernel, 'recurrent_forget_mult.cu') GPUForgetMult.ptx = program.compile() if torch.cuda.current_device() not in GPUForgetMult.configured_gpus: m = function.Module() m.load(bytes(self.ptx.encode())) self.forget_mult = m.get_function('recurrent_forget_mult') self.bwd_forget_mult = m.get_function('bwd_recurrent_forget_mult') Stream = namedtuple('Stream', ['ptr']) self.stream = Stream(ptr=torch.cuda.current_stream().cuda_stream) GPUForgetMult.configured_gpus[torch.cuda.current_device()] = ( self.forget_mult, self.bwd_forget_mult, self.stream) self.forget_mult, self.bwd_forget_mult, self.stream = GPUForgetMult.configured_gpus[ torch.cuda.current_device()]
def _build_cuda_kernels(): global _apply_disparity_func_pos global _apply_disparity_func_neg _apply_disparity_pos_kernel = ''' extern "C" { __global__ void apply_disparity_pos( float *dst, const float *src, const int *disp, int h, int w, int c, int total_l) { int i = blockIdx.x * blockDim.x + threadIdx.x; if (i >= total_l) return; int dbase = (i/h/c*h+i%h)*w; for (int j = w - 1; j >=0; j--) { int idx = j + disp[dbase+j]; if (idx < w) dst[i*w+idx] = src[i*w+j]; } } __global__ void apply_disparity_neg( float *dst, const float *src, const int *disp, int h, int w, int c, int total_l) { int i = blockIdx.x * blockDim.x + threadIdx.x; if (i >= total_l) return; int dbase = (i/h/c*h+i%h)*w; for (int j = 0; j < w; j++) { int idx = j + disp[dbase+j]; if (idx > -1) dst[i*w+idx] = src[i*w+j]; } } } ''' program = Program( _apply_disparity_pos_kernel, 'apply_disparity.cu') m = function.Module() m.load(bytes(program.compile().encode())) _apply_disparity_func_pos = m.get_function('apply_disparity_pos') _apply_disparity_func_neg = m.get_function('apply_disparity_neg')
def load_sru_mod(): global SRU_FWD_FUNC, SRU_BWD_FUNC, SRU_BiFWD_FUNC, SRU_BiBWD_FUNC global SRU_STREAM if check_sru_requirement(): from cupy.cuda import function from pynvrtc.compiler import Program # This sets up device to use. device = torch.device("cuda") tmp_ = torch.rand(1, 1).to(device) sru_prog = Program(SRU_CODE.encode('utf-8'), 'sru_prog.cu'.encode('utf-8')) sru_ptx = sru_prog.compile() sru_mod = function.Module() sru_mod.load(bytes(sru_ptx.encode())) SRU_FWD_FUNC = sru_mod.get_function('sru_fwd') SRU_BWD_FUNC = sru_mod.get_function('sru_bwd') SRU_BiFWD_FUNC = sru_mod.get_function('sru_bi_fwd') SRU_BiBWD_FUNC = sru_mod.get_function('sru_bi_bwd') stream = namedtuple('Stream', ['ptr']) SRU_STREAM = stream(ptr=torch.cuda.current_stream().cuda_stream)
class KernelCache(): def __init__(self, kernel_name, func_name=None): self.func_name = func_name or kernel_name self.name = kernel_name + ".cu" with open(path.join(kernel_dir, self.name), 'r') as cu_f: self.kernel_source = cu_f.read().encode() self.prog = Program(self.kernel_source, self.name.encode()) self.cache = {} def cached(self, device_id): try: kernel_func = self.cache[device_id].get_function(self.func_name) except KeyError: self.cache[device_id] = self.compile_and_prep_kernel(device_id) kernel_func = self.cache[device_id].get_function(self.func_name) return kernel_func def compile_and_prep_kernel(self, device_id): ptx = self.prog.compile([self.get_compute_arch_arg(device_id)]) module = Module() module.load(ptx.encode()) return module def get_compute_arch_arg(self, device_id): return "-arch=compute_{0}".format( device.Device(device_id).compute_capability\ ).encode() def prep_args(self, kwargs): args = [] for k, v in kwargs.items(): try: args.append(v.data_ptr()) except: args.append(v) return args
*(grad_init +col) = cur; } } """ if check_sru_requirement(): from cupy.cuda import function from pynvrtc.compiler import Program # This cuda() is important, it sets up device to use. tmp_ = torch.rand(1, 1).cuda() sru_prog = Program(SRU_CODE.encode('utf-8'), 'sru_prog.cu'.encode('utf-8')) sru_ptx = sru_prog.compile() sru_mod = function.Module() sru_mod.load(bytes(sru_ptx.encode())) SRU_FWD_FUNC = sru_mod.get_function('sru_fwd') SRU_BWD_FUNC = sru_mod.get_function('sru_bwd') SRU_BiFWD_FUNC = sru_mod.get_function('sru_bi_fwd') SRU_BiBWD_FUNC = sru_mod.get_function('sru_bi_bwd') stream = namedtuple('Stream', ['ptr']) SRU_STREAM = stream(ptr=torch.cuda.current_stream().cuda_stream) class SRU_Compute(Function): def __init__(self, activation_type, d_out, bidirectional=False):
xp -= ncols_x_; cp -= ncols_; gup -= ncols_u_; gxp -= ncols_x_; ghp -= ncols_; } *(grad_bias + col) = gbias1; *(grad_bias + col + ncols) = gbias2; *(grad_init +col) = cur; } } """ SRU_PROG = Program(SRU_CODE.encode('utf-8'), 'sru_prog.cu'.encode('utf-8')) SRU_PTX = SRU_PROG.compile() SRU_MOD = function.Module() SRU_MOD.load(bytes(SRU_PTX.encode())) SRU_FWD_FUNC = SRU_MOD.get_function('sru_fwd') SRU_BWD_FUNC = SRU_MOD.get_function('sru_bwd') SRU_BiFWD_FUNC = SRU_MOD.get_function('sru_bi_fwd') SRU_BiBWD_FUNC = SRU_MOD.get_function('sru_bi_bwd') Stream = namedtuple('Stream', ['ptr']) SRU_STREAM = Stream(ptr=torch.cuda.current_stream().cuda_stream) class SRU_Compute(Function): def __init__(self, activation_type, d_out, bidirectional=False): super(SRU_Compute, self).__init__() self.activation_type = activation_type
def __call__(self, input, k): out = input.new(input.size(0), input.size(1), input.size(2) // k, input.size(3) // k, 2) if not self.jit or isinstance(input, (torch.FloatTensor, torch.DoubleTensor)): y = input.view(input.size(0), input.size(1), input.size(2)//out.size(2), out.size(2), input.size(3)//out.size(3), out.size(3), 2) out = y.mean(4).squeeze(4).mean(2).squeeze(2) return out if not iscomplex(input): raise (TypeError('The input and outputs should be complex')) input = input.contiguous() if (self.periodize_cache[(input.size(), out.size(), input.get_device())] is None): kernel = ''' #define NW ${W} / ${k} #define NH ${H} / ${k} extern "C" __global__ void periodize(const ${Dtype}2 *input, ${Dtype}2 *output) { int tx = blockIdx.x * blockDim.x + threadIdx.x; int ty = blockIdx.y * blockDim.y + threadIdx.y; int tz = blockIdx.z * blockDim.z + threadIdx.z; if(tx >= NW || ty >= NH || tz >= ${B}) return; input += tz * ${H} * ${W} + ty * ${W} + tx; ${Dtype}2 res = make_${Dtype}2(0.f, 0.f); for (int j=0; j<${k}; ++j) for (int i=0; i<${k}; ++i) { const ${Dtype}2 &c = input[j * NH * ${W} + i * NW]; res.x += c.x; res.y += c.y; } res.x /= ${k} * ${k}; res.y /= ${k} * ${k}; output[tz * NH * NW + ty * NW + tx] = res; } ''' B = input.nelement() // (2*input.size(-2) * input.size(-3)) W = input.size(-2) H = input.size(-3) k = input.size(-2) // out.size(-2) kernel = Template(kernel).substitute(B=B, H=H, W=W, k=k, Dtype=getDtype(input)) name = str(input.get_device())+'-'+str(B)+'-'+str(k)+'-'+str(H)+'-'+str(W)+'-periodize.cu' print(name) prog = Program(kernel, name.encode()) ptx = prog.compile(['-arch='+get_compute_arch(input)]) module = Module() module.load(bytes(ptx.encode())) self.periodize_cache[(input.size(), out.size(), input.get_device())] = module grid = (self.GET_BLOCKS(out.size(-3), self.block[0]), self.GET_BLOCKS(out.size(-2), self.block[1]), self.GET_BLOCKS(out.nelement() // (2*out.size(-2) * out.size(-3)), self.block[2])) periodize = self.periodize_cache[(input.size(), out.size(), input.get_device())].get_function('periodize') periodize(grid=grid, block=self.block, args=[input.data_ptr(), out.data_ptr()], stream=Stream(ptr=torch.cuda.current_stream().cuda_stream)) return out
sys.exit(1) try: src = None options = [] # Parse all options for a in sys.argv[1:]: if a.startswith('-'): # Treat as compiler option options.append(a) else: # Treat as compiler input with open(a, 'rb') as f: src = f.read() # Create program object p = Program(src) # Run the compile ptx = p.compile(options) # Dump the output to stdout print(ptx) sys.exit(0) except ProgramException as e: # An error occurred, dump it to stdout print('ERROR:\n%s\n' % repr(e))
def __call__(self, input, k): out = input.new(input.size(0), input.size(1), input.size(2) // k, input.size(3) // k, 2) if not self.jit or isinstance(input, (torch.FloatTensor, torch.DoubleTensor)): y = input.view(input.size(0), input.size(1), input.size(2) // out.size(2), out.size(2), input.size(3) // out.size(3), out.size(3), 2) out = y.mean(4).squeeze(4).mean(2).squeeze(2) return out if not iscomplex(input): raise (TypeError('The input and outputs should be complex')) input = input.contiguous() if (self.periodize_cache[(input.size(), out.size(), input.get_device())] is None): kernel = ''' #define NW ${W} / ${k} #define NH ${H} / ${k} extern "C" __global__ void periodize(const ${Dtype}2 *input, ${Dtype}2 *output) { int tx = blockIdx.x * blockDim.x + threadIdx.x; int ty = blockIdx.y * blockDim.y + threadIdx.y; int tz = blockIdx.z * blockDim.z + threadIdx.z; if(tx >= NW || ty >= NH || tz >= ${B}) return; input += tz * ${H} * ${W} + ty * ${W} + tx; ${Dtype}2 res = make_${Dtype}2(0.f, 0.f); for (int j=0; j<${k}; ++j) for (int i=0; i<${k}; ++i) { const ${Dtype}2 &c = input[j * NH * ${W} + i * NW]; res.x += c.x; res.y += c.y; } res.x /= ${k} * ${k}; res.y /= ${k} * ${k}; output[tz * NH * NW + ty * NW + tx] = res; } ''' B = input.nelement() // (2 * input.size(-2) * input.size(-3)) W = input.size(-2) H = input.size(-3) k = input.size(-2) // out.size(-2) kernel = Template(kernel).substitute(B=B, H=H, W=W, k=k, Dtype=getDtype(input)) name = str(input.get_device()) + '-' + str(B) + '-' + str( k) + '-' + str(H) + '-' + str(W) + '-periodize.cu' print(name) prog = Program(kernel, name.encode()) ptx = prog.compile(['-arch=' + get_compute_arch(input)]) module = Module() module.load(bytes(ptx.encode())) self.periodize_cache[(input.size(), out.size(), input.get_device())] = module grid = (self.GET_BLOCKS(out.size(-3), self.block[0]), self.GET_BLOCKS(out.size(-2), self.block[1]), self.GET_BLOCKS( out.nelement() // (2 * out.size(-2) * out.size(-3)), self.block[2])) periodize = self.periodize_cache[( input.size(), out.size(), input.get_device())].get_function('periodize') periodize(grid=grid, block=self.block, args=[input.data_ptr(), out.data_ptr()], stream=Stream(ptr=torch.cuda.current_stream().cuda_stream)) return out