def flatten_parameters(self): """Resets parameter data pointer so that they can use faster code paths. Right now, this works only if the module is on the GPU and cuDNN is enabled. Otherwise, it's a no-op. """ any_param = next(self.parameters()).data if not any_param.is_cuda or not torch.backends.cudnn.is_acceptable( any_param): self._data_ptrs = [] return with torch.cuda.device_of(any_param): # This is quite ugly, but it allows us to reuse the cuDNN code without larger # modifications. It's really a low-level API that doesn't belong in here, but # let's make this exception. from torch.backends.cudnn import rnn from torch.backends import cudnn from torch.nn._functions.rnn import CudnnRNN handle = cudnn.get_handle() with warnings.catch_warnings(record=True): fn = CudnnRNN( self.mode, self.input_size, self.hidden_size, num_layers=self.num_layers, batch_first=self.batch_first, dropout=self.dropout, train=self.training, bidirectional=self.bidirectional, dropout_state=self.dropout_state, ) # Initialize descriptors fn.datatype = cudnn._typemap[any_param.type()] fn.x_descs = cudnn.descriptor(any_param.new(1, self.input_size), 1) fn.rnn_desc = rnn.init_rnn_descriptor(fn, handle) # Allocate buffer to hold the weights self._param_buf_size = rnn.get_num_weights(handle, fn.rnn_desc, fn.x_descs[0], fn.datatype) fn.weight_buf = any_param.new(self._param_buf_size).zero_() fn.w_desc = rnn.init_weight_descriptor(fn, fn.weight_buf) # Slice off views into weight_buf params = rnn.get_parameters(fn, handle, fn.weight_buf) all_weights = [[p.data for p in l] for l in self.all_weights] # Copy weights and update their storage rnn._copyParams(all_weights, params) for orig_layer_param, new_layer_param in zip(all_weights, params): for orig_param, new_param in zip(orig_layer_param, new_layer_param): orig_param.set_(new_param.view_as(orig_param)) self._data_ptrs = list(p.data.data_ptr() for p in self.parameters())
def flatten_parameters(self): """Resets parameter data pointer so that they can use faster code paths. Right now, this works only if the module is on the GPU and cuDNN is enabled. Otherwise, it's a no-op. """ any_param = next(self.parameters()).data if not any_param.is_cuda or not torch.backends.cudnn.is_acceptable(any_param): self._data_ptrs = [] return with torch.cuda.device_of(any_param): # This is quite ugly, but it allows us to reuse the cuDNN code without larger # modifications. It's really a low-level API that doesn't belong in here, but # let's make this exception. from torch.backends.cudnn import rnn from torch.backends import cudnn from torch.nn._functions.rnn import CudnnRNN handle = cudnn.get_handle() with warnings.catch_warnings(record=True): fn = CudnnRNN( self.mode, self.input_size, self.hidden_size, num_layers=self.num_layers, batch_first=self.batch_first, dropout=self.dropout, train=self.training, bidirectional=self.bidirectional, dropout_state=self.dropout_state, ) # Initialize descriptors fn.datatype = cudnn._typemap[any_param.type()] fn.x_descs = cudnn.descriptor(any_param.new(1, self.input_size), 1) fn.rnn_desc = rnn.init_rnn_descriptor(fn, handle) # Allocate buffer to hold the weights self._param_buf_size = rnn.get_num_weights(handle, fn.rnn_desc, fn.x_descs[0], fn.datatype) fn.weight_buf = any_param.new(self._param_buf_size).zero_() fn.w_desc = rnn.init_weight_descriptor(fn, fn.weight_buf) # Slice off views into weight_buf params = rnn.get_parameters(fn, handle, fn.weight_buf) all_weights = [[p.data for p in l] for l in self.all_weights] # Copy weights and update their storage rnn._copyParams(all_weights, params) for orig_layer_param, new_layer_param in zip(all_weights, params): for orig_param, new_param in zip(orig_layer_param, new_layer_param): orig_param.set_(new_param.view_as(orig_param)) self._data_ptrs = list(p.data.data_ptr() for p in self.parameters())
def forward(fn, input, hx, weight, output, hy): with torch.cuda.device_of(input): lib = cudnn.lib handle = cudnn.get_handle() fn.datatype = cudnn._typemap[input.type()] if fn.mode == cudnn.CUDNN_LSTM: hx, cx = hx hy, cy = hy else: cx, cy = None, None if fn.batch_first: input = input.transpose(0, 1) if input.dim() != 3: raise RuntimeError('input must have 3 dimensions, got {}'.format( input.dim())) if fn.input_size != input.size(2): raise RuntimeError( 'input.size(2) must be equal to input_size. Expected {}, got {}' .format(fn.input_size)) if fn.dropout != 0 and cudnn.lib.version < 5103: raise RuntimeError( 'dropout supported only in cudnn v5.1 and above') fn.seq_length, fn.mini_batch, fn.input_size = input.size() hidden_size = _hidden_size(fn) output_size = _output_size(fn) x = input.contiguous() output.resize_(*output_size) hy.resize_(*hidden_size).zero_() if cy: cy.resize_(*hidden_size).zero_() y = output # init descriptors fn.dropout_desc = init_dropout_descriptor(fn, handle) fn.rnn_desc = init_rnn_descriptor(fn) fn.x_descs = cudnn.descriptor(x[0], fn.seq_length) fn.y_descs = cudnn.descriptor(y[0], fn.seq_length) fn.hx_desc = cudnn.descriptor(hx) fn.hy_desc = cudnn.descriptor(hx) fn.cx_desc = cudnn.descriptor(cx) if cx else None fn.cy_desc = cudnn.descriptor(cx) if cx else None # create the weight buffer and copy the weights into it num_weights = get_num_weights(handle, fn.rnn_desc, fn.x_descs[0], fn.datatype) fn.weight_buf = input.new(num_weights) fn.w_desc = init_weight_descriptor(fn, fn.weight_buf) w = fn.weight_buf # this zero might not seem necessary, but it is in the case # where biases are disabled; then they won't be copied and must be zero'd. # Alternatively, _copyParams could be written more carefully. w.zero_() params = get_parameters(fn, handle, w) _copyParams(weight, params) if tuple(hx.size()) != hidden_size: raise RuntimeError('Expected hidden size {}, got {}'.format( hidden_size, tuple(hx.size()))) if cx and tuple(cx.size()) != hidden_size: raise RuntimeError('Expected cell size {}, got {}'.format( hidden_size, tuple(cx.size()))) workspace_size = ctypes.c_long() check_error( lib.cudnnGetRNNWorkspaceSize(handle, fn.rnn_desc, fn.seq_length, fn.x_descs, ctypes.byref(workspace_size))) fn.workspace = torch.cuda.ByteTensor(workspace_size.value) if fn.train: reserve_size = ctypes.c_long() check_error( lib.cudnnGetRNNTrainingReserveSize(handle, fn.rnn_desc, fn.seq_length, fn.x_descs, ctypes.byref(reserve_size))) fn.reserve = torch.cuda.ByteTensor(reserve_size.value) check_error( lib.cudnnRNNForwardTraining( handle, fn.rnn_desc, fn.seq_length, fn.x_descs, ctypes.c_void_p(x.data_ptr()), fn.hx_desc, ctypes.c_void_p(hx.data_ptr()), fn.cx_desc, ctypes.c_void_p(cx.data_ptr()) if cx else None, fn.w_desc, ctypes.c_void_p(w.data_ptr()), fn.y_descs, ctypes.c_void_p(y.data_ptr()), fn.hy_desc, ctypes.c_void_p(hy.data_ptr()), fn.cy_desc, ctypes.c_void_p(cy.data_ptr()) if cx else None, ctypes.c_void_p(fn.workspace.data_ptr()), fn.workspace.size(0), ctypes.c_void_p(fn.reserve.data_ptr()), fn.reserve.size(0))) else: # inference check_error( lib.cudnnRNNForwardInference( handle, fn.rnn_desc, fn.seq_length, fn.x_descs, ctypes.c_void_p(x.data_ptr()), fn.hx_desc, ctypes.c_void_p(hx.data_ptr()), fn.cx_desc, ctypes.c_void_p(cx.data_ptr()) if cx else None, fn.w_desc, ctypes.c_void_p(w.data_ptr()), fn.y_descs, ctypes.c_void_p(y.data_ptr()), fn.hy_desc, ctypes.c_void_p(hy.data_ptr()), fn.cy_desc, ctypes.c_void_p(cy.data_ptr()) if cx else None, ctypes.c_void_p(fn.reserve.data_ptr()), fn.reserve.size(0))) if fn.batch_first: output = output.transpose(0, 1)
def forward(fn, input, hx, weight, output, hy): with torch.cuda.device_of(input): lib = cudnn.lib handle = cudnn.get_handle() fn.datatype = cudnn._typemap[input.type()] is_input_packed = fn.batch_sizes is not None if fn.mode == cudnn.CUDNN_LSTM: hx, cx = hx hy, cy = hy else: cx, cy = None, None if fn.batch_first and not is_input_packed: input = input.transpose(0, 1) if fn.dropout != 0 and cudnn.version() < 5103: raise RuntimeError('dropout supported only in cudnn v5.1 and above') if is_input_packed: fn.seq_length = len(fn.batch_sizes) fn.mini_batch = fn.batch_sizes[0] fn.input_size = input.size(-1) else: fn.seq_length, fn.mini_batch, fn.input_size = input.size() hidden_size = _hidden_size(fn) output_size = _output_size(fn, input) assert hx.is_contiguous() assert cx is None or cx.is_contiguous() x = input.contiguous() output.resize_(*output_size) hy.resize_(*hidden_size) if cy is not None: cy.resize_(*hidden_size) y = output # init descriptors fn.rnn_desc = init_rnn_descriptor(fn, handle) if is_input_packed: fn.x_descs = cudnn.descriptor_sequence(x, fn.batch_sizes) fn.y_descs = cudnn.descriptor_sequence(y, fn.batch_sizes) else: fn.x_descs = cudnn.descriptor(x[0], fn.seq_length) fn.y_descs = cudnn.descriptor(y[0], fn.seq_length) fn.hx_desc = cudnn.descriptor(hx) fn.hy_desc = cudnn.descriptor(hx) fn.cx_desc = cudnn.descriptor(cx) if cx is not None else None fn.cy_desc = cudnn.descriptor(cx) if cx is not None else None # create the weight buffer and copy the weights into it if fn.weight_buf is None: num_weights = get_num_weights( handle, fn.rnn_desc, fn.x_descs[0], fn.datatype) fn.weight_buf = x.new(num_weights) fn.w_desc = init_weight_descriptor(fn, fn.weight_buf) w = fn.weight_buf # this zero might not seem necessary, but it is in the case # where biases are disabled; then they won't be copied and must be zero'd. # Alternatively, _copyParams could be written more carefully. w.zero_() params = get_parameters(fn, handle, w) _copyParams(weight, params) else: fn.w_desc = init_weight_descriptor(fn, fn.weight_buf) w = fn.weight_buf if cx is not None and tuple(cx.size()) != hidden_size: raise RuntimeError('Expected cell size {}, got {}'.format( hidden_size, tuple(cx.size()))) workspace_size = ctypes.c_long() check_error(lib.cudnnGetRNNWorkspaceSize( handle, fn.rnn_desc, fn.seq_length, fn.x_descs, ctypes.byref(workspace_size) )) fn.workspace_size = workspace_size.value with torch.cuda.device_of(input): workspace = torch.cuda.ByteTensor(fn.workspace_size) if fn.requires_grad: reserve_size = ctypes.c_long() check_error(lib.cudnnGetRNNTrainingReserveSize( handle, fn.rnn_desc, fn.seq_length, fn.x_descs, ctypes.byref(reserve_size) )) fn.reserve = torch.cuda.ByteTensor(reserve_size.value) check_error(lib.cudnnRNNForwardTraining( handle, fn.rnn_desc, fn.seq_length, fn.x_descs, ctypes.c_void_p(x.data_ptr()), fn.hx_desc, ctypes.c_void_p(hx.data_ptr()), fn.cx_desc, ctypes.c_void_p(cx.data_ptr()) if cx is not None else None, fn.w_desc, ctypes.c_void_p(w.data_ptr()), fn.y_descs, ctypes.c_void_p(y.data_ptr()), fn.hy_desc, ctypes.c_void_p(hy.data_ptr()), fn.cy_desc, ctypes.c_void_p(cy.data_ptr()) if cx is not None else None, ctypes.c_void_p(workspace.data_ptr()), workspace.size(0), ctypes.c_void_p(fn.reserve.data_ptr()), fn.reserve.size(0) )) else: # inference check_error(lib.cudnnRNNForwardInference( handle, fn.rnn_desc, fn.seq_length, fn.x_descs, ctypes.c_void_p(x.data_ptr()), fn.hx_desc, ctypes.c_void_p(hx.data_ptr()), fn.cx_desc, ctypes.c_void_p(cx.data_ptr()) if cx is not None else None, fn.w_desc, ctypes.c_void_p(w.data_ptr()), fn.y_descs, ctypes.c_void_p(y.data_ptr()), fn.hy_desc, ctypes.c_void_p(hy.data_ptr()), fn.cy_desc, ctypes.c_void_p(cy.data_ptr()) if cx is not None else None, ctypes.c_void_p(workspace.data_ptr()), workspace.size(0) )) if fn.batch_first and not is_input_packed: output.transpose_(0, 1)