def forward(input, weight, hx, batch_sizes): if mode == cudnn.CUDNN_LSTM: hx, cx = hx else: cx = None handle = cudnn.get_handle() with torch.cuda.device(input.get_device()): dropout_ts = cudnn.rnn.init_dropout_state(dropout, train, dropout_seed, dropout_state) weight_arr = list(itertools.chain.from_iterable(weight)) weight_stride0 = len(weight[0]) output, hy, cy, reserve, new_weight_buf = torch._cudnn_rnn( input, weight_arr, weight_stride0, flat_weight, hx, cx, mode, hidden_size, num_layers, batch_first, dropout, train, bool(bidirectional), list(batch_sizes.data) if variable_length else (), dropout_ts) if cx is not None: return (output, (hy, cy)) else: return (output, hy)
def forward(input, weight, hx, batch_sizes): if mode == cudnn.CUDNN_LSTM: hx, cx = hx else: cx = None handle = cudnn.get_handle() dropout_ts = cudnn.rnn.init_dropout_state(torch.cuda.uint8, dropout, train, dropout_seed, dropout_state) weight_arr = list(itertools.chain.from_iterable(weight)) weight_stride0 = len(weight[0]) output, hy, cy, reserve, new_weight_buf = torch._cudnn_rnn( input, weight_arr, weight_stride0, Variable(flat_weight) if flat_weight is not None else None, hx, cx, mode, hidden_size, num_layers, batch_first, dropout, train, bool(bidirectional), list(batch_sizes.data) if variable_length else (), dropout_ts) if cx is not None: return (output, (hy, cy)) else: return (output, hy)
def forward(input, weight, hx, batch_sizes): if mode == cudnn.CUDNN_LSTM: hx, cx = hx else: cx = None handle = cudnn.get_handle() dropout_desc = cudnn.rnn.init_dropout_descriptor(handle, dropout, train, dropout_seed, dropout_state) weight_arr = list(itertools.chain.from_iterable(weight)) weight_stride0 = len(weight[0]) output, hy, cy, reserve, new_weight_buf = torch._C._VariableFunctions._cudnn_rnn( input, weight_arr, weight_stride0, Variable(flat_weight) if flat_weight is not None else None, hx, cx, mode, hidden_size, num_layers, batch_first, dropout, train, bool(bidirectional), list(batch_sizes.data) if variable_length else (), Variable(dropout_desc.state) if dropout_desc.state is not None else None) if cx is not None: return (output, (hy, cy)) else: return (output, hy)
def flatten_parameters(self): """Resets parameter data pointer so that they can use faster code paths. Right now, this works only if the module is on the GPU and cuDNN is enabled. Otherwise, it's a no-op. """ any_param = next(self.parameters()).data if not any_param.is_cuda or not torch.backends.cudnn.is_acceptable( any_param): self._data_ptrs = [] return with torch.cuda.device_of(any_param): # This is quite ugly, but it allows us to reuse the cuDNN code without larger # modifications. It's really a low-level API that doesn't belong in here, but # let's make this exception. from torch.backends.cudnn import rnn from torch.backends import cudnn from torch.nn._functions.rnn import CudnnRNN handle = cudnn.get_handle() with warnings.catch_warnings(record=True): fn = CudnnRNN( self.mode, self.input_size, self.hidden_size, num_layers=self.num_layers, batch_first=self.batch_first, dropout=self.dropout, train=self.training, bidirectional=self.bidirectional, dropout_state=self.dropout_state, ) # Initialize descriptors fn.datatype = cudnn._typemap[any_param.type()] fn.x_descs = cudnn.descriptor(any_param.new(1, self.input_size), 1) fn.rnn_desc = rnn.init_rnn_descriptor(fn, handle) # Allocate buffer to hold the weights self._param_buf_size = rnn.get_num_weights(handle, fn.rnn_desc, fn.x_descs[0], fn.datatype) fn.weight_buf = any_param.new(self._param_buf_size).zero_() fn.w_desc = rnn.init_weight_descriptor(fn, fn.weight_buf) # Slice off views into weight_buf params = rnn.get_parameters(fn, handle, fn.weight_buf) all_weights = [[p.data for p in l] for l in self.all_weights] # Copy weights and update their storage rnn._copyParams(all_weights, params) for orig_layer_param, new_layer_param in zip(all_weights, params): for orig_param, new_param in zip(orig_layer_param, new_layer_param): orig_param.set_(new_param.view_as(orig_param)) self._data_ptrs = list(p.data.data_ptr() for p in self.parameters())
def flatten_parameters(self): """Resets parameter data pointer so that they can use faster code paths. Right now, this works only if the module is on the GPU and cuDNN is enabled. Otherwise, it's a no-op. """ any_param = next(self.parameters()).data if not any_param.is_cuda or not torch.backends.cudnn.is_acceptable(any_param): self._data_ptrs = [] return with torch.cuda.device_of(any_param): # This is quite ugly, but it allows us to reuse the cuDNN code without larger # modifications. It's really a low-level API that doesn't belong in here, but # let's make this exception. from torch.backends.cudnn import rnn from torch.backends import cudnn from torch.nn._functions.rnn import CudnnRNN handle = cudnn.get_handle() with warnings.catch_warnings(record=True): fn = CudnnRNN( self.mode, self.input_size, self.hidden_size, num_layers=self.num_layers, batch_first=self.batch_first, dropout=self.dropout, train=self.training, bidirectional=self.bidirectional, dropout_state=self.dropout_state, ) # Initialize descriptors fn.datatype = cudnn._typemap[any_param.type()] fn.x_descs = cudnn.descriptor(any_param.new(1, self.input_size), 1) fn.rnn_desc = rnn.init_rnn_descriptor(fn, handle) # Allocate buffer to hold the weights self._param_buf_size = rnn.get_num_weights(handle, fn.rnn_desc, fn.x_descs[0], fn.datatype) fn.weight_buf = any_param.new(self._param_buf_size).zero_() fn.w_desc = rnn.init_weight_descriptor(fn, fn.weight_buf) # Slice off views into weight_buf params = rnn.get_parameters(fn, handle, fn.weight_buf) all_weights = [[p.data for p in l] for l in self.all_weights] # Copy weights and update their storage rnn._copyParams(all_weights, params) for orig_layer_param, new_layer_param in zip(all_weights, params): for orig_param, new_param in zip(orig_layer_param, new_layer_param): orig_param.set_(new_param.view_as(orig_param)) self._data_ptrs = list(p.data.data_ptr() for p in self.parameters())
def backward_weight(fn, input, hx, output, weight, grad_weight): with torch.cuda.device_of(input): is_input_packed = fn.batch_sizes is not None handle = cudnn.get_handle() if fn.mode == cudnn.CUDNN_LSTM: hx, cx = hx else: cx = None if fn.batch_first and not is_input_packed: input = input.transpose(0, 1) output = output.transpose(0, 1) input_size = _input_size(fn, input) hidden_size = _hidden_size(fn) if not fn.requires_grad: raise RuntimeError( 'backward_weight can only be called when the function requires grad!' ) if fn.dropout != 0 and cudnn.version() < 5103: raise RuntimeError( 'dropout supported only in cudnn v 5.1 and above') if tuple(input.size()) != input_size: raise RuntimeError('Expected input size {}, got {}'.format( input_size, tuple(input.size()))) if tuple(hx.size()) != hidden_size: raise RuntimeError('Expected input size {}, got {}'.format( hidden_size, hx.size())) assert hx.is_contiguous() assert cx is None or cx.is_contiguous() x = input.contiguous() y = output dw = fn.weight_buf.new().resize_as_(fn.weight_buf).zero_() with torch.cuda.device_of(input): workspace = torch.cuda.ByteTensor(fn.workspace_size) check_error( cudnn.lib.cudnnRNNBackwardWeights( handle, fn.rnn_desc, fn.seq_length, fn.x_descs, ctypes.c_void_p(x.data_ptr()), fn.hx_desc, ctypes.c_void_p(hx.data_ptr()), fn.y_descs, ctypes.c_void_p(y.data_ptr()), ctypes.c_void_p(workspace.data_ptr()), workspace.size(0), fn.w_desc, ctypes.c_void_p(dw.data_ptr()), ctypes.c_void_p(fn.reserve.data_ptr()), fn.reserve.size(0))) # copy the weights from the weight_buf into grad_weight grad_params = get_parameters(fn, handle, dw) _copyParams(grad_params, grad_weight) return grad_weight
def backward_weight(fn, input, hx, output, weight, grad_weight): with torch.cuda.device_of(input): is_input_packed = fn.batch_sizes is not None handle = cudnn.get_handle() if fn.mode == cudnn.CUDNN_LSTM: hx, cx = hx else: cx = None if fn.batch_first and not is_input_packed: input = input.transpose(0, 1) output = output.transpose(0, 1) input_size = _input_size(fn, input) hidden_size = _hidden_size(fn) if not fn.requires_grad: raise RuntimeError('backward_weight can only be called when the function requires grad!') if fn.dropout != 0 and cudnn.version() < 5103: raise RuntimeError('dropout supported only in cudnn v 5.1 and above') if tuple(input.size()) != input_size: raise RuntimeError('Expected input size {}, got {}'.format( input_size, tuple(input.size()))) if tuple(hx.size()) != hidden_size: raise RuntimeError('Expected input size {}, got {}'.format( hidden_size, hx.size())) assert hx.is_contiguous() assert cx is None or cx.is_contiguous() x = input.contiguous() y = output dw = fn.weight_buf.new().resize_as_(fn.weight_buf).zero_() with torch.cuda.device_of(input): workspace = torch.cuda.ByteTensor(fn.workspace_size) check_error(cudnn.lib.cudnnRNNBackwardWeights( handle, fn.rnn_desc, fn.seq_length, fn.x_descs, ctypes.c_void_p(x.data_ptr()), fn.hx_desc, ctypes.c_void_p(hx.data_ptr()), fn.y_descs, ctypes.c_void_p(y.data_ptr()), ctypes.c_void_p(workspace.data_ptr()), workspace.size(0), fn.w_desc, ctypes.c_void_p(dw.data_ptr()), ctypes.c_void_p(fn.reserve.data_ptr()), fn.reserve.size(0) )) # copy the weights from the weight_buf into grad_weight grad_params = get_parameters(fn, handle, dw) _copyParams(grad_params, grad_weight) return grad_weight
def backward_grad(fn, input, hx, weight, output, grad_output, grad_hy, grad_input, grad_hx): with torch.cuda.device_of(input): handle = cudnn.get_handle() if fn.mode == cudnn.CUDNN_LSTM: hx, cx = hx grad_hx, grad_cx = grad_hx grad_hy, grad_cy = grad_hy else: cx, grad_cx, grad_cy = None, None, None if fn.batch_first: input = input.transpose(0, 1) grad_output = grad_output.transpose(0, 1) output = output.transpose(0, 1) input_size = _input_size(fn) hidden_size = _hidden_size(fn) output_size = _output_size(fn) x = input.contiguous() dy = grad_output.contiguous() y = output w = fn.weight_buf dx = grad_input.resize_as_(input) dhy = grad_hy.resize_(*hidden_size) dcy = grad_cy.resize_(*hidden_size) if grad_cy else None dhx = grad_hx.resize_(*hidden_size) dcx = grad_cx.resize_(*hidden_size) if grad_cx else None if fn.dropout != 0 and lib.version < 5103: raise RuntimeError( 'dropout supported only in cudnn v 5.1 and above') if not fn.train: raise RuntimeError( 'backward_grad can only be called when training!') if tuple(input.size()) != input_size: raise RuntimeError('Expected input size {}, got {}'.format( input_size, tuple(input.size()))) if tuple(output.size()) != _output_size(fn): raise RuntimeError('Expected output size {}, got {}'.format( output_size, output.size())) if hx and tuple(hx.size()) != hidden_size: raise RuntimeError('Expected hidden size {}, got {}'.format( hidden_size, hx.size())) if cx and tuple(cx.size()) != hidden_size: raise RuntimeError('Expected cell size {}, got {}'.format( hidden_size, cx.size())) if dhy and tuple(dhy.size()) != hidden_size: raise RuntimeError('Expected d_hidden size {}, got {}'.format( hidden_size, dhy.size())) if dcy and tuple(dcy.size()) != hidden_size: raise RuntimeError('Expected d_cell size {}, got {}'.format( hidden_size, dcy.size())) check_error( cudnn.lib.cudnnRNNBackwardData( handle, fn.rnn_desc, fn.seq_length, fn.y_descs, ctypes.c_void_p(y.data_ptr()), fn.y_descs, ctypes.c_void_p(dy.data_ptr()), fn.hy_desc, ctypes.c_void_p(dhy.data_ptr()), fn.cy_desc, ctypes.c_void_p(dcy.data_ptr()) if cx else None, fn.w_desc, ctypes.c_void_p(w.data_ptr()), fn.hx_desc, ctypes.c_void_p(hx.data_ptr()), fn.cx_desc, ctypes.c_void_p(cx.data_ptr()) if cx else None, fn.x_descs, ctypes.c_void_p(dx.data_ptr()), fn.hx_desc, ctypes.c_void_p(dhx.data_ptr()), fn.cx_desc, ctypes.c_void_p(dcx.data_ptr()) if cx else None, ctypes.c_void_p(fn.workspace.data_ptr()), fn.workspace.size(0), ctypes.c_void_p(fn.reserve.data_ptr()), fn.reserve.size(0))) if fn.batch_first: grad_input = grad_input.transpose(0, 1)
def forward(fn, input, hx, weight, output, hy): with torch.cuda.device_of(input): lib = cudnn.lib handle = cudnn.get_handle() fn.datatype = cudnn._typemap[input.type()] if fn.mode == cudnn.CUDNN_LSTM: hx, cx = hx hy, cy = hy else: cx, cy = None, None if fn.batch_first: input = input.transpose(0, 1) if input.dim() != 3: raise RuntimeError('input must have 3 dimensions, got {}'.format( input.dim())) if fn.input_size != input.size(2): raise RuntimeError( 'input.size(2) must be equal to input_size. Expected {}, got {}' .format(fn.input_size)) if fn.dropout != 0 and cudnn.lib.version < 5103: raise RuntimeError( 'dropout supported only in cudnn v5.1 and above') fn.seq_length, fn.mini_batch, fn.input_size = input.size() hidden_size = _hidden_size(fn) output_size = _output_size(fn) x = input.contiguous() output.resize_(*output_size) hy.resize_(*hidden_size).zero_() if cy: cy.resize_(*hidden_size).zero_() y = output # init descriptors fn.dropout_desc = init_dropout_descriptor(fn, handle) fn.rnn_desc = init_rnn_descriptor(fn) fn.x_descs = cudnn.descriptor(x[0], fn.seq_length) fn.y_descs = cudnn.descriptor(y[0], fn.seq_length) fn.hx_desc = cudnn.descriptor(hx) fn.hy_desc = cudnn.descriptor(hx) fn.cx_desc = cudnn.descriptor(cx) if cx else None fn.cy_desc = cudnn.descriptor(cx) if cx else None # create the weight buffer and copy the weights into it num_weights = get_num_weights(handle, fn.rnn_desc, fn.x_descs[0], fn.datatype) fn.weight_buf = input.new(num_weights) fn.w_desc = init_weight_descriptor(fn, fn.weight_buf) w = fn.weight_buf # this zero might not seem necessary, but it is in the case # where biases are disabled; then they won't be copied and must be zero'd. # Alternatively, _copyParams could be written more carefully. w.zero_() params = get_parameters(fn, handle, w) _copyParams(weight, params) if tuple(hx.size()) != hidden_size: raise RuntimeError('Expected hidden size {}, got {}'.format( hidden_size, tuple(hx.size()))) if cx and tuple(cx.size()) != hidden_size: raise RuntimeError('Expected cell size {}, got {}'.format( hidden_size, tuple(cx.size()))) workspace_size = ctypes.c_long() check_error( lib.cudnnGetRNNWorkspaceSize(handle, fn.rnn_desc, fn.seq_length, fn.x_descs, ctypes.byref(workspace_size))) fn.workspace = torch.cuda.ByteTensor(workspace_size.value) if fn.train: reserve_size = ctypes.c_long() check_error( lib.cudnnGetRNNTrainingReserveSize(handle, fn.rnn_desc, fn.seq_length, fn.x_descs, ctypes.byref(reserve_size))) fn.reserve = torch.cuda.ByteTensor(reserve_size.value) check_error( lib.cudnnRNNForwardTraining( handle, fn.rnn_desc, fn.seq_length, fn.x_descs, ctypes.c_void_p(x.data_ptr()), fn.hx_desc, ctypes.c_void_p(hx.data_ptr()), fn.cx_desc, ctypes.c_void_p(cx.data_ptr()) if cx else None, fn.w_desc, ctypes.c_void_p(w.data_ptr()), fn.y_descs, ctypes.c_void_p(y.data_ptr()), fn.hy_desc, ctypes.c_void_p(hy.data_ptr()), fn.cy_desc, ctypes.c_void_p(cy.data_ptr()) if cx else None, ctypes.c_void_p(fn.workspace.data_ptr()), fn.workspace.size(0), ctypes.c_void_p(fn.reserve.data_ptr()), fn.reserve.size(0))) else: # inference check_error( lib.cudnnRNNForwardInference( handle, fn.rnn_desc, fn.seq_length, fn.x_descs, ctypes.c_void_p(x.data_ptr()), fn.hx_desc, ctypes.c_void_p(hx.data_ptr()), fn.cx_desc, ctypes.c_void_p(cx.data_ptr()) if cx else None, fn.w_desc, ctypes.c_void_p(w.data_ptr()), fn.y_descs, ctypes.c_void_p(y.data_ptr()), fn.hy_desc, ctypes.c_void_p(hy.data_ptr()), fn.cy_desc, ctypes.c_void_p(cy.data_ptr()) if cx else None, ctypes.c_void_p(fn.reserve.data_ptr()), fn.reserve.size(0))) if fn.batch_first: output = output.transpose(0, 1)
def backward_grad(fn, input, hx, weight, output, grad_output, grad_hy, grad_input, grad_hx): with torch.cuda.device_of(input): is_input_packed = fn.batch_sizes is not None handle = cudnn.get_handle() if fn.mode == cudnn.CUDNN_LSTM: hx, cx = hx grad_hx, grad_cx = grad_hx grad_hy, grad_cy = grad_hy else: cx, grad_cx, grad_cy = None, None, None if fn.batch_first and not is_input_packed: input = input.transpose(0, 1) grad_output = grad_output.transpose(0, 1) output = output.transpose(0, 1) input_size = _input_size(fn, input) hidden_size = _hidden_size(fn) output_size = _output_size(fn, input) assert hx.is_contiguous() assert cx is None or cx.is_contiguous() x = input.contiguous() dy = grad_output.contiguous() y = output w = fn.weight_buf dx = grad_input.resize_as_(input) dhy = grad_hy.contiguous().view(*hidden_size) dcy = grad_cy.contiguous().view(*hidden_size) if grad_cy is not None else None dhx = grad_hx.resize_(*hidden_size) dcx = grad_cx.resize_(*hidden_size) if grad_cx is not None else None if fn.dropout != 0 and cudnn.version() < 5103: raise RuntimeError('dropout supported only in cudnn v 5.1 and above') if not fn.requires_grad: raise RuntimeError('backward_grad can only be called when the function requires grad!') if tuple(input.size()) != input_size: raise RuntimeError('Expected input size {}, got {}'.format( input_size, tuple(input.size()))) if tuple(output.size()) != output_size: raise RuntimeError('Expected output size {}, got {}'.format( output_size, output.size())) if hx is not None and tuple(hx.size()) != hidden_size: raise RuntimeError('Expected hidden size {}, got {}'.format( hidden_size, hx.size())) if cx is not None and tuple(cx.size()) != hidden_size: raise RuntimeError('Expected cell size {}, got {}'.format( hidden_size, cx.size())) if dhy is not None and tuple(dhy.size()) != hidden_size: raise RuntimeError('Expected d_hidden size {}, got {}'.format( hidden_size, dhy.size())) if dcy is not None and tuple(dcy.size()) != hidden_size: raise RuntimeError('Expected d_cell size {}, got {}'.format( hidden_size, dcy.size())) if not dhy.is_cuda or not dy.is_cuda or (dcy is not None and not dcy.is_cuda): raise RuntimeError('Gradients aren\'t CUDA tensors') with torch.cuda.device_of(input): workspace = torch.cuda.ByteTensor(fn.workspace_size) check_error(cudnn.lib.cudnnRNNBackwardData( handle, fn.rnn_desc, fn.seq_length, fn.y_descs, ctypes.c_void_p(y.data_ptr()), fn.y_descs, ctypes.c_void_p(dy.data_ptr()), fn.hy_desc, ctypes.c_void_p(dhy.data_ptr()), fn.cy_desc, ctypes.c_void_p(dcy.data_ptr()) if cx is not None else None, fn.w_desc, ctypes.c_void_p(w.data_ptr()), fn.hx_desc, ctypes.c_void_p(hx.data_ptr()), fn.cx_desc, ctypes.c_void_p(cx.data_ptr()) if cx is not None else None, fn.x_descs, ctypes.c_void_p(dx.data_ptr()), fn.hx_desc, ctypes.c_void_p(dhx.data_ptr()), fn.cx_desc, ctypes.c_void_p(dcx.data_ptr()) if cx is not None else None, ctypes.c_void_p(workspace.data_ptr()), workspace.size(0), ctypes.c_void_p(fn.reserve.data_ptr()), fn.reserve.size(0) )) if fn.batch_first and not is_input_packed: grad_input = grad_input.transpose_(0, 1)
def forward(fn, input, hx, weight, output, hy): with torch.cuda.device_of(input): lib = cudnn.lib handle = cudnn.get_handle() fn.datatype = cudnn._typemap[input.type()] is_input_packed = fn.batch_sizes is not None if fn.mode == cudnn.CUDNN_LSTM: hx, cx = hx hy, cy = hy else: cx, cy = None, None if fn.batch_first and not is_input_packed: input = input.transpose(0, 1) if fn.dropout != 0 and cudnn.version() < 5103: raise RuntimeError('dropout supported only in cudnn v5.1 and above') if is_input_packed: fn.seq_length = len(fn.batch_sizes) fn.mini_batch = fn.batch_sizes[0] fn.input_size = input.size(-1) else: fn.seq_length, fn.mini_batch, fn.input_size = input.size() hidden_size = _hidden_size(fn) output_size = _output_size(fn, input) assert hx.is_contiguous() assert cx is None or cx.is_contiguous() x = input.contiguous() output.resize_(*output_size) hy.resize_(*hidden_size) if cy is not None: cy.resize_(*hidden_size) y = output # init descriptors fn.rnn_desc = init_rnn_descriptor(fn, handle) if is_input_packed: fn.x_descs = cudnn.descriptor_sequence(x, fn.batch_sizes) fn.y_descs = cudnn.descriptor_sequence(y, fn.batch_sizes) else: fn.x_descs = cudnn.descriptor(x[0], fn.seq_length) fn.y_descs = cudnn.descriptor(y[0], fn.seq_length) fn.hx_desc = cudnn.descriptor(hx) fn.hy_desc = cudnn.descriptor(hx) fn.cx_desc = cudnn.descriptor(cx) if cx is not None else None fn.cy_desc = cudnn.descriptor(cx) if cx is not None else None # create the weight buffer and copy the weights into it if fn.weight_buf is None: num_weights = get_num_weights( handle, fn.rnn_desc, fn.x_descs[0], fn.datatype) fn.weight_buf = x.new(num_weights) fn.w_desc = init_weight_descriptor(fn, fn.weight_buf) w = fn.weight_buf # this zero might not seem necessary, but it is in the case # where biases are disabled; then they won't be copied and must be zero'd. # Alternatively, _copyParams could be written more carefully. w.zero_() params = get_parameters(fn, handle, w) _copyParams(weight, params) else: fn.w_desc = init_weight_descriptor(fn, fn.weight_buf) w = fn.weight_buf if cx is not None and tuple(cx.size()) != hidden_size: raise RuntimeError('Expected cell size {}, got {}'.format( hidden_size, tuple(cx.size()))) workspace_size = ctypes.c_long() check_error(lib.cudnnGetRNNWorkspaceSize( handle, fn.rnn_desc, fn.seq_length, fn.x_descs, ctypes.byref(workspace_size) )) fn.workspace_size = workspace_size.value with torch.cuda.device_of(input): workspace = torch.cuda.ByteTensor(fn.workspace_size) if fn.requires_grad: reserve_size = ctypes.c_long() check_error(lib.cudnnGetRNNTrainingReserveSize( handle, fn.rnn_desc, fn.seq_length, fn.x_descs, ctypes.byref(reserve_size) )) fn.reserve = torch.cuda.ByteTensor(reserve_size.value) check_error(lib.cudnnRNNForwardTraining( handle, fn.rnn_desc, fn.seq_length, fn.x_descs, ctypes.c_void_p(x.data_ptr()), fn.hx_desc, ctypes.c_void_p(hx.data_ptr()), fn.cx_desc, ctypes.c_void_p(cx.data_ptr()) if cx is not None else None, fn.w_desc, ctypes.c_void_p(w.data_ptr()), fn.y_descs, ctypes.c_void_p(y.data_ptr()), fn.hy_desc, ctypes.c_void_p(hy.data_ptr()), fn.cy_desc, ctypes.c_void_p(cy.data_ptr()) if cx is not None else None, ctypes.c_void_p(workspace.data_ptr()), workspace.size(0), ctypes.c_void_p(fn.reserve.data_ptr()), fn.reserve.size(0) )) else: # inference check_error(lib.cudnnRNNForwardInference( handle, fn.rnn_desc, fn.seq_length, fn.x_descs, ctypes.c_void_p(x.data_ptr()), fn.hx_desc, ctypes.c_void_p(hx.data_ptr()), fn.cx_desc, ctypes.c_void_p(cx.data_ptr()) if cx is not None else None, fn.w_desc, ctypes.c_void_p(w.data_ptr()), fn.y_descs, ctypes.c_void_p(y.data_ptr()), fn.hy_desc, ctypes.c_void_p(hy.data_ptr()), fn.cy_desc, ctypes.c_void_p(cy.data_ptr()) if cx is not None else None, ctypes.c_void_p(workspace.data_ptr()), workspace.size(0) )) if fn.batch_first and not is_input_packed: output.transpose_(0, 1)
def backward_grad(fn, input, hx, weight, output, grad_output, grad_hy, grad_input, grad_hx): with torch.cuda.device_of(input): is_input_packed = fn.batch_sizes is not None handle = cudnn.get_handle() if fn.mode == cudnn.CUDNN_LSTM: hx, cx = hx grad_hx, grad_cx = grad_hx grad_hy, grad_cy = grad_hy else: cx, grad_cx, grad_cy = None, None, None if fn.batch_first and not is_input_packed: input = input.transpose(0, 1) grad_output = grad_output.transpose(0, 1) output = output.transpose(0, 1) input_size = _input_size(fn, input) hidden_size = _hidden_size(fn) output_size = _output_size(fn, input) assert hx.is_contiguous() assert cx is None or cx.is_contiguous() x = input.contiguous() dy = grad_output.contiguous() y = output w = fn.weight_buf dx = grad_input.resize_as_(input) dhy = grad_hy.contiguous().view(*hidden_size) dcy = grad_cy.contiguous().view( *hidden_size) if grad_cy is not None else None dhx = grad_hx.resize_(*hidden_size) dcx = grad_cx.resize_(*hidden_size) if grad_cx is not None else None if fn.dropout != 0 and cudnn.version() < 5103: raise RuntimeError( 'dropout supported only in cudnn v 5.1 and above') if not fn.requires_grad: raise RuntimeError( 'backward_grad can only be called when the function requires grad!' ) if tuple(input.size()) != input_size: raise RuntimeError('Expected input size {}, got {}'.format( input_size, tuple(input.size()))) if tuple(output.size()) != output_size: raise RuntimeError('Expected output size {}, got {}'.format( output_size, output.size())) if hx is not None and tuple(hx.size()) != hidden_size: raise RuntimeError('Expected hidden size {}, got {}'.format( hidden_size, hx.size())) if cx is not None and tuple(cx.size()) != hidden_size: raise RuntimeError('Expected cell size {}, got {}'.format( hidden_size, cx.size())) if dhy is not None and tuple(dhy.size()) != hidden_size: raise RuntimeError('Expected d_hidden size {}, got {}'.format( hidden_size, dhy.size())) if dcy is not None and tuple(dcy.size()) != hidden_size: raise RuntimeError('Expected d_cell size {}, got {}'.format( hidden_size, dcy.size())) if not dhy.is_cuda or not dy.is_cuda or (dcy is not None and not dcy.is_cuda): raise RuntimeError('Gradients aren\'t CUDA tensors') check_error( cudnn.lib.cudnnRNNBackwardData( handle, fn.rnn_desc, fn.seq_length, fn.y_descs, ctypes.c_void_p(y.data_ptr()), fn.y_descs, ctypes.c_void_p(dy.data_ptr()), fn.hy_desc, ctypes.c_void_p(dhy.data_ptr()), fn.cy_desc, ctypes.c_void_p(dcy.data_ptr()) if cx is not None else None, fn.w_desc, ctypes.c_void_p(w.data_ptr()), fn.hx_desc, ctypes.c_void_p(hx.data_ptr()), fn.cx_desc, ctypes.c_void_p(cx.data_ptr()) if cx is not None else None, fn.x_descs, ctypes.c_void_p(dx.data_ptr()), fn.hx_desc, ctypes.c_void_p(dhx.data_ptr()), fn.cx_desc, ctypes.c_void_p(dcx.data_ptr()) if cx is not None else None, ctypes.c_void_p(fn.workspace.data_ptr()), fn.workspace.size(0), ctypes.c_void_p(fn.reserve.data_ptr()), fn.reserve.size(0))) if fn.batch_first and not is_input_packed: grad_input = grad_input.transpose_(0, 1)