def test_reduce_list_tuples(self): tensors = [ (torch.tensor([[3, 4, 5]]), torch.tensor([[0, 1, 2]])), (torch.tensor([[3, 4, 5]]), torch.tensor([[0, 1, 2]])), ] reduced = _reduce_list(tensors) assertTensorAlmostEqual(self, reduced[0], [[3, 4, 5], [3, 4, 5]]) assertTensorAlmostEqual(self, reduced[1], [[0, 1, 2], [0, 1, 2]])
def _gather_distributed_tensors( saved_layer: Dict[device, Tuple[Tensor, ...]], device_ids: Union[None, List[int]] = None, key_list: Union[None, List[device]] = None, ) -> Tuple[Tensor, ...]: r""" A helper function to concatenate intermediate layer results stored on different devices in `saved_layer`. `saved_layer` is a dictionary that contains `device_id` as a key and intermediate layer results (either the input or the output of the layer) stored on the device corresponding to the key. `key_list` is a list of devices in appropriate ordering for concatenation and if not provided, keys are sorted based on device ids. If only one key exists (standard model), key list simply has one element. """ if key_list is None: key_list = _sort_key_list(list(saved_layer.keys()), device_ids) return _reduce_list([saved_layer[device_id] for device_id in key_list])
def _neuron_gradients( inputs: Union[Tensor, Tuple[Tensor, ...]], saved_layer: Dict[device, Tuple[Tensor, ...]], key_list: List[device], gradient_neuron_index: Union[int, Tuple[int, ...]], ) -> Tuple[Tensor, ...]: with torch.autograd.set_grad_enabled(True): gradient_tensors = [] for key in key_list: assert ( len(saved_layer[key]) == 1 ), "Cannot compute neuron gradients for layer with multiple tensors." current_out_tensor = saved_layer[key][0] gradient_tensors.append( torch.autograd.grad( torch.unbind( _verify_select_column(current_out_tensor, gradient_neuron_index)), inputs, )) _total_gradients = _reduce_list(gradient_tensors, sum) return _total_gradients
def test_reduce_list_tensors(self): tensors = [torch.tensor([[3, 4, 5]]), torch.tensor([[0, 1, 2]])] reduced = _reduce_list(tensors) assertTensorAlmostEqual(self, reduced, [[3, 4, 5], [0, 1, 2]])
def compute_layer_gradients_and_eval( forward_fn: Callable, layer: Module, inputs: Union[Tensor, Tuple[Tensor, ...]], model=None, pre_hook=None, target: TargetType = None, target_ind: TargetType = None, additional_forward_args: Any = None, gradient_neuron_index: Union[None, int, Tuple[int, ...]] = None, device_ids: Union[None, List[int]] = None, attribute_to_layer_input: bool = False, output_fn: Union[None, Callable] = None, ) -> Union[Tuple[Tuple[Tensor, ...], Tuple[Tensor, ...], bool], Tuple[Tuple[ Tensor, ...], Tuple[Tensor, ...], Tuple[Tensor, ...], bool], ]: r""" Computes gradients of the output with respect to a given layer as well as the output evaluation of the layer for an arbitrary forward function and given input. For data parallel models, hooks are executed once per device ,so we need to internally combine the separated tensors from devices by concatenating based on device_ids. Any necessary gradients must be taken with respect to each independent batched tensor, so the gradients are computed and combined appropriately. More information regarding the behavior of forward hooks with DataParallel models can be found in the PyTorch data parallel documentation. We maintain the separate inputs in a dictionary protected by a lock, analogous to the gather implementation for the core PyTorch DataParallel implementation. NOTE: To properly handle inplace operations, a clone of the layer output is stored. This structure inhibits execution of a backward hook on the last module for the layer output when computing the gradient with respect to the input, since we store an intermediate clone, as opposed to the true module output. If backward module hooks are necessary for the final module when computing input gradients, utilize _forward_layer_eval_with_neuron_grads instead. Args: forward_fn: forward function. This can be for example model's forward function. layer: Layer for which gradients / output will be evaluated. inputs: Input at which gradients are evaluated, will be passed to forward_fn. target_ind: Index of the target class for which gradients must be computed (classification only). output_fn: An optional function that is applied to the layer inputs or outputs depending whether the `attribute_to_layer_input` is set to `True` or `False` args: Additional input arguments that forward function requires. It takes an empty tuple (no additional arguments) if no additional arguments are required Returns: 2-element tuple of **gradients**, **evals**: - **gradients**: Gradients of output with respect to target layer output. - **evals**: Target layer output for given input. """ with torch.autograd.set_grad_enabled(True): # saved_layer is a dictionary mapping device to a tuple of # layer evaluations on that device. saved_layer, output, is_layer_tuple = _forward_layer_distributed_eval( forward_fn, inputs, layer, target_ind=target_ind, additional_forward_args=None, attribute_to_layer_input=attribute_to_layer_input, forward_hook_with_return=True, ) device_ids = _extract_device_ids(forward_fn, saved_layer, device_ids) # Identifies correct device ordering based on device ids. # key_list is a list of devices in appropriate ordering for concatenation. # If only one key exists (standard model), key list simply has one element. key_list = _sort_key_list(list(saved_layer.keys()), device_ids) all_outputs = _reduce_list([ saved_layer[device_id] if output_fn is None else output_fn(saved_layer[device_id]) for device_id in key_list ]) num_tensors = len(saved_layer[next(iter(saved_layer))]) grad_inputs = list(layer_tensor for device_id in key_list for layer_tensor in saved_layer[device_id]) # --- add for require_grad --- for i, grad_input in enumerate(grad_inputs): if not grad_input.requires_grad: grad_inputs.pop(i) num_tensors -= 1 grad_inputs = tuple(grad_inputs) if target is None: num_node = len(inputs[0]) num_feature = inputs[0].shape[1] grad_node = [] import time, numpy as np total_tik = time.time() back_time = [] for node_idx in range(num_node): # for feature_idx in range(num_feature): # --- May be using Batch to calculate parallel --- # chosen_output = torch.cat([output[node_idx, feature_idx: feature_idx + 1], output[node_idx + num_node, feature_idx: feature_idx + 1]]) chosen_output = torch.cat( [output[node_idx], output[node_idx + num_node]]) # chosen_output = output[node_idx] back_tik = time.time() # pre_hook.remove() # jaco = torch.autograd.functional.jacobian(model, (grad_inputs, additional_forward_args[0])) saved_grads = torch.autograd.grad(torch.unbind(chosen_output), grad_inputs, retain_graph=True) back_tok = time.time() back_time.append(back_tok - back_tik) saved_grads = [ saved_grads[i:i + num_tensors] for i in range(0, len(saved_grads), num_tensors) ] if output_fn is not None: saved_grads = [ output_fn(saved_grad) for saved_grad in saved_grads ] all_grads = _reduce_list(saved_grads) grad_node.append(all_grads) if gradient_neuron_index is not None: inp_grads = _neuron_gradients(inputs, saved_layer, key_list, gradient_neuron_index) return all_grads, all_outputs, inp_grads, is_layer_tuple # print(f'#D#total time: {time.time() - total_tik}\n' # f'back ward time: {np.sum(back_time)}') return grad_node, all_outputs, is_layer_tuple # assert output[0].numel() == 1, ( # "Target not provided when necessary, cannot" # " take gradient with respect to multiple outputs." # ) saved_grads = torch.autograd.grad(torch.unbind(output), grad_inputs) saved_grads = [ saved_grads[i:i + num_tensors] for i in range(0, len(saved_grads), num_tensors) ] if output_fn is not None: saved_grads = [output_fn(saved_grad) for saved_grad in saved_grads] all_grads = _reduce_list(saved_grads) if gradient_neuron_index is not None: inp_grads = _neuron_gradients(inputs, saved_layer, key_list, gradient_neuron_index) return all_grads, all_outputs, inp_grads, is_layer_tuple return all_grads, all_outputs, is_layer_tuple