예제 #1
0
 def test_sort_key_list(self):
     key_list = [
         torch.device("cuda:13"),
         torch.device("cuda:17"),
         torch.device("cuda:10"),
         torch.device("cuda:0"),
     ]
     device_index_list = [0, 10, 13, 17]
     sorted_keys = _sort_key_list(key_list, device_index_list)
     for i in range(len(key_list)):
         self.assertEqual(sorted_keys[i].index, device_index_list[i])
예제 #2
0
def _forward_layer_eval_with_neuron_grads(
    forward_fn: Callable,
    inputs: Union[Tensor, Tuple[Tensor, ...]],
    layer: Module,
    additional_forward_args: Any = None,
    gradient_neuron_index: Union[None, int, Tuple[int, ...]] = None,
    device_ids: Union[None, List[int]] = None,
    attribute_to_layer_input: bool = False,
) -> Union[
    Tuple[Tuple[Tensor, ...], Tuple[Tensor, ...], bool], Tuple[Tuple[Tensor, ...], bool]
]:
    """
    This method computes forward evaluation for a particular layer using a
    forward hook. If a gradient_neuron_index is provided, then gradients with
    respect to that neuron in the layer output are also returned.

    These functionalities are combined due to the behavior of DataParallel models
    with hooks, in which hooks are executed once per device. We need to internally
    combine the separated tensors from devices by concatenating based on device_ids.
    Any necessary gradients must be taken with respect to each independent batched
    tensor, so the gradients are computed and combined appropriately.

    More information regarding the behavior of forward hooks with DataParallel models
    can be found in the PyTorch data parallel documentation. We maintain the separate
    evals in a dictionary protected by a lock, analogous to the gather implementation
    for the core PyTorch DataParallel implementation.
    """
    saved_layer, is_layer_tuple = _forward_layer_distributed_eval(
        forward_fn,
        inputs,
        layer,
        additional_forward_args=additional_forward_args,
        attribute_to_layer_input=attribute_to_layer_input,
    )
    device_ids = _extract_device_ids(forward_fn, saved_layer, device_ids)
    # Identifies correct device ordering based on device ids.
    # key_list is a list of devices in appropriate ordering for concatenation.
    # If only one key exists (standard model), key list simply has one element.
    key_list = _sort_key_list(list(saved_layer.keys()), device_ids)
    if gradient_neuron_index is not None:
        inp_grads = _neuron_gradients(
            inputs, saved_layer, key_list, gradient_neuron_index
        )
        return (
            _gather_distributed_tensors(saved_layer, key_list=key_list),
            inp_grads,
            is_layer_tuple,
        )
    else:
        return (
            _gather_distributed_tensors(saved_layer, key_list=key_list),
            is_layer_tuple,
        )
예제 #3
0
def _gather_distributed_tensors(
    saved_layer: Dict[device, Tuple[Tensor, ...]],
    device_ids: Union[None, List[int]] = None,
    key_list: Union[None, List[device]] = None,
) -> Tuple[Tensor, ...]:
    r"""
    A helper function to concatenate intermediate layer results stored on
    different devices in `saved_layer`. `saved_layer` is a dictionary that
    contains `device_id` as a key and intermediate layer results (either
    the input or the output of the layer) stored on the device corresponding to
    the key.
    `key_list` is a list of devices in appropriate ordering for concatenation
    and if not provided, keys are sorted based on device ids.

    If only one key exists (standard model), key list simply has one element.
    """
    if key_list is None:
        key_list = _sort_key_list(list(saved_layer.keys()), device_ids)
    return _reduce_list([saved_layer[device_id] for device_id in key_list])
예제 #4
0
def compute_layer_gradients_and_eval(
    forward_fn: Callable,
    layer: Module,
    inputs: Union[Tensor, Tuple[Tensor, ...]],
    model=None,
    pre_hook=None,
    target: TargetType = None,
    target_ind: TargetType = None,
    additional_forward_args: Any = None,
    gradient_neuron_index: Union[None, int, Tuple[int, ...]] = None,
    device_ids: Union[None, List[int]] = None,
    attribute_to_layer_input: bool = False,
    output_fn: Union[None, Callable] = None,
) -> Union[Tuple[Tuple[Tensor, ...], Tuple[Tensor, ...], bool], Tuple[Tuple[
        Tensor, ...], Tuple[Tensor, ...], Tuple[Tensor, ...], bool], ]:
    r"""
        Computes gradients of the output with respect to a given layer as well
        as the output evaluation of the layer for an arbitrary forward function
        and given input.

        For data parallel models, hooks are executed once per device ,so we
        need to internally combine the separated tensors from devices by
        concatenating based on device_ids. Any necessary gradients must be taken
        with respect to each independent batched tensor, so the gradients are
        computed and combined appropriately.

        More information regarding the behavior of forward hooks with DataParallel
        models can be found in the PyTorch data parallel documentation. We maintain
        the separate inputs in a dictionary protected by a lock, analogous to the
        gather implementation for the core PyTorch DataParallel implementation.

        NOTE: To properly handle inplace operations, a clone of the layer output
        is stored. This structure inhibits execution of a backward hook on the last
        module for the layer output when computing the gradient with respect to
        the input, since we store an intermediate clone, as
        opposed to the true module output. If backward module hooks are necessary
        for the final module when computing input gradients, utilize
        _forward_layer_eval_with_neuron_grads instead.

        Args:

            forward_fn: forward function. This can be for example model's
                        forward function.
            layer:      Layer for which gradients / output will be evaluated.
            inputs:     Input at which gradients are evaluated,
                        will be passed to forward_fn.
            target_ind: Index of the target class for which gradients
                        must be computed (classification only).
            output_fn:  An optional function that is applied to the layer inputs or
                        outputs depending whether the `attribute_to_layer_input` is
                        set to `True` or `False`
            args:       Additional input arguments that forward function requires.
                        It takes an empty tuple (no additional arguments) if no
                        additional arguments are required


        Returns:
            2-element tuple of **gradients**, **evals**:
            - **gradients**:
                Gradients of output with respect to target layer output.
            - **evals**:
                Target layer output for given input.
    """
    with torch.autograd.set_grad_enabled(True):
        # saved_layer is a dictionary mapping device to a tuple of
        # layer evaluations on that device.
        saved_layer, output, is_layer_tuple = _forward_layer_distributed_eval(
            forward_fn,
            inputs,
            layer,
            target_ind=target_ind,
            additional_forward_args=None,
            attribute_to_layer_input=attribute_to_layer_input,
            forward_hook_with_return=True,
        )

        device_ids = _extract_device_ids(forward_fn, saved_layer, device_ids)

        # Identifies correct device ordering based on device ids.
        # key_list is a list of devices in appropriate ordering for concatenation.
        # If only one key exists (standard model), key list simply has one element.
        key_list = _sort_key_list(list(saved_layer.keys()), device_ids)

        all_outputs = _reduce_list([
            saved_layer[device_id]
            if output_fn is None else output_fn(saved_layer[device_id])
            for device_id in key_list
        ])
        num_tensors = len(saved_layer[next(iter(saved_layer))])
        grad_inputs = list(layer_tensor for device_id in key_list
                           for layer_tensor in saved_layer[device_id])

        # --- add for require_grad ---
        for i, grad_input in enumerate(grad_inputs):
            if not grad_input.requires_grad:
                grad_inputs.pop(i)
                num_tensors -= 1

        grad_inputs = tuple(grad_inputs)

        if target is None:
            num_node = len(inputs[0])
            num_feature = inputs[0].shape[1]
            grad_node = []
            import time, numpy as np
            total_tik = time.time()
            back_time = []
            for node_idx in range(num_node):
                # for feature_idx in range(num_feature):
                # --- May be using Batch to calculate parallel ---

                # chosen_output = torch.cat([output[node_idx, feature_idx: feature_idx + 1], output[node_idx + num_node, feature_idx: feature_idx + 1]])
                chosen_output = torch.cat(
                    [output[node_idx], output[node_idx + num_node]])
                # chosen_output = output[node_idx]
                back_tik = time.time()
                # pre_hook.remove()
                # jaco = torch.autograd.functional.jacobian(model, (grad_inputs, additional_forward_args[0]))
                saved_grads = torch.autograd.grad(torch.unbind(chosen_output),
                                                  grad_inputs,
                                                  retain_graph=True)
                back_tok = time.time()
                back_time.append(back_tok - back_tik)
                saved_grads = [
                    saved_grads[i:i + num_tensors]
                    for i in range(0, len(saved_grads), num_tensors)
                ]
                if output_fn is not None:
                    saved_grads = [
                        output_fn(saved_grad) for saved_grad in saved_grads
                    ]

                all_grads = _reduce_list(saved_grads)
                grad_node.append(all_grads)
                if gradient_neuron_index is not None:
                    inp_grads = _neuron_gradients(inputs, saved_layer,
                                                  key_list,
                                                  gradient_neuron_index)
                    return all_grads, all_outputs, inp_grads, is_layer_tuple
            # print(f'#D#total time: {time.time() - total_tik}\n'
            #       f'back ward time: {np.sum(back_time)}')
            return grad_node, all_outputs, is_layer_tuple

        # assert output[0].numel() == 1, (
        #     "Target not provided when necessary, cannot"
        #     " take gradient with respect to multiple outputs."
        # )

        saved_grads = torch.autograd.grad(torch.unbind(output), grad_inputs)
        saved_grads = [
            saved_grads[i:i + num_tensors]
            for i in range(0, len(saved_grads), num_tensors)
        ]
        if output_fn is not None:
            saved_grads = [output_fn(saved_grad) for saved_grad in saved_grads]

        all_grads = _reduce_list(saved_grads)
        if gradient_neuron_index is not None:
            inp_grads = _neuron_gradients(inputs, saved_layer, key_list,
                                          gradient_neuron_index)
            return all_grads, all_outputs, inp_grads, is_layer_tuple
    return all_grads, all_outputs, is_layer_tuple