Example #1
0
    def _make_params(self):
        """Create the internal attributes needed by this module."""

        w = getattr(self.module, self.name)

        height = w.data.shape[0]
        width = w.view(height, -1).data.shape[1]

        # Create u and v as torch.Parameters from a standard normal distribution
        # with the height and width from the chosen attribute in the internal module.
        u = Parameter(w.data.new(height).normal_(0, 1), requires_grad=False)
        v = Parameter(w.data.new(width).normal_(0, 1), requires_grad=False)

        # Normalize u and v using L2 and create torch.Parameter for the chosen attribute of the internal module.
        u.data = l2normalize(u.data)
        v.data = l2normalize(v.data)
        w_bar = Parameter(w.data)

        # Delete the chosen attributes in the internal module, the attribute is tracked in this module instead.
        del self.module._parameters[self.name]

        # Register the new parameters.
        self.module.register_parameter(self.name + "_u", u)
        self.module.register_parameter(self.name + "_v", v)
        self.module.register_parameter(self.name + "_bar", w_bar)
Example #2
0
 def _make_params(self):
     w = getattr(self.module, self.name)
     height = w.data.shape[0]
     width = w.view(height, -1).data.shape[1]
     u = Parameter(w.data.new(height).normal_(0, 1), requires_grad=True)
     v = Parameter(w.data.new(width).normal_(0, 1), requires_grad=True)
     u.data = l2normalize(u.data)
     v.data = l2normalize(v.data)
     w_bar = Parameter(w.data)
     del self.module._parameters[self.name]
     self.module.register_parameter(self.name + "_u", u)
     self.module.register_parameter(self.name + "_v", v)
     self.module.register_parameter(self.name + "_bar", w_bar)
Example #3
0
    def __call__(self, mu: Parameter, rho: Parameter) -> TwoParameters:
        """Call

        Arguments:
            mu (nn.Parameter): mu parameter to be initialized
            rho (nn.Parameter): rho parameter to be initialized

        Returns:
            (nn.Parameter): mu initialized
            (nn.Parameter): rho initialized
        """
        mu.data = mu.data.uniform_(*self.mu_range)
        rho.data = rho.data.uniform_(*self.rho_range)
        return mu, rho
 def fixup(p: Parameter, is_sharded: bool,
           size: torch.Size) -> Parameter:
     assert isinstance(p, Parameter)
     p.data = p.data.clone()  # move tensors out of shared memory
     p._is_sharded = is_sharded
     p._orig_size = size
     return p
    def _make_params(self):
        w = getattr(self.module, self.name)

        height = w.data.shape[0]
        width = w.view(height, -1).data.shape[1]

        u = Parameter(w.data.new(height).normal_(0, 1), requires_grad=False)
        v = Parameter(w.data.new(width).normal_(0, 1), requires_grad=False)
        u.data = l2normalize(u.data)
        v.data = l2normalize(v.data)
        w_bar = Parameter(w.data)

        del self.module._parameters[self.name]

        self.module.register_parameter(self.name + "_u", u)
        self.module.register_parameter(self.name + "_v", v)
        self.module.register_parameter(self.name + "_bar", w_bar)
Example #6
0
    def _make_params(self):
        """
            Set the parameters from scratch
        """
        # Create the parameters
        w = getattr(self.module, self.name)
        height = w.data.shape[0]
        width = w.view(height, -1).data.shape[1]
        u = Parameter(w.data.new(height).normal_(0, 1), requires_grad=False)
        v = Parameter(w.data.new(width).normal_(0, 1), requires_grad=False)
        u.data = self.l2normalize(u.data)
        v.data = self.l2normalize(v.data)
        w_bar = Parameter(w.data)

        # Regist the parameter into the module
        del self.module._parameters[self.name]
        self.module.register_parameter(self.name + "_u", u)
        self.module.register_parameter(self.name + "_v", v)
        self.module.register_parameter(self.name + "_bar", w_bar)
Example #7
0
    def _make_params(self):
        w = getattr(self.module, self.name)  # get the weight first 100x512x4x4

        height = w.data.shape[
            0]  # nn.ConvTranspose2d(z_dim, conv_dim * mult, 4) -- 100
        width = w.view(height, -1).data.shape[1]  # 8192

        u = Parameter(w.data.new(height).normal_(0, 1),
                      requires_grad=False)  # u is a random vector
        v = Parameter(w.data.new(width).normal_(0, 1),
                      requires_grad=False)  # v is a random vector
        u.data = l2normalize(u.data)  # normalize
        v.data = l2normalize(v.data)  # normalize
        w_bar = Parameter(w.data)

        del self.module._parameters[self.name]

        self.module.register_parameter(self.name + "_u", u)
        self.module.register_parameter(self.name + "_v", v)
        self.module.register_parameter(self.name + "_bar", w_bar)
Example #8
0
	def _make_params(self):
		w = getattr(self.module, self.name)
		# height = dout , width = din * w * h
		height = w.data.shape[0]
		width = w.view(height, -1).data.shape[1]
		# 使用type_as会让数据类型和cuda/cpu与该变量相同
		u = Parameter(t.randn(height), requires_grad=False)
		v = Parameter(t.randn(width), requires_grad=False)
		
		u.data = self.l2normalize(u.data)
		v.data = self.l2normalize(v.data)
		# 这里w是parameter类别的,但是data是Tensor类别, 默认是会有梯度的
		w_real = Parameter(w.data)
		# python的引用机制不一样, 即便删掉了这个,也只是删去了一个引用而已
		# w还在引用原来的变量,并且会随BP而更新变量值
		del self.module._parameters[self.name]
		# Adds a parameter to the module.
		self.module.register_parameter(self.name + '_u', u)
		self.module.register_parameter(self.name + '_v', v)
		self.module.register_parameter(self.name + '_matrices', w_real)
    def _make_params(self):
        # print(self.module)
        weight = getattr(self.module, self.name)
        print(type(weight.data))
        h, w = weight.size()
        # height = w.data.shape[0]
        # width = w.view(height, -1).data.shape[1]
        # print(type(w))
        u = Parameter(weight.data.new(h).normal_(0, 1), requires_grad=False)
        v = Parameter(weight.data.new(w).normal_(0, 1), requires_grad=False)
        # u = Parameter(w.data.new(height).normal_(0, 1), requires_grad=False)
        # v = Parameter(w.data.new(width).normal_(0, 1), requires_grad=False)
        u.data = l2normalize(u.data)
        v.data = l2normalize(v.data)
        w_bar = Parameter(w.data)

        # print(w.data)
        del self.module._parameters[self.name]

        self.module.register_parameter(self.name + "_u", u)
        self.module.register_parameter(self.name + "_v", v)
        self.module.register_parameter(self.name + "_bar", w_bar)
    def _make_params(self, w_init):
        w = getattr(self.module, self.name)

        # tbd initialization here, He initialization for relu
        if w_init:
            w.data.normal_(0.0, 0.1)
            w.data = init.kaiming_normal_(w.data)

        height = w.data.shape[0]
        width = w.view(height, -1).data.shape[1]

        u = Parameter(w.data.new(height).normal_(0, 1), requires_grad=False)
        v = Parameter(w.data.new(width).normal_(0, 1), requires_grad=False)
        u.data = l2normalize(u.data)
        v.data = l2normalize(v.data)
        w_bar = Parameter(w.data)

        del self.module._parameters[self.name]

        self.module.register_parameter(self.name + "_u", u)
        self.module.register_parameter(self.name + "_v", v)
        self.module.register_parameter(self.name + "_bar", w_bar)
Example #11
0
    def _make_params(self):
        #get the weight from the conv layer (module)
        w = getattr(self.module, self.name)

        height = w.data.shape[0]
        #flatten weight matrix except the batch axis
        width = w.view(height, -1).data.shape[1]

        #initialize random vectors from isotropic distribution
        u = Parameter(w.data.new(height).normal_(0, 1), requires_grad=False)
        v = Parameter(w.data.new(width).normal_(0, 1), requires_grad=False)
        u.data = tl.l2normalize(u.data)
        v.data = tl.l2normalize(v.data)
        w_bar = Parameter(w.data)

        #delete the original weight
        del self.module._parameters[self.name]

        #store the vectors into the module as parameters
        self.module.register_parameter(self.name + "_u", u)
        self.module.register_parameter(self.name + "_v", v)
        self.module.register_parameter(self.name + "_bar", w_bar)
Example #12
0
    def _make_params(self):
        """
        No need to change. Initialize parameters.
        v: Initialize v with a random vector (sampled from isotropic distrition).
        u: Initialize u with a random vector (sampled from isotropic distrition).
        w: Weight of the current layer.
        """
        w = getattr(self.module, self.name)

        height = w.data.shape[0]
        width = w.view(height, -1).data.shape[1]

        u = Parameter(w.data.new(height).normal_(0, 1), requires_grad=False)
        v = Parameter(w.data.new(width).normal_(0, 1), requires_grad=False)
        u.data = l2normalize(u.data)
        v.data = l2normalize(v.data)
        w_bar = Parameter(w.data)

        del self.module._parameters[self.name]

        self.module.register_parameter(self.name + "_u", u)
        self.module.register_parameter(self.name + "_v", v)
        self.module.register_parameter(self.name + "_bar", w_bar)
Example #13
0
 def orthonormal_init(param: nn.Parameter, n_blocks: int):
     size0, size1 = param.size()
     size0 //= n_blocks
     size_min = min(size0, size1)
     init_values = []
     for _ in range(n_blocks):
         m1 = torch.randn(size0, size0, dtype=param.dtype)
         m2 = torch.randn(size1, size1, dtype=param.dtype)
         q1, r1 = torch.qr(m1)
         q2, r2 = torch.qr(m2)
         q1 *= torch.sign(torch.diag(r1))
         q2 *= torch.sign(torch.diag(r2))
         value = torch.mm(q1[:, :size_min], q2[:size_min, :])
         init_values.append(value)
     param.data = torch.cat(init_values, dim=0)
Example #14
0
 def assign(param: nn.Parameter,
            weight: Union[str, torch.Tensor],
            trans_fn: Optional[TransFn] = None,
            allow_fail: bool = False):
     param_key = next(k for k, v in to_params.items() if v is param)
     del to_params[param_key]  # delete regardless of whether weight exists
     if isinstance(weight, str):
         try:
             weight = get_weight(weight)
         except KeyError:
             if allow_fail:
                 print(f"Weight {weight} not found in checkpoint")
                 return
             else:
                 raise
     if trans_fn is not None:
         weight = trans_fn(weight).contiguous()
     if param.size() != weight.size():
         raise ValueError(f"Expected size {param.size()}, "
                          f"actual size {weight.size()}")
     param.data = weight
Example #15
0
    def split_it(self, A, n_to_fix, n_to_learn, zero_fixed_part=False):
        """Splits a weight matrix to two parts of given sizes and returns each part as a Parameter."""
        A_fixed = None
        A_learn = None
        if n_to_fix == 0:  # all learnable
            A_learn = Parameter(A)
            A_learn.requires_grad = True
            A = [A_learn]

        elif n_to_learn == 0:  # all fixed
            A_fixed = Parameter(A)
            A_fixed.requires_grad = False
            A = [A_fixed]
        else:
            A_fixed = Parameter(A[:n_to_fix])
            A_learn = Parameter(A[n_to_fix:])
            A_learn.requires_grad = True
            A = [A_fixed, A_learn]

        if zero_fixed_part and n_to_fix > 0:
            A_fixed.data = 0 * A_fixed.data
        if A_fixed is not None:
            A_fixed.requires_grad = False
        return A, A_fixed, A_learn
    def _init_param_attributes(self, p: Parameter) -> None:
        """
        We manage several attributes on each Parameter instance. The first two
        are set by :func:`_shard_parameters_`:

            ``_is_sharded``: ``True`` if the Parameter is sharded or ``False``
                if the Parameter is intentionally not sharded (in which case we
                will all-reduce grads for this param).
            ``_orig_size``: the size of the original Parameter (before sharding)

        The remaining attributes are set here:
            ``_fp32_shard``: a single shard of the parameters in full precision
                (typically FP32, but this is dependent on the dtype of the model
                as it's passed in by the user). This can be on CPU or GPU
                depending on the value of *``cpu_offload``*.
            ``_fp16_shard``: if *``mixed_precision``* is ``True``, this will be
                a single shard of the parameters in FP16, used for all-gather.
            ``_full_param_padded``: the full weight (padded to be evenly
                divisible by ``world_size``), used for computation in the
                forward and backward pass. This will be resized in place and
                only materialized (via all-gather) as needed.
        """
        assert hasattr(p, "_is_sharded") and hasattr(p, "_orig_size")
        if hasattr(p, "_fp32_shard"):
            return

        # Compute device defaults to CUDA when *cpu_offload* is enabled, or the
        # param's current device otherwise (could be CPU).
        compute_device = torch.device("cuda") if self.cpu_offload else p.device

        # A single shard of the parameters in full precision.
        p._fp32_shard = p.data

        if self.mixed_precision:
            assert p._fp32_shard.dtype == torch.float32

            if self.cpu_offload:
                assert p._fp32_shard.device == torch.device("cpu")
                # If we plan to keep the FP32 parameters on CPU, then pinning
                # memory allows us to later use non-blocking transfers when moving
                # the FP32 param shard to compute_device.
                p._fp32_shard = p._fp32_shard.pin_memory()
                p.data = p._fp32_shard

            # In mixed precision mode, we maintain a reduced precision
            # (typically FP16) parameter shard on compute_device for performing
            # the computation in the forward/backward pass. We resize the
            # storage to size 0 at init (here) and re-materialize (by copying
            # from _fp32_shard) as needed.
            p._fp16_shard = torch.zeros_like(p._fp32_shard,
                                             device=compute_device,
                                             dtype=self.compute_dtype)
            free_storage_(p._fp16_shard)
        else:
            p._fp16_shard = None  # use _fp32_shard

        # We also maintain a full-sized parameter of type self.compute_dtype
        # (FP16 for mixed_precision or FP32 otherwise). We resize the
        # storage to size 0 at init (here) and only materialize as needed. The
        # storage may contain padding elements so that it is evenly divisible by
        # world_size, although these padding elements will be removed before the
        # relevant computation.
        if p._is_sharded:
            p._full_param_padded = torch.zeros(p.data.numel() *
                                               self.world_size,
                                               device=compute_device,
                                               dtype=self.compute_dtype)
            free_storage_(p._full_param_padded)

        if self.move_grads_to_cpu:
            # We can optionally move the grad shard to CPU during the backward
            # pass. In this case, it's important to pre-allocate the CPU grad
            # shard in pinned memory so that we can do a non-blocking transfer.
            p._cpu_grad = torch.zeros_like(p.data, device="cpu").pin_memory()
def attack(model,
           criterion,
           img,
           label,
           eps,
           attack_type,
           iters,
           clean_clean_img=None):
    assert not model.training

    adv = img.clone().detach()
    adv = Parameter(adv, requires_grad=True)

    if attack_type == 'fgsm':
        iterations = 1
    else:
        iterations = iters

    if attack_type == 'pgd':
        step = 2 / 255
    else:
        step = eps / iterations

        noise = 0

    for j in range(iterations):
        outputs = None
        if aug_test is None:
            out_adv = model(normalize(adv.clone()))
            loss = criterion(out_adv, label)
            loss.backward()
        else:
            adv_aux = adv * (1.0 - aug_test_lambda)
            for i in range(
                    aug_test
            ):  # TODO Check why this uses so much memory... it ain't normal fam
                adv_aux = adv_aux + aug_test_lambda * clean_clean_img[
                    torch.randperm(label.size(0))]
                out = model(normalize(adv_aux))
                if outputs is None:
                    outputs = out
                else:
                    outputs += out
            out_adv = outputs / aug_test

            loss = criterion(out_adv, label)
            loss.backward()

        if attack_type == 'mim':
            adv_mean = torch.mean(torch.abs(adv.grad), dim=1, keepdim=True)
            adv_mean = torch.mean(torch.abs(adv_mean), dim=2, keepdim=True)
            adv_mean = torch.mean(torch.abs(adv_mean), dim=3, keepdim=True)
            adv.grad = adv.grad / adv_mean
            noise = noise + adv.grad
        else:
            assert adv.grad is not None
            noise = adv.grad

        # Optimization step
        adv.data = adv.data + step * noise.sign()
        #        adv.data = adv.data + step * adv.grad.sign()

        if attack_type == 'pgd':
            adv.data = torch.where(adv.data > img.data + eps, img.data + eps,
                                   adv.data)
            adv.data = torch.where(adv.data < img.data - eps, img.data - eps,
                                   adv.data)
        adv.data.clamp_(0.0, 1.0)

        adv.grad.data.zero_()

    return adv.detach()
Example #18
0
def _compress_module_param_dim(
    param: Parameter,
    target_dim: int,
    idxs_to_keep: Tensor,
    module: Optional[Module] = None,
    optimizer: Optional[Optimizer] = None,
):
    if param.dim() == 1:
        target_dim = 0

    if param.size(target_dim) == 1 and idxs_to_keep.numel() > 1:
        # DW Conv
        return

    if param.size(target_dim) % idxs_to_keep.size(0) != 0:
        _LOGGER.debug(
            "skipping compression of parameter due to shape incompatibility")

    stride = param.data.size(target_dim) // idxs_to_keep.size(0)
    if stride > 1:
        idxs_to_keep = idxs_to_keep.reshape(-1, 1).expand(-1,
                                                          stride).reshape(-1)

    param.data = (param.data[idxs_to_keep, ...]
                  if target_dim == 0 else param.data[:, idxs_to_keep, ...])

    if param.grad is not None:
        param.grad = (param.grad[idxs_to_keep, ...]
                      if target_dim == 0 else param.grad[:, idxs_to_keep, ...])

    if (optimizer is not None and param in optimizer.state
            and ("momentum_buffer" in optimizer.state[param])):
        optimizer.state[param]["momentum_buffer"] = (
            optimizer.state[param]["momentum_buffer"][idxs_to_keep,
                                                      ...] if target_dim == 0
            else optimizer.state[param]["momentum_buffer"][:, idxs_to_keep,
                                                           ...])

    # update module attrs
    if module is not None:
        # Batch Norm
        if param.dim() == 1:
            if hasattr(module, "num_features"):
                module.num_features = param.size(0)
            # BN running mean and var are not stored as Parameters so we must
            # update them here
            if hasattr(module, "running_mean") and (module.running_mean.size(0)
                                                    == idxs_to_keep.size(0)):
                module.running_mean = module.running_mean[idxs_to_keep]
            if hasattr(module, "running_var") and (module.running_var.size(0)
                                                   == idxs_to_keep.size(0)):
                module.running_var = module.running_var[idxs_to_keep]

        # Linear
        elif target_dim == 0 and hasattr(module, "out_features"):
            module.out_features = param.size(0)
        elif target_dim == 1 and hasattr(module, "in_features"):
            module.in_features = param.size(1)
        # Conv
        elif target_dim == 0 and hasattr(module, "out_channels"):
            module.out_channels = param.size(0)
        elif target_dim == 1 and hasattr(module, "in_channels"):
            module.in_channels = param.size(1)

        if (hasattr(module, "groups") and module.groups > 1
                and (hasattr(module, "out_channels")
                     and hasattr(module, "in_channels"))):
            module.groups = param.size(0) // param.size(1)
Example #19
0
def clamp_min_parameter(parameter: nn.Parameter, min_value: float) -> None:
    parameter.data = parameter.data.clamp_min(min_value)