Beispiel #1
0
class cnn_THS(clstm):
    def __init__(self, vocab_size,max_num_hidden_layers,embedding_dim, n_classes,n_filters,filter_size,
                  dropout, batch_size,b=0.99, n=0.01, s=0.2, use_cuda=False):
        super().__init__(vocab_size,max_num_hidden_layers,embedding_dim, n_classes,n_filters,filter_size,
                  dropout, batch_size, b=b, n=n, s=s,use_cuda=use_cuda)
        self.e = Parameter(torch.tensor(e), requires_grad=False)
        self.arms_values = Parameter(torch.arange(n_classes), requires_grad=False)
        self.explorations_mab = []

        for i in range(n_classes):
            self.explorations_mab.append(algs.ThompsomSampling(len(e)))

    def partial_fit(self, X_data, Y_data, exp_factor, show_loss=True):
        self.partial_fit_(X_data, Y_data, show_loss)
        self.explorations_mab[Y_data[0]].reward(exp_factor)

    def predict(self, X_data):
        pred = self.predict_(X_data)[0]
        exp_factor = self.explorations_mab[pred].select()[0]
        if np.random.uniform() < self.e[exp_factor]:
            removed_arms = self.arms_values.clone().numpy().tolist()
            removed_arms.remove(pred)
            return random.choice(removed_arms), exp_factor

        return pred, exp_factor
Beispiel #2
0
class ONN_THS(ONN):
    def __init__(self,
                 features_size,
                 max_num_hidden_layers,
                 qtd_neuron_per_hidden_layer,
                 n_classes,
                 b=0.99,
                 n=0.01,
                 s=0.2,
                 e=[0.5, 0.35, 0.2, 0.1, 0.05],
                 use_cuda=False):
        super().__init__(features_size,
                         max_num_hidden_layers,
                         qtd_neuron_per_hidden_layer,
                         n_classes,
                         b=b,
                         n=n,
                         s=s,
                         use_cuda=use_cuda)
        self.e = Parameter(torch.tensor(e), requires_grad=False)
        self.arms_values = Parameter(torch.arange(n_classes),
                                     requires_grad=False)
        self.explorations_mab = []

        for i in range(n_classes):
            self.explorations_mab.append(algs.ThompsomSampling(len(e)))

    def partial_fit(self, X_data, Y_data, exp_factor, show_loss=True):
        self.partial_fit_(X_data, Y_data, show_loss)
        self.explorations_mab[Y_data[0]].reward(exp_factor)

    def predict(self, X_data):
        pred = self.predict_(X_data)[0]
        exp_factor = self.explorations_mab[pred].select()[0]
        if np.random.uniform() < self.e[exp_factor]:
            removed_arms = self.arms_values.clone().numpy().tolist()
            removed_arms.remove(pred)
            return random.choice(removed_arms), exp_factor

        return pred, exp_factor
Beispiel #3
0
class BinaryGatedLinear(Module):
    """
    Linear layer with stochastic binary gates
    """
    def __init__(self,
                 in_features,
                 out_features,
                 l0_strength=1.,
                 l2_strength=1.,
                 learn_weight=True,
                 bias=True,
                 droprate_init=0.5,
                 random_weight=True,
                 deterministic=False,
                 use_baseline_bias=False,
                 optimize_inference=False,
                 one_sample_per_item=False,
                 **kwargs):
        """
        :param in_features: Input dimensionality
        :param out_features: Output dimensionality
        :param bias: Whether we use a bias
        :param l2_strength: Strength of the L2 penalty
        :param droprate_init: Dropout rate that the gates will be initialized to
        :param l0_strength: Strength of the L0 penalty
        """
        super(BinaryGatedLinear, self).__init__()
        self.in_features = in_features
        self.out_features = out_features
        self.l0_strength = l0_strength
        self.l2_strength = l2_strength
        self.deterministic = deterministic
        self.use_baseline_bias = use_baseline_bias
        self.optimize_inference = optimize_inference
        self.one_sample_per_item = one_sample_per_item

        self.random_weight = random_weight
        if random_weight:
            exc_weight = torch.Tensor(out_features, in_features)
            inh_weight = torch.Tensor(out_features, in_features)
        else:
            exc_weight = torch.ones(out_features, in_features)
            inh_weight = torch.ones(out_features, in_features)

        if learn_weight:
            self.exc_weight = Parameter(exc_weight)
            self.inh_weight = Parameter(inh_weight)
        else:
            self.register_buffer("exc_weight", exc_weight)
            self.register_buffer("inh_weight", inh_weight)

        self.exc_p1 = Parameter(torch.Tensor(out_features, in_features))
        self.inh_p1 = Parameter(torch.Tensor(out_features, in_features))

        self.droprate_init = droprate_init if droprate_init != 0. else 0.5
        self.use_bias = bias
        if bias:
            self.bias = Parameter(torch.Tensor(out_features))
        self.reset_parameters()

    def reset_parameters(self):
        if self.random_weight:
            init.kaiming_normal_(self.exc_weight, mode="fan_out")
            init.kaiming_normal_(self.inh_weight, mode="fan_out")
            self.exc_weight.data.abs_()
            self.inh_weight.data.abs_()
        self.exc_p1.data.normal_(1 - self.droprate_init, 1e-2)
        self.inh_p1.data.normal_(1 - self.droprate_init, 1e-2)
        if self.use_bias:
            self.bias.data.fill_(0)

    def constrain_parameters(self, **kwargs):
        self.exc_weight.data.clamp_(min=0.)
        self.inh_weight.data.clamp_(min=0.)

    def get_gate_probabilities(self):
        exc_p1 = torch.clamp(self.exc_p1.data, min=0., max=1.)
        inh_p1 = torch.clamp(self.inh_p1.data, min=0., max=1.)
        return exc_p1, inh_p1

    def weight_size(self):
        return self.exc_weight.size()

    def regularization(self):
        """
        Expected L0 norm under the stochastic gates, takes into account and
        re-weights also a potential L2 penalty
        """
        if self.l0_strength > 0 or self.l2_strength > 0:
            # Clamp these, but do it in a way that still always propagates the
            # gradient.
            exc_p1 = self.exc_p1.clone()
            torch.clamp(exc_p1.data, min=0, max=1, out=exc_p1.data)
            inh_p1 = self.inh_p1.clone()
            torch.clamp(inh_p1.data, min=0, max=1, out=inh_p1.data)

            if self.l2_strength == 0:
                return self.l0_strength * (exc_p1 + inh_p1).sum()
            else:
                exc_weight_decay_ungated = (.5 * self.l2_strength *
                                            self.exc_weight.pow(2))
                inh_weight_decay_ungated = (.5 * self.l2_strength *
                                            self.inh_weight.pow(2))
                exc_weight_l2_l0 = torch.sum(
                    (exc_weight_decay_ungated + self.l0_strength) * exc_p1)
                inh_weight_l2_l0 = torch.sum(
                    (inh_weight_decay_ungated + self.l0_strength) * inh_p1)
                bias_l2 = (0 if not self.use_bias else torch.sum(
                    .5 * self.l2_strength * self.bias.pow(2)))
                return exc_weight_l2_l0 + inh_weight_l2_l0 + bias_l2
        else:
            return 0

    def get_inference_mask(self):
        exc_p1, inh_p1 = self.get_gate_probabilities()

        if self.deterministic:
            exc_mask = (exc_p1 >= 0.5).float()
            inh_mask = (inh_p1 >= 0.5).float()
            return exc_mask, inh_mask
        else:
            exc_count1 = exc_p1.sum(dim=1).round().int()
            inh_count1 = inh_p1.sum(dim=1).round().int()

            # pytorch doesn't offer topk with varying k values.
            exc_mask = torch.zeros_like(exc_p1)
            inh_mask = torch.zeros_like(inh_p1)
            for i in range(exc_count1.size()[0]):
                _, exc_indices = torch.topk(exc_p1[i], exc_count1[i].item())
                _, inh_indices = torch.topk(inh_p1[i], inh_count1[i].item())
                exc_mask[i].scatter_(-1, exc_indices, 1)
                inh_mask[i].scatter_(-1, inh_indices, 1)

            return exc_mask, inh_mask

    def sample_weight_and_bias(self):
        if self.training or not self.optimize_inference:
            w = (sample_weight(self.exc_p1, self.exc_weight,
                               self.deterministic) -
                 sample_weight(self.inh_p1, self.inh_weight,
                               self.deterministic))
        else:
            exc_mask, inh_mask = self.get_inference_mask()
            w = exc_mask * self.exc_weight - inh_mask * self.inh_weight

        b = None
        if self.use_baseline_bias:
            b = -w.sum(dim=-1) / 2

        if self.use_bias:
            b = (b + self.bias if b is not None else self.bias)

        return w, b

    def forward(self, x):
        if self.one_sample_per_item and self.training and len(x.size()) > 1:
            results = []
            for i in range(x.size(0)):
                w, b = self.sample_weight_and_bias()
                results.append(F.linear(x[i:i + 1], w, b))
            return torch.cat(results)
        else:
            w, b = self.sample_weight_and_bias()
            return F.linear(x, w, b)
            return self._forward(x)

    def get_expected_nonzeros(self):
        exc_p1, inh_p1 = self.get_gate_probabilities()

        # Flip two coins with probabilities pi_1 and pi_2. What is the
        # probability one of them is 1?
        #
        # 1 - (1 - pi_1)*(1 - pi_2)
        # = 1 - 1 + pi_1 + pi_2 - pi_1*pi_2
        # = pi_1 + pi_2 - pi_1*pi_2
        p1 = exc_p1 + inh_p1 - (exc_p1 * inh_p1)

        return p1.sum(dim=1).detach()

    def get_inference_nonzeros(self):
        exc_mask, inh_mask = self.get_inference_mask()

        return torch.sum(exc_mask.int() | inh_mask.int(), dim=1)

    def count_inference_flops(self):
        # For each unit, multiply with its n inputs then do n - 1 additions.
        # To capture the -1, subtract it, but only in cases where there is at
        # least one weight.
        nz_by_unit = self.get_inference_nonzeros()
        multiplies = torch.sum(nz_by_unit)
        adds = multiplies - torch.sum(nz_by_unit > 0)
        return multiplies.item(), adds.item()
Beispiel #4
0
class BinaryGatedConv2d(Module):
    """
    Convolutional layer with binary stochastic gates
    """
    def __init__(self,
                 in_channels,
                 out_channels,
                 kernel_size,
                 stride=1,
                 padding=0,
                 dilation=1,
                 groups=1,
                 learn_weight=True,
                 bias=True,
                 droprate_init=0.5,
                 l2_strength=1.,
                 l0_strength=1.,
                 random_weight=True,
                 deterministic=False,
                 use_baseline_bias=False,
                 optimize_inference=True,
                 one_sample_per_item=False,
                 **kwargs):
        """
        :param in_channels: Number of input channels
        :param out_channels: Number of output channels
        :param kernel_size: Size of the kernel
        :param stride: Stride for the convolution
        :param padding: Padding for the convolution
        :param dilation: Dilation factor for the convolution
        :param groups: How many groups we will assume in the convolution
        :param bias: Whether we will use a bias
        :param droprate_init: Dropout rate that the gates will be initialized to
        :param l2_strength: Strength of the L2 penalty
        :param l0_strength: Strength of the L0 penalty
        """
        super(BinaryGatedConv2d, self).__init__()
        if in_channels % groups != 0:
            raise ValueError("in_channels must be divisible by groups")
        if out_channels % groups != 0:
            raise ValueError("out_channels must be divisible by groups")
        self.in_channels = in_channels
        self.out_channels = out_channels
        self.kernel_size = pair(kernel_size)
        self.stride = pair(stride)
        self.padding = pair(padding)
        self.dilation = pair(dilation)
        self.output_padding = pair(0)
        self.groups = groups
        self.l2_strength = l2_strength
        self.l0_strength = l0_strength
        self.droprate_init = droprate_init if droprate_init != 0. else 0.5
        self.deterministic = deterministic
        self.use_baseline_bias = use_baseline_bias
        self.optimize_inference = optimize_inference
        self.one_sample_per_item = one_sample_per_item

        self.random_weight = random_weight
        if random_weight:
            exc_weight = torch.Tensor(out_channels, in_channels // groups,
                                      *self.kernel_size)
            inh_weight = torch.Tensor(out_channels, in_channels // groups,
                                      *self.kernel_size)
        else:
            exc_weight = torch.ones(out_channels, in_channels // groups,
                                    *self.kernel_size)
            inh_weight = torch.ones(out_channels, in_channels // groups,
                                    *self.kernel_size)

        if learn_weight:
            self.exc_weight = Parameter(exc_weight)
            self.inh_weight = Parameter(inh_weight)
        else:
            self.register_buffer("exc_weight", exc_weight)
            self.register_buffer("inh_weight", inh_weight)
        self.exc_p1 = Parameter(
            torch.Tensor(out_channels, in_channels // groups,
                         *self.kernel_size))
        self.inh_p1 = Parameter(
            torch.Tensor(out_channels, in_channels // groups,
                         *self.kernel_size))
        self.dim_z = out_channels
        self.input_shape = None

        self.use_bias = bias
        if bias:
            self.bias = Parameter(torch.Tensor(out_channels))

        self.reset_parameters()

    def reset_parameters(self):
        if self.random_weight:
            init.kaiming_normal_(self.exc_weight, mode="fan_out")
            init.kaiming_normal_(self.inh_weight, mode="fan_out")
            self.exc_weight.data.abs_()
            self.inh_weight.data.abs_()
        self.exc_p1.data.normal_(1 - self.droprate_init, 1e-2)
        self.inh_p1.data.normal_(1 - self.droprate_init, 1e-2)

        if self.use_bias:
            self.bias.data.fill_(0)

    def constrain_parameters(self, **kwargs):
        self.exc_weight.data.clamp_(min=0.)
        self.inh_weight.data.clamp_(min=0.)

    def weight_size(self):
        return self.exc_weight.size()

    def regularization(self):
        """
        Expected L0 norm under the stochastic gates, takes into account and
        re-weights also a potential L2 penalty
        """

        if self.l0_strength > 0 or self.l2_strength > 0:
            # Clamp these, but do it in a way that still always propagates the
            # gradient.
            exc_p1 = self.exc_p1.clone()
            torch.clamp(exc_p1.data, min=0, max=1, out=exc_p1.data)
            inh_p1 = self.inh_p1.clone()
            torch.clamp(inh_p1.data, min=0, max=1, out=inh_p1.data)

            if self.l2_strength == 0:
                return self.l0_strength * (exc_p1 + inh_p1).sum()
            else:
                exc_weight_decay_ungated = (.5 * self.l2_strength *
                                            self.exc_weight.pow(2))
                inh_weight_decay_ungated = (.5 * self.l2_strength *
                                            self.inh_weight.pow(2))
                exc_weight_l2_l0 = torch.sum(
                    (exc_weight_decay_ungated + self.l0_strength) * exc_p1)
                inh_weight_l2_l0 = torch.sum(
                    (inh_weight_decay_ungated + self.l0_strength) * inh_p1)
                bias_l2 = (0 if not self.use_bias else torch.sum(
                    .5 * self.l2_strength * self.bias.pow(2)))
                return exc_weight_l2_l0 + inh_weight_l2_l0 + bias_l2
        else:
            return 0

    def get_gate_probabilities(self):
        exc_p1 = torch.clamp(self.exc_p1.data, min=0., max=1.)
        inh_p1 = torch.clamp(self.inh_p1.data, min=0., max=1.)
        return exc_p1, inh_p1

    def get_inference_mask(self):
        exc_p1, inh_p1 = self.get_gate_probabilities()

        if self.deterministic:
            exc_mask = (exc_p1 >= 0.5).float()
            inh_mask = (inh_p1 >= 0.5).float()
            return exc_mask, inh_mask
        else:
            exc_count1 = exc_p1.sum(
                dim=tuple(range(1, len(exc_p1.shape)))).round().int()
            inh_count1 = inh_p1.sum(
                dim=tuple(range(1, len(inh_p1.shape)))).round().int()

            # pytorch doesn't offer topk with varying k values.
            exc_mask = torch.zeros_like(exc_p1)
            inh_mask = torch.zeros_like(inh_p1)
            for i in range(exc_count1.size()[0]):
                _, exc_indices = torch.topk(exc_p1[i].flatten(),
                                            exc_count1[i].item())
                _, inh_indices = torch.topk(inh_p1[i].flatten(),
                                            inh_count1[i].item())
                exc_mask[i].flatten().scatter_(-1, exc_indices, 1)
                inh_mask[i].flatten().scatter_(-1, inh_indices, 1)

            return exc_mask, inh_mask

    def sample_weight_and_bias(self, samples=1):
        if self.training or not self.optimize_inference:
            w = (sample_weight(self.exc_p1, self.exc_weight,
                               self.deterministic, samples) -
                 sample_weight(self.inh_p1, self.inh_weight,
                               self.deterministic, samples))
        else:
            exc_mask, inh_mask = self.get_inference_mask()
            w = exc_mask * self.exc_weight - inh_mask * self.inh_weight

        b = None
        if self.use_baseline_bias:
            b = -w.sum(dim=(-3, -2, -1)) / 2

        if self.use_bias:
            b = (b + self.bias if b is not None else self.bias)

        return w, b

    def forward(self, x):
        if self.input_shape is None:
            self.input_shape = x.size()

        if self.one_sample_per_item and self.training and len(x.size()) > 3:
            w, b = self.sample_weight_and_bias(x.size(0))

            if self.use_baseline_bias:
                b = b.view(x.size(0) * self.out_channels)
            else:
                b = b.repeat(x.size(0))

            x_ = x.view(1, x.size(0) * x.size(1), *x.size()[2:])
            w_ = w.view(w.size(0) * w.size(1), *w.size()[2:])
            result = F.conv2d(x_, w_, b, self.stride, self.padding,
                              self.dilation,
                              x.size(0) * self.groups)

            return result.view(x.size(0), self.out_channels,
                               *result.size()[2:])
        else:
            w, b = self.sample_weight_and_bias()
            return F.conv2d(x, w, b, self.stride, self.padding, self.dilation,
                            self.groups)

    def get_expected_nonzeros(self):
        exc_p1, inh_p1 = self.get_gate_probabilities()

        # Flip two coins with probabilities pi_1 and pi_2. What is the
        # probability one of them is 1?
        #
        # 1 - (1 - pi_1)*(1 - pi_2)
        # = 1 - 1 + pi_1 + pi_2 - pi_1*pi_2
        # = pi_1 + pi_2 - pi_1*pi_2
        p1 = exc_p1 + inh_p1 - (exc_p1 * inh_p1)

        return p1.sum(dim=tuple(range(1, len(p1.shape)))).detach()

    def get_inference_nonzeros(self):
        exc_mask, inh_mask = self.get_inference_mask()
        return torch.sum(exc_mask.int() | inh_mask.int(),
                         dim=tuple(range(1, len(exc_mask.shape))))

    def count_inference_flops(self):
        # For each unit, multiply with n inputs then do n - 1 additions.
        # Only subtract 1 in cases where is at least one weight.
        nz_by_unit = self.get_inference_nonzeros()
        multiplies_per_instance = torch.sum(nz_by_unit)
        adds_per_instance = multiplies_per_instance - torch.sum(nz_by_unit > 0)

        # for rows
        instances = ((self.input_shape[-2] - self.kernel_size[0] +
                      2 * self.padding[0]) / self.stride[0]) + 1
        # multiplying with cols
        instances *= ((self.input_shape[-1] - self.kernel_size[1] +
                       2 * self.padding[1]) / self.stride[1]) + 1

        multiplies = multiplies_per_instance * instances
        adds = adds_per_instance * instances

        return multiplies.item(), adds.item()
Beispiel #5
0
class group_relaxed_L1L2Conv2d(Module):
    """Implementation of TF1 regularization for the feature maps of a convolutional layer"""
    def __init__(self,
                 in_channels,
                 out_channels,
                 kernel_size,
                 stride=1,
                 padding=0,
                 dilation=1,
                 groups=1,
                 bias=True,
                 lamba=1.,
                 alpha=1.,
                 beta=4.,
                 weight_decay=1.,
                 **kwargs):
        """
		:param in_channels: Number of input channels
		:param out_channels: Number of output channels
		:param kernel_size: size of the kernel
		:param stride: stride for the convolution
		:param padding: padding for the convolution
		:param dilation: dilation factor for the convolution
		:param groups: how many groups we will assume in the convolution
		:param bias: whether we will use a bias
		:param lamba: strength of the TFL regularization
		"""
        super(group_relaxed_L1L2Conv2d, self).__init__()
        self.floatTensor = torch.FloatTensor if not torch.cuda.is_available(
        ) else torch.cuda.FloatTensor
        self.in_channels = in_channels
        self.out_channels = out_channels
        self.kernel_size = pair(kernel_size)
        self.stride = pair(stride)
        self.padding = pair(padding)
        self.dilation = pair(dilation)
        self.output_padding = pair(0)
        self.groups = groups
        self.lamba = lamba
        self.alpha = alpha
        self.beta = beta
        self.lamba1 = self.lamba / self.beta
        self.weight_decay = weight_decay
        self.weight = Parameter(
            torch.Tensor(out_channels, in_channels // groups,
                         *self.kernel_size))
        self.u = torch.rand(out_channels, in_channels // groups,
                            *self.kernel_size)
        self.u = self.u.to('cuda')
        if bias:
            self.bias = Parameter(torch.Tensor(out_channels))
        else:
            self.register_parameter('bias', None)
        self.reset_parameters()
        self.input_shape = None
        print(self)

    def reset_parameters(self):
        init.kaiming_normal(self.weight, mode='fan_in')

        if self.bias is not None:
            self.bias.data.normal_(0, 1e-2)

    def constrain_parameters(self, **kwargs):
        norm_w = self.weight.data.norm(p=float('inf'))
        if norm_w > self.lamba1:
            m = Softshrink(self.lamba1)
            z = m(self.weight.data)
            self.u.data = z * (z.data.norm(p=2) +
                               self.alpha * self.lamba1) / (z.data.norm(p=2))
        elif norm_w == self.lamba1:
            self.u = self.weight.clone()
            self.u[self.u.abs() < lamba1] = 0
            n = torch.sum(self.u != 0)
            self.u[self.u != 0] = self.weight.sign(
            ) * self.alpha * self.lamba1 / (n**(1 / 2))

        elif (1 - self.alpha) * self.lamba1 < norm_w and norm_w < self.lamba1:
            self.u = self.weight.clone()
            max_idx = np.unravel_index(torch.argmax(self.u.cpu(), None),
                                       self.u.shape)
            max_value_sign = self.u[max_idx].sign()
            self.u[:] = 0
            self.u[max_idx] = (norm_w +
                               (self.alpha - 1) * self.lamba1) * max_value_sign
        else:
            self.u = self.weight.clone()
            self.u[:] = 0

    def grow_beta(self, growth_factor):
        self.beta = self.beta * growth_factor
        self.lamba1 = self.lamba / self.beta

    def _reg_w(self, **kwargs):
        logpw = -self.beta * torch.sum(
            0.5 * self.weight.add(-self.u).pow(2)) - self.lamba * np.sqrt(
                self.in_channels * self.kernel_size[0] * self.kernel_size[1]
            ) * torch.sum(
                torch.pow(torch.sum(self.weight.pow(2), 3).sum(2).sum(1), 0.5))
        logpb = 0
        if self.bias is not None:
            logpb = -torch.sum(self.weight_decay * .5 * (self.bias.pow(2)))
        return logpw + logpb

    def regularization(self):
        return self._reg_w()

    def count_zero_u(self):
        total = np.prod(self.u.size())
        zero = total - self.u.nonzero().size(0)
        return zero

    def count_zero_w(self):
        return torch.sum((self.weight.abs() < 1e-5).int()).item()

    def count_active_neuron(self):
        return torch.sum((torch.sum(self.weight.abs(), 3).sum(2).sum(1) /
                          (self.in_channels * self.kernel_size[0] *
                           self.kernel_size[1])) > 1e-5).item()

    def count_total_neuron(self):
        return self.out_channels

    def count_weight(self):
        return np.prod(self.u.size())

    def count_expected_flops_and_l0(self):
        #ppos = self.out_channels
        ppos = torch.sum(
            torch.sum(self.weight.abs(), 3).sum(2).sum(1) > 0.001).item()
        n = self.kernel_size[0] * self.kernel_size[1] * self.in_channels
        flops_per_instance = n + (n - 1)

        num_instances_per_filter = (
            (self.input_shape[1] - self.kernel_size[0] + 2 * self.padding[0]) /
            self.stride[0]) + 1
        num_instances_per_filter *= (
            (self.input_shape[2] - self.kernel_size[1] + 2 * self.padding[1]) /
            self.stride[1]) + 1

        flops_per_filter = num_instances_per_filter * flops_per_instance
        expected_flops = flops_per_filter * ppos
        expected_l0 = n * ppos

        if self.bias is not None:
            expected_flops += num_instances_per_filter * ppos
            expected_l0 += ppos
        return expected_flops, expected_l0

    def forward(self, input_):
        if self.input_shape is None:
            self.input_shape = input_.size()
        output = F.conv2d(input_, self.weight, self.bias, self.stride,
                          self.padding, self.dilation, self.groups)
        return output

    def __repr__(self):
        s = ('{name}({in_channels}, {out_channels}, kernel_size={kernel_size} '
             ', stride={stride}')
        if self.padding != (0, ) * len(self.padding):
            s += ', padding={padding}'
        if self.dilation != (1, ) * len(self.dilation):
            s += ', dilation={dilation}'
        if self.output_padding != (0, ) * len(self.output_padding):
            s += ', output_padding={output_padding}'
        if self.groups != 1:
            s += ', groups={groups}'
        if self.bias is None:
            s += ', bias=False'
        s += ')'
        return s.format(name=self.__class__.__name__, **self.__dict__)
Beispiel #6
0
class group_relaxed_L1L2Dense(Module):
    """Implementation of TFL regularization for the input units of a fully connected layer"""
    def __init__(self,
                 in_features,
                 out_features,
                 bias=True,
                 lamba=1.,
                 alpha=1.,
                 beta=4.,
                 weight_decay=1.,
                 **kwargs):
        """
		:param in_features: input dimensionality
		:param out_features: output dimensionality
		:param bias: whether we use bias
		:param lamba: strength of the TF1 regularization
		"""
        super(group_relaxed_L1L2Dense, self).__init__()
        self.in_features = in_features
        self.out_features = out_features
        self.weight = Parameter(torch.Tensor(in_features, out_features))
        self.u = torch.rand(in_features, out_features)
        self.u = self.u.to('cuda')
        if bias:
            self.bias = Parameter(torch.Tensor(out_features))
        else:
            self.register_parameter('bias', None)
        self.lamba = lamba
        self.alpha = alpha
        self.beta = beta
        self.lamba1 = self.lamba / self.beta
        self.weight_decay = weight_decay
        self.floatTensor = torch.FloatTensor if not torch.cuda.is_available(
        ) else torch.cuda.FloatTensor
        self.reset_parameters()
        print(self)

    def reset_parameters(self):
        init.kaiming_normal(self.weight, mode='fan_out')

        if self.bias is not None:
            self.bias.data.normal_(0, 1e-2)

    def constrain_parameters(self, **kwargs):
        norm_w = self.weight.data.norm(p=float('inf'))
        if norm_w > self.lamba1:
            m = Softshrink(self.lamba1)
            z = m(self.weight.data)
            self.u.data = z * (z.data.norm(p=2) +
                               self.alpha * self.lamba1) / (z.data.norm(p=2))
        elif norm_w == self.lamba1:
            self.u = self.weight.clone()
            self.u[self.u.abs() < lamba1] = 0
            n = torch.sum(self.u != 0)
            self.u[self.u != 0] = self.weight.sign(
            ) * self.alpha * self.lamba1 / (n**(1 / 2))

        elif (1 - self.alpha) * self.lamba1 < norm_w and norm_w < self.lamba1:
            self.u = self.weight.clone()
            max_idx = np.unravel_index(torch.argmax(self.u.cpu(), None),
                                       self.u.shape)
            max_value_sign = self.u[max_idx].sign()
            self.u[:] = 0
            self.u[max_idx] = (norm_w +
                               (self.alpha - 1) * self.lamba1) * max_value_sign
        else:
            self.u = self.weight.clone()
            self.u[:] = 0

    def grow_beta(self, growth_factor):
        self.beta = self.beta * growth_factor
        self.lamba1 = self.lamba / self.beta

    def _reg_w(self, **kwargs):
        logpw = -self.beta * torch.sum(
            0.5 * self.weight.add(-self.u).pow(2)) - self.lamba * np.sqrt(
                self.out_features) * torch.sum(
                    torch.pow(torch.sum(self.weight.pow(2), 1), 0.5))
        logpb = 0
        if self.bias is not None:
            logpb = -torch.sum(self.weight_decay * .5 * (self.bias.pow(2)))
        return logpw + logpb

    def regularization(self):
        return self._reg_w()

    def count_zero_u(self):
        total = np.prod(self.u.size())
        zero = total - self.u.nonzero().size(0)
        return zero

    def count_zero_w(self):
        return torch.sum((self.weight.abs() < 1e-5).int()).item()

    def count_weight(self):
        return np.prod(self.u.size())

    def count_active_neuron(self):
        return torch.sum(
            torch.sum(self.weight.abs() / self.out_features, 1) > 1e-5).item()

    def count_total_neuron(self):
        return self.in_features

    def count_expected_flops_and_l0(self):
        ppos = torch.sum(self.weight.abs() > 0.000001).item()
        expected_flops = (2 * ppos - 1) * self.out_features
        expected_l0 = ppos * self.out_features
        if self.bias is not None:
            expected_flops += self.out_features
            expected_l0 += self.out_features
        return expected_flops, expected_l0

    def forward(self, input):
        output = input.mm(self.weight)
        if self.bias is not None:
            output.add_(self.bias.view(1, self.out_features).expand_as(output))
        return output

    def __repr__(self):
        return self.__class__.__name__+' (' \
         + str(self.in_features) + ' -> ' \
         + str(self.out_features) + ', lambda: ' \
         + str(self.lamba) + ')'
Beispiel #7
0
class MFLinearLayer(nn.Module):
    def __init__(self, dim_in, dim_out, prior_var=1, init_var=-7):
        super().__init__()
        self.init_var = init_var
        self.dim_in = dim_in
        self.dim_out = dim_out
        self.W_mean = Parameter(torch.Tensor(dim_out, dim_in))
        self.b_mean = Parameter(torch.Tensor(dim_out))

        self.W_var = Parameter(torch.Tensor(dim_out, dim_in))
        self.b_var = Parameter(torch.Tensor(dim_out))

        self.W_prior_mean = torch.zeros([dim_out, dim_in], device=device)
        self.b_prior_mean = torch.zeros([dim_out], device=device)

        self.prior_var = prior_var

        self.W_prior_var = torch.ones([dim_out, dim_in], device=device).mul(
            np.log(self.prior_var))
        self.b_prior_var = torch.ones([dim_out], device=device).mul(
            np.log(self.prior_var))

        self.reset_parameters()

    def reset_parameters(self):
        init.kaiming_uniform_(self.W_mean, a=math.sqrt(5))

        fan_in, _ = init._calculate_fan_in_and_fan_out(self.W_mean)
        bound = 1 / math.sqrt(fan_in)
        init.uniform_(self.b_mean, -bound, bound)

        init.constant_(self.W_var, self.init_var)
        init.constant_(self.b_var, self.init_var)

    def add_new_task(self, reset_variance=True):
        self.W_prior_mean = self.W_mean.clone().detach().requires_grad_(False)
        self.b_prior_mean = self.b_mean.clone().detach().requires_grad_(False)

        self.W_prior_var = self.W_var.clone().detach().requires_grad_(False)
        self.b_prior_var = self.b_var.clone().detach().requires_grad_(False)

        if reset_variance:
            self.W_var.data = torch.min(
                self.W_var,
                self.init_var * torch.ones_like(self.W_var).data)
            self.b_var.data = torch.min(
                self.b_var,
                self.init_var * torch.ones_like(self.b_var).data)

            fan_in, _ = init._calculate_fan_in_and_fan_out(self.W_mean)
            bound = 1 / math.sqrt(fan_in)

            initialization_noise = torch.empty_like(self.W_mean)
            init.kaiming_uniform_(initialization_noise, a=math.sqrt(5))
            # self.W_mean.data = self.W_mean.data + (self.W_var > -2).float() * initialization_noise
            # self.b_mean.data = self.b_mean.data + (self.b_var > -2).float() * torch.empty_like(self.b_mean).uniform_(-bound, bound)

            self.W_mean.data = initialization_noise.data
            self.b_mean.data = torch.empty_like(self.b_mean).uniform_(
                -bound, bound).data

    def get_kl(self, lamb):
        W_kl = compute_kl(self.W_mean,
                          self.W_var,
                          self.W_prior_mean,
                          self.W_prior_var,
                          lamb=lamb,
                          initial_prior_var=self.prior_var)
        b_kl = compute_kl(self.b_mean,
                          self.b_var,
                          self.b_prior_mean,
                          self.b_prior_var,
                          lamb=lamb,
                          initial_prior_var=self.prior_var)
        return W_kl + b_kl

    def forward(self, x):
        output_mean = x.matmul(
            self.W_mean.t()) + self.b_mean.unsqueeze(0).unsqueeze(0)
        output_std = torch.sqrt(
            (x**2).matmul(torch.exp(self.W_var.t())) +
            torch.exp(self.b_var).unsqueeze(0).unsqueeze(0))
        eps = torch.empty(output_mean.shape, device=device).normal_(mean=0,
                                                                    std=1)

        output = output_mean + (eps * output_std)
        return output
Beispiel #8
0
class MFConvLayer(torch.nn.modules.conv._ConvNd):
    def __init__(self,
                 in_channels,
                 out_channels,
                 kernel_size,
                 stride=1,
                 padding=0,
                 dilation=1,
                 groups=1,
                 bias=True,
                 padding_mode='zeros',
                 prior_var=1,
                 init_var=-7):
        kernel_size = _pair(kernel_size)
        stride = _pair(stride)
        padding = _pair(padding)
        dilation = _pair(dilation)
        super().__init__(in_channels, out_channels, kernel_size, stride,
                         padding, dilation, False, _pair(0), groups, bias,
                         padding_mode)

        self.init_var = init_var

        self.W_prior_mean = torch.zeros(self.weight.shape, device=device)
        self.b_prior_mean = torch.zeros(self.bias.shape, device=device)

        self.prior_var = prior_var
        self.W_prior_var = torch.ones(self.weight.shape, device=device).mul(
            np.log(self.prior_var))
        self.b_prior_var = torch.ones(self.bias.shape, device=device).mul(
            np.log(self.prior_var))

        self.weight_var = Parameter(torch.Tensor(self.weight.shape))
        self.bias_var = Parameter(torch.Tensor(self.bias.shape))

        self.reset_parameters()

    def conv2d_forward(self, input, weight, bias):
        if self.padding_mode == 'circular':
            expanded_padding = ((self.padding[1] + 1) // 2,
                                self.padding[1] // 2,
                                (self.padding[0] + 1) // 2,
                                self.padding[0] // 2)
            return F.conv2d(F.pad(input, expanded_padding,
                                  mode='circular'), weight, bias, self.stride,
                            _pair(0), self.dilation, self.groups)
        return F.conv2d(input, weight, bias, self.stride, self.padding,
                        self.dilation, self.groups)

    def reset_parameters(self):
        super().reset_parameters()
        if hasattr(self, 'weight_var'):
            init.constant_(self.weight_var, self.init_var)
            init.constant_(self.bias_var, self.init_var)

    def add_new_task(self):
        self.W_prior_mean = self.weight.clone().detach().requires_grad_(False)
        self.b_prior_mean = self.bias.clone().detach().requires_grad_(False)

        self.W_prior_var = self.weight_var.clone().detach().requires_grad_(
            False)
        self.b_prior_var = self.bias_var.clone().detach().requires_grad_(False)

        self.weight_var.data = torch.min(
            self.weight_var,
            self.init_var * torch.ones_like(self.weight_var).data)
        self.bias_var.data = torch.min(
            self.bias_var,
            self.init_var * torch.ones_like(self.bias_var).data)

        fan_in, _ = init._calculate_fan_in_and_fan_out(self.weight)
        bound = 1 / math.sqrt(fan_in)

        initialization_noise = torch.empty_like(self.weight)
        init.kaiming_uniform_(initialization_noise, a=math.sqrt(5))
        # self.weight.data = self.weight.data + (self.weight_var > -2).float() * initialization_noise
        # self.bias.data = self.bias.data + (self.bias_var > -2).float() * torch.empty_like(self.bias).uniform_(-bound, bound)

        self.weight.data = initialization_noise.data
        self.bias.data = torch.empty_like(self.bias).uniform_(-bound,
                                                              bound).data

    def get_kl(self, lamb):
        W_kl = compute_kl(self.weight,
                          self.weight_var,
                          self.W_prior_mean,
                          self.W_prior_var,
                          lamb=lamb,
                          initial_prior_var=self.prior_var)
        b_kl = compute_kl(self.bias,
                          self.bias_var,
                          self.b_prior_mean,
                          self.b_prior_var,
                          lamb=lamb,
                          initial_prior_var=self.prior_var)

        return W_kl + b_kl

    def forward(self, input):
        output_mean = self.conv2d_forward(input, self.weight, self.bias)
        output_var = self.conv2d_forward(input**2, torch.exp(self.weight_var),
                                         torch.exp(self.bias_var))

        eps = torch.empty(output_mean.shape, device=device).normal_(mean=0,
                                                                    std=1)
        output = output_mean + torch.sqrt(output_var + 1e-9) * eps

        return output
Beispiel #9
0
class my_Linear(nn.Module):
    def __init__(self, in_features, out_features, bias=True):
        super(my_Linear, self).__init__()
        self.in_features = in_features
        self.out_features = out_features
        self.weight = Parameter(torch.Tensor(out_features, in_features))
        if bias:
            self.bias = Parameter(torch.Tensor(out_features))
        else:
            self.register_parameter('bias', None)
        self.reset_parameters()
        self._mode = 0
        self._verbose = False
        self._bverbose = False
        self._value = None     ## save mav value
        self._index = None     ## save max position

    def setMode(self, m):
        self._mode = m

    def reset_parameters(self):
        init.kaiming_uniform_(self.weight, a=math.sqrt(5))
        if self.bias is not None:
            fan_in, _ = init._calculate_fan_in_and_fan_out(self.weight)
            bound = 1 / math.sqrt(fan_in)
            init.uniform_(self.bias, -bound, bound)

    def forward(self, input):
        tweight = self.weight.clone()
        if(self._mode == 2):  ## find path
            if(input.shape[0] > 1):  ## max & min input
                max_input = input[0].clone().unsqueeze(0)  ## max
                min_input = input[1].clone().unsqueeze(0)  ## min
            else:                    ## max only
                max_input = input.clone()
                min_input = input.clone() * 0

            maxpos = None
            if self._verbose:
                print('== input ==')
                print(input.shape)
                print(input)
                print('== weight ==')
                print(self.weight.shape)
                print(self.weight)
                print('== bias ==')
                print(self.bias)
            tx = []
            tx_min = []
            ws = self.weight.shape
            bias = self.bias.clone()
            bias *= 0
            print('linear max node : ', ws[1], file=sys.stderr)
            for py in range(ws[1]):  ## out-feature
                tweight *= 0
                tweight[:,py] = self.weight[:,py].data 
                tx.append(F.linear(max_input, tweight, bias))
                tx_min.append(F.linear(min_input, tweight, bias))
                if(py % 100 == 0):
                    print('processed node : %d \r' % py, file=sys.stderr, end='')
                if self._verbose:
                    print('===iter ', py, ' ===')
                    print(tweight)
                    print(tx[py])
                    print(tx_min[py])
            ## make maximum result
            maxv = torch.max(torch.stack(tx + tx_min), axis=0)
            minv = torch.min(torch.stack(tx + tx_min), axis=0)

            self._value = maxv[0].data
            self._value_min = minv[0].data

            maxi = maxv[1].data
            maxi[ maxi >= ws[1] ] *= -1
            maxi[ maxi < 0 ] += (ws[1]-1)  ## -1 부터 시작되도록
            self._index = maxi.data

            mini = minv[1].data
            mini[ mini >= ws[1] ] *= -1
            mini[ mini < 0 ] += (ws[1]-1)
            self._index_min = mini.data

            if self._verbose:
                #print(torch.stack(tx + tx_min))
                print(self._value) 
                print(self._index) 
                print(self._value_min) 
                print(self._index_min) 

            return torch.cat([self._value, self._value_min])
            
        elif(self._mode == 1): ## normal mode
            return F.linear(input, self.weight, self.bias)

        else:
            return F.linear(input, self.weight, self.bias)
        
    def getValue(self, pos):
        if(pos >= 0):
            v = self._value.flatten()[pos] ## position  
        else:
            npos = -1 * (pos+1) ## begins from -1
            v = self._value_min.flatten()[npos]
        return v   

    def getIndex(self, pos):
        if(pos >= 0):
            tpos = self._index.flatten()[pos].item() ## 
        else: 
            npos = -1 * (pos+1) ## begins from -1
            tpos = self._index_min.flatten()[npos].item()
        return tpos

    def getOutShape(self):
        if(self._value is None): return None
        return self._value.shape

    def getWeight(self, cpos, upos):   ## cpos : current, upos : under pos
        if(cpos < 0):
            cpos = -1 * (cpos+1)
        if(upos < 0):
            upos = -1 * (upos+1)
        return self.weight[cpos, upos]

    def backward(self, input):
        ##1. use last tensor ( upper layer result)
        current_pos = int(input[-1, 0].item())               ## current position
        current_val = self.getValue(current_pos) 
        input[-1,1] = current_val  ## set current val
        ##2. make under layer information
        under_pos = self.getIndex(current_pos)
        under_out = torch.tensor([[under_pos, current_val, 0.0, 0.0]])
        #for saving weight
        weight = self.getWeight(current_pos, under_pos)
        input[-1,2] = weight.data
        out = torch.cat([input, under_out], dim=0)
        if self._bverbose:
            print('=== linear backwrd ===')
            print('selected class = ', current_pos)
            print('max value = ', current_val)
            print('position in under layer = ',  under_pos)
            print('used weigh = ', weight)
            print('-- input')
            print(input)
            print('-- output')
            print(out)
            print('======')
        
        return out

    def back_candidate(self, path, underpath, not_input):
        p = []
        cp = int(path[0].item())
        up = int(underpath[0].item())
        for px in range(self.weight.shape[1]):
            if(px == up): continue  ## check identity
            tweight = self.weight[cp, px]
            p.append(torch.tensor([px, tweight, 0.0]))      
            if(not_input):
                p.append(torch.tensor([-1*(px+1), tweight, 0.0]))      
        return p

    def path_forward(self, input_val, path):
        cpos = int(path[0].item())    # [cpos, value, weight]
        if(input_val is None):
            return self.getValue(cpos)
        
        cweight = path[2]
        return input_val * cweight 

    def extra_repr(self):
        return 'in_features={}, out_features={}, bias={}'.format(
            self.in_features, self.out_features, self.bias is not None
        )
Beispiel #10
0
class DenseFCLayer(torch.nn.Module):
    def __init__(self,
                 n_inputs=None,
                 n_outputs=None,
                 weights: torch.Tensor = None,
                 use_biases=True,
                 activation=None):
        super(DenseFCLayer, self).__init__()
        if n_inputs is not None and n_outputs is not None:
            self.n_inputs = n_inputs
            self.n_outputs = n_outputs
            self._activation = activation
            self._initial_weights = None

            self._weights = Parameter(torch.Tensor(n_inputs, n_outputs))
            self._init_weights()
            self._mask = torch.ones_like(self._weights)
            self._initial_weights = self._weights.clone()
            self.use_biases = use_biases

            if self.use_biases:
                self._biases = Parameter(torch.Tensor(n_outputs))
                self._init_biases()
        elif weights is not None:
            self.n_inputs = weights.size(0)
            self.n_outputs = weights.size(1)
            self._activation = activation
            self._initial_weights = weights

            self._weights = Parameter(weights)
            self._mask = torch.ones_like(self._weights)

            self._biases = Parameter(torch.Tensor(self.n_outputs))
            self._init_biases()
        else:
            raise ValueError(
                "DenseFClayer class accepts either n_inputs/n_outputs or weights"
            )

    def _init_weights(self):
        # Note the difference between init functions
        # torch.nn.init.xavier_normal_(self._weights)
        # torch.nn.init.xavier_uniform_(self._weights)
        # torch.nn.init.kaiming_normal_(self._weights)
        torch.nn.init.kaiming_uniform_(self._weights)

    def _init_biases(self):
        torch.nn.init.zeros_(self._biases)

    def prune_by_threshold(self, thr):
        self._mask *= (torch.abs(self._weights) >= thr).float()

    def prune_by_rank(self, rank):
        weights_val = self._weights[self._mask == 1]
        sorted_abs_weights = torch.sort(torch.abs(weights_val))[0]
        thr = sorted_abs_weights[rank]
        self.prune_by_threshold(thr)

    def prune_by_pct(self, pct):
        prune_idx = int(self.n_weights * pct)
        self.prune_by_rank(prune_idx)

    def prune_by_pct_taylor(self, pct):
        prune_idx = int(self.n_weights * pct)

        # by abs val
        wg = torch.abs(self._weights[self._mask == 1] *
                       self._weights.grad[self._mask == 1])
        sorted_wg = torch.sort(wg)[0]
        thr = sorted_wg[prune_idx]
        print(thr)
        self._mask *= (torch.abs(self._weights * self._weights.grad) >
                       thr).float()

        # by val
        # wg = self._weights[self._mask == 1] * self._weights.grad[self._mask == 1]
        # sorted_wg = torch.sort(wg)[0]
        # thr = sorted_wg[prune_idx]
        # self._mask *= (self._weights * self._weights.grad >= thr).float()

    def random_prune_by_pct(self, pct):
        prune_idx = int(self.n_weights * pct)
        rand = torch.rand(size=self._mask.size(), device=self._mask.device)
        rand_val = rand[self._mask == 1]
        sorted_abs_rand = torch.sort(rand_val)[0]
        thr = sorted_abs_rand[prune_idx]
        self._mask *= (rand >= thr).float()

    def reinitialize(self):
        self._weights = Parameter(self._initial_weights)
        self._init_biases()  # biases are reinitialized

    def to_sparse(self) -> SparseFCLayer:
        return SparseFCLayer((self._weights * self._mask).t().to_sparse(),
                             self._biases.reshape((-1, 1)), self._activation)

    @classmethod
    def from_sparse(cls, s_layer: SparseFCLayer):
        return cls(weights=s_layer.weights.t().to_dense(),
                   activation=s_layer.activation)

    def to_device(self, device: torch.device):
        self._initial_weights = self._initial_weights.to(device)
        self._mask = self._mask.to(device)

    def forward(self, inputs: torch.Tensor, use_mask=True):
        masked_weights = self._weights
        if use_mask:
            masked_weights = self._weights * self._mask
        if self.use_biases:
            ret = torch.addmm(self._biases, inputs, masked_weights)
        else:
            ret = torch.mm(inputs, masked_weights)
        return ret if self._activation is None else self._activation(ret)

    @property
    def mask(self):
        return self._mask

    @property
    def weights(self):
        return self._weights

    @property
    def activation(self):
        return self._activation

    @property
    def n_weights(self):
        return torch.nonzero(self._mask).size(0)

    @property
    def biases(self):
        if self.use_biases:
            return self._biases
        else:
            return None

    def __str__(self):
        return "DenseFClayer with size {} and activation {}".format(
            (self.n_inputs, self.n_outputs), self._activation)
class my_Linear(nn.Module):
    def __init__(self, in_features, out_features, bias=True):
        super(my_Linear, self).__init__()
        self.in_features = in_features
        self.out_features = out_features
        self.weight = Parameter(torch.Tensor(out_features, in_features))
        if bias:
            self.bias = Parameter(torch.Tensor(out_features))
        else:
            self.register_parameter('bias', None)
        self.reset_parameters()
        self._mode = 0
        self._verbose = False
        self._bverbose = True
        self._value = None  ## save mav value
        self._index = None  ## save max position

    def setMode(self, m):
        self._mode = m

    def reset_parameters(self):
        init.kaiming_uniform_(self.weight, a=math.sqrt(5))
        if self.bias is not None:
            fan_in, _ = init._calculate_fan_in_and_fan_out(self.weight)
            bound = 1 / math.sqrt(fan_in)
            init.uniform_(self.bias, -bound, bound)

    def forward(self, input):
        tweight = self.weight.clone()
        if (self._mode == 2):  ## find path
            maxpos = None
            if self._verbose:
                print('== input ==')
                print(input.shape)
                print(input)
                print('== weight ==')
                print(self.weight.shape)
                print(self.weight)
                print('== bias ==')
                print(self.bias)
            tx = []
            ws = self.weight.shape
            bias = self.bias.clone()
            bias *= 0
            print('linear max node : ', ws[1], file=sys.stderr)
            for py in range(ws[1]):  ## out-feature
                tweight *= 0
                tweight[:, py] = self.weight[:, py].data
                tx.append(F.linear(input, tweight, bias))
                if (py % 1000 == 0):
                    print('processed node : %d \r' % py,
                          file=sys.stderr,
                          end='')
                if self._verbose:
                    print('===iter ', py, ' ===')
                    print(tweight)
                    print(tx[py])
            ## make maximum result
            ts = torch.stack(tx)
            maxv = torch.max(ts, axis=0)

            self._value = maxv[0].data
            self._index = maxv[1].data

            if self._verbose:
                print(self._value)
                print(self._index)

            return self._value

        elif (self._mode == 1):  ## normal mode
            return F.linear(input, self.weight, self.bias)

        else:
            return F.linear(input, self.weight, self.bias)

    def getValue(self, pos):
        return self._value.flatten()[pos]  ## position

    def getIndex(self, pos):
        tpos = self._index.flatten()[pos].item()  ##
        return tpos

    def getOutShape(self):
        return self._value.shape

    def backward(self, input):
        ## use last tensor ( upper layer result)
        current_pos = int(input[-1, 0].item())  ## current position
        current_val = self.getValue(current_pos)
        under_pos = self.getIndex(current_pos)
        under_out = torch.tensor([[under_pos, current_val, 0]])
        out = torch.cat([input, under_out], dim=0)
        if self._bverbose:
            print('=== linear backwrd ===')
            print('selected class = ', current_pos)
            print('max value = ', current_val)
            print('position in under layer = ', under_pos)
            print('-- input')
            print(input)
            print('-- output')
            print(out)
            print('======')

        return out

    def extra_repr(self):
        return 'in_features={}, out_features={}, bias={}'.format(
            self.in_features, self.out_features, self.bias is not None)
class group_relaxed_SCAD_Dense(Module):
	"""Implementation of TFL regularization for the input units of a fully connected layer"""
	def __init__(self, in_features, out_features, bias=True, lamba=1., alpha = 3.7, beta = 4.0, weight_decay=1., **kwargs):
		"""
		:param in_features: input dimensionality
		:param out_features: output dimensionality
		:param bias: whether we use bias
		:param lamba: strength of the TF1 regularization
		"""
		super(group_relaxed_SCAD_Dense,self).__init__()
		self.in_features = in_features
		self.out_features = out_features
		self.weight = Parameter(torch.Tensor(in_features, out_features))
		self.u = torch.rand(in_features, out_features)
		self.u = self.u.to('cuda')
		if bias:
			self.bias = Parameter(torch.Tensor(out_features))
		else:
			self.register_parameter('bias', None)
		self.lamba = lamba
		self.alpha = alpha
		self.beta = beta
		self.lamba1 = self.lamba/self.beta
		self.weight_decay = weight_decay
		self.floatTensor = torch.FloatTensor if not torch.cuda.is_available() else torch.cuda.FloatTensor
		self.reset_parameters()
		print(self)

	def reset_parameters(self):
		init.kaiming_normal(self.weight, mode='fan_out')

		if self.bias is not None:
			self.bias.data.normal_(0,1e-2)


	def constrain_parameters(self, **kwargs):
		self.u = self.weight.clone()
		s = Softshrink(self.lamba1)
		#shrinkage on values with absolute value less than 2*lamba1
		shrink_value = s(self.weight.data)
		self.u[self.weight.abs()<=2*self.lamba1] = shrink_value[self.weight.abs()<=2*self.lamba1]

		#modify values whose absolute values are between 2*lamba1 and alpha*lamba1
		modify_weight = self.weight.data
		modify_weight = ((self.alpha - 1)*modify_weight-modify_weight.sign()*(3.7*self.lamba1))/(self.alpha -2)
		self.u[(self.weight.abs()>2*self.lamba1) & (self.weight.abs()<=self.alpha*self.lamba1)] = modify_weight[(self.weight.abs()>2*self.lamba1) & (self.weight.abs()<=self.alpha*self.lamba1)]


	def grow_beta(self, growth_factor):
		self.beta = self.beta*growth_factor
		self.lamba1 = self.lamba/self.beta

	def _reg_w(self, **kwargs):
		logpw = -self.beta*torch.sum(0.5*self.weight.add(-self.u).pow(2))-self.lamba*np.sqrt(self.out_features)*torch.sum(torch.pow(torch.sum(self.weight.pow(2),1),0.5))
		logpb = 0
		if self.bias is not None:
			logpb = - torch.sum(self.weight_decay * .5 * (self.bias.pow(2)))
		return logpw + logpb

	def regularization(self):
		return self._reg_w()

	def count_zero_u(self):
		total = np.prod(self.u.size())
		zero = total - self.u.nonzero().size(0)
		return zero

	def count_zero_w(self):
		return torch.sum((self.weight.abs()<1e-5).int()).item()

	def count_weight(self):
		return np.prod(self.u.size())

	def count_active_neuron(self):
		return torch.sum(torch.sum(self.weight.abs()/self.out_features,1)>1e-5).item()

	def count_total_neuron(self):
		return self.in_features

	def count_expected_flops_and_l0(self):
		ppos = torch.sum(self.weight.abs()>0.000001).item()
		expected_flops = (2*ppos-1)*self.out_features
		expected_l0 = ppos*self.out_features
		if self.bias is not None:
			expected_flops += self.out_features
			expected_l0 += self.out_features
		return expected_flops, expected_l0

	def forward(self, input):
		output = input.mm(self.weight)
		if self.bias is not None:
			output.add_(self.bias.view(1, self.out_features).expand_as(output))
		return output

	def __repr__(self):
		return self.__class__.__name__+' (' \
			+ str(self.in_features) + ' -> ' \
			+ str(self.out_features) + ', lambda: ' \
			+ str(self.lamba) + ')'
Beispiel #13
0
class ElementWiseConv2d(nn.Module):
    """Modified conv with masks for weights."""
    def __init__(self,
                 in_channels,
                 out_channels,
                 kernel_size,
                 stride=1,
                 padding=0,
                 dilation=1,
                 groups=1,
                 bias=False,
                 mask_init='uniform',
                 mask_scale=1e-2,
                 threshold_fn='binarizer',
                 threshold=0.0):
        super(ElementWiseConv2d, self).__init__()
        kernel_size = _pair(kernel_size)
        stride = _pair(stride)
        padding = _pair(padding)
        dilation = _pair(dilation)
        self.mask_scale = mask_scale
        self.mask_init = mask_init

        if in_channels % groups != 0:
            raise ValueError('in_channels must be divisible by groups')
        if out_channels % groups != 0:
            raise ValueError('out_channels must be divisible by groups')

        self.in_channels = in_channels
        self.out_channels = out_channels
        self.kernel_size = kernel_size
        self.stride = stride
        self.padding = padding
        self.dilation = dilation
        self.transposed = False
        self.output_padding = _pair(0)
        self.groups = groups

        # imagenet pretrained weight
        self.imagenet_weight = Parameter(torch.Tensor(out_channels,
                                                      in_channels // groups,
                                                      *kernel_size),
                                         requires_grad=True)

        # place365 weight  no bias now
        self.place365_weight = Parameter(torch.Tensor(out_channels,
                                                      in_channels // groups,
                                                      *kernel_size),
                                         requires_grad=True)

        # Initialize real-valued mask weights.
        self.mask_real = self.imagenet_weight.data.new(
            self.imagenet_weight.size())

        if mask_init == '1s':
            self.mask_real.fill_(mask_scale)

        elif mask_init == 'uniform':
            self.mask_real.uniform_(-1 * mask_scale, mask_scale)

        # mask_real is now a trainable parameter.
        self.mask_real = Parameter(self.mask_real)
        '''
        # changed for audo threshold
        self.threshold = nn.Parameter(torch.Tensor([threshold]), requires_grad = False)
        '''

        # Initialize the thresholder.
        if threshold_fn == 'binarizer':
            print('Calling binarizer with threshold:', threshold)
            self.threshold_fn = Binarizer(threshold=threshold)
        elif threshold_fn == 'ternarizer':
            print('Calling ternarizer with threshold:', threshold)
            self.threshold_fn = Ternarizer(threshold=threshold)

    def forward(self, input):
        # Get binarized/ternarized mask from real-valued mask.
        #mask_thresholded = self.threshold_fn(self.mask_real)

        #mask_thresholded = torch.sigmoid(self.mask_real)
        prob_data = self.mask_real.clone()
        prob_data[self.mask_real.le(0.5)] = 0
        prob_data[self.mask_real.gt(0.5)] = 1
        mask_thresholded = (prob_data -
                            self.mask_real).detach() + self.mask_real

        # changed  for audo threshold
        #mask_thresholded = Binarizer_auto()(self.mask_real+self.threshold)

        # Mask weights with above mask.
        weight_combined = mask_thresholded * self.place365_weight + (
            1 - mask_thresholded) * self.imagenet_weight
        #weight_combined = self.place365_weight

        # Perform conv using modified weight.
        return F.conv2d(input, weight_combined, None, self.stride,
                        self.padding, self.dilation, self.groups)

    def __repr__(self):
        s = ('{name} ({in_channels}, {out_channels}, kernel_size={kernel_size}'
             ', stride={stride}')
        if self.padding != (0, ) * len(self.padding):
            s += ', padding={padding}'
        if self.dilation != (1, ) * len(self.dilation):
            s += ', dilation={dilation}'
        if self.output_padding != (0, ) * len(self.output_padding):
            s += ', output_padding={output_padding}'
        if self.groups != 1:
            s += ', groups={groups}'
        if self.bias is None:
            s += ', bias=False'
        s += ')'
        return s.format(name=self.__class__.__name__, **self.__dict__)

    def _apply(self, fn):
        for module in self.children():
            module._apply(fn)

        for param in self._parameters.values():
            if param is not None:
                # Variables stored in modules are graph leaves, and we don't
                # want to create copy nodes, so we have to unpack the data.
                param.data = fn(param.data)
                if param._grad is not None:
                    param._grad.data = fn(param._grad.data)

        for key, buf in self._buffers.items():
            if buf is not None:
                self._buffers[key] = fn(buf)

        self.imagenet_weight.data = fn(self.imagenet_weight.data)