class cnn_THS(clstm): def __init__(self, vocab_size,max_num_hidden_layers,embedding_dim, n_classes,n_filters,filter_size, dropout, batch_size,b=0.99, n=0.01, s=0.2, use_cuda=False): super().__init__(vocab_size,max_num_hidden_layers,embedding_dim, n_classes,n_filters,filter_size, dropout, batch_size, b=b, n=n, s=s,use_cuda=use_cuda) self.e = Parameter(torch.tensor(e), requires_grad=False) self.arms_values = Parameter(torch.arange(n_classes), requires_grad=False) self.explorations_mab = [] for i in range(n_classes): self.explorations_mab.append(algs.ThompsomSampling(len(e))) def partial_fit(self, X_data, Y_data, exp_factor, show_loss=True): self.partial_fit_(X_data, Y_data, show_loss) self.explorations_mab[Y_data[0]].reward(exp_factor) def predict(self, X_data): pred = self.predict_(X_data)[0] exp_factor = self.explorations_mab[pred].select()[0] if np.random.uniform() < self.e[exp_factor]: removed_arms = self.arms_values.clone().numpy().tolist() removed_arms.remove(pred) return random.choice(removed_arms), exp_factor return pred, exp_factor
class ONN_THS(ONN): def __init__(self, features_size, max_num_hidden_layers, qtd_neuron_per_hidden_layer, n_classes, b=0.99, n=0.01, s=0.2, e=[0.5, 0.35, 0.2, 0.1, 0.05], use_cuda=False): super().__init__(features_size, max_num_hidden_layers, qtd_neuron_per_hidden_layer, n_classes, b=b, n=n, s=s, use_cuda=use_cuda) self.e = Parameter(torch.tensor(e), requires_grad=False) self.arms_values = Parameter(torch.arange(n_classes), requires_grad=False) self.explorations_mab = [] for i in range(n_classes): self.explorations_mab.append(algs.ThompsomSampling(len(e))) def partial_fit(self, X_data, Y_data, exp_factor, show_loss=True): self.partial_fit_(X_data, Y_data, show_loss) self.explorations_mab[Y_data[0]].reward(exp_factor) def predict(self, X_data): pred = self.predict_(X_data)[0] exp_factor = self.explorations_mab[pred].select()[0] if np.random.uniform() < self.e[exp_factor]: removed_arms = self.arms_values.clone().numpy().tolist() removed_arms.remove(pred) return random.choice(removed_arms), exp_factor return pred, exp_factor
class BinaryGatedLinear(Module): """ Linear layer with stochastic binary gates """ def __init__(self, in_features, out_features, l0_strength=1., l2_strength=1., learn_weight=True, bias=True, droprate_init=0.5, random_weight=True, deterministic=False, use_baseline_bias=False, optimize_inference=False, one_sample_per_item=False, **kwargs): """ :param in_features: Input dimensionality :param out_features: Output dimensionality :param bias: Whether we use a bias :param l2_strength: Strength of the L2 penalty :param droprate_init: Dropout rate that the gates will be initialized to :param l0_strength: Strength of the L0 penalty """ super(BinaryGatedLinear, self).__init__() self.in_features = in_features self.out_features = out_features self.l0_strength = l0_strength self.l2_strength = l2_strength self.deterministic = deterministic self.use_baseline_bias = use_baseline_bias self.optimize_inference = optimize_inference self.one_sample_per_item = one_sample_per_item self.random_weight = random_weight if random_weight: exc_weight = torch.Tensor(out_features, in_features) inh_weight = torch.Tensor(out_features, in_features) else: exc_weight = torch.ones(out_features, in_features) inh_weight = torch.ones(out_features, in_features) if learn_weight: self.exc_weight = Parameter(exc_weight) self.inh_weight = Parameter(inh_weight) else: self.register_buffer("exc_weight", exc_weight) self.register_buffer("inh_weight", inh_weight) self.exc_p1 = Parameter(torch.Tensor(out_features, in_features)) self.inh_p1 = Parameter(torch.Tensor(out_features, in_features)) self.droprate_init = droprate_init if droprate_init != 0. else 0.5 self.use_bias = bias if bias: self.bias = Parameter(torch.Tensor(out_features)) self.reset_parameters() def reset_parameters(self): if self.random_weight: init.kaiming_normal_(self.exc_weight, mode="fan_out") init.kaiming_normal_(self.inh_weight, mode="fan_out") self.exc_weight.data.abs_() self.inh_weight.data.abs_() self.exc_p1.data.normal_(1 - self.droprate_init, 1e-2) self.inh_p1.data.normal_(1 - self.droprate_init, 1e-2) if self.use_bias: self.bias.data.fill_(0) def constrain_parameters(self, **kwargs): self.exc_weight.data.clamp_(min=0.) self.inh_weight.data.clamp_(min=0.) def get_gate_probabilities(self): exc_p1 = torch.clamp(self.exc_p1.data, min=0., max=1.) inh_p1 = torch.clamp(self.inh_p1.data, min=0., max=1.) return exc_p1, inh_p1 def weight_size(self): return self.exc_weight.size() def regularization(self): """ Expected L0 norm under the stochastic gates, takes into account and re-weights also a potential L2 penalty """ if self.l0_strength > 0 or self.l2_strength > 0: # Clamp these, but do it in a way that still always propagates the # gradient. exc_p1 = self.exc_p1.clone() torch.clamp(exc_p1.data, min=0, max=1, out=exc_p1.data) inh_p1 = self.inh_p1.clone() torch.clamp(inh_p1.data, min=0, max=1, out=inh_p1.data) if self.l2_strength == 0: return self.l0_strength * (exc_p1 + inh_p1).sum() else: exc_weight_decay_ungated = (.5 * self.l2_strength * self.exc_weight.pow(2)) inh_weight_decay_ungated = (.5 * self.l2_strength * self.inh_weight.pow(2)) exc_weight_l2_l0 = torch.sum( (exc_weight_decay_ungated + self.l0_strength) * exc_p1) inh_weight_l2_l0 = torch.sum( (inh_weight_decay_ungated + self.l0_strength) * inh_p1) bias_l2 = (0 if not self.use_bias else torch.sum( .5 * self.l2_strength * self.bias.pow(2))) return exc_weight_l2_l0 + inh_weight_l2_l0 + bias_l2 else: return 0 def get_inference_mask(self): exc_p1, inh_p1 = self.get_gate_probabilities() if self.deterministic: exc_mask = (exc_p1 >= 0.5).float() inh_mask = (inh_p1 >= 0.5).float() return exc_mask, inh_mask else: exc_count1 = exc_p1.sum(dim=1).round().int() inh_count1 = inh_p1.sum(dim=1).round().int() # pytorch doesn't offer topk with varying k values. exc_mask = torch.zeros_like(exc_p1) inh_mask = torch.zeros_like(inh_p1) for i in range(exc_count1.size()[0]): _, exc_indices = torch.topk(exc_p1[i], exc_count1[i].item()) _, inh_indices = torch.topk(inh_p1[i], inh_count1[i].item()) exc_mask[i].scatter_(-1, exc_indices, 1) inh_mask[i].scatter_(-1, inh_indices, 1) return exc_mask, inh_mask def sample_weight_and_bias(self): if self.training or not self.optimize_inference: w = (sample_weight(self.exc_p1, self.exc_weight, self.deterministic) - sample_weight(self.inh_p1, self.inh_weight, self.deterministic)) else: exc_mask, inh_mask = self.get_inference_mask() w = exc_mask * self.exc_weight - inh_mask * self.inh_weight b = None if self.use_baseline_bias: b = -w.sum(dim=-1) / 2 if self.use_bias: b = (b + self.bias if b is not None else self.bias) return w, b def forward(self, x): if self.one_sample_per_item and self.training and len(x.size()) > 1: results = [] for i in range(x.size(0)): w, b = self.sample_weight_and_bias() results.append(F.linear(x[i:i + 1], w, b)) return torch.cat(results) else: w, b = self.sample_weight_and_bias() return F.linear(x, w, b) return self._forward(x) def get_expected_nonzeros(self): exc_p1, inh_p1 = self.get_gate_probabilities() # Flip two coins with probabilities pi_1 and pi_2. What is the # probability one of them is 1? # # 1 - (1 - pi_1)*(1 - pi_2) # = 1 - 1 + pi_1 + pi_2 - pi_1*pi_2 # = pi_1 + pi_2 - pi_1*pi_2 p1 = exc_p1 + inh_p1 - (exc_p1 * inh_p1) return p1.sum(dim=1).detach() def get_inference_nonzeros(self): exc_mask, inh_mask = self.get_inference_mask() return torch.sum(exc_mask.int() | inh_mask.int(), dim=1) def count_inference_flops(self): # For each unit, multiply with its n inputs then do n - 1 additions. # To capture the -1, subtract it, but only in cases where there is at # least one weight. nz_by_unit = self.get_inference_nonzeros() multiplies = torch.sum(nz_by_unit) adds = multiplies - torch.sum(nz_by_unit > 0) return multiplies.item(), adds.item()
class BinaryGatedConv2d(Module): """ Convolutional layer with binary stochastic gates """ def __init__(self, in_channels, out_channels, kernel_size, stride=1, padding=0, dilation=1, groups=1, learn_weight=True, bias=True, droprate_init=0.5, l2_strength=1., l0_strength=1., random_weight=True, deterministic=False, use_baseline_bias=False, optimize_inference=True, one_sample_per_item=False, **kwargs): """ :param in_channels: Number of input channels :param out_channels: Number of output channels :param kernel_size: Size of the kernel :param stride: Stride for the convolution :param padding: Padding for the convolution :param dilation: Dilation factor for the convolution :param groups: How many groups we will assume in the convolution :param bias: Whether we will use a bias :param droprate_init: Dropout rate that the gates will be initialized to :param l2_strength: Strength of the L2 penalty :param l0_strength: Strength of the L0 penalty """ super(BinaryGatedConv2d, self).__init__() if in_channels % groups != 0: raise ValueError("in_channels must be divisible by groups") if out_channels % groups != 0: raise ValueError("out_channels must be divisible by groups") self.in_channels = in_channels self.out_channels = out_channels self.kernel_size = pair(kernel_size) self.stride = pair(stride) self.padding = pair(padding) self.dilation = pair(dilation) self.output_padding = pair(0) self.groups = groups self.l2_strength = l2_strength self.l0_strength = l0_strength self.droprate_init = droprate_init if droprate_init != 0. else 0.5 self.deterministic = deterministic self.use_baseline_bias = use_baseline_bias self.optimize_inference = optimize_inference self.one_sample_per_item = one_sample_per_item self.random_weight = random_weight if random_weight: exc_weight = torch.Tensor(out_channels, in_channels // groups, *self.kernel_size) inh_weight = torch.Tensor(out_channels, in_channels // groups, *self.kernel_size) else: exc_weight = torch.ones(out_channels, in_channels // groups, *self.kernel_size) inh_weight = torch.ones(out_channels, in_channels // groups, *self.kernel_size) if learn_weight: self.exc_weight = Parameter(exc_weight) self.inh_weight = Parameter(inh_weight) else: self.register_buffer("exc_weight", exc_weight) self.register_buffer("inh_weight", inh_weight) self.exc_p1 = Parameter( torch.Tensor(out_channels, in_channels // groups, *self.kernel_size)) self.inh_p1 = Parameter( torch.Tensor(out_channels, in_channels // groups, *self.kernel_size)) self.dim_z = out_channels self.input_shape = None self.use_bias = bias if bias: self.bias = Parameter(torch.Tensor(out_channels)) self.reset_parameters() def reset_parameters(self): if self.random_weight: init.kaiming_normal_(self.exc_weight, mode="fan_out") init.kaiming_normal_(self.inh_weight, mode="fan_out") self.exc_weight.data.abs_() self.inh_weight.data.abs_() self.exc_p1.data.normal_(1 - self.droprate_init, 1e-2) self.inh_p1.data.normal_(1 - self.droprate_init, 1e-2) if self.use_bias: self.bias.data.fill_(0) def constrain_parameters(self, **kwargs): self.exc_weight.data.clamp_(min=0.) self.inh_weight.data.clamp_(min=0.) def weight_size(self): return self.exc_weight.size() def regularization(self): """ Expected L0 norm under the stochastic gates, takes into account and re-weights also a potential L2 penalty """ if self.l0_strength > 0 or self.l2_strength > 0: # Clamp these, but do it in a way that still always propagates the # gradient. exc_p1 = self.exc_p1.clone() torch.clamp(exc_p1.data, min=0, max=1, out=exc_p1.data) inh_p1 = self.inh_p1.clone() torch.clamp(inh_p1.data, min=0, max=1, out=inh_p1.data) if self.l2_strength == 0: return self.l0_strength * (exc_p1 + inh_p1).sum() else: exc_weight_decay_ungated = (.5 * self.l2_strength * self.exc_weight.pow(2)) inh_weight_decay_ungated = (.5 * self.l2_strength * self.inh_weight.pow(2)) exc_weight_l2_l0 = torch.sum( (exc_weight_decay_ungated + self.l0_strength) * exc_p1) inh_weight_l2_l0 = torch.sum( (inh_weight_decay_ungated + self.l0_strength) * inh_p1) bias_l2 = (0 if not self.use_bias else torch.sum( .5 * self.l2_strength * self.bias.pow(2))) return exc_weight_l2_l0 + inh_weight_l2_l0 + bias_l2 else: return 0 def get_gate_probabilities(self): exc_p1 = torch.clamp(self.exc_p1.data, min=0., max=1.) inh_p1 = torch.clamp(self.inh_p1.data, min=0., max=1.) return exc_p1, inh_p1 def get_inference_mask(self): exc_p1, inh_p1 = self.get_gate_probabilities() if self.deterministic: exc_mask = (exc_p1 >= 0.5).float() inh_mask = (inh_p1 >= 0.5).float() return exc_mask, inh_mask else: exc_count1 = exc_p1.sum( dim=tuple(range(1, len(exc_p1.shape)))).round().int() inh_count1 = inh_p1.sum( dim=tuple(range(1, len(inh_p1.shape)))).round().int() # pytorch doesn't offer topk with varying k values. exc_mask = torch.zeros_like(exc_p1) inh_mask = torch.zeros_like(inh_p1) for i in range(exc_count1.size()[0]): _, exc_indices = torch.topk(exc_p1[i].flatten(), exc_count1[i].item()) _, inh_indices = torch.topk(inh_p1[i].flatten(), inh_count1[i].item()) exc_mask[i].flatten().scatter_(-1, exc_indices, 1) inh_mask[i].flatten().scatter_(-1, inh_indices, 1) return exc_mask, inh_mask def sample_weight_and_bias(self, samples=1): if self.training or not self.optimize_inference: w = (sample_weight(self.exc_p1, self.exc_weight, self.deterministic, samples) - sample_weight(self.inh_p1, self.inh_weight, self.deterministic, samples)) else: exc_mask, inh_mask = self.get_inference_mask() w = exc_mask * self.exc_weight - inh_mask * self.inh_weight b = None if self.use_baseline_bias: b = -w.sum(dim=(-3, -2, -1)) / 2 if self.use_bias: b = (b + self.bias if b is not None else self.bias) return w, b def forward(self, x): if self.input_shape is None: self.input_shape = x.size() if self.one_sample_per_item and self.training and len(x.size()) > 3: w, b = self.sample_weight_and_bias(x.size(0)) if self.use_baseline_bias: b = b.view(x.size(0) * self.out_channels) else: b = b.repeat(x.size(0)) x_ = x.view(1, x.size(0) * x.size(1), *x.size()[2:]) w_ = w.view(w.size(0) * w.size(1), *w.size()[2:]) result = F.conv2d(x_, w_, b, self.stride, self.padding, self.dilation, x.size(0) * self.groups) return result.view(x.size(0), self.out_channels, *result.size()[2:]) else: w, b = self.sample_weight_and_bias() return F.conv2d(x, w, b, self.stride, self.padding, self.dilation, self.groups) def get_expected_nonzeros(self): exc_p1, inh_p1 = self.get_gate_probabilities() # Flip two coins with probabilities pi_1 and pi_2. What is the # probability one of them is 1? # # 1 - (1 - pi_1)*(1 - pi_2) # = 1 - 1 + pi_1 + pi_2 - pi_1*pi_2 # = pi_1 + pi_2 - pi_1*pi_2 p1 = exc_p1 + inh_p1 - (exc_p1 * inh_p1) return p1.sum(dim=tuple(range(1, len(p1.shape)))).detach() def get_inference_nonzeros(self): exc_mask, inh_mask = self.get_inference_mask() return torch.sum(exc_mask.int() | inh_mask.int(), dim=tuple(range(1, len(exc_mask.shape)))) def count_inference_flops(self): # For each unit, multiply with n inputs then do n - 1 additions. # Only subtract 1 in cases where is at least one weight. nz_by_unit = self.get_inference_nonzeros() multiplies_per_instance = torch.sum(nz_by_unit) adds_per_instance = multiplies_per_instance - torch.sum(nz_by_unit > 0) # for rows instances = ((self.input_shape[-2] - self.kernel_size[0] + 2 * self.padding[0]) / self.stride[0]) + 1 # multiplying with cols instances *= ((self.input_shape[-1] - self.kernel_size[1] + 2 * self.padding[1]) / self.stride[1]) + 1 multiplies = multiplies_per_instance * instances adds = adds_per_instance * instances return multiplies.item(), adds.item()
class group_relaxed_L1L2Conv2d(Module): """Implementation of TF1 regularization for the feature maps of a convolutional layer""" def __init__(self, in_channels, out_channels, kernel_size, stride=1, padding=0, dilation=1, groups=1, bias=True, lamba=1., alpha=1., beta=4., weight_decay=1., **kwargs): """ :param in_channels: Number of input channels :param out_channels: Number of output channels :param kernel_size: size of the kernel :param stride: stride for the convolution :param padding: padding for the convolution :param dilation: dilation factor for the convolution :param groups: how many groups we will assume in the convolution :param bias: whether we will use a bias :param lamba: strength of the TFL regularization """ super(group_relaxed_L1L2Conv2d, self).__init__() self.floatTensor = torch.FloatTensor if not torch.cuda.is_available( ) else torch.cuda.FloatTensor self.in_channels = in_channels self.out_channels = out_channels self.kernel_size = pair(kernel_size) self.stride = pair(stride) self.padding = pair(padding) self.dilation = pair(dilation) self.output_padding = pair(0) self.groups = groups self.lamba = lamba self.alpha = alpha self.beta = beta self.lamba1 = self.lamba / self.beta self.weight_decay = weight_decay self.weight = Parameter( torch.Tensor(out_channels, in_channels // groups, *self.kernel_size)) self.u = torch.rand(out_channels, in_channels // groups, *self.kernel_size) self.u = self.u.to('cuda') if bias: self.bias = Parameter(torch.Tensor(out_channels)) else: self.register_parameter('bias', None) self.reset_parameters() self.input_shape = None print(self) def reset_parameters(self): init.kaiming_normal(self.weight, mode='fan_in') if self.bias is not None: self.bias.data.normal_(0, 1e-2) def constrain_parameters(self, **kwargs): norm_w = self.weight.data.norm(p=float('inf')) if norm_w > self.lamba1: m = Softshrink(self.lamba1) z = m(self.weight.data) self.u.data = z * (z.data.norm(p=2) + self.alpha * self.lamba1) / (z.data.norm(p=2)) elif norm_w == self.lamba1: self.u = self.weight.clone() self.u[self.u.abs() < lamba1] = 0 n = torch.sum(self.u != 0) self.u[self.u != 0] = self.weight.sign( ) * self.alpha * self.lamba1 / (n**(1 / 2)) elif (1 - self.alpha) * self.lamba1 < norm_w and norm_w < self.lamba1: self.u = self.weight.clone() max_idx = np.unravel_index(torch.argmax(self.u.cpu(), None), self.u.shape) max_value_sign = self.u[max_idx].sign() self.u[:] = 0 self.u[max_idx] = (norm_w + (self.alpha - 1) * self.lamba1) * max_value_sign else: self.u = self.weight.clone() self.u[:] = 0 def grow_beta(self, growth_factor): self.beta = self.beta * growth_factor self.lamba1 = self.lamba / self.beta def _reg_w(self, **kwargs): logpw = -self.beta * torch.sum( 0.5 * self.weight.add(-self.u).pow(2)) - self.lamba * np.sqrt( self.in_channels * self.kernel_size[0] * self.kernel_size[1] ) * torch.sum( torch.pow(torch.sum(self.weight.pow(2), 3).sum(2).sum(1), 0.5)) logpb = 0 if self.bias is not None: logpb = -torch.sum(self.weight_decay * .5 * (self.bias.pow(2))) return logpw + logpb def regularization(self): return self._reg_w() def count_zero_u(self): total = np.prod(self.u.size()) zero = total - self.u.nonzero().size(0) return zero def count_zero_w(self): return torch.sum((self.weight.abs() < 1e-5).int()).item() def count_active_neuron(self): return torch.sum((torch.sum(self.weight.abs(), 3).sum(2).sum(1) / (self.in_channels * self.kernel_size[0] * self.kernel_size[1])) > 1e-5).item() def count_total_neuron(self): return self.out_channels def count_weight(self): return np.prod(self.u.size()) def count_expected_flops_and_l0(self): #ppos = self.out_channels ppos = torch.sum( torch.sum(self.weight.abs(), 3).sum(2).sum(1) > 0.001).item() n = self.kernel_size[0] * self.kernel_size[1] * self.in_channels flops_per_instance = n + (n - 1) num_instances_per_filter = ( (self.input_shape[1] - self.kernel_size[0] + 2 * self.padding[0]) / self.stride[0]) + 1 num_instances_per_filter *= ( (self.input_shape[2] - self.kernel_size[1] + 2 * self.padding[1]) / self.stride[1]) + 1 flops_per_filter = num_instances_per_filter * flops_per_instance expected_flops = flops_per_filter * ppos expected_l0 = n * ppos if self.bias is not None: expected_flops += num_instances_per_filter * ppos expected_l0 += ppos return expected_flops, expected_l0 def forward(self, input_): if self.input_shape is None: self.input_shape = input_.size() output = F.conv2d(input_, self.weight, self.bias, self.stride, self.padding, self.dilation, self.groups) return output def __repr__(self): s = ('{name}({in_channels}, {out_channels}, kernel_size={kernel_size} ' ', stride={stride}') if self.padding != (0, ) * len(self.padding): s += ', padding={padding}' if self.dilation != (1, ) * len(self.dilation): s += ', dilation={dilation}' if self.output_padding != (0, ) * len(self.output_padding): s += ', output_padding={output_padding}' if self.groups != 1: s += ', groups={groups}' if self.bias is None: s += ', bias=False' s += ')' return s.format(name=self.__class__.__name__, **self.__dict__)
class group_relaxed_L1L2Dense(Module): """Implementation of TFL regularization for the input units of a fully connected layer""" def __init__(self, in_features, out_features, bias=True, lamba=1., alpha=1., beta=4., weight_decay=1., **kwargs): """ :param in_features: input dimensionality :param out_features: output dimensionality :param bias: whether we use bias :param lamba: strength of the TF1 regularization """ super(group_relaxed_L1L2Dense, self).__init__() self.in_features = in_features self.out_features = out_features self.weight = Parameter(torch.Tensor(in_features, out_features)) self.u = torch.rand(in_features, out_features) self.u = self.u.to('cuda') if bias: self.bias = Parameter(torch.Tensor(out_features)) else: self.register_parameter('bias', None) self.lamba = lamba self.alpha = alpha self.beta = beta self.lamba1 = self.lamba / self.beta self.weight_decay = weight_decay self.floatTensor = torch.FloatTensor if not torch.cuda.is_available( ) else torch.cuda.FloatTensor self.reset_parameters() print(self) def reset_parameters(self): init.kaiming_normal(self.weight, mode='fan_out') if self.bias is not None: self.bias.data.normal_(0, 1e-2) def constrain_parameters(self, **kwargs): norm_w = self.weight.data.norm(p=float('inf')) if norm_w > self.lamba1: m = Softshrink(self.lamba1) z = m(self.weight.data) self.u.data = z * (z.data.norm(p=2) + self.alpha * self.lamba1) / (z.data.norm(p=2)) elif norm_w == self.lamba1: self.u = self.weight.clone() self.u[self.u.abs() < lamba1] = 0 n = torch.sum(self.u != 0) self.u[self.u != 0] = self.weight.sign( ) * self.alpha * self.lamba1 / (n**(1 / 2)) elif (1 - self.alpha) * self.lamba1 < norm_w and norm_w < self.lamba1: self.u = self.weight.clone() max_idx = np.unravel_index(torch.argmax(self.u.cpu(), None), self.u.shape) max_value_sign = self.u[max_idx].sign() self.u[:] = 0 self.u[max_idx] = (norm_w + (self.alpha - 1) * self.lamba1) * max_value_sign else: self.u = self.weight.clone() self.u[:] = 0 def grow_beta(self, growth_factor): self.beta = self.beta * growth_factor self.lamba1 = self.lamba / self.beta def _reg_w(self, **kwargs): logpw = -self.beta * torch.sum( 0.5 * self.weight.add(-self.u).pow(2)) - self.lamba * np.sqrt( self.out_features) * torch.sum( torch.pow(torch.sum(self.weight.pow(2), 1), 0.5)) logpb = 0 if self.bias is not None: logpb = -torch.sum(self.weight_decay * .5 * (self.bias.pow(2))) return logpw + logpb def regularization(self): return self._reg_w() def count_zero_u(self): total = np.prod(self.u.size()) zero = total - self.u.nonzero().size(0) return zero def count_zero_w(self): return torch.sum((self.weight.abs() < 1e-5).int()).item() def count_weight(self): return np.prod(self.u.size()) def count_active_neuron(self): return torch.sum( torch.sum(self.weight.abs() / self.out_features, 1) > 1e-5).item() def count_total_neuron(self): return self.in_features def count_expected_flops_and_l0(self): ppos = torch.sum(self.weight.abs() > 0.000001).item() expected_flops = (2 * ppos - 1) * self.out_features expected_l0 = ppos * self.out_features if self.bias is not None: expected_flops += self.out_features expected_l0 += self.out_features return expected_flops, expected_l0 def forward(self, input): output = input.mm(self.weight) if self.bias is not None: output.add_(self.bias.view(1, self.out_features).expand_as(output)) return output def __repr__(self): return self.__class__.__name__+' (' \ + str(self.in_features) + ' -> ' \ + str(self.out_features) + ', lambda: ' \ + str(self.lamba) + ')'
class MFLinearLayer(nn.Module): def __init__(self, dim_in, dim_out, prior_var=1, init_var=-7): super().__init__() self.init_var = init_var self.dim_in = dim_in self.dim_out = dim_out self.W_mean = Parameter(torch.Tensor(dim_out, dim_in)) self.b_mean = Parameter(torch.Tensor(dim_out)) self.W_var = Parameter(torch.Tensor(dim_out, dim_in)) self.b_var = Parameter(torch.Tensor(dim_out)) self.W_prior_mean = torch.zeros([dim_out, dim_in], device=device) self.b_prior_mean = torch.zeros([dim_out], device=device) self.prior_var = prior_var self.W_prior_var = torch.ones([dim_out, dim_in], device=device).mul( np.log(self.prior_var)) self.b_prior_var = torch.ones([dim_out], device=device).mul( np.log(self.prior_var)) self.reset_parameters() def reset_parameters(self): init.kaiming_uniform_(self.W_mean, a=math.sqrt(5)) fan_in, _ = init._calculate_fan_in_and_fan_out(self.W_mean) bound = 1 / math.sqrt(fan_in) init.uniform_(self.b_mean, -bound, bound) init.constant_(self.W_var, self.init_var) init.constant_(self.b_var, self.init_var) def add_new_task(self, reset_variance=True): self.W_prior_mean = self.W_mean.clone().detach().requires_grad_(False) self.b_prior_mean = self.b_mean.clone().detach().requires_grad_(False) self.W_prior_var = self.W_var.clone().detach().requires_grad_(False) self.b_prior_var = self.b_var.clone().detach().requires_grad_(False) if reset_variance: self.W_var.data = torch.min( self.W_var, self.init_var * torch.ones_like(self.W_var).data) self.b_var.data = torch.min( self.b_var, self.init_var * torch.ones_like(self.b_var).data) fan_in, _ = init._calculate_fan_in_and_fan_out(self.W_mean) bound = 1 / math.sqrt(fan_in) initialization_noise = torch.empty_like(self.W_mean) init.kaiming_uniform_(initialization_noise, a=math.sqrt(5)) # self.W_mean.data = self.W_mean.data + (self.W_var > -2).float() * initialization_noise # self.b_mean.data = self.b_mean.data + (self.b_var > -2).float() * torch.empty_like(self.b_mean).uniform_(-bound, bound) self.W_mean.data = initialization_noise.data self.b_mean.data = torch.empty_like(self.b_mean).uniform_( -bound, bound).data def get_kl(self, lamb): W_kl = compute_kl(self.W_mean, self.W_var, self.W_prior_mean, self.W_prior_var, lamb=lamb, initial_prior_var=self.prior_var) b_kl = compute_kl(self.b_mean, self.b_var, self.b_prior_mean, self.b_prior_var, lamb=lamb, initial_prior_var=self.prior_var) return W_kl + b_kl def forward(self, x): output_mean = x.matmul( self.W_mean.t()) + self.b_mean.unsqueeze(0).unsqueeze(0) output_std = torch.sqrt( (x**2).matmul(torch.exp(self.W_var.t())) + torch.exp(self.b_var).unsqueeze(0).unsqueeze(0)) eps = torch.empty(output_mean.shape, device=device).normal_(mean=0, std=1) output = output_mean + (eps * output_std) return output
class MFConvLayer(torch.nn.modules.conv._ConvNd): def __init__(self, in_channels, out_channels, kernel_size, stride=1, padding=0, dilation=1, groups=1, bias=True, padding_mode='zeros', prior_var=1, init_var=-7): kernel_size = _pair(kernel_size) stride = _pair(stride) padding = _pair(padding) dilation = _pair(dilation) super().__init__(in_channels, out_channels, kernel_size, stride, padding, dilation, False, _pair(0), groups, bias, padding_mode) self.init_var = init_var self.W_prior_mean = torch.zeros(self.weight.shape, device=device) self.b_prior_mean = torch.zeros(self.bias.shape, device=device) self.prior_var = prior_var self.W_prior_var = torch.ones(self.weight.shape, device=device).mul( np.log(self.prior_var)) self.b_prior_var = torch.ones(self.bias.shape, device=device).mul( np.log(self.prior_var)) self.weight_var = Parameter(torch.Tensor(self.weight.shape)) self.bias_var = Parameter(torch.Tensor(self.bias.shape)) self.reset_parameters() def conv2d_forward(self, input, weight, bias): if self.padding_mode == 'circular': expanded_padding = ((self.padding[1] + 1) // 2, self.padding[1] // 2, (self.padding[0] + 1) // 2, self.padding[0] // 2) return F.conv2d(F.pad(input, expanded_padding, mode='circular'), weight, bias, self.stride, _pair(0), self.dilation, self.groups) return F.conv2d(input, weight, bias, self.stride, self.padding, self.dilation, self.groups) def reset_parameters(self): super().reset_parameters() if hasattr(self, 'weight_var'): init.constant_(self.weight_var, self.init_var) init.constant_(self.bias_var, self.init_var) def add_new_task(self): self.W_prior_mean = self.weight.clone().detach().requires_grad_(False) self.b_prior_mean = self.bias.clone().detach().requires_grad_(False) self.W_prior_var = self.weight_var.clone().detach().requires_grad_( False) self.b_prior_var = self.bias_var.clone().detach().requires_grad_(False) self.weight_var.data = torch.min( self.weight_var, self.init_var * torch.ones_like(self.weight_var).data) self.bias_var.data = torch.min( self.bias_var, self.init_var * torch.ones_like(self.bias_var).data) fan_in, _ = init._calculate_fan_in_and_fan_out(self.weight) bound = 1 / math.sqrt(fan_in) initialization_noise = torch.empty_like(self.weight) init.kaiming_uniform_(initialization_noise, a=math.sqrt(5)) # self.weight.data = self.weight.data + (self.weight_var > -2).float() * initialization_noise # self.bias.data = self.bias.data + (self.bias_var > -2).float() * torch.empty_like(self.bias).uniform_(-bound, bound) self.weight.data = initialization_noise.data self.bias.data = torch.empty_like(self.bias).uniform_(-bound, bound).data def get_kl(self, lamb): W_kl = compute_kl(self.weight, self.weight_var, self.W_prior_mean, self.W_prior_var, lamb=lamb, initial_prior_var=self.prior_var) b_kl = compute_kl(self.bias, self.bias_var, self.b_prior_mean, self.b_prior_var, lamb=lamb, initial_prior_var=self.prior_var) return W_kl + b_kl def forward(self, input): output_mean = self.conv2d_forward(input, self.weight, self.bias) output_var = self.conv2d_forward(input**2, torch.exp(self.weight_var), torch.exp(self.bias_var)) eps = torch.empty(output_mean.shape, device=device).normal_(mean=0, std=1) output = output_mean + torch.sqrt(output_var + 1e-9) * eps return output
class my_Linear(nn.Module): def __init__(self, in_features, out_features, bias=True): super(my_Linear, self).__init__() self.in_features = in_features self.out_features = out_features self.weight = Parameter(torch.Tensor(out_features, in_features)) if bias: self.bias = Parameter(torch.Tensor(out_features)) else: self.register_parameter('bias', None) self.reset_parameters() self._mode = 0 self._verbose = False self._bverbose = False self._value = None ## save mav value self._index = None ## save max position def setMode(self, m): self._mode = m def reset_parameters(self): init.kaiming_uniform_(self.weight, a=math.sqrt(5)) if self.bias is not None: fan_in, _ = init._calculate_fan_in_and_fan_out(self.weight) bound = 1 / math.sqrt(fan_in) init.uniform_(self.bias, -bound, bound) def forward(self, input): tweight = self.weight.clone() if(self._mode == 2): ## find path if(input.shape[0] > 1): ## max & min input max_input = input[0].clone().unsqueeze(0) ## max min_input = input[1].clone().unsqueeze(0) ## min else: ## max only max_input = input.clone() min_input = input.clone() * 0 maxpos = None if self._verbose: print('== input ==') print(input.shape) print(input) print('== weight ==') print(self.weight.shape) print(self.weight) print('== bias ==') print(self.bias) tx = [] tx_min = [] ws = self.weight.shape bias = self.bias.clone() bias *= 0 print('linear max node : ', ws[1], file=sys.stderr) for py in range(ws[1]): ## out-feature tweight *= 0 tweight[:,py] = self.weight[:,py].data tx.append(F.linear(max_input, tweight, bias)) tx_min.append(F.linear(min_input, tweight, bias)) if(py % 100 == 0): print('processed node : %d \r' % py, file=sys.stderr, end='') if self._verbose: print('===iter ', py, ' ===') print(tweight) print(tx[py]) print(tx_min[py]) ## make maximum result maxv = torch.max(torch.stack(tx + tx_min), axis=0) minv = torch.min(torch.stack(tx + tx_min), axis=0) self._value = maxv[0].data self._value_min = minv[0].data maxi = maxv[1].data maxi[ maxi >= ws[1] ] *= -1 maxi[ maxi < 0 ] += (ws[1]-1) ## -1 부터 시작되도록 self._index = maxi.data mini = minv[1].data mini[ mini >= ws[1] ] *= -1 mini[ mini < 0 ] += (ws[1]-1) self._index_min = mini.data if self._verbose: #print(torch.stack(tx + tx_min)) print(self._value) print(self._index) print(self._value_min) print(self._index_min) return torch.cat([self._value, self._value_min]) elif(self._mode == 1): ## normal mode return F.linear(input, self.weight, self.bias) else: return F.linear(input, self.weight, self.bias) def getValue(self, pos): if(pos >= 0): v = self._value.flatten()[pos] ## position else: npos = -1 * (pos+1) ## begins from -1 v = self._value_min.flatten()[npos] return v def getIndex(self, pos): if(pos >= 0): tpos = self._index.flatten()[pos].item() ## else: npos = -1 * (pos+1) ## begins from -1 tpos = self._index_min.flatten()[npos].item() return tpos def getOutShape(self): if(self._value is None): return None return self._value.shape def getWeight(self, cpos, upos): ## cpos : current, upos : under pos if(cpos < 0): cpos = -1 * (cpos+1) if(upos < 0): upos = -1 * (upos+1) return self.weight[cpos, upos] def backward(self, input): ##1. use last tensor ( upper layer result) current_pos = int(input[-1, 0].item()) ## current position current_val = self.getValue(current_pos) input[-1,1] = current_val ## set current val ##2. make under layer information under_pos = self.getIndex(current_pos) under_out = torch.tensor([[under_pos, current_val, 0.0, 0.0]]) #for saving weight weight = self.getWeight(current_pos, under_pos) input[-1,2] = weight.data out = torch.cat([input, under_out], dim=0) if self._bverbose: print('=== linear backwrd ===') print('selected class = ', current_pos) print('max value = ', current_val) print('position in under layer = ', under_pos) print('used weigh = ', weight) print('-- input') print(input) print('-- output') print(out) print('======') return out def back_candidate(self, path, underpath, not_input): p = [] cp = int(path[0].item()) up = int(underpath[0].item()) for px in range(self.weight.shape[1]): if(px == up): continue ## check identity tweight = self.weight[cp, px] p.append(torch.tensor([px, tweight, 0.0])) if(not_input): p.append(torch.tensor([-1*(px+1), tweight, 0.0])) return p def path_forward(self, input_val, path): cpos = int(path[0].item()) # [cpos, value, weight] if(input_val is None): return self.getValue(cpos) cweight = path[2] return input_val * cweight def extra_repr(self): return 'in_features={}, out_features={}, bias={}'.format( self.in_features, self.out_features, self.bias is not None )
class DenseFCLayer(torch.nn.Module): def __init__(self, n_inputs=None, n_outputs=None, weights: torch.Tensor = None, use_biases=True, activation=None): super(DenseFCLayer, self).__init__() if n_inputs is not None and n_outputs is not None: self.n_inputs = n_inputs self.n_outputs = n_outputs self._activation = activation self._initial_weights = None self._weights = Parameter(torch.Tensor(n_inputs, n_outputs)) self._init_weights() self._mask = torch.ones_like(self._weights) self._initial_weights = self._weights.clone() self.use_biases = use_biases if self.use_biases: self._biases = Parameter(torch.Tensor(n_outputs)) self._init_biases() elif weights is not None: self.n_inputs = weights.size(0) self.n_outputs = weights.size(1) self._activation = activation self._initial_weights = weights self._weights = Parameter(weights) self._mask = torch.ones_like(self._weights) self._biases = Parameter(torch.Tensor(self.n_outputs)) self._init_biases() else: raise ValueError( "DenseFClayer class accepts either n_inputs/n_outputs or weights" ) def _init_weights(self): # Note the difference between init functions # torch.nn.init.xavier_normal_(self._weights) # torch.nn.init.xavier_uniform_(self._weights) # torch.nn.init.kaiming_normal_(self._weights) torch.nn.init.kaiming_uniform_(self._weights) def _init_biases(self): torch.nn.init.zeros_(self._biases) def prune_by_threshold(self, thr): self._mask *= (torch.abs(self._weights) >= thr).float() def prune_by_rank(self, rank): weights_val = self._weights[self._mask == 1] sorted_abs_weights = torch.sort(torch.abs(weights_val))[0] thr = sorted_abs_weights[rank] self.prune_by_threshold(thr) def prune_by_pct(self, pct): prune_idx = int(self.n_weights * pct) self.prune_by_rank(prune_idx) def prune_by_pct_taylor(self, pct): prune_idx = int(self.n_weights * pct) # by abs val wg = torch.abs(self._weights[self._mask == 1] * self._weights.grad[self._mask == 1]) sorted_wg = torch.sort(wg)[0] thr = sorted_wg[prune_idx] print(thr) self._mask *= (torch.abs(self._weights * self._weights.grad) > thr).float() # by val # wg = self._weights[self._mask == 1] * self._weights.grad[self._mask == 1] # sorted_wg = torch.sort(wg)[0] # thr = sorted_wg[prune_idx] # self._mask *= (self._weights * self._weights.grad >= thr).float() def random_prune_by_pct(self, pct): prune_idx = int(self.n_weights * pct) rand = torch.rand(size=self._mask.size(), device=self._mask.device) rand_val = rand[self._mask == 1] sorted_abs_rand = torch.sort(rand_val)[0] thr = sorted_abs_rand[prune_idx] self._mask *= (rand >= thr).float() def reinitialize(self): self._weights = Parameter(self._initial_weights) self._init_biases() # biases are reinitialized def to_sparse(self) -> SparseFCLayer: return SparseFCLayer((self._weights * self._mask).t().to_sparse(), self._biases.reshape((-1, 1)), self._activation) @classmethod def from_sparse(cls, s_layer: SparseFCLayer): return cls(weights=s_layer.weights.t().to_dense(), activation=s_layer.activation) def to_device(self, device: torch.device): self._initial_weights = self._initial_weights.to(device) self._mask = self._mask.to(device) def forward(self, inputs: torch.Tensor, use_mask=True): masked_weights = self._weights if use_mask: masked_weights = self._weights * self._mask if self.use_biases: ret = torch.addmm(self._biases, inputs, masked_weights) else: ret = torch.mm(inputs, masked_weights) return ret if self._activation is None else self._activation(ret) @property def mask(self): return self._mask @property def weights(self): return self._weights @property def activation(self): return self._activation @property def n_weights(self): return torch.nonzero(self._mask).size(0) @property def biases(self): if self.use_biases: return self._biases else: return None def __str__(self): return "DenseFClayer with size {} and activation {}".format( (self.n_inputs, self.n_outputs), self._activation)
class my_Linear(nn.Module): def __init__(self, in_features, out_features, bias=True): super(my_Linear, self).__init__() self.in_features = in_features self.out_features = out_features self.weight = Parameter(torch.Tensor(out_features, in_features)) if bias: self.bias = Parameter(torch.Tensor(out_features)) else: self.register_parameter('bias', None) self.reset_parameters() self._mode = 0 self._verbose = False self._bverbose = True self._value = None ## save mav value self._index = None ## save max position def setMode(self, m): self._mode = m def reset_parameters(self): init.kaiming_uniform_(self.weight, a=math.sqrt(5)) if self.bias is not None: fan_in, _ = init._calculate_fan_in_and_fan_out(self.weight) bound = 1 / math.sqrt(fan_in) init.uniform_(self.bias, -bound, bound) def forward(self, input): tweight = self.weight.clone() if (self._mode == 2): ## find path maxpos = None if self._verbose: print('== input ==') print(input.shape) print(input) print('== weight ==') print(self.weight.shape) print(self.weight) print('== bias ==') print(self.bias) tx = [] ws = self.weight.shape bias = self.bias.clone() bias *= 0 print('linear max node : ', ws[1], file=sys.stderr) for py in range(ws[1]): ## out-feature tweight *= 0 tweight[:, py] = self.weight[:, py].data tx.append(F.linear(input, tweight, bias)) if (py % 1000 == 0): print('processed node : %d \r' % py, file=sys.stderr, end='') if self._verbose: print('===iter ', py, ' ===') print(tweight) print(tx[py]) ## make maximum result ts = torch.stack(tx) maxv = torch.max(ts, axis=0) self._value = maxv[0].data self._index = maxv[1].data if self._verbose: print(self._value) print(self._index) return self._value elif (self._mode == 1): ## normal mode return F.linear(input, self.weight, self.bias) else: return F.linear(input, self.weight, self.bias) def getValue(self, pos): return self._value.flatten()[pos] ## position def getIndex(self, pos): tpos = self._index.flatten()[pos].item() ## return tpos def getOutShape(self): return self._value.shape def backward(self, input): ## use last tensor ( upper layer result) current_pos = int(input[-1, 0].item()) ## current position current_val = self.getValue(current_pos) under_pos = self.getIndex(current_pos) under_out = torch.tensor([[under_pos, current_val, 0]]) out = torch.cat([input, under_out], dim=0) if self._bverbose: print('=== linear backwrd ===') print('selected class = ', current_pos) print('max value = ', current_val) print('position in under layer = ', under_pos) print('-- input') print(input) print('-- output') print(out) print('======') return out def extra_repr(self): return 'in_features={}, out_features={}, bias={}'.format( self.in_features, self.out_features, self.bias is not None)
class group_relaxed_SCAD_Dense(Module): """Implementation of TFL regularization for the input units of a fully connected layer""" def __init__(self, in_features, out_features, bias=True, lamba=1., alpha = 3.7, beta = 4.0, weight_decay=1., **kwargs): """ :param in_features: input dimensionality :param out_features: output dimensionality :param bias: whether we use bias :param lamba: strength of the TF1 regularization """ super(group_relaxed_SCAD_Dense,self).__init__() self.in_features = in_features self.out_features = out_features self.weight = Parameter(torch.Tensor(in_features, out_features)) self.u = torch.rand(in_features, out_features) self.u = self.u.to('cuda') if bias: self.bias = Parameter(torch.Tensor(out_features)) else: self.register_parameter('bias', None) self.lamba = lamba self.alpha = alpha self.beta = beta self.lamba1 = self.lamba/self.beta self.weight_decay = weight_decay self.floatTensor = torch.FloatTensor if not torch.cuda.is_available() else torch.cuda.FloatTensor self.reset_parameters() print(self) def reset_parameters(self): init.kaiming_normal(self.weight, mode='fan_out') if self.bias is not None: self.bias.data.normal_(0,1e-2) def constrain_parameters(self, **kwargs): self.u = self.weight.clone() s = Softshrink(self.lamba1) #shrinkage on values with absolute value less than 2*lamba1 shrink_value = s(self.weight.data) self.u[self.weight.abs()<=2*self.lamba1] = shrink_value[self.weight.abs()<=2*self.lamba1] #modify values whose absolute values are between 2*lamba1 and alpha*lamba1 modify_weight = self.weight.data modify_weight = ((self.alpha - 1)*modify_weight-modify_weight.sign()*(3.7*self.lamba1))/(self.alpha -2) self.u[(self.weight.abs()>2*self.lamba1) & (self.weight.abs()<=self.alpha*self.lamba1)] = modify_weight[(self.weight.abs()>2*self.lamba1) & (self.weight.abs()<=self.alpha*self.lamba1)] def grow_beta(self, growth_factor): self.beta = self.beta*growth_factor self.lamba1 = self.lamba/self.beta def _reg_w(self, **kwargs): logpw = -self.beta*torch.sum(0.5*self.weight.add(-self.u).pow(2))-self.lamba*np.sqrt(self.out_features)*torch.sum(torch.pow(torch.sum(self.weight.pow(2),1),0.5)) logpb = 0 if self.bias is not None: logpb = - torch.sum(self.weight_decay * .5 * (self.bias.pow(2))) return logpw + logpb def regularization(self): return self._reg_w() def count_zero_u(self): total = np.prod(self.u.size()) zero = total - self.u.nonzero().size(0) return zero def count_zero_w(self): return torch.sum((self.weight.abs()<1e-5).int()).item() def count_weight(self): return np.prod(self.u.size()) def count_active_neuron(self): return torch.sum(torch.sum(self.weight.abs()/self.out_features,1)>1e-5).item() def count_total_neuron(self): return self.in_features def count_expected_flops_and_l0(self): ppos = torch.sum(self.weight.abs()>0.000001).item() expected_flops = (2*ppos-1)*self.out_features expected_l0 = ppos*self.out_features if self.bias is not None: expected_flops += self.out_features expected_l0 += self.out_features return expected_flops, expected_l0 def forward(self, input): output = input.mm(self.weight) if self.bias is not None: output.add_(self.bias.view(1, self.out_features).expand_as(output)) return output def __repr__(self): return self.__class__.__name__+' (' \ + str(self.in_features) + ' -> ' \ + str(self.out_features) + ', lambda: ' \ + str(self.lamba) + ')'
class ElementWiseConv2d(nn.Module): """Modified conv with masks for weights.""" def __init__(self, in_channels, out_channels, kernel_size, stride=1, padding=0, dilation=1, groups=1, bias=False, mask_init='uniform', mask_scale=1e-2, threshold_fn='binarizer', threshold=0.0): super(ElementWiseConv2d, self).__init__() kernel_size = _pair(kernel_size) stride = _pair(stride) padding = _pair(padding) dilation = _pair(dilation) self.mask_scale = mask_scale self.mask_init = mask_init if in_channels % groups != 0: raise ValueError('in_channels must be divisible by groups') if out_channels % groups != 0: raise ValueError('out_channels must be divisible by groups') self.in_channels = in_channels self.out_channels = out_channels self.kernel_size = kernel_size self.stride = stride self.padding = padding self.dilation = dilation self.transposed = False self.output_padding = _pair(0) self.groups = groups # imagenet pretrained weight self.imagenet_weight = Parameter(torch.Tensor(out_channels, in_channels // groups, *kernel_size), requires_grad=True) # place365 weight no bias now self.place365_weight = Parameter(torch.Tensor(out_channels, in_channels // groups, *kernel_size), requires_grad=True) # Initialize real-valued mask weights. self.mask_real = self.imagenet_weight.data.new( self.imagenet_weight.size()) if mask_init == '1s': self.mask_real.fill_(mask_scale) elif mask_init == 'uniform': self.mask_real.uniform_(-1 * mask_scale, mask_scale) # mask_real is now a trainable parameter. self.mask_real = Parameter(self.mask_real) ''' # changed for audo threshold self.threshold = nn.Parameter(torch.Tensor([threshold]), requires_grad = False) ''' # Initialize the thresholder. if threshold_fn == 'binarizer': print('Calling binarizer with threshold:', threshold) self.threshold_fn = Binarizer(threshold=threshold) elif threshold_fn == 'ternarizer': print('Calling ternarizer with threshold:', threshold) self.threshold_fn = Ternarizer(threshold=threshold) def forward(self, input): # Get binarized/ternarized mask from real-valued mask. #mask_thresholded = self.threshold_fn(self.mask_real) #mask_thresholded = torch.sigmoid(self.mask_real) prob_data = self.mask_real.clone() prob_data[self.mask_real.le(0.5)] = 0 prob_data[self.mask_real.gt(0.5)] = 1 mask_thresholded = (prob_data - self.mask_real).detach() + self.mask_real # changed for audo threshold #mask_thresholded = Binarizer_auto()(self.mask_real+self.threshold) # Mask weights with above mask. weight_combined = mask_thresholded * self.place365_weight + ( 1 - mask_thresholded) * self.imagenet_weight #weight_combined = self.place365_weight # Perform conv using modified weight. return F.conv2d(input, weight_combined, None, self.stride, self.padding, self.dilation, self.groups) def __repr__(self): s = ('{name} ({in_channels}, {out_channels}, kernel_size={kernel_size}' ', stride={stride}') if self.padding != (0, ) * len(self.padding): s += ', padding={padding}' if self.dilation != (1, ) * len(self.dilation): s += ', dilation={dilation}' if self.output_padding != (0, ) * len(self.output_padding): s += ', output_padding={output_padding}' if self.groups != 1: s += ', groups={groups}' if self.bias is None: s += ', bias=False' s += ')' return s.format(name=self.__class__.__name__, **self.__dict__) def _apply(self, fn): for module in self.children(): module._apply(fn) for param in self._parameters.values(): if param is not None: # Variables stored in modules are graph leaves, and we don't # want to create copy nodes, so we have to unpack the data. param.data = fn(param.data) if param._grad is not None: param._grad.data = fn(param._grad.data) for key, buf in self._buffers.items(): if buf is not None: self._buffers[key] = fn(buf) self.imagenet_weight.data = fn(self.imagenet_weight.data)