def set_mask(self, mask): if mask is None: return channel = mask.sum().item() mid_channel = make_divisible(channel // self.reduction, 8) exp_mask = _get_channel_mask(self.se.expand.weight.data, mid_channel) self.se.reduction.set_mask(mask, exp_mask) self.se.expand.set_mask(exp_mask, mask)
def __init__(self, channel, reduction=4, reduction_layer=None, expand_layer=None): super(SEModule, self).__init__() self.channel = channel self.reduction = reduction mid_channel = make_divisible(channel // reduction, 8) self.se = nn.Sequential(OrderedDict([ ("reduction", reduction_layer or nn.Conv2d(self.channel, mid_channel, 1, 1, 0)), ("relu", nn.ReLU(inplace=True)), ("expand", expand_layer or nn.Conv2d(mid_channel, self.channel, 1, 1, 0)), ("activation", get_op("h_sigmoid")()) ]))
def __init__(self, channel, reduction=4): mid_channel = make_divisible(channel // reduction, 8) reduction_layer = FlexiblePointLinear(channel, mid_channel, 1, 1, 0, bias=True) expand_layer = FlexiblePointLinear(mid_channel, channel, 1, 1, 0, bias=True) super(FlexibleSEModule, self).__init__(channel, reduction, reduction_layer, expand_layer) FlexibleLayer.__init__(self)
def gradient(self, data, criterion=lambda i, l, t: nn.CrossEntropyLoss()(l, t), parameters=None, eval_criterions=None, mode="train", zero_grads=True, return_grads=True, **kwargs): """Get the gradient with respect to the candidate net parameters. Args: parameters (optional): if specificied, can be a dict of param_name: param, or a list of parameter name. Returns: grads (dict of name: grad tensor) """ self._set_mode(mode) if return_grads: active_parameters = dict(self.named_parameters()) if parameters is not None: _parameters = dict(parameters) _addi = set(_parameters.keys()).difference(active_parameters) assert not _addi,\ ("Cannot get gradient of parameters that are not active " "in this candidate net: {}")\ .format(", ".join(_addi)) else: _parameters = active_parameters inputs, targets = data batch_size = inputs.size(0) min_image_size = min(self.super_net.search_space.image_size_choice) cur_image_size = self.rollout.image_size ratio = (min_image_size / cur_image_size)**2 mini_batch_size = make_divisible(batch_size * ratio, 8) inputs = F.interpolate(inputs, (cur_image_size, cur_image_size), mode="bilinear", align_corners=False) if zero_grads: self.zero_grad() for i in range( 0, batch_size // mini_batch_size + int(batch_size % mini_batch_size != 0), mini_batch_size): mini_inputs = inputs[i:i + mini_batch_size] mini_targets = targets[i:i + mini_batch_size] outputs = self.forward_data(mini_inputs, mini_targets, **kwargs) loss = criterion(mini_inputs, outputs, mini_targets) loss.backward() if not return_grads: grads = None else: grads = [(k, v.grad.clone()) for k, v in six.iteritems(_parameters) if v.grad is not None] if eval_criterions: eval_res = utils.flatten_list([ c(mini_inputs, outputs, mini_targets) for c in eval_criterions ]) return grads, eval_res return grads