def retrain(self, finetuned_state, max_ep, op_cfg, sc_cfg, retrain_only_cutout): init_params(model=self.model, output=self.lg.info) if not retrain_only_cutout: self.agent.load_state(finetuned_state['agent']) [ self.g_tb_lg.add_scalars('final_probs', self.agent.get_prob_dict(), t) for t in [-10, 10] ] self.g_tb_lg.add_histogram('final_probs_dist', self.agent.get_prob_tensor(), 0) self._train_with_aug( max_iters=self.full_train_iters if retrain_only_cutout else self.auged_full_train_iters, loader=self.full_train_ld if retrain_only_cutout else self.auged_full_train_ld, sync_mid=False, lsmooth=True, max_ep=max_ep, op_cfg=op_cfg, sc_cfg=sc_cfg, save_mode='best', prefix='re')
def pretrain(self, max_ep, op_cfg, sc_cfg, sync_mid, lsmooth): init_params(model=self.model, output=self.lg.info) self.agent.random_initialize() pretrained_state = self._train_with_aug( max_iters=self.auged_sub_train_iters, loader=self.auged_sub_train_ld, sync_mid=sync_mid, lsmooth=lsmooth, max_ep=max_ep, op_cfg=op_cfg, sc_cfg=sc_cfg, save_mode='last', prefix='pre') return pretrained_state
def __init__(self, in_shape=(32, 32, 3), num_classes=10, verbose=True, arch='cifar', no_weights=False, init_weights=None, dropout_rate=0.25): super(ZenkeNet, self).__init__(num_classes, verbose) assert(in_shape[0] == 32 and in_shape[1] == 32) self._in_shape = in_shape assert(arch in ZenkeNet._architectures.keys()) self._param_shapes = ZenkeNet._architectures[arch] self._param_shapes[-2][0] = num_classes self._param_shapes[-1][0] = num_classes assert(init_weights is None or no_weights is False) self._no_weights = no_weights self._use_dropout = dropout_rate != -1 self._has_bias = True self._has_fc_out = True # We need to make sure that the last 2 entries of `weights` correspond # to the weight matrix and bias vector of the last layer. self._mask_fc_out = True # We don't use any output non-linearity. self._has_linear_out = True self._num_weights = MainNetInterface.shapes_to_num_weights( \ self._param_shapes) if verbose: print('Creating a ZenkeNet with %d weights' \ % (self._num_weights) + (', that uses dropout.' if self._use_dropout else '.')) if self._use_dropout: if dropout_rate > 0.5: # FIXME not a pretty solution, but we aim to follow the original # paper. raise ValueError('Dropout rate must be smaller equal 0.5.') self._drop_conv = nn.Dropout2d(p=dropout_rate) self._drop_fc1 = nn.Dropout(p=dropout_rate * 2.) self._layer_weight_tensors = nn.ParameterList() self._layer_bias_vectors = nn.ParameterList() if no_weights: self._weights = None self._hyper_shapes_learned = self._param_shapes self._hyper_shapes_learned_ref = \ list(range(len(self._param_shapes))) self._is_properly_setup() return ### Define and initialize network weights. # Each odd entry of this list will contain a weight Tensor and each # even entry a bias vector. self._weights = nn.ParameterList() for i, dims in enumerate(self._param_shapes): self._weights.append(nn.Parameter(torch.Tensor(*dims), requires_grad=True)) if i % 2 == 0: self._layer_weight_tensors.append(self._weights[i]) else: assert(len(dims) == 1) self._layer_bias_vectors.append(self._weights[i]) if init_weights is not None: assert(len(init_weights) == len(self._param_shapes)) for i in range(len(init_weights)): assert(np.all(np.equal(list(init_weights[i].shape), list(self._weights[i].shape)))) self._weights[i].data = init_weights[i] else: for i in range(len(self._layer_weight_tensors)): init_params(self._layer_weight_tensors[i], self._layer_bias_vectors[i]) self._is_properly_setup()
def __init__(self, out_size, num_layers, num_filters, kernel_size, sa_units, input_dim, use_batch_norm, use_spectral_norm, no_theta, init_theta): # FIXME find a way using super to handle multiple inheritence. #super(SAHnetPart, self).__init__() nn.Module.__init__(self) CLHyperNetInterface.__init__(self) assert (init_theta is None or not no_theta) if use_spectral_norm: raise NotImplementedError( 'Spectral normalization not yet ' + 'implemented for this hypernetwork type.') if use_batch_norm: raise NotImplementedError( 'Batch normalization not yet ' + 'implemented for this hypernetwork type.') # FIXME task embeddings are currently maintained outside of this class. self._target_shapes = out_size self._task_embs = None self._size_ext_input = input_dim self._num_outputs = np.prod(out_size) if sa_units is None: sa_units = [] self._sa_units_inds = sa_units self._use_batch_norm = use_batch_norm assert (num_layers > 0) # Initial fully-connected layer must exist. assert (num_filters is None or len(num_filters) == num_layers - 1) assert (len(out_size) == 2 or len(out_size) == 3) #assert(num_layers-1 not in sa_units) assert (len(sa_units) == 0 or np.max(sa_units) < num_layers - 1) out_channels = 1 if len(out_size) == 2 else out_size[2] if num_filters is None: num_filters = [128] * (num_layers - 1) multipliers = np.power(2, range(num_layers - 2, -1, -1)).tolist() num_filters = [e1 * e2 for e1, e2 in zip(num_filters, multipliers)] num_filters.append(out_channels) if kernel_size is None: kernel_size = 5 if not isinstance(kernel_size, list): kernel_size = [kernel_size, kernel_size] if len(kernel_size) == 2: kernel_size = [kernel_size] * (num_layers - 1) else: for i, tup in enumerate(kernel_size): if not isinstance(tup, list): kernel_size[i] = [tup, tup] print('Building a self-attention generator with %d layers and an ' % \ (num_layers) + 'output shape of %s.' % str(out_size)) ### Compute strides and pads of all transpose conv layers. # Keep in mind the formula: # W_o = S * (W_i - 1) - 2 * P + K + P_o # S - Strides # P - Padding # P_o - Output padding # K - Kernel size strides = [[2, 2] for _ in range(num_layers - 1)] pads = [[0, 0] for _ in range(num_layers - 1)] out_pads = [[0, 0] for _ in range(num_layers - 1)] # Layer sizes. sizes = [[out_size[0], out_size[1]]] * (num_layers - 1) w = out_size[0] h = out_size[1] def compute_pads(w, k, s): """Compute paddings. Given the equation W_o = S * (W_i - 1) - 2 * P + K + P_o Paddings and output paddings are chosen such that it holds: W_o = S * W_i Args: w: Size of output dimension. k: Kernel size. s: Stride. Returns: Padding, output padding. """ offset = s if s == 2 and (w % 2) == 1: offset = 3 if ((k - offset) % 2) == 0: p = (k - offset) // 2 p_out = 0 else: p = int(np.ceil((k - offset) / 2)) p_out = -(k - offset - 2 * p) return p, p_out for i in range(num_layers - 2, -1, -1): sizes[i] = [w, h] # This is a condition we set. # If one of the sizes is too small, we just keep the layer size. if w <= 4: strides[i][0] = 1 if h <= 4: strides[i][1] = 1 pads[i][0], out_pads[i][0] = compute_pads(w, kernel_size[i][0], strides[i][0]) pads[i][1], out_pads[i][1] = compute_pads(h, kernel_size[i][1], strides[i][1]) w = w if strides[i][0] == 1 else w // 2 h = h if strides[i][1] == 1 else h // 2 self._fc_out_shape = [num_filters[0], w, h] if num_layers > 1: num_filters = num_filters[1:] # Just a sanity check. for i, s in enumerate(strides): w = s[0] * ( w - 1) + kernel_size[i][0] - 2 * pads[i][0] + out_pads[i][0] h = s[1] * ( h - 1) + kernel_size[i][1] - 2 * pads[i][1] + out_pads[i][1] assert (w == out_size[0] and h == out_size[1]) # For shapes of self-maintained parameters (underlying modules, like # self-attention layers, maintain their own weights). theta_shapes_internal = [] if no_theta: self._theta = None else: self._theta = nn.ParameterList() if init_theta is not None and len(sa_units) > 0: num_p = 7 # Number of param tensors per self-attention layer. num_sa_p = len(sa_units) * num_p sind = len(init_theta) - num_sa_p sa_init_weights = [] for i in range(len(sa_units)): sa_init_weights.append( \ init_theta[sind+i*num_p:sind+(i+1)*num_p]) init_theta = init_theta[:sind] ### Initial fully-connected layer. num_units = np.prod(self._fc_out_shape) theta_shapes_internal.extend([[num_units, input_dim], [num_units]]) print('The output shape of the fully-connected layer will be %s' % (str(self._fc_out_shape))) ### Transpose Convolutional Layers. self._sa_units = torch.nn.ModuleList() prev_nfilters = self._fc_out_shape[0] sa_ind = 0 if 0 in sa_units: print('A self-attention unit is added after the initial fc layer.') w_init = None if init_theta is not None: w_init = sa_init_weights[sa_ind] self._sa_units.append( SelfAttnLayerV2(prev_nfilters, use_spectral_norm, no_weights=no_theta, init_weights=w_init)) sa_ind += 1 # Needed to setup transpose convolutional layers in forward method. self._strides = strides self._pads = pads self._out_pads = out_pads for i in range(num_layers - 1): theta_shapes_internal.extend( [[prev_nfilters, num_filters[i], *kernel_size[i]], [num_filters[i]]]) prev_nfilters = num_filters[i] msg = 'Transpose convolutional layer %d will have output ' + \ 'shape %s. It uses strides=%s, padding=%s and ' \ 'output_padding=%s. The kernel size is %s.' print(msg % (i, str([num_filters[i], *sizes[i]]), str(strides[i]), str(pads[i]), str(out_pads[i]), str(kernel_size[i]))) if (i + 1) in sa_units: print('A self-attention unit is added after transpose conv ' + \ 'layer %d.' % i) w_init = None if init_theta is not None: w_init = sa_init_weights[sa_ind] self._sa_units.append( SelfAttnLayerV2(num_filters[i], use_spectral_norm, no_weights=no_theta, init_weights=w_init)) sa_ind += 1 if not no_theta: for i, dims in enumerate(theta_shapes_internal): self._theta.append( nn.Parameter(torch.Tensor(*dims), requires_grad=True)) if init_theta is not None: assert (len(init_theta) == len(theta_shapes_internal)) for i in range(len(init_theta)): assert (np.all( np.equal(list(init_theta[i].shape), list(self._theta[i].shape)))) self._theta[i].data = init_theta[i] else: for i in range(0, len(self._theta), 2): init_params(self._theta[i], self._theta[i + 1]) self._theta_shapes = theta_shapes_internal for unit in self._sa_units: self._theta_shapes.extend(unit.weight_shapes) self._num_weights = np.sum([np.prod(s) for s in self._theta_shapes]) print( 'Total number of parameters in the self-attention generator: %d' % self._num_weights) self._is_properly_setup()
def __init__(self, in_dim, use_spectral_norm, no_weights=False, init_weights=None): """Initialize self-attention layer. Args: in_dim: Number of input channels (C). use_spectral_norm: Enable spectral normalization for all 1x1 conv. layers. no_weights: If set to True, no trainable parameters will be constructed, i.e., weights are assumed to be produced ad-hoc by a hypernetwork and passed to the forward function. init_weights (optional): This option is for convinience reasons. The option expects a list of parameter values that are used to initialize the network weights. As such, it provides a convinient way of initializing a network with a weight draw produced by the hypernetwork. See attribute "weight_shapes" for the format in which parameters should be passed. """ super(SelfAttnLayerV2, self).__init__() assert (not no_weights or init_weights is None) if use_spectral_norm: raise NotImplementedError('Spectral norm not yet implemented ' + 'for this layer type.') self.channel_in = in_dim self.softmax = nn.Softmax(dim=-1) # 1x1 convolution to generate f(x). query_dim = [in_dim // 8, in_dim, 1, 1] # 1x1 convolution to generate g(x). key_dim = [in_dim // 8, in_dim, 1, 1] # 1x1 convolution to generate h(x). value_dim = [in_dim, in_dim, 1, 1] gamma_dim = [1] self._weight_shapes = [ query_dim, [query_dim[0]], key_dim, [key_dim[0]], value_dim, [value_dim[0]], gamma_dim ] if no_weights: self._weights = None return ### Define and initialize network weights. self._weights = nn.ParameterList() for i, dims in enumerate(self._weight_shapes): self._weights.append( nn.Parameter(torch.Tensor(*dims), requires_grad=True)) if init_weights is not None: assert (len(init_weights) == len(self._weight_shapes)) for i in range(len(init_weights)): assert (np.all( np.equal(list(init_weights[i].shape), list(self._weights[i].shape)))) self._weights[i].data = init_weights[i] else: for i in range(0, len(self._weights) - 1, 2): init_params(self._weights[i], self._weights[i + 1]) # This gamma parameter is on purpose initialized to be zero as # described in the paper. nn.init.constant_(self._weights[-1], 0)