Beispiel #1
0
    def retrain(self, finetuned_state, max_ep, op_cfg, sc_cfg,
                retrain_only_cutout):
        init_params(model=self.model, output=self.lg.info)
        if not retrain_only_cutout:
            self.agent.load_state(finetuned_state['agent'])
            [
                self.g_tb_lg.add_scalars('final_probs',
                                         self.agent.get_prob_dict(), t)
                for t in [-10, 10]
            ]
            self.g_tb_lg.add_histogram('final_probs_dist',
                                       self.agent.get_prob_tensor(), 0)

        self._train_with_aug(
            max_iters=self.full_train_iters
            if retrain_only_cutout else self.auged_full_train_iters,
            loader=self.full_train_ld
            if retrain_only_cutout else self.auged_full_train_ld,
            sync_mid=False,
            lsmooth=True,
            max_ep=max_ep,
            op_cfg=op_cfg,
            sc_cfg=sc_cfg,
            save_mode='best',
            prefix='re')
Beispiel #2
0
 def pretrain(self, max_ep, op_cfg, sc_cfg, sync_mid, lsmooth):
     init_params(model=self.model, output=self.lg.info)
     self.agent.random_initialize()
     pretrained_state = self._train_with_aug(
         max_iters=self.auged_sub_train_iters,
         loader=self.auged_sub_train_ld,
         sync_mid=sync_mid,
         lsmooth=lsmooth,
         max_ep=max_ep,
         op_cfg=op_cfg,
         sc_cfg=sc_cfg,
         save_mode='last',
         prefix='pre')
     return pretrained_state
    def __init__(self, in_shape=(32, 32, 3),
                 num_classes=10, verbose=True, arch='cifar', no_weights=False,
                 init_weights=None, dropout_rate=0.25):
        super(ZenkeNet, self).__init__(num_classes, verbose)

        assert(in_shape[0] == 32 and in_shape[1] == 32)
        self._in_shape = in_shape

        assert(arch in ZenkeNet._architectures.keys())
        self._param_shapes = ZenkeNet._architectures[arch]
        self._param_shapes[-2][0] = num_classes
        self._param_shapes[-1][0] = num_classes

        assert(init_weights is None or no_weights is False)
        self._no_weights = no_weights

        self._use_dropout = dropout_rate != -1

        self._has_bias = True
        self._has_fc_out = True
        # We need to make sure that the last 2 entries of `weights` correspond
        # to the weight matrix and bias vector of the last layer.
        self._mask_fc_out = True
        # We don't use any output non-linearity.
        self._has_linear_out = True

        self._num_weights = MainNetInterface.shapes_to_num_weights( \
            self._param_shapes)
        if verbose:
            print('Creating a ZenkeNet with %d weights' \
                  % (self._num_weights)
                  + (', that uses dropout.' if self._use_dropout else '.'))

        if self._use_dropout:
            if dropout_rate > 0.5:
                # FIXME not a pretty solution, but we aim to follow the original
                # paper.
                raise ValueError('Dropout rate must be smaller equal 0.5.')
            self._drop_conv = nn.Dropout2d(p=dropout_rate)
            self._drop_fc1 = nn.Dropout(p=dropout_rate * 2.)

        self._layer_weight_tensors = nn.ParameterList()
        self._layer_bias_vectors = nn.ParameterList()

        if no_weights:
            self._weights = None
            self._hyper_shapes_learned = self._param_shapes
            self._hyper_shapes_learned_ref = \
                list(range(len(self._param_shapes)))
            self._is_properly_setup()
            return

        ### Define and initialize network weights.
        # Each odd entry of this list will contain a weight Tensor and each
        # even entry a bias vector.
        self._weights = nn.ParameterList()

        for i, dims in enumerate(self._param_shapes):
            self._weights.append(nn.Parameter(torch.Tensor(*dims),
                                              requires_grad=True))

            if i % 2 == 0:
                self._layer_weight_tensors.append(self._weights[i])
            else:
                assert(len(dims) == 1)
                self._layer_bias_vectors.append(self._weights[i])

        if init_weights is not None:
            assert(len(init_weights) == len(self._param_shapes))
            for i in range(len(init_weights)):
                assert(np.all(np.equal(list(init_weights[i].shape),
                                       list(self._weights[i].shape))))
                self._weights[i].data = init_weights[i]
        else:
            for i in range(len(self._layer_weight_tensors)):
                init_params(self._layer_weight_tensors[i],
                            self._layer_bias_vectors[i])

        self._is_properly_setup()
Beispiel #4
0
    def __init__(self, out_size, num_layers, num_filters, kernel_size,
                 sa_units, input_dim, use_batch_norm, use_spectral_norm,
                 no_theta, init_theta):
        # FIXME find a way using super to handle multiple inheritence.
        #super(SAHnetPart, self).__init__()
        nn.Module.__init__(self)
        CLHyperNetInterface.__init__(self)

        assert (init_theta is None or not no_theta)

        if use_spectral_norm:
            raise NotImplementedError(
                'Spectral normalization not yet ' +
                'implemented for this hypernetwork type.')
        if use_batch_norm:
            raise NotImplementedError(
                'Batch normalization not yet ' +
                'implemented for this hypernetwork type.')

        # FIXME task embeddings are currently maintained outside of this class.
        self._target_shapes = out_size
        self._task_embs = None
        self._size_ext_input = input_dim
        self._num_outputs = np.prod(out_size)

        if sa_units is None:
            sa_units = []

        self._sa_units_inds = sa_units
        self._use_batch_norm = use_batch_norm

        assert (num_layers > 0)  # Initial fully-connected layer must exist.
        assert (num_filters is None or len(num_filters) == num_layers - 1)
        assert (len(out_size) == 2 or len(out_size) == 3)
        #assert(num_layers-1 not in sa_units)
        assert (len(sa_units) == 0 or np.max(sa_units) < num_layers - 1)

        out_channels = 1 if len(out_size) == 2 else out_size[2]

        if num_filters is None:
            num_filters = [128] * (num_layers - 1)
            multipliers = np.power(2, range(num_layers - 2, -1, -1)).tolist()
            num_filters = [e1 * e2 for e1, e2 in zip(num_filters, multipliers)]
        num_filters.append(out_channels)

        if kernel_size is None:
            kernel_size = 5
        if not isinstance(kernel_size, list):
            kernel_size = [kernel_size, kernel_size]
        if len(kernel_size) == 2:
            kernel_size = [kernel_size] * (num_layers - 1)
        else:
            for i, tup in enumerate(kernel_size):
                if not isinstance(tup, list):
                    kernel_size[i] = [tup, tup]

        print('Building a self-attention generator with %d layers and an ' % \
              (num_layers) + 'output shape of %s.' % str(out_size))

        ### Compute strides and pads of all transpose conv layers.
        # Keep in mind the formula:
        # W_o = S * (W_i - 1) - 2 * P + K + P_o
        # S - Strides
        # P - Padding
        # P_o - Output padding
        # K - Kernel size
        strides = [[2, 2] for _ in range(num_layers - 1)]
        pads = [[0, 0] for _ in range(num_layers - 1)]
        out_pads = [[0, 0] for _ in range(num_layers - 1)]
        # Layer sizes.
        sizes = [[out_size[0], out_size[1]]] * (num_layers - 1)

        w = out_size[0]
        h = out_size[1]

        def compute_pads(w, k, s):
            """Compute paddings. Given the equation
                W_o = S * (W_i - 1) - 2 * P + K + P_o
            Paddings and output paddings are chosen such that it holds:
                W_o = S * W_i

            Args:
                w: Size of output dimension.
                k: Kernel size.
                s: Stride.

            Returns:
                Padding, output padding.
            """
            offset = s
            if s == 2 and (w % 2) == 1:
                offset = 3
            if ((k - offset) % 2) == 0:
                p = (k - offset) // 2
                p_out = 0
            else:
                p = int(np.ceil((k - offset) / 2))
                p_out = -(k - offset - 2 * p)

            return p, p_out

        for i in range(num_layers - 2, -1, -1):
            sizes[i] = [w, h]

            # This is a condition we set.
            # If one of the sizes is too small, we just keep the layer size.
            if w <= 4:
                strides[i][0] = 1
            if h <= 4:
                strides[i][1] = 1

            pads[i][0], out_pads[i][0] = compute_pads(w, kernel_size[i][0],
                                                      strides[i][0])
            pads[i][1], out_pads[i][1] = compute_pads(h, kernel_size[i][1],
                                                      strides[i][1])

            w = w if strides[i][0] == 1 else w // 2
            h = h if strides[i][1] == 1 else h // 2

        self._fc_out_shape = [num_filters[0], w, h]
        if num_layers > 1:
            num_filters = num_filters[1:]

        # Just a sanity check.
        for i, s in enumerate(strides):
            w = s[0] * (
                w - 1) + kernel_size[i][0] - 2 * pads[i][0] + out_pads[i][0]
            h = s[1] * (
                h - 1) + kernel_size[i][1] - 2 * pads[i][1] + out_pads[i][1]
        assert (w == out_size[0] and h == out_size[1])

        # For shapes of self-maintained parameters (underlying modules, like
        # self-attention layers, maintain their own weights).
        theta_shapes_internal = []
        if no_theta:
            self._theta = None
        else:
            self._theta = nn.ParameterList()

            if init_theta is not None and len(sa_units) > 0:
                num_p = 7  # Number of param tensors per self-attention layer.
                num_sa_p = len(sa_units) * num_p

                sind = len(init_theta) - num_sa_p

                sa_init_weights = []
                for i in range(len(sa_units)):
                    sa_init_weights.append( \
                        init_theta[sind+i*num_p:sind+(i+1)*num_p])

                init_theta = init_theta[:sind]

        ### Initial fully-connected layer.
        num_units = np.prod(self._fc_out_shape)
        theta_shapes_internal.extend([[num_units, input_dim], [num_units]])

        print('The output shape of the fully-connected layer will be %s' %
              (str(self._fc_out_shape)))

        ### Transpose Convolutional Layers.
        self._sa_units = torch.nn.ModuleList()

        prev_nfilters = self._fc_out_shape[0]

        sa_ind = 0
        if 0 in sa_units:
            print('A self-attention unit is added after the initial fc layer.')
            w_init = None
            if init_theta is not None:
                w_init = sa_init_weights[sa_ind]
            self._sa_units.append(
                SelfAttnLayerV2(prev_nfilters,
                                use_spectral_norm,
                                no_weights=no_theta,
                                init_weights=w_init))

            sa_ind += 1

        # Needed to setup transpose convolutional layers in forward method.
        self._strides = strides
        self._pads = pads
        self._out_pads = out_pads

        for i in range(num_layers - 1):
            theta_shapes_internal.extend(
                [[prev_nfilters, num_filters[i], *kernel_size[i]],
                 [num_filters[i]]])
            prev_nfilters = num_filters[i]

            msg = 'Transpose convolutional layer %d will have output ' + \
                'shape %s. It uses strides=%s, padding=%s and ' \
                'output_padding=%s. The kernel size is %s.'
            print(msg % (i, str([num_filters[i], *sizes[i]]), str(strides[i]),
                         str(pads[i]), str(out_pads[i]), str(kernel_size[i])))

            if (i + 1) in sa_units:
                print('A self-attention unit is added after transpose conv ' + \
                    'layer %d.' % i)
                w_init = None
                if init_theta is not None:
                    w_init = sa_init_weights[sa_ind]
                self._sa_units.append(
                    SelfAttnLayerV2(num_filters[i],
                                    use_spectral_norm,
                                    no_weights=no_theta,
                                    init_weights=w_init))

                sa_ind += 1

        if not no_theta:
            for i, dims in enumerate(theta_shapes_internal):
                self._theta.append(
                    nn.Parameter(torch.Tensor(*dims), requires_grad=True))

            if init_theta is not None:
                assert (len(init_theta) == len(theta_shapes_internal))
                for i in range(len(init_theta)):
                    assert (np.all(
                        np.equal(list(init_theta[i].shape),
                                 list(self._theta[i].shape))))
                    self._theta[i].data = init_theta[i]
            else:
                for i in range(0, len(self._theta), 2):
                    init_params(self._theta[i], self._theta[i + 1])

        self._theta_shapes = theta_shapes_internal
        for unit in self._sa_units:
            self._theta_shapes.extend(unit.weight_shapes)

        self._num_weights = np.sum([np.prod(s) for s in self._theta_shapes])
        print(
            'Total number of parameters in the self-attention generator: %d' %
            self._num_weights)

        self._is_properly_setup()
Beispiel #5
0
    def __init__(self,
                 in_dim,
                 use_spectral_norm,
                 no_weights=False,
                 init_weights=None):
        """Initialize self-attention layer.

        Args:
            in_dim: Number of input channels (C).
            use_spectral_norm: Enable spectral normalization for all 1x1 conv.
                layers.
            no_weights: If set to True, no trainable parameters will be
                constructed, i.e., weights are assumed to be produced ad-hoc
                by a hypernetwork and passed to the forward function.
            init_weights (optional): This option is for convinience reasons.
                The option expects a list of parameter values that are used to
                initialize the network weights. As such, it provides a
                convinient way of initializing a network with a weight draw
                produced by the hypernetwork.
                See attribute "weight_shapes" for the format in which parameters
                should be passed.
        """
        super(SelfAttnLayerV2, self).__init__()
        assert (not no_weights or init_weights is None)
        if use_spectral_norm:
            raise NotImplementedError('Spectral norm not yet implemented ' +
                                      'for this layer type.')

        self.channel_in = in_dim

        self.softmax = nn.Softmax(dim=-1)

        # 1x1 convolution to generate f(x).
        query_dim = [in_dim // 8, in_dim, 1, 1]
        # 1x1 convolution to generate g(x).
        key_dim = [in_dim // 8, in_dim, 1, 1]
        # 1x1 convolution to generate h(x).
        value_dim = [in_dim, in_dim, 1, 1]
        gamma_dim = [1]
        self._weight_shapes = [
            query_dim, [query_dim[0]], key_dim, [key_dim[0]], value_dim,
            [value_dim[0]], gamma_dim
        ]

        if no_weights:
            self._weights = None
            return

        ### Define and initialize network weights.
        self._weights = nn.ParameterList()

        for i, dims in enumerate(self._weight_shapes):
            self._weights.append(
                nn.Parameter(torch.Tensor(*dims), requires_grad=True))

        if init_weights is not None:
            assert (len(init_weights) == len(self._weight_shapes))

            for i in range(len(init_weights)):
                assert (np.all(
                    np.equal(list(init_weights[i].shape),
                             list(self._weights[i].shape))))
                self._weights[i].data = init_weights[i]
        else:
            for i in range(0, len(self._weights) - 1, 2):
                init_params(self._weights[i], self._weights[i + 1])
            # This gamma parameter is on purpose initialized to be zero as
            # described in the paper.
            nn.init.constant_(self._weights[-1], 0)