Beispiel #1
0
    def step(self, optimizer: SGD, *args, **kwargs) -> Optional[float]:  # type: ignore
        """
        :meth:`step` carries out the following two operations:

        1.  Internally invokes ``unscale_(optimizer)`` (unless :meth:`unscale_` was explicitly called for ``optimizer``
            earlier in the iteration).  As part of the :meth:`unscale_`, gradients are checked for infs/NaNs.
        2.  If no inf/NaN gradients are found, invokes ``optimizer.step()`` using the unscaled
            gradients.  Otherwise, ``optimizer.step()`` is skipped to avoid corrupting the params.

        ``*args`` and ``**kwargs`` are forwarded to ``optimizer.step()``.

        Returns the return value of ``optimizer.step(*args, **kwargs)``.

        Args:
            optimizer (torch.optim.Optimizer):  Optimizer that applies the gradients.
            args:  Any arguments.
            kwargs:  Any keyword arguments.

        .. warning::
            Closure use is not currently supported.

        Note: This is an exact copy of the step function in grad_scaler.py. If this copy is deleted then the
        unittest test_cpu_offload_and_cpu_grads fails. This is because the parent class step function calls
        the parent class unscale_ function which does not handle torch.distributed.all_reduce on cpu.
        """
        if not self._enabled:
            return optimizer.step(*args, **kwargs)

        if "closure" in kwargs:
            raise RuntimeError("Closure use is not currently supported if GradScaler is enabled.")

        self._check_scale_growth_tracker("step")  # type: ignore

        optimizer_state = self._per_optimizer_states[id(optimizer)]

        if optimizer_state["stage"] is OptState.STEPPED:
            raise RuntimeError("step() has already been called since the last update().")

        retval = None

        if hasattr(optimizer, "_step_supports_amp_scaling") and optimizer._step_supports_amp_scaling:
            # This optimizer has customized scale-handling logic, so we can call optimizer.step() directly.
            # The contract with custom optimizers is that their step() should accept an additional,
            # optional grad_scaler kwarg.  We append self to the kwargs so the custom optimizer has full information:
            # it can query its own state, invoke unscale_ on itself, etc
            retval = optimizer.step(*args, **dict(kwargs, grad_scaler=self))
            optimizer_state["stage"] = OptState.STEPPED
            return retval

        if optimizer_state["stage"] is OptState.READY:
            self.unscale_(optimizer)

        assert len(optimizer_state["found_inf_per_device"]) > 0, "No inf checks were recorded for this optimizer."
        retval = self._maybe_opt_step(optimizer, optimizer_state, *args, **kwargs)  # type: ignore
        optimizer_state["stage"] = OptState.STEPPED
        return retval
Beispiel #2
0
def learn_second(network,
                 lr,
                 model,
                 examples_files,
                 total_example,
                 alpha=1.0,
                 batch_size=20):
    """
    Helper function used to optimize O2
    :param network: network model to optimize
    :param lr: learning rate
    :param model: model containing the shared data
    :param examples_files: list of files containing the examples
    :param total_example: total example for training
    :param alpha: trade-off param
    :param batch_size: size of the batch
    :return: loss value
    """

    num_batch = 0

    log.info("compute o2")
    optimizer = SGD(network.parameters(), lr)
    log.debug("read example file: {}".format("\t".join(examples_files)))
    loss_val = 0

    if alpha <= 0:
        return loss_val

    for batch in emb_utils.batch_generator(emb_utils.prepare_sentences(
            model, graph_utils.combine_example_files_iter(examples_files),
            network.transfer_fn(model.vocab)),
                                           batch_size,
                                           long_tensor=LongTensor):
        input, output = batch
        loss = (alpha * network.forward(
            input, output, negative_sampling_fn=model.negative_sample))
        loss_val += loss.data[0]

        optimizer.zero_grad()
        loss.backward()
        optimizer.step()
        num_batch += 1

        if (num_batch) % 10000 == 0:
            log.info("community embedding batches completed: {}".format(
                num_batch / (total_example / batch_size)))

    log.debug("O2 loss: {}".format(loss_val))
    return loss_val
Beispiel #3
0
def learn_community(network, lr, model, nodes, beta=1.0, batch_size=20):
    """
    Helper function used to optimize O3
    :param network: model to optimize
    :param lr: learning rate
    :param model: model containing the shared data
    :param nodes: nodes on which execute the learning
    :param beta: trade-off value
    :param batch_size: size of the batch
    :return: loss value
    """

    num_batch = 0

    log.info("compute o3")
    optimizer = SGD(network.parameters(), lr)
    loss_val = 0

    if beta <= 0.:
        return loss_val

    for batch in emb_utils.batch_generator(emb_utils.prepare_sentences(
            model, nodes, network.transfer_fn()),
                                           batch_size,
                                           long_tensor=LongTensor):

        input, output = batch
        loss = network.forward(input, model)
        loss.data *= (beta / model.k)
        loss_val += loss.data[0]

        optimizer.zero_grad()
        loss.backward()
        optimizer.step()
        num_batch += 1

        if (num_batch) % 10000 == 0:
            log.info("community embedding batches completed: {}".format(
                num_batch / (total_example / batch_size)))

    log.debug("O3 loss: {}".format(loss_val))
    return loss_val
Beispiel #4
0
def learn_first(network, lr, model, edges, num_iter=1, batch_size=20):
    """
    Helper function used to optimize O1
    :param network: neural network to train
    :param lr: learning rate
    :param model: model containing the shared data
    :param edges: numpy list of edges used for training
    :param num_iter: iteration number over the edges
    :param batch_size: size of the batch
    :return: loss value
    """
    log.info("computing o1")
    optimizer = SGD(network.parameters(), lr)

    num_batch = 0
    total_batch = (edges.shape[0] * num_iter) / batch_size
    loss_val = 0
    for batch in emb_utils.batch_generator(emb_utils.prepare_sentences(
            model, edges, network.transfer_fn(model.vocab)),
                                           batch_size,
                                           long_tensor=LongTensor):

        input, output = batch
        loss = network.forward(input,
                               output,
                               negative_sampling_fn=model.negative_sample)

        loss_val += loss.data[0]
        optimizer.zero_grad()
        loss.backward()
        optimizer.step()
        num_batch += 1

        if (num_batch) % 10000 == 0:
            log.info("community embedding batches completed: {}".format(
                num_batch / total_batch))

    log.debug("O1 loss: {}".format(loss_val))
    return loss_val
Beispiel #5
0
def learn_second(network, lr, model, examples_files, alpha=1.0):
    """
    Helper function used to optimize O1 and O3
    :param loss: loss to optimize
    :param lr: learning rate
    :param model: deprecated_model used to compute the batches and the negative sampling
    :param examples_files: list of files containing the examples
    :param num_iter: iteration number over the edges
    :return: 
    """
    log.info("compute o2")
    optimizer = SGD(network.parameters(), lr)
    log.debug("read example file: {}".format("\t".join(examples_files)))
    for batch in emb_utils.batch_generator(
            emb_utils.prepare_sentences(
                model, graph_utils.combine_example_files_iter(examples_files),
                network.transfer_fn(model.vocab)), 20):
        input, output = batch
        loss = (alpha * network.forward(
            input, output, negative_sampling_fn=model.negative_sample))

        optimizer.zero_grad()
        loss.backward()
        optimizer.step()
Beispiel #6
0
def learn_first(network, lr, model, edges, num_iter=1):
    """
    Helper function used to optimize O1 and O3
    :param network: neural network to train
    :param lr: learning rate
    :param model: deprecated_model used to compute the batches and the negative sampling
    :param edges: numpy list of edges used for training
    :param num_iter: iteration number over the edges
    :return: 
    """
    log.info("computing o1")
    optimizer = SGD(network.parameters(), lr)
    for batch in emb_utils.batch_generator(
            emb_utils.prepare_sentences(
                model, emb_utils.RepeatCorpusNTimes(edges, n=num_iter),
                network.transfer_fn(model.vocab)), 20):
        input, output = batch
        loss = network.forward(input,
                               output,
                               negative_sampling_fn=model.negative_sample)

        optimizer.zero_grad()
        loss.backward()
        optimizer.step()
Beispiel #7
0

if __name__ == '__main__':
    model = [torch.nn.Parameter(torch.randn(2, 2, requires_grad=True))]
    optim = SGD(model, 0.1)

    epochs = 20
    # scheduler_warmup is chained with lr_schduler
    lr_schduler = CosineAnnealingLR(optim, T_max=epochs - 5, eta_min=0.02)
    scheduler_warmup = GradualWarmupScheduler(optim,
                                              multiplier=1,
                                              total_epoch=5,
                                              after_scheduler=lr_schduler)

    # this zero gradient update is needed to avoid a warning message, issue #8.
    optim.zero_grad()
    optim.step()
    scheduler_warmup.step()

    lr_list = list()
    for epoch in range(epochs):
        current_lr = optim.param_groups[0]['lr']

        optim.step()
        scheduler_warmup.step()

        print(epoch + 1, current_lr)
        lr_list.append(current_lr)

    plot(lr_list)
class RestrictedBoltzmannMachines(nn.Module, ObservableData):
    '''
    Restricted Boltzmann Machines(RBM).
    
    According to graph theory, the structure of RBM corresponds to 
    a complete bipartite graph which is a special kind of bipartite 
    graph where every node in the visible layer is connected to every 
    node in the hidden layer. Based on statistical mechanics and 
    thermodynamics(Ackley, D. H., Hinton, G. E., & Sejnowski, T. J. 1985), 
    the state of this structure can be reflected by the energy function.

    In relation to RBM, the Contrastive Divergence(CD) is a method for 
    approximation of the gradients of the log-likelihood(Hinton, G. E. 2002).
    This algorithm draws a distinction between a positive phase and a 
    negative phase. Conceptually, the positive phase is to the negative 
    phase what waking is to sleeping.

    The procedure of this method is similar to Markov Chain Monte Carlo method(MCMC).
    However, unlike MCMC, the visbile variables to be set first in visible layer is 
    not randomly initialized but the observed data points in training dataset are set 
    to the first visbile variables. And, like Gibbs sampler, drawing samples from hidden 
    variables and visible variables is repeated k times. Empirically (and surprisingly), 
    `k` is considered to be `1`.

    **Note** that this class does not support a *Hybrid* of imperative and symbolic 
    programming. Only `mxnet.ndarray` is supported.

    References:
        - Ackley, D. H., Hinton, G. E., & Sejnowski, T. J. (1985). A learning algorithm for Boltzmann machines. Cognitive science, 9(1), 147-169.
        - Hinton, G. E. (2002). Training products of experts by minimizing contrastive divergence. Neural computation, 14(8), 1771-1800.
        - Le Roux, N., & Bengio, Y. (2008). Representational power of restricted Boltzmann machines and deep belief networks. Neural computation, 20(6), 1631-1649.
    '''

    # The list of losses.
    __loss_arr = []
    # Learning rate.
    __learning_rate = 0.5
    # Batch size in learning.
    __batch_size = 0
    # Batch size in inference(recursive learning or not).
    __r_batch_size = 0

    def __init__(
        self,
        computable_loss,
        initializer_f=None,
        optimizer_f=None,
        visible_activation=torch.nn.Sigmoid(),
        hidden_activation=torch.nn.Sigmoid(),
        visible_dim=1000,
        hidden_dim=100,
        learning_rate=0.005,
        visible_dropout_rate=0.0,
        hidden_dropout_rate=0.0,
        visible_batch_norm=None,
        hidden_batch_norm=None,
        regularizatable_data_list=[],
        ctx="cpu",
    ):
        '''
        Init.
        
        Args:
            computable_loss:            is-a `ComputableLoss`.
            visible_activation:         `mxnet.ndarray.Activation` or `mxnet.symbol.Activation` in visible layer.
            hidden_activation:          `mxnet.ndarray.Activation` or `mxnet.symbol.Activation` in hidden layer.
            visible_dim:                `int` of dimension in visible layer.
            hidden_dim:                 `int` of dimension in hidden layer.
            initializer:                is-a `mxnet.initializer` for parameters of model. If `None`, it is drawing from the Xavier distribution.
            optimizer_name:             `str` of name of optimizer.
            learning_rate:              `float` of learning rate.
            learning_attenuate_rate:    `float` of attenuate the `learning_rate` by a factor of this value every `attenuate_epoch`.
            attenuate_epoch:            `int` of attenuate the `learning_rate` by a factor of `learning_attenuate_rate` every `attenuate_epoch`.
            visible_dropout_rate:       `float` of dropout rate in visible layer.
            hidden_dropout_rate:        `float` of dropout rate in hidden layer.
            visible_batch_norm:         `gluon.nn.BatchNorm` in visible layer.
            hidden_batch_norm:          `gluon.nn.BatchNorm` in hidden layer.
            regularizatable_data_list:  `list` of `RegularizatableData`s.
            ctx:                        `mx.gpu()` or `mx.cpu()`.
        '''
        super(RestrictedBoltzmannMachines, self).__init__()

        for v in regularizatable_data_list:
            if isinstance(v, RegularizatableData) is False:
                raise TypeError(
                    "The type of values of `regularizatable_data_list` must be `RegularizatableData`."
                )
        self.__regularizatable_data_list = regularizatable_data_list

        self.__computable_loss = computable_loss
        self.visible_activation = visible_activation
        self.hidden_activation = hidden_activation

        self.__visible_unit = nn.Linear(
            visible_dim,
            hidden_dim,
            bias=True,
        )

        self.visible_dropout_forward = None
        if visible_dropout_rate > 0:
            self.visible_dropout_forward = nn.Dropout(p=visible_dropout_rate)

        self.hidden_dropout_forward = None
        if hidden_dropout_rate > 0:
            self.hidden_dropout_forward = nn.Dropout(p=hidden_dropout_rate)

        self.visible_batch_norm = visible_batch_norm
        self.hidden_batch_norm = hidden_batch_norm

        self.__ctx = ctx
        self.to(self.__ctx)

        if initializer_f is not None:
            self.__visible_unit.weight = initializer_f(
                self.__visible_unit.weight)
        else:
            self.__visible_unit.weight = torch.nn.init.xavier_normal_(
                self.__visible_unit.weight, gain=1.0)

        if optimizer_f is not None:
            self.optimizer = optimizer_f(self.parameters())
        else:
            self.optimizer = SGD(
                self.parameters(),
                lr=self.__learning_rate,
            )

        self.__learning_rate = learning_rate

        self.__loss_arr = np.array([])

        logger = getLogger("accelbrainbase")
        self.__logger = logger

        self.__loss_list = []
        self.__test_loss_list = []

        self.epoch = 0

    def learn(self, iteratable_data):
        '''
        Learn samples drawn by `IteratableData.generate_learned_samples()`.

        Args:
            iteratable_data:     is-a `IteratableData`.

        '''
        if isinstance(iteratable_data, IteratableData) is False:
            raise TypeError(
                "The type of `iteratable_data` must be `IteratableData`.")

        self.__loss_list = []
        self.__test_loss_list = []

        try:
            epoch = self.epoch
            iter_n = 0
            for observed_arr, label_arr, test_observed_arr, test_label_arr in iteratable_data.generate_learned_samples(
            ):
                self.batch_size = observed_arr.shape[0]
                observed_arr = observed_arr.reshape((self.batch_size, -1))
                test_observed_arr = test_observed_arr.reshape(
                    (self.batch_size, -1))

                self.optimizer.zero_grad()
                visible_activity_arr = self.inference(observed_arr)
                loss = self.compute_loss(observed_arr, visible_activity_arr)
                loss.backward()
                self.optimizer.step()
                self.regularize()

                if (iter_n + 1) % int(
                        iteratable_data.iter_n / iteratable_data.epochs) == 0:
                    with torch.inference_mode():
                        test_visible_activity_arr = self.inference(
                            test_observed_arr)
                        test_loss = self.compute_loss(
                            test_observed_arr, test_visible_activity_arr)
                    _loss = loss.to('cpu').detach().numpy().copy()
                    _test_loss = test_loss.to('cpu').detach().numpy().copy()

                    self.__loss_list.append(_loss)
                    self.__test_loss_list.append(_test_loss)
                    self.__logger.debug("Epoch: " + str(epoch + 1) +
                                        " Train loss: " +
                                        str(self.__loss_list[-1]) +
                                        " Test loss: " +
                                        str(self.__test_loss_list[-1]))
                    epoch += 1
                iter_n += 1

        except KeyboardInterrupt:
            self.__logger.debug("Interrupt.")

        self.__logger.debug("end. ")

        self.__loss_arr = np.c_[
            np.array(self.__loss_list[:len(self.__test_loss_list)]),
            np.array(self.__test_loss_list)]
        self.epoch = epoch

    def inference(self, observed_arr):
        '''
        Inference samples drawn by `IteratableData.generate_inferenced_samples()`.

        Args:
            observed_arr:   rank-2 Array like or sparse matrix as the observed data points.
                            The shape is: (batch size, feature points)

        Returns:
            `mxnet.ndarray` of inferenced feature points.
        '''
        return self(observed_arr)

    def compute_loss(self, pred_arr, labeled_arr):
        '''
        Compute loss.

        Args:
            pred_arr:       `mxnet.ndarray` or `mxnet.symbol`.
            labeled_arr:    `mxnet.ndarray` or `mxnet.symbol`.

        Returns:
            loss.
        '''
        return self.__computable_loss(pred_arr, labeled_arr)

    def extract_learned_dict(self):
        '''
        Extract (pre-) learned parameters.

        Returns:
            `dict` of the parameters.
        '''
        params_dict = {}
        for k in self.state_dict().keys():
            params_dict.setdefault(k, self.state_dict()[k])

        return params_dict

    def forward(self, x):
        '''
        Forward with Gluon API.

        Args:
            F:      `mxnet.ndarray` or `mxnet.symbol`.
            x:      `mxnet.ndarray` of observed data points.
        
        Returns:
            `mxnet.ndarray` or `mxnet.symbol` of inferenced feature points.
        '''
        self.batch_size = x.shape[0]
        x = x.reshape((self.batch_size, -1))

        self.__visible_activity_arr = x

        x = self.__visible_unit(x)

        if self.visible_activation == "identity_adjusted":
            x = x / torch.sum(torch.ones_like(x))
        elif self.visible_activation != "identity":
            x = self.visible_activation(x)

        if self.visible_dropout_forward is not None:
            x = self.visible_dropout_forward(x)
        if self.visible_batch_norm is not None:
            x = self.visible_batch_norm(x)

        self.__hidden_activity_arr = x

        self.__diff_weights_arr = torch.mm(
            self.__visible_activity_arr.T,
            self.__hidden_activity_arr,
        )

        #self.__visible_diff_bias_arr += nd.nansum(self.__visible_activity_arr, axis=0)
        #self.__hidden_diff_bias_arr += nd.nansum(self.__hidden_activity_arr, axis=0)

        params_dict = self.extract_learned_dict()
        weight_keys_list = [
            key for key in params_dict.keys() if "weight" in key
        ]
        weights_arr = params_dict[weight_keys_list[0]]

        self.__visible_activity_arr = torch.mm(
            self.__hidden_activity_arr,
            weights_arr,
        )
        x = self.__visible_activity_arr
        if self.hidden_activation == "identity_adjusted":
            x = x / torch.sum(torch.ones_like(x))
        elif self.hidden_activation != "identity":
            x = self.hidden_activation(x)
        if self.hidden_dropout_forward is not None:
            x = self.hidden_dropout_forward(x)
        if self.hidden_batch_norm is not None:
            x = self.hidden_batch_norm(x)
        self.__visible_activity_arr = x

        self.__hidden_activity_arr = self.__visible_unit(
            self.__visible_activity_arr)
        x = self.__hidden_activity_arr
        if self.visible_activation == "identity_adjusted":
            x = x / torch.sum(torch.ones_like(x))
        elif self.visible_activation != "identity":
            x = self.visible_activation(x)
        if self.visible_dropout_forward is not None:
            x = self.visible_dropout_forward(x)
        if self.visible_batch_norm is not None:
            x = self.visible_batch_norm(x)
        self.__hidden_activity_arr = x

        self.__diff_weights_arr = self.__diff_weights_arr - torch.mm(
            self.__visible_activity_arr.T,
            self.__hidden_activity_arr,
        )
        #self.__visible_diff_bias_arr -= nd.nansum(self.__visible_activity_arr, axis=0)
        #self.__hidden_diff_bias_arr -= nd.nansum(self.__hidden_activity_arr, axis=0)

        return self.__visible_activity_arr

    def regularize(self):
        '''
        Regularization.
        '''
        if len(self.__regularizatable_data_list) > 0:
            params_dict = self.extract_learned_dict()
            for regularizatable in self.__regularizatable_data_list:
                params_dict = regularizatable.regularize(params_dict)

            for k, params in params_dict.items():
                self.load_state_dict({k: params}, strict=False)

    def save_parameters(self, filename):
        '''
        Save parameters to files.

        Args:
            filename:       File name.
        '''
        torch.save(
            {
                'epoch': self.epoch,
                'model_state_dict': self.state_dict(),
                'optimizer_state_dict': self.optimizer.state_dict(),
                'loss': self.loss_arr,
            }, filename)

    def load_parameters(self, filename, ctx=None, strict=True):
        '''
        Load parameters to files.

        Args:
            filename:       File name.
            ctx:            Context-manager that changes the selected device.
            strict:         Whether to strictly enforce that the keys in state_dict match the keys returned by this module’s state_dict() function. Default: `True`.
        '''
        checkpoint = torch.load(filename)
        self.load_state_dict(checkpoint['model_state_dict'], strict=strict)
        self.optimizer.load_state_dict(checkpoint['optimizer_state_dict'])
        self.epoch = checkpoint['epoch']
        self.__loss_list = checkpoint['loss'].tolist()
        if ctx is not None:
            self.to(ctx)
            self.__ctx = ctx

    def set_readonly(self, value):
        ''' setter '''
        raise TypeError("This is read-only.")

    def get_loss_list(self):
        ''' getter for `list` of losses in training. '''
        return self.__loss_list

    loss_list = property(get_loss_list, set_readonly)

    def get_test_loss_arr(self):
        ''' getter for `list` of losses in test. '''
        return self.__test_loss_list

    test_loss_list = property(get_test_loss_arr, set_readonly)

    def get_loss_arr(self):
        ''' getter for losses. '''
        return self.__loss_arr

    loss_arr = property(get_loss_arr, set_readonly)

    def get_feature_points_arr(self):
        ''' getter for `mxnet.narray` of feature points in middle hidden layer. '''
        return self.__hidden_activity_arr

    feature_points_arr = property(get_feature_points_arr, set_readonly)

    def get_weights_arr(self):
        ''' getter for `mxnet.ndarray` of weights matrics. '''
        return self.__weights_arr

    def set_weights_arr(self, value):
        ''' setter for `mxnet.ndarray` of weights matrics.'''
        self.__weights_arr = value

    weights_arr = property(get_weights_arr, set_weights_arr)

    def get_visible_bias_arr(self):
        ''' getter for `mxnet.ndarray` of biases in visible layer.'''
        return self.__visible_bias_arr

    def set_visible_bias_arr(self, value):
        ''' setter for `mxnet.ndarray` of biases in visible layer.'''
        self.__visible_bias_arr = value

    visible_bias_arr = property(get_visible_bias_arr, set_visible_bias_arr)

    def get_hidden_bias_arr(self):
        ''' getter for `mxnet.ndarray` of biases in hidden layer.'''
        return self.__hidden_bias_arr

    def set_hidden_bias_arr(self, value):
        ''' setter for `mxnet.ndarray` of biases in hidden layer.'''
        self.__hidden_bias_arr = value

    hidden_bias_arr = property(get_hidden_bias_arr, set_hidden_bias_arr)

    def get_visible_activity_arr(self):
        ''' getter for `mxnet.ndarray` of activities in visible layer.'''
        return self.__visible_activity_arr

    def set_visible_activity_arr(self, value):
        ''' setter for `mxnet.ndarray` of activities in visible layer.'''
        self.__visible_activity_arr = value

    visible_activity_arr = property(get_visible_activity_arr,
                                    set_visible_activity_arr)

    def get_hidden_activity_arr(self):
        ''' getter for `mxnet.ndarray` of activities in hidden layer.'''
        return self.__hidden_activity_arr

    def set_hidden_activity_arr(self, value):
        ''' setter for `mxnet.ndarray` of activities in hidden layer.'''
        self.__hidden_activity_arr = value

    hidden_activity_arr = property(get_hidden_activity_arr,
                                   set_hidden_activity_arr)
Beispiel #9
0
import torch
from torch.optim.lr_scheduler import StepLR, ExponentialLR
from torch.optim.sgd import SGD

from warmup_scheduler import GradualWarmupScheduler

if __name__ == '__main__':
    model = [torch.nn.Parameter(torch.randn(2, 2, requires_grad=True))]
    optim = SGD(model, 0.1)

    # scheduler_warmup is chained with schduler_steplr
    scheduler_steplr = StepLR(optim, step_size=10, gamma=0.1)
    scheduler_warmup = GradualWarmupScheduler(optim,
                                              multiplier=1,
                                              total_epoch=5,
                                              after_scheduler=scheduler_steplr)

    # this zero gradient update is needed to avoid a warning message, issue #8.
    optim.zero_grad()
    optim.step()

    for epoch in range(1, 20):
        scheduler_warmup.step(epoch)
        print(epoch, optim.param_groups[0]['lr'])

        optim.step()  # backward pass (update network)
Beispiel #10
0
import torch
import torch.nn as nn

from model.maml.optim import MamlSGD
from torch.optim.sgd import SGD

net = nn.Sequential(nn.Linear(10, 10), nn.ReLU(), nn.Linear(10, 20), nn.ReLU(),
                    nn.Linear(20, 5))

optim = MamlSGD(net.parameters(), momentum=0.9, lr=0.1)
TrueOptim = SGD(net.parameters(), momentum=0.9, lr=0.1)

x = torch.randn(100, 10)

loss = net(x).sum()
loss.backward()
updated = optim.maml_step()
original = optim.maml_replace(net, updated)
loss = net(x).sum()
loss.backward()
TrueOptim.step()

print("hello")
Beispiel #11
0
def train_mnist(epoch_num=10,
                show_iter=100,
                logdir='test',
                model_weight=None,
                load_d=False,
                load_g=False,
                compare_path=None,
                info_time=100,
                run_select=None,
                device='cpu'):
    lr_d = 0.01
    lr_g = 0.01
    batchsize = 128
    z_dim = 96
    print('MNIST, discriminator lr: %.3f, generator lr: %.3f' % (lr_d, lr_g))
    dataset = get_data(dataname='MNIST', path='../datas/mnist')
    dataloader = DataLoader(dataset=dataset,
                            batch_size=batchsize,
                            shuffle=True,
                            num_workers=4)
    D = dc_D().to(device)
    G = dc_G(z_dim=z_dim).to(device)
    D.apply(weights_init_d)
    G.apply(weights_init_g)
    if model_weight is not None:
        chk = torch.load(model_weight)
        if load_d:
            D.load_state_dict(chk['D'])
            print('Load D from %s' % model_weight)
        if load_g:
            G.load_state_dict(chk['G'])
            print('Load G from %s' % model_weight)
    if compare_path is not None:
        discriminator = dc_D().to(device)
        model_weight = torch.load(compare_path)
        discriminator.load_state_dict(model_weight['D'])
        model_vec = torch.cat(
            [p.contiguous().view(-1) for p in discriminator.parameters()])
        print('Load discriminator from %s' % compare_path)
    if run_select is not None:
        fixed_data = torch.load(run_select)
        real_set = fixed_data['real_set']
        fake_set = fixed_data['fake_set']
        real_d = fixed_data['real_d']
        fake_d = fixed_data['fake_d']
        fixed_vec = fixed_data['pred_vec']
        print('load fixed data set')

    from datetime import datetime
    current_time = datetime.now().strftime('%b%d_%H-%M-%S')
    writer = SummaryWriter(log_dir='logs/%s/%s_%.3f' %
                           (logdir, current_time, lr_d))
    d_optimizer = SGD(D.parameters(), lr=lr_d)
    g_optimizer = SGD(G.parameters(), lr=lr_g)
    timer = time.time()
    count = 0
    fixed_noise = torch.randn((64, z_dim), device=device)
    for e in range(epoch_num):
        for real_x in dataloader:
            real_x = real_x[0].to(device)
            d_real = D(real_x)
            z = torch.randn((d_real.shape[0], z_dim), device=device)
            fake_x = G(z)
            fake_x_c = fake_x.clone().detach()
            # update generator
            d_fake = D(fake_x)

            writer.add_scalars('Discriminator output', {
                'Generated image': d_fake.mean().item(),
                'Real image': d_real.mean().item()
            },
                               global_step=count)
            G_loss = get_loss(name='JSD', g_loss=True, d_fake=d_fake)
            g_optimizer.zero_grad()
            G_loss.backward()
            g_optimizer.step()
            gg = torch.norm(torch.cat(
                [p.grad.contiguous().view(-1) for p in G.parameters()]),
                            p=2)

            d_fake_c = D(fake_x_c)
            D_loss = get_loss(name='JSD',
                              g_loss=False,
                              d_real=d_real,
                              d_fake=d_fake_c)
            if compare_path is not None and count % info_time == 0:
                diff = get_diff(net=D, model_vec=model_vec)
                writer.add_scalar('Distance from checkpoint',
                                  diff.item(),
                                  global_step=count)
                if run_select is not None:
                    with torch.no_grad():
                        d_real_set = D(real_set)
                        d_fake_set = D(fake_set)
                        diff_real = torch.norm(d_real_set - real_d, p=2)
                        diff_fake = torch.norm(d_fake_set - fake_d, p=2)
                        d_vec = torch.cat([d_real_set, d_fake_set])
                        diff = torch.norm(d_vec.sub_(fixed_vec), p=2)
                        writer.add_scalars('L2 norm of pred difference', {
                            'Total': diff.item(),
                            'real set': diff_real.item(),
                            'fake set': diff_fake.item()
                        },
                                           global_step=count)
            d_optimizer.zero_grad()
            D_loss.backward()
            d_optimizer.step()

            gd = torch.norm(torch.cat(
                [p.grad.contiguous().view(-1) for p in D.parameters()]),
                            p=2)

            writer.add_scalars('Loss', {
                'D_loss': D_loss.item(),
                'G_loss': G_loss.item()
            },
                               global_step=count)
            writer.add_scalars('Grad', {
                'D grad': gd.item(),
                'G grad': gg.item()
            },
                               global_step=count)
            if count % show_iter == 0:
                time_cost = time.time() - timer
                print('Iter :%d , D_loss: %.5f, G_loss: %.5f, time: %.3fs' %
                      (count, D_loss.item(), G_loss.item(), time_cost))
                timer = time.time()
                with torch.no_grad():
                    fake_img = G(fixed_noise).detach()
                    path = 'figs/%s/' % logdir
                    if not os.path.exists(path):
                        os.makedirs(path)
                    vutils.save_image(fake_img,
                                      path + 'iter_%d.png' % count,
                                      normalize=True)
                save_checkpoint(path=logdir,
                                name='SGD-%.3f_%d.pth' % (lr_d, count),
                                D=D,
                                G=G)
            count += 1
    writer.close()
Beispiel #12
0
def training(optimizer_sign=0):
    training_data = {
        'train_loss': [],
        'val_loss': [],
        'train_acc': [],
        'val_acc': []
    }
    net = Net(input_size, hidden_size, num_classes)
    # net = Net(input_size, hidden_size, num_classes)
    net.cuda()
    net.train()
    # Loss and Optimizer
    criterion = nn.CrossEntropyLoss()
    if optimizer_sign == 0:
        optimizer = torch.optim.RMSprop(net.parameters(), lr=learning_rate)
    elif optimizer_sign == 1:
        optimizer = torch.optim.Adam(net.parameters(), lr=learning_rate)
    elif optimizer_sign == 2:
        optimizer = SGD(net.parameters(),
                        lr=learning_rate,
                        weight_decay=0.0001,
                        momentum=0.9)
    elif optimizer_sign == 3:
        optimizer = PIDOptimizer(net.parameters(),
                                 lr=learning_rate,
                                 weight_decay=0.0001,
                                 momentum=0.9,
                                 I=I,
                                 D=D)
    else:
        optimizer = PIDOptimizer(net.parameters(),
                                 lr=learning_rate,
                                 weight_decay=0.0001,
                                 momentum=0.9,
                                 I=I,
                                 D=0)
    # Train the Model
    for epoch in range(num_epochs):

        train_loss_log = AverageMeter()
        train_acc_log = AverageMeter()
        val_loss_log = AverageMeter()
        val_acc_log = AverageMeter()
        for i, (images, labels) in enumerate(train_loader):
            # Convert torch tensor to Variable
            images = images.view(-1, 28 * 28).cuda()
            labels = Variable(labels.cuda())

            # Forward + Backward + Optimize
            optimizer.zero_grad()  # zero the gradient buffer
            outputs = net(images)
            train_loss = criterion(outputs, labels)
            train_loss.backward()
            optimizer.step()
            prec1, prec5 = accuracy(outputs.data, labels.data, topk=(1, 5))
            train_loss_log.update(train_loss.data, images.size(0))
            train_acc_log.update(prec1, images.size(0))

            if (i + 1) % 100 == 0:
                print('Epoch [%d/%d], Step [%d/%d], Loss: %.4f, Acc: %.8f' %
                      (epoch + 1, num_epochs, i + 1, len(train_dataset) //
                       batch_size, train_loss_log.avg, train_acc_log.avg))
                training_data['train_loss'].append(
                    train_loss_log.avg.detach().cpu().numpy())
                training_data['train_acc'].append(
                    train_acc_log.avg.detach().cpu().numpy())

        # Test the Model
        net.eval()
        correct = 0
        loss = 0
        total = 0
        for images, labels in test_loader:
            images = images.view(-1, 28 * 28).cuda()
            labels = Variable(labels).cuda()
            outputs = net(images)
            test_loss = criterion(outputs, labels)
            val_loss_log.update(test_loss.data, images.size(0))
            prec1, prec5 = accuracy(outputs.data, labels.data, topk=(1, 5))
            val_acc_log.update(prec1, images.size(0))

        #logger.append([learning_rate, train_loss_log.avg, val_loss_log.avg, train_acc_log.avg, val_acc_log.avg])
        print('Accuracy of the network on the 10000 test images: %.8f %%' %
              (val_acc_log.avg))
        print('Loss of the network on the 10000 test images: %.8f' %
              (val_loss_log.avg))
        training_data['val_loss'].append(
            val_loss_log.avg.detach().cpu().numpy())
        training_data['val_acc'].append(val_acc_log.avg.detach().cpu().numpy())
    #logger.close()
    #logger.plot()
    return training_data
Beispiel #13
0
class Neumann(Optimizer):
    """
    Documentation about the algorithm
    """
    def __init__(self,
                 params,
                 lr=1e-3,
                 eps=1e-8,
                 alpha=1e-7,
                 beta=1e-5,
                 gamma=0.9,
                 momentum=1,
                 sgd_steps=5,
                 K=10):

        if not 0.0 <= lr:
            raise ValueError("Invalid learning rate: {}".format(lr))
        if not 0.0 <= eps:
            raise ValueError("Invalid epsilon value: {}".format(eps))
        if not 1 >= momentum:
            raise ValueError("Invalid momentum value: {}".format(eps))

        self.iter = 0
        self.sgd = SGD(params, lr=lr, momentum=0.9)

        param_count = np.sum([np.prod(p.size())
                              for p in params])  # got from MNIST-GAN

        defaults = dict(lr=lr,
                        eps=eps,
                        alpha=alpha,
                        beta=beta * param_count,
                        gamma=gamma,
                        sgd_steps=sgd_steps,
                        momentum=momentum,
                        K=K)

        super(Neumann, self).__init__(params, defaults)

    def step(self, closure=None):
        """
        Performs a single optimization step.

        Arguments:
            closure (callable, optional): A closure that reevaluates the model
                and returns the loss.
        """
        self.iter += 1

        loss = None
        if closure is not None:  #checkout what's the deal with this. present in multiple pytorch optimizers
            loss = closure()

        for group in self.param_groups:

            sgd_steps = group['sgd_steps']

            alpha = group['alpha']
            beta = group['beta']
            gamma = group['gamma']
            K = group['K']
            momentum = group['momentum']
            mu = momentum * (1 - (1 / (1 + self.iter)))
            eta = group['lr'] / self.iter  ## update with time

            if self.iter <= sgd_steps:
                self.sgd.step()
                return

            if self.iter == 8:
                print("here")

            if mu >= 0.9:
                mu = 0.9
            elif mu <= 0.5:
                mu = 0.5

            for p in group['params']:
                if p.grad is None:
                    continue
                grad = p.grad.data

                state = self.state[p]

                if len(state) == 0:
                    state['step'] = 0
                    state['m'] = torch.zeros_like(p.data).float()
                    state['d'] = torch.zeros_like(p.data).float()
                    state['moving_avg'] = p.data

                state['step'] += 1

                ## Reset neumann iterate
                if self.iter % K == 0:
                    state['m'] = grad.mul(-eta)
                    group['K'] = group['K'] * 2

                ## Compute update d_t
                diff = p.data.sub(state['moving_avg'])
                diff_norm = (p.data.sub(state['moving_avg'])).norm(p=1)
                if np.count_nonzero(diff):
                    state['d'] = grad.add((((diff_norm.pow(2)).mul(alpha)).sub(
                        (diff_norm.pow(-2)).mul(beta))).mul(
                            diff.div(diff_norm)))
                else:
                    state['d'] = grad

                ## Update Neumann iterate
                (state['m'].mul_(mu)).sub_(state['d'].mul(eta))

                ## Update Weights
                p.data.add_((state['m'].mul(mu)).sub(state['d'].mul(eta)))

                ## Update Moving Average
                state['moving_avg'] = p.data.add(
                    (state['moving_avg'].sub(p.data)).mul(gamma))
Beispiel #14
0
    train_loss_log = AverageMeter()
    train_acc_log = AverageMeter()
    val_loss_log = AverageMeter()
    val_acc_log = AverageMeter()
    for i, (images, labels) in enumerate(train_loader):
        # Convert torch tensor to Variable
        images = images.cuda()  #Variable(images.view(-1, 28*28).cuda())
        labels = Variable(labels.cuda())

        # Forward + Backward + Optimize
        optimizer.zero_grad()  # zero the gradient buffer
        outputs = net(images)
        train_loss = criterion(outputs, labels)
        train_loss.backward()
        optimizer.step()
        prec1, prec5 = accuracy(outputs.data, labels.data, topk=(1, 5))
        train_loss_log.update(train_loss.data, images.size(0))
        train_acc_log.update(prec1, images.size(0))

        save_name = os.path.join(model_save_dir, str(iters) + '.pth.tar')
        '''
        torch.save({'iter': iters,
                    'state_dict': net.state_dict(),
                    'optimizer' : optimizer.state_dict()}, 
                    save_name)
        '''
        iters += 1

        if (i + 1) % 100 == 0:
            print('Epoch [%d/%d], Step [%d/%d], Loss: %.4f, Acc: %.8f' %
Beispiel #15
0
def training(optimizer_sign=0):
    training_data = {'train_loss':[], 'val_loss':[], 'train_acc':[], 'val_acc':[]}
    net = Net(input_size, hidden_size, num_classes)
    # net = Net(input_size, hidden_size, num_classes)
    net.train()
    # Loss and Optimizer
    oldnet_sign = False
    basicgrad_sign = False
    criterion = nn.CrossEntropyLoss()
    print('optimizer_sign:' + str(optimizer_sign))
    if optimizer_sign == 0:
        optimizer = torch.optim.SGD(net.parameters(), lr=learning_rate)
    elif optimizer_sign == 1:
        optimizer = torch.optim.RMSprop(net.parameters(), lr=learning_rate)
    elif optimizer_sign == 2:
        optimizer = torch.optim.Adam(net.parameters(), lr=.001)
    elif optimizer_sign == 3:
        optimizer = SGD(net.parameters(), lr=learning_rate, weight_decay=0.001, momentum=0.9)
    elif optimizer_sign == 4:
        optimizer = pid.PIDOptimizer(net.parameters(), lr=learning_rate, weight_decay=0.001, momentum=0.9, I=I, D=0)
    elif optimizer_sign == 5:
        optimizer = pid.PIDOptimizer(net.parameters(), lr=learning_rate, weight_decay=0.001, momentum=0.9, I=I, D=D)
    elif optimizer_sign == 6:
        optimizer = pid.AdapidOptimizer(net.parameters(), lr=learning_rate, weight_decay=0.001, momentum=0.9, I=I, D=D)
    elif optimizer_sign == 7:
        optimizer = pid.AdapidOptimizer_test(net.parameters(), lr=learning_rate, weight_decay=0.001, momentum=0.9, I=I, D=D)
    elif optimizer_sign == 8:
        optimizer = pid.specPIDoptimizer(net.parameters(), lr=learning_rate, weight_decay=0.001, momentum=0.9, I=I, D=D)
        oldnet_sign = True
    elif optimizer_sign == 9:
        optimizer = pid.SVRGoptimizer(net.parameters(), lr=learning_rate, weight_decay=0.001, momentum=0.9)
        oldnet_sign = True
        basicgrad_sign = True
    else:
        optimizer = pid.SARAHoptimizer(net.parameters(), lr=learning_rate, weight_decay=0.001, momentum=0.9)
        oldnet_sign = True
        basicgrad_sign = True

    if oldnet_sign == True:
        torch.save(net, 'net.pkl')
        old_net = torch.load('net.pkl')

    # Train the Model
    for epoch in range(num_epochs):

        train_loss_log = AverageMeter()
        train_acc_log = AverageMeter()
        val_loss_log = AverageMeter()
        val_acc_log = AverageMeter()
        for i, (images, labels) in enumerate(train_loader):
            if i % 100 == 0 and basicgrad_sign == True:
                for j, (all_images, all_labels) in enumerate(BGD_loader):
                    all_images = all_images.view(-1, 28 * 28)
                    all_labels = Variable(all_labels)
                    optimizer.zero_grad()  # zero the gradient buffer
                    outputs = net(all_images)
                    train_loss = criterion(outputs, all_labels)
                    train_loss.backward()
                    params = list(net.parameters())
                    grads = []
                    for param in params:
                        grads.append(param.grad.detach())
                    optimizer.get_basicgrad(grads)
                    optimizer.step()
                    prec1, prec5 = accuracy(outputs.data, all_labels.data, topk=(1, 5))
                    train_loss_log.update(train_loss.data, all_images.size(0))
                    train_acc_log.update(prec1, all_images.size(0))
                    torch.save(net, 'net.pkl')
                    old_net = torch.load('net.pkl')
                    print('Epoch [%d/%d], Step [%d/%d], Loss: %.4f, Acc: %.8f'
                          % (epoch + 1, num_epochs, i + 1, len(train_dataset) // batch_size, train_loss_log.avg,
                             train_acc_log.avg))
            # Convert torch tensor to Variable
            images = images.view(-1, 28*28)
            labels = Variable(labels)

            # Forward + Backward + Optimize
            optimizer.zero_grad()  # zero the gradient buffer
            outputs = net(images)
            train_loss = criterion(outputs, labels)
            train_loss.backward()
            if oldnet_sign == True:
                old_outputs = old_net(images)
                old_loss = criterion(old_outputs, labels)
                old_loss.backward()
                old_params = list(old_net.parameters())
                old_grads = []
                for param in old_params:
                    old_grads.append(param.grad.detach())
                optimizer.get_oldgrad(old_grads)
            optimizer.step()
            if oldnet_sign == True and optimizer_sign != 8:
                torch.save(net, 'net.pkl')
                old_net = torch.load('net.pkl')
            prec1, prec5 = accuracy(outputs.data, labels.data, topk=(1, 5))
            train_loss_log.update(train_loss.data, images.size(0))
            train_acc_log.update(prec1, images.size(0))

            if (i + 1) % 30 == 0:
                print('Epoch [%d/%d], Step [%d/%d], Loss: %.4f, Acc: %.8f'
                      % (epoch + 1, num_epochs, i + 1, len(train_dataset) // batch_size, train_loss_log.avg,
                         train_acc_log.avg))
                training_data['train_loss'].append(train_loss_log.avg.detach().cpu().numpy())
                training_data['train_acc'].append(train_acc_log.avg.detach().cpu().numpy())

        # Test the Model
        net.eval()
        correct = 0
        loss = 0
        total = 0
        for images, labels in test_loader:
            images = images.view(-1, 28*28)
            labels = Variable(labels)
            outputs = net(images)
            test_loss = criterion(outputs, labels)
            val_loss_log.update(test_loss.data, images.size(0))
            prec1, prec5 = accuracy(outputs.data, labels.data, topk=(1, 5))
            val_acc_log.update(prec1, images.size(0))

        #logger.append([learning_rate, train_loss_log.avg, val_loss_log.avg, train_acc_log.avg, val_acc_log.avg])
        print('Accuracy of the network on the 10000 test images: %.8f %%' % (val_acc_log.avg))
        print('Loss of the network on the 10000 test images: %.8f' % (val_loss_log.avg))
        training_data['val_loss'].append(val_loss_log.avg.detach().cpu().numpy())
        training_data['val_acc'].append(val_acc_log.avg.detach().cpu().numpy())

    #logger.close()
    #logger.plot()
    return training_data
Beispiel #16
0
class MLPWithMNIST:
    def __init__(self, hparams, ckpt_name, homedir, separate_history,
                 patience):
        self.hparams = hparams
        # batch size
        self.batch_size = 256
        # loader
        self.loader_train, self.loader_valid, self.loader_test = mnist_data_loader(
            self.batch_size, homedir)
        # model
        self.model = Network(hparams)
        # loss function
        self.loss_fn = nn.CrossEntropyLoss()
        # initial learning rate
        self.lr = hparams['lr']
        # momentum coef
        self.momentum = hparams['momentum']
        # optimizer
        self.optimizer = SGD(self.model.parameters(),
                             lr=self.lr,
                             momentum=self.momentum,
                             nesterov=True)
        # epoch
        self.epoch = 0
        # check point
        self.ckpt_dir = homedir + "ckpt"
        self.ckpt_name = ckpt_name
        # history
        self.separate_history = separate_history
        # patience
        self.patience = patience

        try:
            ckpt = self._load_checkpoint(self.ckpt_name)
            self.model.load_state_dict(ckpt['state_dict'])
            self.epoch = ckpt['current_epoch']
        except FileNotFoundError:
            pass

    def evaluate(self, num_iter):

        min_val_loss = np.inf
        diff_epoch = num_iter - self.epoch
        overfitted_cnt = 0
        for epoch in range(diff_epoch):
            self._train_one_epoch()
            self.epoch += 1
            val_loss = self._validate_one_epoch()
            self.separate_history[self.ckpt_name].append(
                (self.hparams, val_loss))
            if val_loss < min_val_loss:
                min_val_loss = val_loss
                overfitted_cnt = 0
            else:
                overfitted_cnt += 1
            if overfitted_cnt >= self.patience:
                print("model overfitted.")
                return min_val_loss, True
        state = {
            'state_dict': self.model.state_dict(),
            'min_val_loss': min_val_loss,
            'current_epoch': self.epoch
        }
        self._save_checkpoint(state, self.ckpt_name)
        return min_val_loss, False

    def _train_one_epoch(self):
        self.model.train()
        for data, targets in self.loader_train:
            self.model.zero_grad()
            outputs = self.model(data)
            loss = self.loss_fn(outputs, targets)
            loss.backward()
            self.optimizer.step()

    def _validate_one_epoch(self):
        self.model.eval()
        correct = 0
        with torch.no_grad():
            for data, targets in self.loader_valid:
                outputs = self.model(data)
                _, predicted = torch.max(outputs.data, 1)
                correct += (predicted == targets).sum().item()

        data_num = len(self.loader_valid.dataset)
        val_loss = (1 - correct / data_num) * 100
        return val_loss

    def _save_checkpoint(self, state, name):
        filename = name + '.pth.tar'
        ckpt_path = os.path.join(self.ckpt_dir, filename)
        torch.save(state, ckpt_path)

    def _load_checkpoint(self, name):
        filename = name + '.pth.tar'
        ckpt_path = os.path.join(self.ckpt_dir, filename)
        ckpt = torch.load(ckpt_path)
        return ckpt