def step(self, optimizer: SGD, *args, **kwargs) -> Optional[float]: # type: ignore """ :meth:`step` carries out the following two operations: 1. Internally invokes ``unscale_(optimizer)`` (unless :meth:`unscale_` was explicitly called for ``optimizer`` earlier in the iteration). As part of the :meth:`unscale_`, gradients are checked for infs/NaNs. 2. If no inf/NaN gradients are found, invokes ``optimizer.step()`` using the unscaled gradients. Otherwise, ``optimizer.step()`` is skipped to avoid corrupting the params. ``*args`` and ``**kwargs`` are forwarded to ``optimizer.step()``. Returns the return value of ``optimizer.step(*args, **kwargs)``. Args: optimizer (torch.optim.Optimizer): Optimizer that applies the gradients. args: Any arguments. kwargs: Any keyword arguments. .. warning:: Closure use is not currently supported. Note: This is an exact copy of the step function in grad_scaler.py. If this copy is deleted then the unittest test_cpu_offload_and_cpu_grads fails. This is because the parent class step function calls the parent class unscale_ function which does not handle torch.distributed.all_reduce on cpu. """ if not self._enabled: return optimizer.step(*args, **kwargs) if "closure" in kwargs: raise RuntimeError("Closure use is not currently supported if GradScaler is enabled.") self._check_scale_growth_tracker("step") # type: ignore optimizer_state = self._per_optimizer_states[id(optimizer)] if optimizer_state["stage"] is OptState.STEPPED: raise RuntimeError("step() has already been called since the last update().") retval = None if hasattr(optimizer, "_step_supports_amp_scaling") and optimizer._step_supports_amp_scaling: # This optimizer has customized scale-handling logic, so we can call optimizer.step() directly. # The contract with custom optimizers is that their step() should accept an additional, # optional grad_scaler kwarg. We append self to the kwargs so the custom optimizer has full information: # it can query its own state, invoke unscale_ on itself, etc retval = optimizer.step(*args, **dict(kwargs, grad_scaler=self)) optimizer_state["stage"] = OptState.STEPPED return retval if optimizer_state["stage"] is OptState.READY: self.unscale_(optimizer) assert len(optimizer_state["found_inf_per_device"]) > 0, "No inf checks were recorded for this optimizer." retval = self._maybe_opt_step(optimizer, optimizer_state, *args, **kwargs) # type: ignore optimizer_state["stage"] = OptState.STEPPED return retval
def learn_second(network, lr, model, examples_files, total_example, alpha=1.0, batch_size=20): """ Helper function used to optimize O2 :param network: network model to optimize :param lr: learning rate :param model: model containing the shared data :param examples_files: list of files containing the examples :param total_example: total example for training :param alpha: trade-off param :param batch_size: size of the batch :return: loss value """ num_batch = 0 log.info("compute o2") optimizer = SGD(network.parameters(), lr) log.debug("read example file: {}".format("\t".join(examples_files))) loss_val = 0 if alpha <= 0: return loss_val for batch in emb_utils.batch_generator(emb_utils.prepare_sentences( model, graph_utils.combine_example_files_iter(examples_files), network.transfer_fn(model.vocab)), batch_size, long_tensor=LongTensor): input, output = batch loss = (alpha * network.forward( input, output, negative_sampling_fn=model.negative_sample)) loss_val += loss.data[0] optimizer.zero_grad() loss.backward() optimizer.step() num_batch += 1 if (num_batch) % 10000 == 0: log.info("community embedding batches completed: {}".format( num_batch / (total_example / batch_size))) log.debug("O2 loss: {}".format(loss_val)) return loss_val
def learn_community(network, lr, model, nodes, beta=1.0, batch_size=20): """ Helper function used to optimize O3 :param network: model to optimize :param lr: learning rate :param model: model containing the shared data :param nodes: nodes on which execute the learning :param beta: trade-off value :param batch_size: size of the batch :return: loss value """ num_batch = 0 log.info("compute o3") optimizer = SGD(network.parameters(), lr) loss_val = 0 if beta <= 0.: return loss_val for batch in emb_utils.batch_generator(emb_utils.prepare_sentences( model, nodes, network.transfer_fn()), batch_size, long_tensor=LongTensor): input, output = batch loss = network.forward(input, model) loss.data *= (beta / model.k) loss_val += loss.data[0] optimizer.zero_grad() loss.backward() optimizer.step() num_batch += 1 if (num_batch) % 10000 == 0: log.info("community embedding batches completed: {}".format( num_batch / (total_example / batch_size))) log.debug("O3 loss: {}".format(loss_val)) return loss_val
def learn_first(network, lr, model, edges, num_iter=1, batch_size=20): """ Helper function used to optimize O1 :param network: neural network to train :param lr: learning rate :param model: model containing the shared data :param edges: numpy list of edges used for training :param num_iter: iteration number over the edges :param batch_size: size of the batch :return: loss value """ log.info("computing o1") optimizer = SGD(network.parameters(), lr) num_batch = 0 total_batch = (edges.shape[0] * num_iter) / batch_size loss_val = 0 for batch in emb_utils.batch_generator(emb_utils.prepare_sentences( model, edges, network.transfer_fn(model.vocab)), batch_size, long_tensor=LongTensor): input, output = batch loss = network.forward(input, output, negative_sampling_fn=model.negative_sample) loss_val += loss.data[0] optimizer.zero_grad() loss.backward() optimizer.step() num_batch += 1 if (num_batch) % 10000 == 0: log.info("community embedding batches completed: {}".format( num_batch / total_batch)) log.debug("O1 loss: {}".format(loss_val)) return loss_val
def learn_second(network, lr, model, examples_files, alpha=1.0): """ Helper function used to optimize O1 and O3 :param loss: loss to optimize :param lr: learning rate :param model: deprecated_model used to compute the batches and the negative sampling :param examples_files: list of files containing the examples :param num_iter: iteration number over the edges :return: """ log.info("compute o2") optimizer = SGD(network.parameters(), lr) log.debug("read example file: {}".format("\t".join(examples_files))) for batch in emb_utils.batch_generator( emb_utils.prepare_sentences( model, graph_utils.combine_example_files_iter(examples_files), network.transfer_fn(model.vocab)), 20): input, output = batch loss = (alpha * network.forward( input, output, negative_sampling_fn=model.negative_sample)) optimizer.zero_grad() loss.backward() optimizer.step()
def learn_first(network, lr, model, edges, num_iter=1): """ Helper function used to optimize O1 and O3 :param network: neural network to train :param lr: learning rate :param model: deprecated_model used to compute the batches and the negative sampling :param edges: numpy list of edges used for training :param num_iter: iteration number over the edges :return: """ log.info("computing o1") optimizer = SGD(network.parameters(), lr) for batch in emb_utils.batch_generator( emb_utils.prepare_sentences( model, emb_utils.RepeatCorpusNTimes(edges, n=num_iter), network.transfer_fn(model.vocab)), 20): input, output = batch loss = network.forward(input, output, negative_sampling_fn=model.negative_sample) optimizer.zero_grad() loss.backward() optimizer.step()
if __name__ == '__main__': model = [torch.nn.Parameter(torch.randn(2, 2, requires_grad=True))] optim = SGD(model, 0.1) epochs = 20 # scheduler_warmup is chained with lr_schduler lr_schduler = CosineAnnealingLR(optim, T_max=epochs - 5, eta_min=0.02) scheduler_warmup = GradualWarmupScheduler(optim, multiplier=1, total_epoch=5, after_scheduler=lr_schduler) # this zero gradient update is needed to avoid a warning message, issue #8. optim.zero_grad() optim.step() scheduler_warmup.step() lr_list = list() for epoch in range(epochs): current_lr = optim.param_groups[0]['lr'] optim.step() scheduler_warmup.step() print(epoch + 1, current_lr) lr_list.append(current_lr) plot(lr_list)
class RestrictedBoltzmannMachines(nn.Module, ObservableData): ''' Restricted Boltzmann Machines(RBM). According to graph theory, the structure of RBM corresponds to a complete bipartite graph which is a special kind of bipartite graph where every node in the visible layer is connected to every node in the hidden layer. Based on statistical mechanics and thermodynamics(Ackley, D. H., Hinton, G. E., & Sejnowski, T. J. 1985), the state of this structure can be reflected by the energy function. In relation to RBM, the Contrastive Divergence(CD) is a method for approximation of the gradients of the log-likelihood(Hinton, G. E. 2002). This algorithm draws a distinction between a positive phase and a negative phase. Conceptually, the positive phase is to the negative phase what waking is to sleeping. The procedure of this method is similar to Markov Chain Monte Carlo method(MCMC). However, unlike MCMC, the visbile variables to be set first in visible layer is not randomly initialized but the observed data points in training dataset are set to the first visbile variables. And, like Gibbs sampler, drawing samples from hidden variables and visible variables is repeated k times. Empirically (and surprisingly), `k` is considered to be `1`. **Note** that this class does not support a *Hybrid* of imperative and symbolic programming. Only `mxnet.ndarray` is supported. References: - Ackley, D. H., Hinton, G. E., & Sejnowski, T. J. (1985). A learning algorithm for Boltzmann machines. Cognitive science, 9(1), 147-169. - Hinton, G. E. (2002). Training products of experts by minimizing contrastive divergence. Neural computation, 14(8), 1771-1800. - Le Roux, N., & Bengio, Y. (2008). Representational power of restricted Boltzmann machines and deep belief networks. Neural computation, 20(6), 1631-1649. ''' # The list of losses. __loss_arr = [] # Learning rate. __learning_rate = 0.5 # Batch size in learning. __batch_size = 0 # Batch size in inference(recursive learning or not). __r_batch_size = 0 def __init__( self, computable_loss, initializer_f=None, optimizer_f=None, visible_activation=torch.nn.Sigmoid(), hidden_activation=torch.nn.Sigmoid(), visible_dim=1000, hidden_dim=100, learning_rate=0.005, visible_dropout_rate=0.0, hidden_dropout_rate=0.0, visible_batch_norm=None, hidden_batch_norm=None, regularizatable_data_list=[], ctx="cpu", ): ''' Init. Args: computable_loss: is-a `ComputableLoss`. visible_activation: `mxnet.ndarray.Activation` or `mxnet.symbol.Activation` in visible layer. hidden_activation: `mxnet.ndarray.Activation` or `mxnet.symbol.Activation` in hidden layer. visible_dim: `int` of dimension in visible layer. hidden_dim: `int` of dimension in hidden layer. initializer: is-a `mxnet.initializer` for parameters of model. If `None`, it is drawing from the Xavier distribution. optimizer_name: `str` of name of optimizer. learning_rate: `float` of learning rate. learning_attenuate_rate: `float` of attenuate the `learning_rate` by a factor of this value every `attenuate_epoch`. attenuate_epoch: `int` of attenuate the `learning_rate` by a factor of `learning_attenuate_rate` every `attenuate_epoch`. visible_dropout_rate: `float` of dropout rate in visible layer. hidden_dropout_rate: `float` of dropout rate in hidden layer. visible_batch_norm: `gluon.nn.BatchNorm` in visible layer. hidden_batch_norm: `gluon.nn.BatchNorm` in hidden layer. regularizatable_data_list: `list` of `RegularizatableData`s. ctx: `mx.gpu()` or `mx.cpu()`. ''' super(RestrictedBoltzmannMachines, self).__init__() for v in regularizatable_data_list: if isinstance(v, RegularizatableData) is False: raise TypeError( "The type of values of `regularizatable_data_list` must be `RegularizatableData`." ) self.__regularizatable_data_list = regularizatable_data_list self.__computable_loss = computable_loss self.visible_activation = visible_activation self.hidden_activation = hidden_activation self.__visible_unit = nn.Linear( visible_dim, hidden_dim, bias=True, ) self.visible_dropout_forward = None if visible_dropout_rate > 0: self.visible_dropout_forward = nn.Dropout(p=visible_dropout_rate) self.hidden_dropout_forward = None if hidden_dropout_rate > 0: self.hidden_dropout_forward = nn.Dropout(p=hidden_dropout_rate) self.visible_batch_norm = visible_batch_norm self.hidden_batch_norm = hidden_batch_norm self.__ctx = ctx self.to(self.__ctx) if initializer_f is not None: self.__visible_unit.weight = initializer_f( self.__visible_unit.weight) else: self.__visible_unit.weight = torch.nn.init.xavier_normal_( self.__visible_unit.weight, gain=1.0) if optimizer_f is not None: self.optimizer = optimizer_f(self.parameters()) else: self.optimizer = SGD( self.parameters(), lr=self.__learning_rate, ) self.__learning_rate = learning_rate self.__loss_arr = np.array([]) logger = getLogger("accelbrainbase") self.__logger = logger self.__loss_list = [] self.__test_loss_list = [] self.epoch = 0 def learn(self, iteratable_data): ''' Learn samples drawn by `IteratableData.generate_learned_samples()`. Args: iteratable_data: is-a `IteratableData`. ''' if isinstance(iteratable_data, IteratableData) is False: raise TypeError( "The type of `iteratable_data` must be `IteratableData`.") self.__loss_list = [] self.__test_loss_list = [] try: epoch = self.epoch iter_n = 0 for observed_arr, label_arr, test_observed_arr, test_label_arr in iteratable_data.generate_learned_samples( ): self.batch_size = observed_arr.shape[0] observed_arr = observed_arr.reshape((self.batch_size, -1)) test_observed_arr = test_observed_arr.reshape( (self.batch_size, -1)) self.optimizer.zero_grad() visible_activity_arr = self.inference(observed_arr) loss = self.compute_loss(observed_arr, visible_activity_arr) loss.backward() self.optimizer.step() self.regularize() if (iter_n + 1) % int( iteratable_data.iter_n / iteratable_data.epochs) == 0: with torch.inference_mode(): test_visible_activity_arr = self.inference( test_observed_arr) test_loss = self.compute_loss( test_observed_arr, test_visible_activity_arr) _loss = loss.to('cpu').detach().numpy().copy() _test_loss = test_loss.to('cpu').detach().numpy().copy() self.__loss_list.append(_loss) self.__test_loss_list.append(_test_loss) self.__logger.debug("Epoch: " + str(epoch + 1) + " Train loss: " + str(self.__loss_list[-1]) + " Test loss: " + str(self.__test_loss_list[-1])) epoch += 1 iter_n += 1 except KeyboardInterrupt: self.__logger.debug("Interrupt.") self.__logger.debug("end. ") self.__loss_arr = np.c_[ np.array(self.__loss_list[:len(self.__test_loss_list)]), np.array(self.__test_loss_list)] self.epoch = epoch def inference(self, observed_arr): ''' Inference samples drawn by `IteratableData.generate_inferenced_samples()`. Args: observed_arr: rank-2 Array like or sparse matrix as the observed data points. The shape is: (batch size, feature points) Returns: `mxnet.ndarray` of inferenced feature points. ''' return self(observed_arr) def compute_loss(self, pred_arr, labeled_arr): ''' Compute loss. Args: pred_arr: `mxnet.ndarray` or `mxnet.symbol`. labeled_arr: `mxnet.ndarray` or `mxnet.symbol`. Returns: loss. ''' return self.__computable_loss(pred_arr, labeled_arr) def extract_learned_dict(self): ''' Extract (pre-) learned parameters. Returns: `dict` of the parameters. ''' params_dict = {} for k in self.state_dict().keys(): params_dict.setdefault(k, self.state_dict()[k]) return params_dict def forward(self, x): ''' Forward with Gluon API. Args: F: `mxnet.ndarray` or `mxnet.symbol`. x: `mxnet.ndarray` of observed data points. Returns: `mxnet.ndarray` or `mxnet.symbol` of inferenced feature points. ''' self.batch_size = x.shape[0] x = x.reshape((self.batch_size, -1)) self.__visible_activity_arr = x x = self.__visible_unit(x) if self.visible_activation == "identity_adjusted": x = x / torch.sum(torch.ones_like(x)) elif self.visible_activation != "identity": x = self.visible_activation(x) if self.visible_dropout_forward is not None: x = self.visible_dropout_forward(x) if self.visible_batch_norm is not None: x = self.visible_batch_norm(x) self.__hidden_activity_arr = x self.__diff_weights_arr = torch.mm( self.__visible_activity_arr.T, self.__hidden_activity_arr, ) #self.__visible_diff_bias_arr += nd.nansum(self.__visible_activity_arr, axis=0) #self.__hidden_diff_bias_arr += nd.nansum(self.__hidden_activity_arr, axis=0) params_dict = self.extract_learned_dict() weight_keys_list = [ key for key in params_dict.keys() if "weight" in key ] weights_arr = params_dict[weight_keys_list[0]] self.__visible_activity_arr = torch.mm( self.__hidden_activity_arr, weights_arr, ) x = self.__visible_activity_arr if self.hidden_activation == "identity_adjusted": x = x / torch.sum(torch.ones_like(x)) elif self.hidden_activation != "identity": x = self.hidden_activation(x) if self.hidden_dropout_forward is not None: x = self.hidden_dropout_forward(x) if self.hidden_batch_norm is not None: x = self.hidden_batch_norm(x) self.__visible_activity_arr = x self.__hidden_activity_arr = self.__visible_unit( self.__visible_activity_arr) x = self.__hidden_activity_arr if self.visible_activation == "identity_adjusted": x = x / torch.sum(torch.ones_like(x)) elif self.visible_activation != "identity": x = self.visible_activation(x) if self.visible_dropout_forward is not None: x = self.visible_dropout_forward(x) if self.visible_batch_norm is not None: x = self.visible_batch_norm(x) self.__hidden_activity_arr = x self.__diff_weights_arr = self.__diff_weights_arr - torch.mm( self.__visible_activity_arr.T, self.__hidden_activity_arr, ) #self.__visible_diff_bias_arr -= nd.nansum(self.__visible_activity_arr, axis=0) #self.__hidden_diff_bias_arr -= nd.nansum(self.__hidden_activity_arr, axis=0) return self.__visible_activity_arr def regularize(self): ''' Regularization. ''' if len(self.__regularizatable_data_list) > 0: params_dict = self.extract_learned_dict() for regularizatable in self.__regularizatable_data_list: params_dict = regularizatable.regularize(params_dict) for k, params in params_dict.items(): self.load_state_dict({k: params}, strict=False) def save_parameters(self, filename): ''' Save parameters to files. Args: filename: File name. ''' torch.save( { 'epoch': self.epoch, 'model_state_dict': self.state_dict(), 'optimizer_state_dict': self.optimizer.state_dict(), 'loss': self.loss_arr, }, filename) def load_parameters(self, filename, ctx=None, strict=True): ''' Load parameters to files. Args: filename: File name. ctx: Context-manager that changes the selected device. strict: Whether to strictly enforce that the keys in state_dict match the keys returned by this module’s state_dict() function. Default: `True`. ''' checkpoint = torch.load(filename) self.load_state_dict(checkpoint['model_state_dict'], strict=strict) self.optimizer.load_state_dict(checkpoint['optimizer_state_dict']) self.epoch = checkpoint['epoch'] self.__loss_list = checkpoint['loss'].tolist() if ctx is not None: self.to(ctx) self.__ctx = ctx def set_readonly(self, value): ''' setter ''' raise TypeError("This is read-only.") def get_loss_list(self): ''' getter for `list` of losses in training. ''' return self.__loss_list loss_list = property(get_loss_list, set_readonly) def get_test_loss_arr(self): ''' getter for `list` of losses in test. ''' return self.__test_loss_list test_loss_list = property(get_test_loss_arr, set_readonly) def get_loss_arr(self): ''' getter for losses. ''' return self.__loss_arr loss_arr = property(get_loss_arr, set_readonly) def get_feature_points_arr(self): ''' getter for `mxnet.narray` of feature points in middle hidden layer. ''' return self.__hidden_activity_arr feature_points_arr = property(get_feature_points_arr, set_readonly) def get_weights_arr(self): ''' getter for `mxnet.ndarray` of weights matrics. ''' return self.__weights_arr def set_weights_arr(self, value): ''' setter for `mxnet.ndarray` of weights matrics.''' self.__weights_arr = value weights_arr = property(get_weights_arr, set_weights_arr) def get_visible_bias_arr(self): ''' getter for `mxnet.ndarray` of biases in visible layer.''' return self.__visible_bias_arr def set_visible_bias_arr(self, value): ''' setter for `mxnet.ndarray` of biases in visible layer.''' self.__visible_bias_arr = value visible_bias_arr = property(get_visible_bias_arr, set_visible_bias_arr) def get_hidden_bias_arr(self): ''' getter for `mxnet.ndarray` of biases in hidden layer.''' return self.__hidden_bias_arr def set_hidden_bias_arr(self, value): ''' setter for `mxnet.ndarray` of biases in hidden layer.''' self.__hidden_bias_arr = value hidden_bias_arr = property(get_hidden_bias_arr, set_hidden_bias_arr) def get_visible_activity_arr(self): ''' getter for `mxnet.ndarray` of activities in visible layer.''' return self.__visible_activity_arr def set_visible_activity_arr(self, value): ''' setter for `mxnet.ndarray` of activities in visible layer.''' self.__visible_activity_arr = value visible_activity_arr = property(get_visible_activity_arr, set_visible_activity_arr) def get_hidden_activity_arr(self): ''' getter for `mxnet.ndarray` of activities in hidden layer.''' return self.__hidden_activity_arr def set_hidden_activity_arr(self, value): ''' setter for `mxnet.ndarray` of activities in hidden layer.''' self.__hidden_activity_arr = value hidden_activity_arr = property(get_hidden_activity_arr, set_hidden_activity_arr)
import torch from torch.optim.lr_scheduler import StepLR, ExponentialLR from torch.optim.sgd import SGD from warmup_scheduler import GradualWarmupScheduler if __name__ == '__main__': model = [torch.nn.Parameter(torch.randn(2, 2, requires_grad=True))] optim = SGD(model, 0.1) # scheduler_warmup is chained with schduler_steplr scheduler_steplr = StepLR(optim, step_size=10, gamma=0.1) scheduler_warmup = GradualWarmupScheduler(optim, multiplier=1, total_epoch=5, after_scheduler=scheduler_steplr) # this zero gradient update is needed to avoid a warning message, issue #8. optim.zero_grad() optim.step() for epoch in range(1, 20): scheduler_warmup.step(epoch) print(epoch, optim.param_groups[0]['lr']) optim.step() # backward pass (update network)
import torch import torch.nn as nn from model.maml.optim import MamlSGD from torch.optim.sgd import SGD net = nn.Sequential(nn.Linear(10, 10), nn.ReLU(), nn.Linear(10, 20), nn.ReLU(), nn.Linear(20, 5)) optim = MamlSGD(net.parameters(), momentum=0.9, lr=0.1) TrueOptim = SGD(net.parameters(), momentum=0.9, lr=0.1) x = torch.randn(100, 10) loss = net(x).sum() loss.backward() updated = optim.maml_step() original = optim.maml_replace(net, updated) loss = net(x).sum() loss.backward() TrueOptim.step() print("hello")
def train_mnist(epoch_num=10, show_iter=100, logdir='test', model_weight=None, load_d=False, load_g=False, compare_path=None, info_time=100, run_select=None, device='cpu'): lr_d = 0.01 lr_g = 0.01 batchsize = 128 z_dim = 96 print('MNIST, discriminator lr: %.3f, generator lr: %.3f' % (lr_d, lr_g)) dataset = get_data(dataname='MNIST', path='../datas/mnist') dataloader = DataLoader(dataset=dataset, batch_size=batchsize, shuffle=True, num_workers=4) D = dc_D().to(device) G = dc_G(z_dim=z_dim).to(device) D.apply(weights_init_d) G.apply(weights_init_g) if model_weight is not None: chk = torch.load(model_weight) if load_d: D.load_state_dict(chk['D']) print('Load D from %s' % model_weight) if load_g: G.load_state_dict(chk['G']) print('Load G from %s' % model_weight) if compare_path is not None: discriminator = dc_D().to(device) model_weight = torch.load(compare_path) discriminator.load_state_dict(model_weight['D']) model_vec = torch.cat( [p.contiguous().view(-1) for p in discriminator.parameters()]) print('Load discriminator from %s' % compare_path) if run_select is not None: fixed_data = torch.load(run_select) real_set = fixed_data['real_set'] fake_set = fixed_data['fake_set'] real_d = fixed_data['real_d'] fake_d = fixed_data['fake_d'] fixed_vec = fixed_data['pred_vec'] print('load fixed data set') from datetime import datetime current_time = datetime.now().strftime('%b%d_%H-%M-%S') writer = SummaryWriter(log_dir='logs/%s/%s_%.3f' % (logdir, current_time, lr_d)) d_optimizer = SGD(D.parameters(), lr=lr_d) g_optimizer = SGD(G.parameters(), lr=lr_g) timer = time.time() count = 0 fixed_noise = torch.randn((64, z_dim), device=device) for e in range(epoch_num): for real_x in dataloader: real_x = real_x[0].to(device) d_real = D(real_x) z = torch.randn((d_real.shape[0], z_dim), device=device) fake_x = G(z) fake_x_c = fake_x.clone().detach() # update generator d_fake = D(fake_x) writer.add_scalars('Discriminator output', { 'Generated image': d_fake.mean().item(), 'Real image': d_real.mean().item() }, global_step=count) G_loss = get_loss(name='JSD', g_loss=True, d_fake=d_fake) g_optimizer.zero_grad() G_loss.backward() g_optimizer.step() gg = torch.norm(torch.cat( [p.grad.contiguous().view(-1) for p in G.parameters()]), p=2) d_fake_c = D(fake_x_c) D_loss = get_loss(name='JSD', g_loss=False, d_real=d_real, d_fake=d_fake_c) if compare_path is not None and count % info_time == 0: diff = get_diff(net=D, model_vec=model_vec) writer.add_scalar('Distance from checkpoint', diff.item(), global_step=count) if run_select is not None: with torch.no_grad(): d_real_set = D(real_set) d_fake_set = D(fake_set) diff_real = torch.norm(d_real_set - real_d, p=2) diff_fake = torch.norm(d_fake_set - fake_d, p=2) d_vec = torch.cat([d_real_set, d_fake_set]) diff = torch.norm(d_vec.sub_(fixed_vec), p=2) writer.add_scalars('L2 norm of pred difference', { 'Total': diff.item(), 'real set': diff_real.item(), 'fake set': diff_fake.item() }, global_step=count) d_optimizer.zero_grad() D_loss.backward() d_optimizer.step() gd = torch.norm(torch.cat( [p.grad.contiguous().view(-1) for p in D.parameters()]), p=2) writer.add_scalars('Loss', { 'D_loss': D_loss.item(), 'G_loss': G_loss.item() }, global_step=count) writer.add_scalars('Grad', { 'D grad': gd.item(), 'G grad': gg.item() }, global_step=count) if count % show_iter == 0: time_cost = time.time() - timer print('Iter :%d , D_loss: %.5f, G_loss: %.5f, time: %.3fs' % (count, D_loss.item(), G_loss.item(), time_cost)) timer = time.time() with torch.no_grad(): fake_img = G(fixed_noise).detach() path = 'figs/%s/' % logdir if not os.path.exists(path): os.makedirs(path) vutils.save_image(fake_img, path + 'iter_%d.png' % count, normalize=True) save_checkpoint(path=logdir, name='SGD-%.3f_%d.pth' % (lr_d, count), D=D, G=G) count += 1 writer.close()
def training(optimizer_sign=0): training_data = { 'train_loss': [], 'val_loss': [], 'train_acc': [], 'val_acc': [] } net = Net(input_size, hidden_size, num_classes) # net = Net(input_size, hidden_size, num_classes) net.cuda() net.train() # Loss and Optimizer criterion = nn.CrossEntropyLoss() if optimizer_sign == 0: optimizer = torch.optim.RMSprop(net.parameters(), lr=learning_rate) elif optimizer_sign == 1: optimizer = torch.optim.Adam(net.parameters(), lr=learning_rate) elif optimizer_sign == 2: optimizer = SGD(net.parameters(), lr=learning_rate, weight_decay=0.0001, momentum=0.9) elif optimizer_sign == 3: optimizer = PIDOptimizer(net.parameters(), lr=learning_rate, weight_decay=0.0001, momentum=0.9, I=I, D=D) else: optimizer = PIDOptimizer(net.parameters(), lr=learning_rate, weight_decay=0.0001, momentum=0.9, I=I, D=0) # Train the Model for epoch in range(num_epochs): train_loss_log = AverageMeter() train_acc_log = AverageMeter() val_loss_log = AverageMeter() val_acc_log = AverageMeter() for i, (images, labels) in enumerate(train_loader): # Convert torch tensor to Variable images = images.view(-1, 28 * 28).cuda() labels = Variable(labels.cuda()) # Forward + Backward + Optimize optimizer.zero_grad() # zero the gradient buffer outputs = net(images) train_loss = criterion(outputs, labels) train_loss.backward() optimizer.step() prec1, prec5 = accuracy(outputs.data, labels.data, topk=(1, 5)) train_loss_log.update(train_loss.data, images.size(0)) train_acc_log.update(prec1, images.size(0)) if (i + 1) % 100 == 0: print('Epoch [%d/%d], Step [%d/%d], Loss: %.4f, Acc: %.8f' % (epoch + 1, num_epochs, i + 1, len(train_dataset) // batch_size, train_loss_log.avg, train_acc_log.avg)) training_data['train_loss'].append( train_loss_log.avg.detach().cpu().numpy()) training_data['train_acc'].append( train_acc_log.avg.detach().cpu().numpy()) # Test the Model net.eval() correct = 0 loss = 0 total = 0 for images, labels in test_loader: images = images.view(-1, 28 * 28).cuda() labels = Variable(labels).cuda() outputs = net(images) test_loss = criterion(outputs, labels) val_loss_log.update(test_loss.data, images.size(0)) prec1, prec5 = accuracy(outputs.data, labels.data, topk=(1, 5)) val_acc_log.update(prec1, images.size(0)) #logger.append([learning_rate, train_loss_log.avg, val_loss_log.avg, train_acc_log.avg, val_acc_log.avg]) print('Accuracy of the network on the 10000 test images: %.8f %%' % (val_acc_log.avg)) print('Loss of the network on the 10000 test images: %.8f' % (val_loss_log.avg)) training_data['val_loss'].append( val_loss_log.avg.detach().cpu().numpy()) training_data['val_acc'].append(val_acc_log.avg.detach().cpu().numpy()) #logger.close() #logger.plot() return training_data
class Neumann(Optimizer): """ Documentation about the algorithm """ def __init__(self, params, lr=1e-3, eps=1e-8, alpha=1e-7, beta=1e-5, gamma=0.9, momentum=1, sgd_steps=5, K=10): if not 0.0 <= lr: raise ValueError("Invalid learning rate: {}".format(lr)) if not 0.0 <= eps: raise ValueError("Invalid epsilon value: {}".format(eps)) if not 1 >= momentum: raise ValueError("Invalid momentum value: {}".format(eps)) self.iter = 0 self.sgd = SGD(params, lr=lr, momentum=0.9) param_count = np.sum([np.prod(p.size()) for p in params]) # got from MNIST-GAN defaults = dict(lr=lr, eps=eps, alpha=alpha, beta=beta * param_count, gamma=gamma, sgd_steps=sgd_steps, momentum=momentum, K=K) super(Neumann, self).__init__(params, defaults) def step(self, closure=None): """ Performs a single optimization step. Arguments: closure (callable, optional): A closure that reevaluates the model and returns the loss. """ self.iter += 1 loss = None if closure is not None: #checkout what's the deal with this. present in multiple pytorch optimizers loss = closure() for group in self.param_groups: sgd_steps = group['sgd_steps'] alpha = group['alpha'] beta = group['beta'] gamma = group['gamma'] K = group['K'] momentum = group['momentum'] mu = momentum * (1 - (1 / (1 + self.iter))) eta = group['lr'] / self.iter ## update with time if self.iter <= sgd_steps: self.sgd.step() return if self.iter == 8: print("here") if mu >= 0.9: mu = 0.9 elif mu <= 0.5: mu = 0.5 for p in group['params']: if p.grad is None: continue grad = p.grad.data state = self.state[p] if len(state) == 0: state['step'] = 0 state['m'] = torch.zeros_like(p.data).float() state['d'] = torch.zeros_like(p.data).float() state['moving_avg'] = p.data state['step'] += 1 ## Reset neumann iterate if self.iter % K == 0: state['m'] = grad.mul(-eta) group['K'] = group['K'] * 2 ## Compute update d_t diff = p.data.sub(state['moving_avg']) diff_norm = (p.data.sub(state['moving_avg'])).norm(p=1) if np.count_nonzero(diff): state['d'] = grad.add((((diff_norm.pow(2)).mul(alpha)).sub( (diff_norm.pow(-2)).mul(beta))).mul( diff.div(diff_norm))) else: state['d'] = grad ## Update Neumann iterate (state['m'].mul_(mu)).sub_(state['d'].mul(eta)) ## Update Weights p.data.add_((state['m'].mul(mu)).sub(state['d'].mul(eta))) ## Update Moving Average state['moving_avg'] = p.data.add( (state['moving_avg'].sub(p.data)).mul(gamma))
train_loss_log = AverageMeter() train_acc_log = AverageMeter() val_loss_log = AverageMeter() val_acc_log = AverageMeter() for i, (images, labels) in enumerate(train_loader): # Convert torch tensor to Variable images = images.cuda() #Variable(images.view(-1, 28*28).cuda()) labels = Variable(labels.cuda()) # Forward + Backward + Optimize optimizer.zero_grad() # zero the gradient buffer outputs = net(images) train_loss = criterion(outputs, labels) train_loss.backward() optimizer.step() prec1, prec5 = accuracy(outputs.data, labels.data, topk=(1, 5)) train_loss_log.update(train_loss.data, images.size(0)) train_acc_log.update(prec1, images.size(0)) save_name = os.path.join(model_save_dir, str(iters) + '.pth.tar') ''' torch.save({'iter': iters, 'state_dict': net.state_dict(), 'optimizer' : optimizer.state_dict()}, save_name) ''' iters += 1 if (i + 1) % 100 == 0: print('Epoch [%d/%d], Step [%d/%d], Loss: %.4f, Acc: %.8f' %
def training(optimizer_sign=0): training_data = {'train_loss':[], 'val_loss':[], 'train_acc':[], 'val_acc':[]} net = Net(input_size, hidden_size, num_classes) # net = Net(input_size, hidden_size, num_classes) net.train() # Loss and Optimizer oldnet_sign = False basicgrad_sign = False criterion = nn.CrossEntropyLoss() print('optimizer_sign:' + str(optimizer_sign)) if optimizer_sign == 0: optimizer = torch.optim.SGD(net.parameters(), lr=learning_rate) elif optimizer_sign == 1: optimizer = torch.optim.RMSprop(net.parameters(), lr=learning_rate) elif optimizer_sign == 2: optimizer = torch.optim.Adam(net.parameters(), lr=.001) elif optimizer_sign == 3: optimizer = SGD(net.parameters(), lr=learning_rate, weight_decay=0.001, momentum=0.9) elif optimizer_sign == 4: optimizer = pid.PIDOptimizer(net.parameters(), lr=learning_rate, weight_decay=0.001, momentum=0.9, I=I, D=0) elif optimizer_sign == 5: optimizer = pid.PIDOptimizer(net.parameters(), lr=learning_rate, weight_decay=0.001, momentum=0.9, I=I, D=D) elif optimizer_sign == 6: optimizer = pid.AdapidOptimizer(net.parameters(), lr=learning_rate, weight_decay=0.001, momentum=0.9, I=I, D=D) elif optimizer_sign == 7: optimizer = pid.AdapidOptimizer_test(net.parameters(), lr=learning_rate, weight_decay=0.001, momentum=0.9, I=I, D=D) elif optimizer_sign == 8: optimizer = pid.specPIDoptimizer(net.parameters(), lr=learning_rate, weight_decay=0.001, momentum=0.9, I=I, D=D) oldnet_sign = True elif optimizer_sign == 9: optimizer = pid.SVRGoptimizer(net.parameters(), lr=learning_rate, weight_decay=0.001, momentum=0.9) oldnet_sign = True basicgrad_sign = True else: optimizer = pid.SARAHoptimizer(net.parameters(), lr=learning_rate, weight_decay=0.001, momentum=0.9) oldnet_sign = True basicgrad_sign = True if oldnet_sign == True: torch.save(net, 'net.pkl') old_net = torch.load('net.pkl') # Train the Model for epoch in range(num_epochs): train_loss_log = AverageMeter() train_acc_log = AverageMeter() val_loss_log = AverageMeter() val_acc_log = AverageMeter() for i, (images, labels) in enumerate(train_loader): if i % 100 == 0 and basicgrad_sign == True: for j, (all_images, all_labels) in enumerate(BGD_loader): all_images = all_images.view(-1, 28 * 28) all_labels = Variable(all_labels) optimizer.zero_grad() # zero the gradient buffer outputs = net(all_images) train_loss = criterion(outputs, all_labels) train_loss.backward() params = list(net.parameters()) grads = [] for param in params: grads.append(param.grad.detach()) optimizer.get_basicgrad(grads) optimizer.step() prec1, prec5 = accuracy(outputs.data, all_labels.data, topk=(1, 5)) train_loss_log.update(train_loss.data, all_images.size(0)) train_acc_log.update(prec1, all_images.size(0)) torch.save(net, 'net.pkl') old_net = torch.load('net.pkl') print('Epoch [%d/%d], Step [%d/%d], Loss: %.4f, Acc: %.8f' % (epoch + 1, num_epochs, i + 1, len(train_dataset) // batch_size, train_loss_log.avg, train_acc_log.avg)) # Convert torch tensor to Variable images = images.view(-1, 28*28) labels = Variable(labels) # Forward + Backward + Optimize optimizer.zero_grad() # zero the gradient buffer outputs = net(images) train_loss = criterion(outputs, labels) train_loss.backward() if oldnet_sign == True: old_outputs = old_net(images) old_loss = criterion(old_outputs, labels) old_loss.backward() old_params = list(old_net.parameters()) old_grads = [] for param in old_params: old_grads.append(param.grad.detach()) optimizer.get_oldgrad(old_grads) optimizer.step() if oldnet_sign == True and optimizer_sign != 8: torch.save(net, 'net.pkl') old_net = torch.load('net.pkl') prec1, prec5 = accuracy(outputs.data, labels.data, topk=(1, 5)) train_loss_log.update(train_loss.data, images.size(0)) train_acc_log.update(prec1, images.size(0)) if (i + 1) % 30 == 0: print('Epoch [%d/%d], Step [%d/%d], Loss: %.4f, Acc: %.8f' % (epoch + 1, num_epochs, i + 1, len(train_dataset) // batch_size, train_loss_log.avg, train_acc_log.avg)) training_data['train_loss'].append(train_loss_log.avg.detach().cpu().numpy()) training_data['train_acc'].append(train_acc_log.avg.detach().cpu().numpy()) # Test the Model net.eval() correct = 0 loss = 0 total = 0 for images, labels in test_loader: images = images.view(-1, 28*28) labels = Variable(labels) outputs = net(images) test_loss = criterion(outputs, labels) val_loss_log.update(test_loss.data, images.size(0)) prec1, prec5 = accuracy(outputs.data, labels.data, topk=(1, 5)) val_acc_log.update(prec1, images.size(0)) #logger.append([learning_rate, train_loss_log.avg, val_loss_log.avg, train_acc_log.avg, val_acc_log.avg]) print('Accuracy of the network on the 10000 test images: %.8f %%' % (val_acc_log.avg)) print('Loss of the network on the 10000 test images: %.8f' % (val_loss_log.avg)) training_data['val_loss'].append(val_loss_log.avg.detach().cpu().numpy()) training_data['val_acc'].append(val_acc_log.avg.detach().cpu().numpy()) #logger.close() #logger.plot() return training_data
class MLPWithMNIST: def __init__(self, hparams, ckpt_name, homedir, separate_history, patience): self.hparams = hparams # batch size self.batch_size = 256 # loader self.loader_train, self.loader_valid, self.loader_test = mnist_data_loader( self.batch_size, homedir) # model self.model = Network(hparams) # loss function self.loss_fn = nn.CrossEntropyLoss() # initial learning rate self.lr = hparams['lr'] # momentum coef self.momentum = hparams['momentum'] # optimizer self.optimizer = SGD(self.model.parameters(), lr=self.lr, momentum=self.momentum, nesterov=True) # epoch self.epoch = 0 # check point self.ckpt_dir = homedir + "ckpt" self.ckpt_name = ckpt_name # history self.separate_history = separate_history # patience self.patience = patience try: ckpt = self._load_checkpoint(self.ckpt_name) self.model.load_state_dict(ckpt['state_dict']) self.epoch = ckpt['current_epoch'] except FileNotFoundError: pass def evaluate(self, num_iter): min_val_loss = np.inf diff_epoch = num_iter - self.epoch overfitted_cnt = 0 for epoch in range(diff_epoch): self._train_one_epoch() self.epoch += 1 val_loss = self._validate_one_epoch() self.separate_history[self.ckpt_name].append( (self.hparams, val_loss)) if val_loss < min_val_loss: min_val_loss = val_loss overfitted_cnt = 0 else: overfitted_cnt += 1 if overfitted_cnt >= self.patience: print("model overfitted.") return min_val_loss, True state = { 'state_dict': self.model.state_dict(), 'min_val_loss': min_val_loss, 'current_epoch': self.epoch } self._save_checkpoint(state, self.ckpt_name) return min_val_loss, False def _train_one_epoch(self): self.model.train() for data, targets in self.loader_train: self.model.zero_grad() outputs = self.model(data) loss = self.loss_fn(outputs, targets) loss.backward() self.optimizer.step() def _validate_one_epoch(self): self.model.eval() correct = 0 with torch.no_grad(): for data, targets in self.loader_valid: outputs = self.model(data) _, predicted = torch.max(outputs.data, 1) correct += (predicted == targets).sum().item() data_num = len(self.loader_valid.dataset) val_loss = (1 - correct / data_num) * 100 return val_loss def _save_checkpoint(self, state, name): filename = name + '.pth.tar' ckpt_path = os.path.join(self.ckpt_dir, filename) torch.save(state, ckpt_path) def _load_checkpoint(self, name): filename = name + '.pth.tar' ckpt_path = os.path.join(self.ckpt_dir, filename) ckpt = torch.load(ckpt_path) return ckpt