def test_sparse_adam(self): self._test_rosenbrock_sparse( lambda params: optim.SparseAdam(params, lr=4e-2), True ) with self.assertRaisesRegex(ValueError, "Invalid beta parameter at index 0: 1.0"): optim.SparseAdam(None, lr=1e-2, betas=(1.0, 0.0))
def _init_optimizer(self): """ """ if not hasattr(self, 'embeddings_'): raise ValueError('You should call .fit first!') params = {'sparse': [], 'dense': []} for lyr in self.embeddings_.children(): if lyr.sparse: params['sparse'].append(lyr.weight) else: params['dense'].append(lyr.weight) # determine if sparse optimizer needed if len(params['sparse']) == 0: self.opt = optim.Adam(params['dense'], lr=self.learn_rate, weight_decay=self.l2) elif len(params['dense']) == 0: self.opt = optim.SparseAdam(params['sparse'], lr=self.learn_rate) else: # init multi-optimizers # register it to the instance self.opt = MultipleOptimizer( optim.SparseAdam(params['sparse'], lr=self.learn_rate), optim.Adam(params['dense'], lr=self.learn_rate, weight_decay=self.l2))
def __init__(self, logger, ntm_params={}, tmn_params={}, kl_growing_epoch=0): self.ntm_params = ntm_params self.ntm = NeuralTopicModel(**ntm_params) self.ntm.cuda() self.tmn_params = tmn_params self.tmn = TopicMemoryNetwork(**tmn_params) self.tmn.cuda() self.ntm_optimizer = optim.SparseAdam(self.ntm.parameters()) self.tmn_optimizer = optim.SparseAdam(self.tmn.parameters()) self.kl_strength = 1.0 self.ntm_loss_fn = NTMLoss(self.kl_strength) self.logger = logger self.optimize_ntm = True self.first_optimize_ntm = True self.min_bound_ntm = np.inf self.min_bound_cls = -np.inf self.epoch_since_improvement = 0 self.epoch_since_improvement_global = 0 self.psudo_indices = np.expand_dims(np.arange( self.ntm_params.get("n_topics")), axis=0) self.kl_growing_epoch = kl_growing_epoch self.max_epochs = 0 self.current_epoch = 0
def optimizer(params, mode, *args, **kwargs): if mode == 'SGD': opt = optim.SGD(params, *args, momentum=0., **kwargs) elif mode.startswith('nesterov'): momentum = float(mode[len('nesterov'):]) opt = optim.SGD(params, *args, momentum=momentum, nesterov=True, **kwargs) elif mode.lower() == 'adam': betas = kwargs.pop('betas', (.9, .999)) opt = optim.Adam(params, *args, betas=betas, amsgrad=True, **kwargs) elif mode.lower() == 'adam_hyp2': betas = kwargs.pop('betas', (.5, .99)) opt = optim.Adam(params, *args, betas=betas, amsgrad=True, **kwargs) elif mode.lower() == 'adam_hyp3': betas = kwargs.pop('betas', (0., .99)) opt = optim.Adam(params, *args, betas=betas, amsgrad=True, **kwargs) elif mode.lower() == 'adam_sparse': betas = kwargs.pop('betas', (.9, .999)) opt = optim.SparseAdam(params, *args, betas=betas) elif mode.lower() == 'adam_sparse_hyp2': betas = kwargs.pop('betas', (.5, .99)) opt = optim.SparseAdam(params, *args, betas=betas) elif mode.lower() == 'adam_sparse_hyp3': betas = kwargs.pop('betas', (.0, .99)) opt = optim.SparseAdam(params, *args, betas=betas) else: raise NotImplementedError() return opt
def train(self, iterations, lr=None, negative_targets=5, discard_probability=0.05): for iteration in range(iterations): print("Epoch: {}".format(iteration + 1)) if not lr: lr = self.initial_lr optimizer = optim.SparseAdam(list( self.skip_gram_model.parameters()), lr=lr) self.dataset.update_discard_probability(discard_probability) self.dataset.update_negative_targets(negative_targets) self.dataloader = self.create_dataloader(self.dataset) running_loss = 0.0 for i, sample_batched in enumerate(tqdm(self.dataloader)): if len(sample_batched[0]) > 1: pos_u = sample_batched[0].to(self.device) pos_v = sample_batched[1].to(self.device) neg_v = sample_batched[2].to(self.device) optimizer.zero_grad() loss = self.skip_gram_model.forward(pos_u, pos_v, neg_v) loss.backward() optimizer.step() running_loss = +loss.item() print("Loss:", running_loss) self.skip_gram_model.save_embedding(self.dataset.data.id2gene, self.output_file_name)
def fit(self, raw_inputs): """ Learn a vocabulary """ flat_items = raw_inputs if self.reduce is None else list(it.chain.from_iterable(raw_inputs)) if self.vocab_size is None: # if vocab size is None, use all items cutoff = len(flat_items) elif isinstance(self.vocab_size, float): # if vocab size is float, interprete it as percentage of top items (authors) cutoff = int(self.vocab_size * len(flat_items)) else: # else use fixed vocab size or None, which is fine aswell cutoff = int(self.vocab_size) print("Using top {:.2f}% authors ({})".format(cutoff / len(flat_items) * 100, cutoff)) item_cnt = Counter(flat_items).most_common(cutoff) # index 0 is reserved for unk idx self.vocab = {value: idx + 1 for idx, (value, __) in enumerate(item_cnt)} num_embeddings = len(self.vocab) + 1 self.embedding = nn.Embedding(num_embeddings, self.embedding_dim, padding_idx=self.padding_idx, **self.embedding_params, sparse=self.sparse) if self.use_cuda and self.embedding_on_gpu: # Put the embedding on GPU only when wanted self.embedding = self.embedding.cuda() print("Embedding before creating optimizer:", self.embedding, sep='\n') if self.sparse: self.optimizer = optim.SparseAdam(self.embedding.parameters(), lr=self.lr) else: self.optimizer = optim.Adam(self.embedding.parameters(), lr=self.lr) return self
def get_optimizer(optimizer, lr, params): if optimizer == 'adagrad': optimizer = torch.optim.Adagrad(params, lr=lr*5, lr_decay=0, weight_decay=0, initial_accumulator_value=0, eps=1e-10) elif optimizer == 'adadelta': optimizer = optim.Adadelta(params, lr=lr*100*5, rho=0.9, eps=1e-06, weight_decay=0) elif optimizer == 'adam': optimizer = optim.Adam(params, lr=lr/10*5, betas=(0.9, 0.999), eps=1e-08, weight_decay=0, amsgrad=False) elif optimizer == 'adaw': optimizer = optim.AdamW(params, lr=lr/10*5, betas=(0.9, 0.999), eps=1e-08, weight_decay=0.01, amsgrad=False) elif optimizer == 'sparseadam': optimizer = optim.SparseAdam(params/10*5, lr=lr, betas=(0.9, 0.999), eps=1e-08) elif optimizer == 'ASGD': optimizer = optim.ASGD(params, lr=lr*5, lambd=0.0001, alpha=0.75, t0=1000000.0, weight_decay=0) elif optimizer == 'LBFGS': optimizer = optim.LBFGS(params, lr=lr*100*5) elif optimizer == 'RMSprop': optimizer = optim.RMSprop(params, lr=lr*5) elif optimizer == 'rprop': optimizer = optim.Rprop(params, lr=lr*5) elif optimizer == 'SGD': optimizer = optim.SGD(params, lr=lr*5, momentum=0, dampening=0, weight_decay=0, nesterov=False) elif optimizer == 'adamax': #standard: adamax optimizer = optim.Adamax(params, lr=lr) # best lr=0.01, standard is lr=0.002, mutiply every other by factor 5 as well else: raise Exception("Optimizer not supported. Please change it!") return optimizer
def get_optim(config, model): if config.optimizer == 'Adam': optimizer = optim.Adam(model.parameters(), lr=float(config.lr)) if config.optimizer == 'RMSprop': optimizer = optim.RMSprop(model.parameters(), lr=float(config.lr)) if config.optimizer == 'Adagrad': optimizer = optim.Adagrad(model.parameters(), lr=float(config.lr)) if config.optimizer == 'SGD': optimizer = optim.SGD(model.parameters(), lr=float(config.lr)) if config.optimizer == 'Adadelta': optimizer = optim.Adadelta(model.parameters(), lr=float(config.lr)) if config.optimizer == 'AdamW': optimizer = optim.AdamW(model.parameters(), lr=float(config.lr)) if config.optimizer == 'SparseAdam': optimizer = optim.SparseAdam(model.parameters(), lr=float(config.lr)) if config.optimizer == 'Adamax': optimizer = optim.Adamax(model.parameters(), lr=float(config.lr)) if config.optimizer == 'ASGD': optimizer = optim.ASGD(model.parameters(), lr=float(config.lr)) if config.optimizer == 'LBFGS': optimizer = optim.LBFGS(model.parameters(), lr=float(config.lr)) if config.optimizer == 'Rprop': optimizer = optim.Rprop(model.parameters(), lr=float(config.lr)) print('\noptimizer :', optimizer, '\n') return optimizer
def __init__(self, epochs, training_file, name="model/skipgram", embedding_dim=100, batch_size=256, window_size=2, negative_sample=5): self.epochs = epochs self.training_file = training_file self.embedding_dim = embedding_dim self.batch_size = batch_size self.win_size = window_size self.neg_samples = negative_sample corpus = TokenizedCorpus(self.training_file) self.sentences = corpus.get_words() self.skip_data = VsGram(self.sentences, self.win_size) self.vocab = self.skip_data.w2i self.model = SkipGram(self.vocab, self.embedding_dim) self.optimizer = optim.SparseAdam(self.model.parameters(), lr=0.001) self.name = "model/skipgram" #save w2i and i2w as json with open(os.path.join(self.name + "_i2w.txt"), "w") as out: json.dump(self.skip_data.i2w, out, indent=4) with open(os.path.join(self.name + "_w2i.txt"), "w") as out: json.dump(self.skip_data.w2i, out, indent=4) self.training()
def get_optimiser(name, net_params, optim_params): lr = optim_params['learning_rate'] momentum = optim_params['momentum'] weight_decay = optim_params['weight_decay'] if(name == "SGD"): return optim.SGD(net_params, lr, momentum = momentum, weight_decay = weight_decay) elif(name == "Adam"): return optim.Adam(net_params, lr, weight_decay = 1e-5) elif(name == "SparseAdam"): return optim.SparseAdam(net_params, lr) elif(name == "Adadelta"): return optim.Adadelta(net_params, lr, weight_decay = weight_decay) elif(name == "Adagrad"): return optim.Adagrad(net_params, lr, weight_decay = weight_decay) elif(name == "Adamax"): return optim.Adamax(net_params, lr, weight_decay = weight_decay) elif(name == "ASGD"): return optim.ASGD(net_params, lr, weight_decay = weight_decay) elif(name == "LBFGS"): return optim.LBFGS(net_params, lr) elif(name == "RMSprop"): return optim.RMSprop(net_params, lr, momentum = momentum, weight_decay = weight_decay) elif(name == "Rprop"): return optim.Rprop(net_params, lr) else: raise ValueError("unsupported optimizer {0:}".format(name))
def set_parameters(self, params): """ ? """ self.params = [] self.sparse_params = [] for k, p in params: if p.requires_grad: if self.method != 'sparseadam' or "embed" not in k: self.params.append(p) else: self.sparse_params.append(p) if self.method == 'sgd': self.optimizer = optim.SGD(self.params, lr=self.learning_rate) elif self.method == 'adagrad': self.optimizer = optim.Adagrad(self.params, lr=self.learning_rate) for group in self.optimizer.param_groups: for p in group['params']: self.optimizer.state[p]['sum'] = self.optimizer\ .state[p]['sum'].fill_(self.adagrad_accum) elif self.method == 'adadelta': self.optimizer = optim.Adadelta(self.params, lr=self.learning_rate) elif self.method == 'adam': self.optimizer = optim.Adam(self.params, lr=self.learning_rate, betas=self.betas, eps=1e-9) elif self.method == 'sparseadam': self.optimizer = MultipleOptimizer( [optim.Adam(self.params, lr=self.learning_rate, betas=self.betas, eps=1e-8), optim.SparseAdam(self.sparse_params, lr=self.learning_rate, betas=self.betas, eps=1e-8)]) else: raise RuntimeError("Invalid optim method: " + self.method)
def __init__(self, root_dir='./VolSegData/', checkpoint_dir='./checkpoints/', batch_size=10, shuffle=True, num_workers=0, num_epochs=2, load_from=None): self.model = sc.Net.create() if load_from: self.model.load_state_dict(torch.load(load_from), strict=False) self.dataloader = dl.LitsDataSet.create(root_dir=root_dir, batch_size=batch_size, shuffle=True, num_workers=0) self.criterion = nn.MSELoss() self.optimizer = optim.SparseAdam(self.model.parameters(), lr=0.001, betas=(0.9, 0.999), eps=1e-08) self.checkpoint = checkpoint_dir self.num_epochs = num_epochs
def _initialize(self, interactions): (self._num_users, self._num_items) = (interactions.num_users, interactions.num_items) if self._representation is not None: self._net = gpu(self._representation, self._use_cuda) else: self._net = gpu( BilinearNet(self._num_users, self._num_items, self._embedding_dim, sparse=self._sparse), self._use_cuda) if self._optimizer_func is None: if self._sparse: self._optimizer = optim.SparseAdam(self._net.parameters()) else: self._optimizer = optim.Adam(self._net.parameters(), weight_decay=self._l2, lr=self._learning_rate) else: self._optimizer = self._optimizer_func(self._net.parameters()) if self._loss == 'pointwise': self._loss_func = pointwise_loss elif self._loss == 'bpr': self._loss_func = bpr_loss elif self._loss == 'hinge': self._loss_func = hinge_loss else: self._loss_func = adaptive_hinge_loss
def train(self): optimizer = optim.SparseAdam(self.skip_gram_model.parameters(), lr=self.initial_lr) path_to_save = os.path.join(self.args["output_folder"], "word_vectors.npy") for iteration in range(self.iterations): running_loss = 0.0 for i, sample_batched in enumerate(self.dataloader): if len(sample_batched[0]) > 1: pos_u = sample_batched[0].to(self.device) pos_v = sample_batched[1].to(self.device) neg_v = sample_batched[2].to(self.device) optimizer.zero_grad() loss = self.skip_gram_model.forward(pos_u, pos_v, neg_v) loss.backward() optimizer.step() running_loss = running_loss * 0.9 + loss.item() * 0.1 if (iteration + 1) % self.args["print_every"] == 0: print("Iter {}: Loss: {}".format(iteration, running_loss)) self.skip_gram_model.save_embedding(path_to_save) json.dump( self.id2token, open(os.path.join(self.args["output_folder"], "index2word.json"), 'w'))
def get_optim(lr): if args.optim == 'adam': optimizer = optim.Adam(model.parameters(), lr=lr, eps=1e-9, betas=[0.9, 0.98]) if args.optim == 'sparseadam': optimizer = optim.SparseAdam(model.parameters(), lr=lr, eps=1e-9, betas=[0.9, 0.98]) if args.optim == 'adamax': optimizer = optim.Adamax(model.parameters(), lr=lr, eps=1e-9, betas=[0.9, 0.98]) elif args.optim == 'rmsprop': optimizer = optim.RMSprop(model.parameters(), lr=lr, eps=1e-9, momentum=0.9) elif args.optim == 'sgd': optimizer = optim.SGD(model.parameters(), lr=lr) # 0.01 elif args.optim == 'adagrad': optimizer = optim.Adagrad(model.parameters(), lr=lr) elif args.optim == 'adadelta': optimizer = optim.Adadelta(model.parameters(), lr=lr) return optimizer
def train(self): for iteration in range(self.iterations): print("\n\n\nIteration: " + str(iteration + 1)) optimizer = optim.SparseAdam(self.skip_gram_model.parameters(), lr=self.initial_lr) scheduler = torch.optim.lr_scheduler.CosineAnnealingLR( optimizer, len(self.dataloader)) running_loss = 0.0 for i, sample_batched in enumerate(tqdm(self.dataloader)): if len(sample_batched[0]) > 1: pos_u = sample_batched[0].to(self.device) pos_v = sample_batched[1].to(self.device) neg_v = sample_batched[2].to(self.device) scheduler.step() optimizer.zero_grad() loss = self.skip_gram_model.forward(pos_u, pos_v, neg_v) loss.backward() optimizer.step() running_loss = running_loss * 0.9 + loss.item() * 0.1 if i > 0 and i % 500 == 0: print(" Loss: " + str(running_loss)) self.skip_gram_model.save_embedding(self.data.id2word, self.output_file_name)
def _build_optimizer(self): r"""Init the Optimizer Returns: torch.optim: the optimizer """ if self.learner.lower() == 'adam': optimizer = optim.Adam(self.model.parameters(), lr=self.learning_rate) elif self.learner.lower() == 'sgd': optimizer = optim.SGD(self.model.parameters(), lr=self.learning_rate) elif self.learner.lower() == 'adagrad': optimizer = optim.Adagrad(self.model.parameters(), lr=self.learning_rate) elif self.learner.lower() == 'rmsprop': optimizer = optim.RMSprop(self.model.parameters(), lr=self.learning_rate) elif self.learner.lower() == 'sparse_adam': optimizer = optim.SparseAdam(self.model.parameters(), lr=self.learning_rate) else: self.logger.warning( 'Received unrecognized optimizer, set default Adam optimizer') optimizer = optim.Adam(self.model.parameters(), lr=self.learning_rate) return optimizer
def main(): user = os.path.expanduser("~") user = os.path.join(user, 'PycharmProjects/Parametric_GT') current_dataset = 'caltech' max_epochs = 20 batch_size = 8 dataset, stats, number_of_classes = misc(user, current_dataset) dataset_train = os.path.join(dataset, 'train_labelled0.1') dataset_test = os.path.join(dataset, 'test') nets_and_features = create_dict_nets_and_features() net_types = ['resnet18'] out_dir = os.path.join(os.path.join(os.path.join(user, 'Results'), current_dataset), 'nets') for net_type in net_types: inception = net_type == 'inception' train_loader = prepare_loader_train(dataset_train, stats, batch_size) test_loader = prepare_loader_val(dataset_test, stats, batch_size) net, feature_size = create_net(number_of_classes, nets_and_features, net_type=net_type) criterion = nn.CrossEntropyLoss() optimizer = optim.SparseAdam(net.parameters(), lr=1e-4) best_net = train(net, net_type, train_loader, test_loader, optimizer, criterion, max_epochs, out_dir) net.load_state_dict(torch.load(best_net)) net_accuracy = evaluate(net, test_loader) print('Accuracy: ' + str(net_accuracy))
def __init__( self, root_dir = ["/data/keshav/CT/trainbatch/trainbatch1/","/data/keshav/CT/trainbatch/trainbatch2/"], checkpoint_dir = './checkpoints/', batch_size = 10, shuffle = True, num_workers = 0, num_epochs = 2, load_from = None ): self.model = sc.Net.create() self.batch_size = batch_size if load_from: self.model.load_state_dict(torch.load(load_from),strict=False) self.dataloader = LITS.LitsDataLoader.create(root_dir = root_dir, batch_size = batch_size, shuffle = False, num_workers = 0) self.criterion = nn.MSELoss() self.optimizer = optim.SparseAdam(self.model.parameters(), lr=0.001, betas=(0.9, 0.999), eps=1e-08) self.checkpoint = checkpoint_dir self.num_epochs = num_epochs self.N = 0 self.epoch_loss = [] self.batch_loss = [] self.scan_loss = [] self.test_loss = 0.0 self.train_loss = 0.0 self.N_test = 0
def optimizer_reset(self, learning_rate): self.learning_rate = learning_rate if self.optimizer_type == "Adam": self.optimizer = optim.Adam(self.dense_parameters, lr=learning_rate, weight_decay=self.l2_lambda) if len(self.sparse_parameters) > 0: self.sparse_optimizer = optim.SparseAdam( self.sparse_parameters, lr=learning_rate) else: self.sparse_optimizer = None elif self.optimizer_type == "SGD": self.optimizer = optim.SGD(self.dense_parameters, lr=learning_rate, weight_decay=self.l2_lambda) if len(self.sparse_parameters) > 0: self.sparse_optimizer = optim.SGD(self.sparse_parameters, lr=learning_rate) else: self.sparse_optimizer = None if the_gpu() >= 0: recursively_set_device(self.optimizer.state_dict(), the_gpu()) if self.sparse_optimizer is not None: recursively_set_device(self.sparse_optimizer.state_dict(), the_gpu())
def _build_optimizer(self, params): r"""Init the Optimizer Returns: torch.optim: the optimizer """ if self.config['reg_weight'] and self.weight_decay and self.weight_decay * self.config['reg_weight'] > 0: self.logger.warning( 'The parameters [weight_decay] and [reg_weight] are specified simultaneously, ' 'which may lead to double regularization.' ) if self.learner.lower() == 'adam': optimizer = optim.Adam(params, lr=self.learning_rate, weight_decay=self.weight_decay) elif self.learner.lower() == 'sgd': optimizer = optim.SGD(params, lr=self.learning_rate, weight_decay=self.weight_decay) elif self.learner.lower() == 'adagrad': optimizer = optim.Adagrad(params, lr=self.learning_rate, weight_decay=self.weight_decay) elif self.learner.lower() == 'rmsprop': optimizer = optim.RMSprop(params, lr=self.learning_rate, weight_decay=self.weight_decay) elif self.learner.lower() == 'sparse_adam': optimizer = optim.SparseAdam(params, lr=self.learning_rate) if self.weight_decay > 0: self.logger.warning('Sparse Adam cannot argument received argument [{weight_decay}]') else: self.logger.warning('Received unrecognized optimizer, set default Adam optimizer') optimizer = optim.Adam(params, lr=self.learning_rate) return optimizer
def _build_optimizer(self, params): r"""Init the Optimizer Returns: torch.optim: the optimizer """ if self.learner.lower() == 'adam': optimizer = optim.Adam(params, lr=self.learning_rate, weight_decay=self.weight_decay) elif self.learner.lower() == 'sgd': optimizer = optim.SGD(params, lr=self.learning_rate, weight_decay=self.weight_decay) elif self.learner.lower() == 'adagrad': optimizer = optim.Adagrad(params, lr=self.learning_rate, weight_decay=self.weight_decay) elif self.learner.lower() == 'rmsprop': optimizer = optim.RMSprop(params, lr=self.learning_rate, weight_decay=self.weight_decay) elif self.learner.lower() == 'sparse_adam': optimizer = optim.SparseAdam(params, lr=self.learning_rate) if self.weight_decay > 0: self.logger.warning( 'Sparse Adam cannot argument received argument [{weight_decay}]' ) else: self.logger.warning( 'Received unrecognized optimizer, set default Adam optimizer') optimizer = optim.Adam(params, lr=self.learning_rate) return optimizer
def train(self): """Train the network with the settings used to initialise the Trainer """ for epoch in range(self.epochs): print("### Epoch: " + str(epoch)) optimizer = optim.SparseAdam(self.skipgram.parameters(), lr=self.initial_lr) scheduler = torch.optim.lr_scheduler.CosineAnnealingLR( optimizer, len(self.dataloader)) running_loss = 0.0 for sample_batched in tqdm(self.dataloader): if len(sample_batched[0]) > 1: pos_target = sample_batched[0].to(self.device) pos_context = sample_batched[1].to(self.device) neg_context = sample_batched[2].to(self.device) optimizer.zero_grad() loss = self.skipgram.forward( pos_target, pos_context, neg_context ) # the loss is integrated into the forward function loss.backward() optimizer.step() scheduler.step() running_loss = running_loss * 0.9 + loss.item() * 0.1 print(" Loss: " + str(running_loss)) final_embeddings = self.skipgram.target_embeddings.weight.cpu( ).data.numpy() save_graph_embeddings(self.corpus, final_embeddings, self.output_fh)
def get_optimizer(model, name, hyper_parameters): """ name can be: 'Adam', 'SparseAdam', 'Adamax', 'RMSprop', """ optimizer = None if name == "adam": optimizer = optim.Adam(model.parameters(), lr=hyper_parameters["lr"]) elif name == "sparseadam": optimizer = optim.SparseAdam(model.parameters(), lr=hyper_parameters["lr"], betas=eval(hyper_parameters["betas"]), eps=hyper_parameters["eps"]) elif name == "adamax": optimizer = optim.Adamax(model.parameters(), lr=hyper_parameters["lr"], betas=hyper_parameters["betas"], eps=hyper_parameters["eps"], weight_decay=hyper_parameters["weight_decay"]) elif name == "sgd": optimizer = optim.SGD(model.parameters(), lr=hyper_parameters["lr"], momentum=hyper_parameters["momentum"], weight_decay=hyper_parameters["weight_decay"]) return optimizer
def get_optimizers(self, args, checkpoint): optimizers = list() if args.encoder_lr > 0: optimizer_encoder = optim.Adam( list(self.bert.parameters()) + list(self.fc.parameters() if args.project else list()), lr=args.encoder_lr, ) if args.resume_from_checkpoint is not None: optimizer_encoder.load_state_dict( checkpoint["optimizer_dense"]) optimizer_encoder.param_groups[0]["lr"] = args.encoder_lr optimizer_encoder.param_groups[0][ "weight_decay"] = args.encoder_weight_decay optimizers.append(optimizer_encoder) else: optimizers.append( DummyOptimizer(self.out.parameters(), defaults={})) if args.decoder_lr > 0: if args.sparse: optimizer_decoder = optim.SparseAdam(self.out.parameters(), lr=args.decoder_lr) else: optimizer_decoder = optim.Adam(self.out.parameters(), lr=args.decoder_lr) if args.resume_from_checkpoint is not None: optimizer_decoder.load_state_dict( checkpoint["optimizer_sparse"]) if "weight_decay" not in optimizer_decoder.param_groups[0]: optimizer_decoder.param_groups[0]["weight_decay"] = 0 optimizer_decoder.param_groups[0]["lr"] = args.decoder_lr if not args.sparse: optimizer_decoder.param_groups[0][ "weight_decay"] = args.decoder_weight_decay optimizers.append(optimizer_decoder) else: optimizers.append( DummyOptimizer(self.out.parameters(), defaults={})) lr_schedulers = [ getattr(LRSchedulers, lr_scheduler)(optimizer=optimizer, **lr_scheduler_config) for optimizer, (lr_scheduler, lr_scheduler_config) in zip( optimizers, [ (args.encoder_lr_scheduler, args.encoder_lr_scheduler_config), (args.decoder_lr_scheduler, args.decoder_lr_scheduler_config), ], ) # and not isinstance(optimizer, DummyOptimizer) if lr_scheduler is not None ] return tuple(optimizers), tuple(lr_schedulers)
def train(self): if self.optimizer == 'adam': optimizer = optim.Adam(self.skip_gram_model.parameters(), lr=self.initial_lr, **self.optimizer_kwargs) elif self.optimizer == 'sparse_adam': optimizer = optim.SparseAdam(self.skip_gram_model.parameters(), lr=self.initial_lr, **self.optimizer_kwargs) elif self.optimizer == 'sgd': optimizer = optim.SGD(self.skip_gram_model.parameters(), lr=self.initial_lr, **self.optimizer_kwargs) elif self.optimizer == 'asgd': optimizer = optim.ASGD(self.skip_gram_model.parameters(), lr=self.initial_lr, **self.optimizer_kwargs) elif self.optimizer == 'adagrad': optimizer = optim.Adagrad(self.skip_gram_model.parameters(), lr=self.initial_lr, **self.optimizer_kwargs) else: raise Exception('Unknown optimizer!') for iteration in range(self.iterations): print("\n\n\nIteration: " + str(iteration + 1)) if self.lr_schedule: scheduler = torch.optim.lr_scheduler.CosineAnnealingLR( optimizer, len(self.dataloader)) running_loss = 0.0 iprint = len(self.dataloader) // 20 for i, sample_batched in enumerate(tqdm(self.dataloader)): if len(sample_batched[0]) > 1: pos_u = sample_batched[0].to(self.device) pos_v = sample_batched[1].to(self.device) neg_v = sample_batched[2].to(self.device) optimizer.zero_grad() loss = self.skip_gram_model.forward(pos_u, pos_v, neg_v) loss.backward() optimizer.step() if self.lr_schedule: scheduler.step() running_loss = running_loss * ( 1 - 5 / iprint) + loss.item() * (5 / iprint) if i > 0 and i % iprint == 0: print(" Loss: " + str(running_loss) + ' lr: ' + str([ param_group['lr'] for param_group in optimizer.param_groups ])) print(" Loss: " + str(running_loss)) self.skip_gram_model.save_embedding(self.data.id2word, self.output_file_name)
def train_skipgram(model, docs): ''' train skipgram model''' torch.manual_seed(42) np.random.seed(42) # set optimizer HYPERPARAMS? optimizer = optim.SparseAdam(model.parameters()) # batch size batch_size = 512 # get pdf for negative sampling pdf = model.get_neg_sample_pdf(model.counter) # epoch for epoch in range(model.nr_epochs): print(f"Epoch: {epoch}") # get batch of positive and negative examples for step, (pos_batch, neg_batch) in enumerate( get_batches(model, docs, batch_size, pdf)): optimizer.zero_grad() # extract words pos_u = [x[0].item() for x in pos_batch] pos_v = [x[1].item() for x in pos_batch] neg_v = neg_batch # forward pass loss = model.forward(pos_u, pos_v, neg_v) if step % 50 == 0: print(f'at step {step}: loss: {loss.item()}') # save every 1000 steps if step & 1000 == 0: if not os.path.exists('./models'): os.mkdir('./models') torch.save(model.state_dict(), f'./models/trained_w2v_bs_256_thr_120.pt') # backprop loss.backward() optimizer.step() # save model if not os.path.exists('./models'): os.mkdir('./models') torch.save(model.state_dict(), f'./models/trained_w2v_bs_256_thr_120.pt') # aggregate all docs to embeddings for retrieval print('Done with training. \nConverting all documents to embeddings.', 'This may take a while.') model.aggregate_all_docs() print('Done with converting all docs')
def train(self, model: nn.Module, train_interactions: np.ndarray, test_interactions: np.ndarray, is_sparse: bool): optimizer: optim.Optimizer if is_sparse: optimizer = optim.SparseAdam(model.parameters(), lr=self.LR) else: optimizer = optim.Adam(model.parameters(), lr=self.LR, weight_decay=self.WEIGHT_DECAY) train_loss_history = [] test_loss_history = [] train_dataset = get_dataset(train_interactions) test_dataset = get_dataset(test_interactions) test_users, test_movies, test_ratings = test_dataset.tensors data_loader = DataLoader(train_dataset, batch_size=self.BATCH_SIZE) model.to(DEVICE) for epoch in tqdm(range(0, self.EPOCHS), desc='Training'): train_loss = 0 for users_batch, movies_batch, ratings_batch in data_loader: optimizer.zero_grad() prediction = model(users_batch, movies_batch) loss = self.loss(prediction, ratings_batch) for regularizer in self.regularizers: loss += regularizer(prediction) loss.backward() optimizer.step() train_loss += loss.item() test_prediction = model(test_users, test_movies) test_loss = self.loss(test_prediction, test_ratings).item() for regularizer in self.regularizers: test_loss += regularizer(test_prediction).item() train_loss /= len(data_loader) train_loss_history.append(train_loss) test_loss_history.append(test_loss) if self.VERBOSE: msg = f'Train loss: {train_loss:.3f}, ' msg += f'Test loss: {test_loss:.3f}' tqdm.write(msg) return train_loss_history, test_loss_history
def build_optimizer(model, args, reload=False): optimizer_sparse = None if args.optim.lower() == 'sgd': if args.sample_softmax > 0: dense_params, sparse_params = [], [] for param in model.parameters(): if param.size() == model.word_emb.weight.size(): sparse_params.append(param) else: dense_params.append(param) optimizer_sparse = optim.SGD(sparse_params, lr=args.lr * 2) optimizer = optim.SGD(dense_params, lr=args.lr, momentum=args.mom) else: optimizer = optim.SGD(model.parameters(), lr=args.lr, momentum=args.mom) elif args.optim.lower() == 'adam': if args.sample_softmax > 0: dense_params, sparse_params = [], [] for param in model.parameters(): if param.size() == model.word_emb.weight.size(): sparse_params.append(param) else: dense_params.append(param) optimizer_sparse = optim.SparseAdam(sparse_params, lr=args.lr) optimizer = optim.Adam(dense_params, lr=args.lr) else: optimizer = optim.Adam(model.parameters(), lr=args.lr) elif args.optim.lower() == 'adagrad': optimizer = optim.Adagrad(model.parameters(), lr=args.lr) else: raise ValueError(f"optimizer type {args.optim} not recognized") if reload: if args.restart_from is not None: optim_name = f'optimizer_{args.restart_from}.pt' else: optim_name = 'optimizer.pt' optim_file_name = os.path.join(args.restart_dir, optim_name) logging(f"reloading {optim_file_name}") if os.path.exists(os.path.join(args.restart_dir, optim_name)): with open(os.path.join(args.restart_dir, optim_name), 'rb') as optim_file: opt_state_dict = torch.load(optim_file) try: optimizer.load_state_dict(opt_state_dict) # in case the optimizer param groups aren't the same shape, merge them except: logging("merging optimizer param groups") opt_state_dict["param_groups"][0]["params"] \ = [param for param_group in opt_state_dict["param_groups"] for param in param_group["params"]] opt_state_dict["param_groups"] = [opt_state_dict["param_groups"][0]] optimizer.load_state_dict(opt_state_dict) else: logging('Optimizer was not saved. Start from scratch.') return optimizer, optimizer_sparse
def train(self, training_files, output_file, num_epochs=100, init=weightInit.fromScratch, model_path=None): losses = list() dataloader = self.initDataLoader(training_files) self.weightInitialisation(init, saved_model_path=model_path) self.initDevice() for iteration in tqdm(range(num_epochs)): optimizer = optim.SparseAdam(self.skip_gram_model.parameters(), lr=self.initial_lr) scheduler = torch.optim.lr_scheduler.CosineAnnealingLR( optimizer, len(dataloader)) count = 0.0 running_loss = 0.0 cumulative_loss = 0.0 for i, sample_batched in enumerate(dataloader): if len(sample_batched[0]) > 1: pos_u = sample_batched[0].to(self.device) pos_v = sample_batched[1].to(self.device) neg_v = sample_batched[2].to(self.device) scheduler.step() optimizer.zero_grad() loss = self.skip_gram_model.forward(pos_u, pos_v, neg_v) loss.backward() optimizer.step() running_loss = running_loss * 0.9 + loss.item() * 0.1 cumulative_loss += loss.item() count += 1.0 losses.append(cumulative_loss / count) self.iter_per_epoch = int(count * self.batch_size) # write to vectors #if torch.cuda.device_count() > 1: #self.skip_gram_model.module.save_embedding(self.data.id2word, output_file, self.data.max_num_words_file) #else: #self.skip_gram_model.save_embedding(self.data.id2word, output_file, self.data.max_num_words_file) self.skip_gram_model.save_embedding(self.data.id2word, output_file, self.data.max_num_words_file) return losses