def main(): par = Params(sys.argv) random.seed(par.seed) torch.manual_seed(par.seed) if torch.cuda.is_available(): torch.cuda.manual_seed_all(par.seed) if par.trn and par.val: chk = Checkpoint(par.dir) if chk.contains_model: ####### resume training #################################### cfg, mod, opt = chk.load(par) ### also moves to GPU if cfg.cuda # cfg.update_par(par) ### updates par in cfg print_time('Learning [resume It={}]...'.format(cfg.n_iters_sofar)) else: ######################## training from scratch ############################## cfg = Config(par) ### reads cfg and par (reads vocabularies) mod = Model(cfg) if cfg.cuda: mod.cuda() ### moves to GPU opt = Optimizer(cfg, mod) #build Optimizer print_time('Learning [from scratch]...') trn = Dataset(par.trn, cfg.svoc, cfg.tvoc, par.batch_size, par.max_src_len, par.max_tgt_len, do_shuffle=True, do_filter=True, is_test=False) val = Dataset(par.val, cfg.svoc, cfg.tvoc, par.batch_size, par.max_src_len, par.max_tgt_len, do_shuffle=True, do_filter=True, is_test=True) Training(cfg, mod, opt, trn, val, chk) elif par.tst: #################### inference ########################################## chk = Checkpoint() cfg, mod, opt = chk.load(par, par.chk) # cfg.update_par(par) ### updates cfg options with pars tst = Dataset(par.tst, cfg.svoc, cfg.tvoc, par.batch_size, 0, 0, do_shuffle=False, do_filter=False, is_test=True) print_time('Inference [model It={}]...'.format(cfg.n_iters_sofar)) Inference(cfg, mod, tst)
raise ValueError if arguments.no_ft and arguments.pruning_iterations != 1: print("You can't specify a pruning_iteration value if there is no fine-tuning at all") raise ValueError get_mask = get_mask_function(arguments.pruning_type) _dataset = get_dataset(arguments) _targets = [int((n + 1) * (arguments.target / arguments.pruning_iterations)) for n in range(arguments.pruning_iterations)] # Train model print('Train model !') print(f'Regularization with t-{_targets[0]}') training_model = Checkpoint(arguments, 'training') training_model.regularization = Regularization(None, _targets[0], arguments) training_model.load() train_model(training_model, arguments, [0, arguments.epochs], _dataset, None, soft_pruning=arguments.soft_pruning) if arguments.lr_rewinding: training_model.rewind_lr() if arguments.no_ft: print('\nPruning model without fine tuning :') pruned_model = training_model.clone('pruned') pruned_model.load() mask = get_mask(pruned_model.model, arguments.target) apply_mask(pruned_model.model, mask) _acc, _top5, _test_loss = test_model(_dataset, pruned_model.model, arguments) pruned_model.save_results({'epoch': 'before', 'acc': _acc, 'top5': _top5, 'loss': _test_loss, 'norm': l2_norm(pruned_model.model), 'pruned_param_count': pruned_model.model.compute_params_count(
def train( self, train_loader, val_loader=None, max_epochs=1000, enable_early_stopping=True ): if val_loader is None: enable_early_stopping = False print() print("-" * 2, "Training Setup", "-" * 2) print(f"Maximum Epochs: {max_epochs}") print(f"Enable Early Stoping: {enable_early_stopping}") print("-" * 20) print("*Start Training.") # model setup self.model.train().to(self.device) if self.multi_gpus and torch.cuda.device_count() > 1: print(f"*Using {torch.cuda.device_count()} GPUs!") self.model = nn.DataParallel(self.model) # early stopping instance if enable_early_stopping: if self.early_stopping is None: self.early_stopping = EarlyStopping(patience=5) else: self.early_stopping.reset_counter() # training start! for epoch in range(1, max_epochs + 1): running_loss = 0.0 for step, data in enumerate(train_loader, start=1): inputs, labels = data inputs, labels = inputs.to(self.device), labels.to(self.device) # Zero the parameter gradients self.optimizer.zero_grad() # forward + backward + optimize outputs = self.model(inputs) loss = self.loss_func(outputs, labels) loss.backward() self.optimizer.step() # print statistics running_loss += loss.item() if step % 100 == 0 or step == len(train_loader): print( f"[{epoch}/{max_epochs}, {step}/{len(train_loader)}] loss: {running_loss / step :.3f}" ) # train & validation loss train_loss = running_loss / len(train_loader) if val_loader is None: print(f"train loss: {train_loss:.3f}") else: # FIXME: fixed the problem that first validation is not correct val_loss = self.validation(val_loader) print(f"train loss: {train_loss:.3f}, val loss: {val_loss:.3f}") if enable_early_stopping: self.early_stopping(self.model, val_loss, self.optimizer) if self.early_stopping.get_early_stop() == True: print("*Early Stopping.") break print("*Finished Training!") if enable_early_stopping: checkpoint = self.early_stopping.get_checkpoint() else: checkpoint = Checkpoint() checkpoint.tmp_save(self.model, self.optimizer, epoch, val_loss) self.checkpoint = checkpoint self.model = checkpoint.load(self.model, self.optimizer)["model"] return self.model
def train(): args = configs.get_args() use_cuda = args.use_cuda and torch.cuda.is_available() # prepare dataset dataset = libs.dataset.MyDataset(min_length=args.min_length) voc_size = dataset.get_voc_size() dataloader = DataLoader(dataset, 1, True, drop_last=False) # prepare model model = models.TopModuleCNN(voc_size, output_channel=args.output_channel) if use_cuda: model = model.cuda() # load pretrained if asked if args.resume: checkpoint_path = Checkpoint.get_certain_checkpoint( "./experiment/cnn_net", "best") resume_checkpoint = Checkpoint.load(checkpoint_path) model = resume_checkpoint.model optimizer = resume_checkpoint.optimizer resume_optim = optimizer.optimizer defaults = resume_optim.param_groups[0] defaults.pop('params', None) optimizer.optimizer = resume_optim.__class__(model.parameters(), **defaults) start_epoch = resume_checkpoint.epoch max_ans_acc = resume_checkpoint.max_ans_acc else: start_epoch = 1 max_ans_acc = 0 optimizer = NoamOpt( 512, 1, 2000, optim.Adam(model.parameters(), lr=0, betas=(0.9, 0.98), eps=1e-9)) # define loss loss = nn.CrossEntropyLoss(weight=torch.tensor([1., 4.])) if use_cuda: loss = loss.cuda() # training for i in range(start_epoch, args.epochs): # test the model if args.resume: test_ans_acc = max_ans_acc else: test_ans_acc = test(DataLoader(dataset, 1, True, drop_last=False), model, i) print('For EPOCH {}, total f1: {:.2f}'.format(i, test_ans_acc)) # calculate loss j = 0 los1 = [] for _, data in enumerate(dataloader): j += 1 x = data['que'].long() y = data['ans'].long() res = data['res'].long() if use_cuda: x, y, res = x.cuda(), y.cuda(), res.cuda() res_pred = model(x, y) los1.append(loss(res_pred, res).unsqueeze(0)) # apply gradient if j % args.batch_size == 0: los1 = torch.cat(los1) los = los1.sum() model.zero_grad() los.backward() optimizer.step() los1 = [] print('EPOCH: {}, {} / {}====> LOSS: {:.2f}'.format( i, j // args.batch_size, dataloader.__len__() // args.batch_size, los.item() / args.batch_size)) # save checkpoint if test_ans_acc > max_ans_acc: max_ans_acc = test_ans_acc th_checkpoint = Checkpoint(model=model, optimizer=optimizer, epoch=i, max_ans_acc=max_ans_acc) th_checkpoint.save_according_name("./experiment/cnn_net", 'best')