optimizer = optim.Adam(model.parameters(), lr=learning_rate, betas=(0.9, 0.99)) criterion = nn.CrossEntropyLoss() writer = SummaryWriter(outf + '/exp') best_acc = float('-inf') best_since_last = 0 for ep in range(start_epoch, epochs + 1): if best_since_last == 20: break elif best_since_last % 8 == 0 and best_since_last != 0: adjust_learning_rate(optimizer, 0.5) train_metrics = train(model, train_loader, optimizer, criterion, ep) test_metrics = test(model, test_loader, criterion) # print(train_metrics) # print(test_metrics) writer.add_scalar('train-acc', train_metrics['acc'], global_step=ep) writer.add_scalar('train-loss', train_metrics['loss'], global_step=ep) writer.add_scalar('test-acc', test_metrics['acc'], global_step=ep) writer.add_scalar('test-loss', test_metrics['loss'], global_step=ep) if best_acc < test_metrics['acc']: best_since_last = 0 print(f'Found best at epoch {ep}\n') best_acc = test_metrics['acc'] torch.save(model.cpu().state_dict(), outf + '/best.pth') else: best_since_last += 1 torch.save(model.cpu().state_dict(), outf + '/last.pth') model.to(device)
def train(args): device = torch.device(f"cuda:{args.device_id}") model = AlexNet(n_cls=100, useLRN=args.useLRN, useDropOut=args.useDropOut) # model = AlexNet(num_classes= 100) criterion = nn.CrossEntropyLoss() model.to(device) optimizer = Adam(model.parameters(), lr=args.lr) train_loader, valid_loader = getLoaders(split="train", batch_size=args.batch_size, num_workers=args.num_workers, aug=args.useAug) train_loss_arr = [] valid_loss_arr = [] valid_acc_arr = [] valid_top5_arr = [] n_iter = 0 best_loss = float('inf') best_top1_acc = 0 best_top5_acc = 0 for ep in range(args.epoch): model.train() for _, (img, label) in tqdm(enumerate(train_loader), total=len(train_loader)): img, label = img.to(device), label.to(device) optimizer.zero_grad() pred = model(img) loss = criterion(pred, label) # loss = model.criterion(pred, label) loss.backward() optimizer.step() train_loss_arr.append(loss.item()) n_iter += 1 model.eval() ep_valid_loss_arr = [] ep_acc_arr = [] ep_top5_arr = [] with torch.no_grad(): for _, (img, label) in tqdm(enumerate(valid_loader), total=len(valid_loader)): img, label = img.to(device), label.to(device) pred = model(img) loss = criterion(pred, label) # loss = model.criterion(pred, label) acc = utils.top_k_acc(k=1, pred=pred.detach().cpu().numpy(), label=label.detach().cpu().numpy()) acc5 = utils.top_k_acc(k=5, pred=pred.detach().cpu().numpy(), label=label.detach().cpu().numpy()) ep_acc_arr.append(acc) ep_top5_arr.append(acc5) ep_valid_loss_arr.append(loss.item()) valid_loss = np.mean(ep_valid_loss_arr) valid_acc = np.mean(ep_acc_arr) valid_top5 = np.mean(ep_top5_arr) train_loss = np.mean(train_loss_arr[-len(train_loader):]) valid_loss_arr.append(valid_loss) if valid_loss < best_loss: best_loss = valid_loss best_top1_acc = valid_acc best_top5_acc = valid_top5 model.cpu() torch.save(model.state_dict(), "best_model.pth") model.to(device) if (ep + 1) % 10 == 0: model.cpu() torch.save( { "model": model.state_dict(), "optimizer": optimizer.state_dict(), "train_loss": train_loss_arr, "valid_loss": valid_loss_arr, "valid_acc": valid_acc_arr, "valid_top5": valid_top5_arr, "best_loss": best_loss, "ep": ep, "n_iter": n_iter, }, "model_checkpoint.pth") model.to(device) print( f"[{ep}, {n_iter}] train: {train_loss:.4f}, valid: {valid_loss:.4f}, acc: {valid_acc:.4f}, top5: {valid_top5:.4f}" ) with open("exp_result.txt", "a+") as f: f.write( f"{args}, loss: {best_loss:.4f}, top1: {best_top1_acc*100:.1f}, top5: {best_top5_acc*100:.1f}\n" )