def train2(net, train_loader, test_loader): loss_fn = nn.CrossEntropyLoss() net2 = BYOL_Classification(net, 10) net2.eval() net2.cuda() for pq in net.parameters(): pq.requires_grad = False optimizer = optim.Adam(filter(lambda p: p.requires_grad, net2.parameters()), lr=1e-3) from warmup_scheduler import GradualWarmupScheduler scheduler = GradualWarmupScheduler( optimizer, multiplier=1, total_epoch=20, after_scheduler=optim.lr_scheduler.CosineAnnealingLR(optimizer, T_max=80)) train_start = time.time() for epoch in range(1, 100 + 1): train_loss = 0 net2.train() epoch_start = time.time() for idx, (data, target) in enumerate(train_loader): optimizer.zero_grad() data = data.cuda() target = target.cuda() data = net2(data)[1] loss = loss_fn(data, target) train_loss += loss.item() loss.backward() optimizer.step() train_loss /= (idx + 1) scheduler.step() epoch_time = time.time() - epoch_start if epoch % 10 == 0: net.eval() total = 0.0 correct = 0.0 for test_data in test_loader: images, labels = test_data images = images.cuda() labels = labels.cuda() outputs = net2(images)[1] _, predicted = torch.max(outputs.data, 1) total += labels.size(0) correct += (predicted == labels).sum().item() print("Epoch\t", epoch, "\tTest accuracy\t", correct / total * 100) elapsed_train_time = time.time() - train_start print('Finished training. Train time was:', elapsed_train_time)
def train(args, config, loader, device): logging.info('Start training...') model = getattr(net, config.model.name)(**config.model.args, **config.embedder) model = model.to(device) criterion = getattr(nn, config.loss.name)(**config.loss.args).to(device) optimizer = getattr(torch.optim, config.optimizer.name)(model.parameters(), **config.optimizer.args) if hasattr(config, 'lr_scheduler'): if hasattr(config.lr_scheduler, 'name'): scheduler = getattr(torch.optim.lr_scheduler, config.lr_scheduler.name)( optimizer, **config.lr_scheduler.args) else: scheduler = None if hasattr(config.lr_scheduler, 'warm_up'): scheduler_warm_up = GradualWarmupScheduler( optimizer, multiplier=config.lr_scheduler.warm_up.multiplier, total_epoch=config.lr_scheduler.warm_up.epoch, after_scheduler=scheduler) loss = Box({'train': 0.0, 'val': 0.0}) metrics = Box({'train': [Accuracy()], 'val': [Accuracy()]}) for epoch in range(config.train.n_epoch): if hasattr(config, 'lr_scheduler'): if hasattr(config.lr_scheduler, 'warm_up'): scheduler_warm_up.step() else: scheduler.step() loss.train, metrics.train = run_epoch( model, optimizer, criterion, loader.train, train=True, metrics=metrics.train, max_norm=config.max_norm if hasattr(config, 'max_norm') else -1) loss.val, metrics.val = run_epoch(model, optimizer, criterion, loader.val, train=False, metrics=metrics.val) saved_path = os.path.join(args.model_folder, 'checkpoints', f'epoch_{epoch}.pt') save_model(saved_path, epoch, model, optimizer) log_metrics(epoch, args.model_folder, loss, metrics)
def train(net, loader): optimizer = SGD_with_lars(net.parameters(), lr=0.1, momentum=0.9, weight_decay=1e-6) from warmup_scheduler import GradualWarmupScheduler scheduler = GradualWarmupScheduler( optimizer, multiplier=1, total_epoch=20, after_scheduler=optim.lr_scheduler.CosineAnnealingLR(optimizer, T_max=180)) train_start = time.time() for epoch in range(1, 100 + 1): print('hi') train_loss = 0 net.train() epoch_start = time.time() for idx, (data, target) in enumerate(loader): optimizer.zero_grad() dat1 = data[0].cuda() dat2 = data[1].cuda() loss = net(dat1, dat2) train_loss += loss.item() loss.backward() optimizer.step() train_loss /= (idx + 1) scheduler.step() epoch_time = time.time() - epoch_start print( "Epoch\t", epoch, "\tLoss\t", train_loss, "\tTime\t", epoch_time, ) elapsed_train_time = time.time() - train_start print('Finished training. Train time was:', elapsed_train_time)
def train(): cfg = opt.cfg data = opt.data img_size = opt.img_size epochs = 1 if opt.prebias else int( hyp['epochs']) # 500200 batches at bs 64, 117263 images = 273 epochs batch_size = int(hyp['batch_size']) accumulate = opt.accumulate # effective bs = batch_size * accumulate = 16 * 4 = 64 weights = opt.weights # initial training weights if 'pw' not in opt.arc: # remove BCELoss positive weights hyp['cls_pw'] = 1. hyp['obj_pw'] = 1. # Initialize init_seeds() if opt.multi_scale: img_sz_min = round(img_size / 32 / 1.5) + 1 img_sz_max = round(img_size / 32 * 1.3) - 1 img_size = img_sz_max * 32 # initiate with maximum multi_scale size print('Using multi-scale %g - %g' % (img_sz_min * 32, img_size)) # Configure run data_dict = parse_data_cfg(data) train_path = data_dict['train'] test_path = data_dict['valid'] nc = int(data_dict['classes']) # number of classes # Remove previous results for f in glob.glob('*_batch*.jpg') + glob.glob(results_file): os.remove(f) # Initialize model model = Darknet(cfg, hyp, arc=opt.arc).to(device) # Optimizer pg0, pg1 = [], [] # optimizer parameter groups for k, v in dict(model.named_parameters()).items(): if 'Conv2d.weight' in k: pg1 += [v] # parameter group 1 (apply weight_decay) else: pg0 += [v] # parameter group 0 if opt.adam: optimizer = optim.Adam(pg0, lr=hyp['lr0']) # optimizer = AdaBound(pg0, lr=hyp['lr0'], final_lr=0.1) else: optimizer = optim.SGD(pg0, lr=hyp['lr0'], momentum=hyp['momentum'], nesterov=True) optimizer.add_param_group({ 'params': pg1, 'weight_decay': hyp['weight_decay'] }) # add pg1 with weight_decay del pg0, pg1 cutoff = -1 # backbone reaches to cutoff layer start_epoch = 0 best_fitness = float('inf') attempt_download(weights) if weights.endswith('.pt'): # pytorch format # possible weights are 'last.pt', 'yolov3-spp.pt', 'yolov3-tiny.pt' etc. chkpt = torch.load(weights, map_location=device) # load model # if opt.transfer: chkpt['model'] = { k: v for k, v in chkpt['model'].items() if model.state_dict()[k].numel() == v.numel() } model.load_state_dict(chkpt['model'], strict=False) # else: # model.load_state_dict(chkpt['model']) # load optimizer if chkpt['optimizer'] is not None: optimizer.load_state_dict(chkpt['optimizer']) best_fitness = chkpt['best_fitness'] # load results if chkpt.get('training_results') is not None: with open(results_file, 'w') as file: file.write(chkpt['training_results']) # write results.txt if opt.resume: start_epoch = chkpt['epoch'] + 1 del chkpt # elif len(weights) > 0: # darknet format # # possible weights are 'yolov3.weights', 'yolov3-tiny.conv.15', 'darknet53.conv.74' etc. # cutoff = load_darknet_weights(model, weights) if opt.transfer or opt.prebias: # transfer learning edge (yolo) layers nf = [ int(model.module_defs[x - 1]['filters']) for x in model.yolo_layers ] # yolo layer size (i.e. 255) if opt.prebias: for p in optimizer.param_groups: # lower param count allows more aggressive training settings: i.e. SGD ~0.1 lr0, ~0.9 momentum p['lr'] = 0.1 # learning rate if p.get('momentum') is not None: # for SGD but not Adam p['momentum'] = 0.9 for p in model.parameters(): if opt.prebias and p.numel() == nf: # train (yolo biases) p.requires_grad = True elif opt.transfer and p.shape[ 0] == nf: # train (yolo biases+weights) p.requires_grad = True else: # freeze layer p.requires_grad = False # Scheduler https://github.com/ultralytics/yolov3/issues/238 # lf = lambda x: 1 - x / epochs # linear ramp to zero # lf = lambda x: 10 ** (hyp['lrf'] * x / epochs) # exp ramp # lf = lambda x: 1 - 10 ** (hyp['lrf'] * (1 - x / epochs)) # inverse exp ramp # scheduler = lr_scheduler.LambdaLR(optimizer, lr_lambda=lf) # scheduler = lr_scheduler.MultiStepLR(optimizer, milestones=range(59, 70, 1), gamma=0.8) # gradual fall to 0.1*lr0 scheduler = lr_scheduler.MultiStepLR( optimizer, milestones=[round(epochs * x) for x in [0.8, 0.9]], gamma=0.1) # 带重启的余弦退火 # scheduler = lr_scheduler.CosineAnnealingLR(optimizer, T_max = 0.1*epochs, eta_min=0, last_epoch=-1) # 余弦退火 # scheduler = lr_scheduler.CosineAnnealingLR(optimizer, epochs) # warmup加载器,支持各种scheduler scheduler = GradualWarmupScheduler(optimizer, multiplier=hyp['multiplier'], total_epoch=hyp['warm_epoch'], after_scheduler=scheduler) scheduler.last_epoch = start_epoch - 1 # # # Plot lr schedule(注意别一直开着!否则lr调整失效) # y = [] # for _ in range(epochs): # scheduler.step() # y.append(optimizer.param_groups[0]['lr']) # plt.plot(y, label='LR') # plt.xlabel('epoch') # plt.ylabel('LR') # plt.tight_layout() # plt.savefig('LR.png', dpi=300) if mixed_precision: model, optimizer = amp.initialize(model, optimizer, opt_level='O1', verbosity=0) # Initialize distributed training if torch.cuda.device_count() > 1: dist.init_process_group( backend='nccl', # 'distributed backend' init_method= 'tcp://127.0.0.1:9999', # distributed training init method world_size=1, # number of nodes for distributed training rank=0) # distributed training node rank model = torch.nn.parallel.DistributedDataParallel(model) model.yolo_layers = model.module.yolo_layers # move yolo layer indices to top level # Dataset dataset = LoadImagesAndLabels( train_path, img_size, batch_size, augment=False, # augment=True, hyp=hyp, # augmentation hyperparameters rect=opt.rect, # rectangular training image_weights=opt.img_weights, cache_labels=epochs > 10, cache_images=opt.cache_images and not opt.prebias, ) # Dataloader batch_size = min(batch_size, len(dataset)) nw = min([os.cpu_count(), batch_size if batch_size > 1 else 0, 8]) # number of workers dataloader = torch.utils.data.DataLoader( dataset, batch_size=batch_size, num_workers=nw, shuffle=not opt. rect, # Shuffle=True unless rectangular training is used pin_memory=True, collate_fn=dataset.collate_fn) # Test Dataloader if not opt.prebias: testloader = torch.utils.data.DataLoader(LoadImagesAndLabels( test_path, opt.img_size, batch_size * 2, hyp=hyp, rect=opt.rect, cache_labels=True, cache_images=opt.cache_images), batch_size=batch_size * 2, num_workers=nw, pin_memory=True, collate_fn=dataset.collate_fn) # Start training model.nc = nc # attach number of classes to model model.arc = opt.arc # attach yolo architecture model.hyp = hyp # attach hyperparameters to model # model.class_weights = labels_to_class_weights(dataset.labels, nc).to(device) # attach class weights model_info(model, report='summary') # 'full' or 'summary' nb = len(dataloader) maps = np.zeros(nc) # mAP per class results = ( 0, 0, 0, 0, 0, 0, 0 ) # 'P', 'R', 'mAP', 'F1', 'val GIoU', 'val Objectness', 'val Classification', 'val Regression' t0 = time.time() print('Using %g dataloader workers' % nw) print('Starting %s for %g epochs...' % ('prebias' if opt.prebias else 'training', epochs)) for epoch in range( start_epoch, epochs ): # epoch ------------------------------------------------------------------ model.train() model.epoch = epoch # print(('\n' + '%10s' * 9) % ('Epoch', 'gpu_mem', 'GIoU', 'obj', 'cls', 'reg', 'total', 'targets', 'img_size')) print(('\n' + '%10s' * 8) % ('Epoch', 'gpu_mem', 'obj', 'cls', 'reg', 'total', 'targets', 'img_size')) # Freeze backbone at epoch 0, unfreeze at epoch 1 (optional) freeze_backbone = False if freeze_backbone and epoch < 2: for name, p in model.named_parameters(): if int(name.split('.')[1]) < cutoff: # if layer < 75 p.requires_grad = False if epoch == 0 else True # Update image weights (optional) if dataset.image_weights: w = model.class_weights.cpu().numpy() * (1 - maps)**2 # class weights image_weights = labels_to_image_weights(dataset.labels, nc=nc, class_weights=w) dataset.indices = random.choices(range(dataset.n), weights=image_weights, k=dataset.n) # rand weighted idx mloss = torch.zeros(4).to(device) # mean losses pbar = tqdm(enumerate(dataloader), total=nb) # progress bar # 着重注意这个targets,已经经过resize到416,augment等变化了,不能直接映射到原图 for i, ( imgs, targets, paths, _ ) in pbar: # batch ------------------------------------------------------------- ni = i + nb * epoch # number integrated batches (since train start) imgs = imgs.to(device) # uint8 to float32, 0 - 255 to 0.0 - 1.0 targets = targets.to(device) # Multi-Scale training if opt.multi_scale: if ni / accumulate % 10 == 0: # adjust (67% - 150%) every 10 batches img_size = random.randrange(img_sz_min, img_sz_max + 1) * 32 sf = img_size / max(imgs.shape[2:]) # scale factor if sf != 1: ns = [ math.ceil(x * sf / 32.) * 32 for x in imgs.shape[2:] ] # new shape (stretched to 32-multiple) imgs = F.interpolate(imgs, size=ns, mode='bilinear', align_corners=False) # Plot images with bounding boxes if ni == 0: fname = 'train_batch%g.jpg' % i plot_images(imgs=imgs, targets=targets, paths=paths, fname=fname) if tb_writer: tb_writer.add_image(fname, cv2.imread(fname)[:, :, ::-1], dataformats='HWC') # Hyperparameter burn-in # n_burn = nb - 1 # min(nb // 5 + 1, 1000) # number of burn-in batches # if ni <= n_burn: # for m in model.named_modules(): # if m[0].endswith('BatchNorm2d'): # m[1].momentum = 1 - i / n_burn * 0.99 # BatchNorm2d momentum falls from 1 - 0.01 # g = (i / n_burn) ** 4 # gain rises from 0 - 1 # for x in optimizer.param_groups: # x['lr'] = hyp['lr0'] * g # x['weight_decay'] = hyp['weight_decay'] * g # Run model pred = model(imgs) # Compute loss loss, loss_items = compute_loss(pred, targets, model, hyp) if not torch.isfinite(loss): print('WARNING: non-finite loss, ending training ', loss_items) return results # Scale loss by nominal batch_size of 64 # loss *= batch_size / 64 # Compute gradient if mixed_precision: with amp.scale_loss(loss, optimizer) as scaled_loss: scaled_loss.backward() else: loss.backward() # Accumulate gradient for x batches before optimizing if ni % accumulate == 0: optimizer.step() optimizer.zero_grad() # Print batch results mloss = (mloss * i + loss_items) / (i + 1) # update mean losses mem = torch.cuda.memory_cached() / 1E9 if torch.cuda.is_available( ) else 0 # (GB) # s = ('%10s' * 2 + '%10.3g' * 7) % ( # '%g/%g' % (epoch, epochs - 1), '%.3gG' % mem, *mloss, len(targets), img_size) s = ('%10s' * 2 + '%10.3g' * 6) % ('%g/%g' % (epoch, epochs - 1), '%.3gG' % mem, *mloss, len(targets), img_size) pbar.set_description(s) # end batch ------------------------------------------------------------------------------------------------ # Update scheduler scheduler.step() # Process epoch results final_epoch = epoch + 1 == epochs if opt.prebias: print_model_biases(model) else: # Calculate mAP (always test final epoch, skip first 10 if opt.nosave) if not (opt.notest or (opt.nosave and epoch < 10)) or final_epoch: if not epoch < hyp['test_from']: # 前部分epoch proposal太多,不计算 if epoch % hyp['test_interval'] == 0 and epoch != 0: results, maps = test.test( cfg, data, batch_size=1, img_size=opt.img_size, model=model, hyp=hyp, conf_thres=0.001 if final_epoch else 0.1, # 0.1 for speed save_json=final_epoch and epoch > 0 and 'coco.data' in data, dataloader=testloader) # Write epoch results with open(results_file, 'a') as f: f.write(s + '%10.3g' * 7 % results + '\n') # P, R, mAP, F1, test_losses=(GIoU, obj, cls) # Write Tensorboard results if tb_writer: x = list(mloss) + list(results) titles = [ 'GIoU', 'Objectness', 'Classification', 'Train loss', 'Precision', 'Recall', 'mAP', 'F1', 'val GIoU', 'val Objectness', 'val Classification' ] for xi, title in zip(x, titles): tb_writer.add_scalar(title, xi, epoch) # Update best mAP fitness = sum(results[4:]) # total loss if fitness < best_fitness: best_fitness = fitness # Save training results save = (not opt.nosave) or (final_epoch and not opt.evolve) or opt.prebias if save: with open(results_file, 'r') as f: # Create checkpoint chkpt = { 'epoch': epoch, 'best_fitness': best_fitness, 'training_results': f.read(), 'model': model.module.state_dict() if type(model) is nn.parallel.DistributedDataParallel else model.state_dict(), 'optimizer': None if final_epoch else optimizer.state_dict() } # Save last checkpoint torch.save(chkpt, last) # Save best checkpoint if best_fitness == fitness: torch.save(chkpt, best) # Save backup every 10 epochs (optional) if epoch > 0 and epoch % hyp['save_interval'] == 0: torch.save(chkpt, wdir + 'backup%g.pt' % epoch) # Delete checkpoint del chkpt # end epoch ---------------------------------------------------------------------------------------------------- # end training if len(opt.name): os.rename('results.txt', 'results_%s.txt' % opt.name) plot_results() # save as results.png print('%g epochs completed in %.3f hours.\n' % (epoch - start_epoch + 1, (time.time() - t0) / 3600)) dist.destroy_process_group() if torch.cuda.device_count() > 1 else None torch.cuda.empty_cache() return results
model.eval() # print(model) model = model.to(device) criterion = SmoothLabelCritierion(label_smoothing=0.1) optimizer = optim.Adam(model.parameters(), lr=3e-4, weight_decay=3e-5) if name == '3e-4 -> 1e-4': lr_scheduler = optim.lr_scheduler.CosineAnnealingLR(optimizer, num_epochs - 5, eta_min=1e-4) elif name == '3e-4 -> 3e-5': lr_scheduler = optim.lr_scheduler.CosineAnnealingLR(optimizer, num_epochs - 5, eta_min=3e-5) else: lr_scheduler = optim.lr_scheduler.CosineAnnealingLR(optimizer, num_epochs - 5, eta_min=0) warmup_scheduler = GradualWarmupScheduler(optimizer, multiplier=1, total_epoch=5, after_scheduler=lr_scheduler) optimizer.zero_grad() optimizer.step() warmup_scheduler.step() util.check_dir('../data/models/') best_model, loss_dict, top1_acc_dict, top5_acc_dict = train_model( data_loaders, data_sizes, name, model, criterion, optimizer, warmup_scheduler, num_epochs=num_epochs, device=device) # 保存最好的模型参数 # util.save_model(best_model.cpu(), '../data/models/best_%s.pth' % name) res_loss[name] = loss_dict res_top1_acc[name] = top1_acc_dict res_top5_acc[name] = top5_acc_dict print('train %s done' % name) print()
def training(train_data_list, val_data_list, test_files, fold): os.makedirs(os.path.join(config.weights, config.model_name) + os.sep + str(fold), exist_ok=True) os.makedirs(config.best_models, exist_ok=True) ### ---------- get model ------------------------------------------ model = FF3DNet(drop=0.5) ### ---------- set lr, opt, loss ------------------------------------------ img_params = list(map(id, model.img_encoder.parameters())) rest_params = filter(lambda p: id(p) not in img_params, model.parameters()) params = [ { 'params': rest_params, 'lr': config.lr }, { 'params': model.img_encoder.parameters(), 'lr': config.lr * 3 }, ] optimizer = torch.optim.SGD(params, momentum=0.9, weight_decay=1e-4) scheduler = lr_scheduler.CosineAnnealingLR(optimizer, T_max=config.epochs - 5, eta_min=config.lr / 100) scheduler_warmup = GradualWarmupScheduler(optimizer, multiplier=10, total_epoch=5, after_scheduler=scheduler) criterion = nn.CrossEntropyLoss().to(device) if torch.cuda.device_count() > 1: model = nn.DataParallel(model) model.to(device) best_results = [0, np.inf, 0] val_metrics = [0, np.inf, 0] ### ---------- load dataset ------------------------------------------ train_gen = MultiModalDataset(train_data_list, config.train_data, config.train_vis, mode="train") train_loader = DataLoader(train_gen, batch_size=config.batch_size, shuffle=True, pin_memory=True, num_workers=4) # val_data=getfiles("val") # val_data.sort() val_csv = "/root/userfolder/linan/C/preliminary/val.csv" val_data = pd.read_csv(val_csv) val_gen = MultiModalDataset(val_data, config.train_data, config.train_vis, augument=False, mode="val") val_loader = DataLoader(val_gen, 512, shuffle=False, pin_memory=True, num_workers=4) test_gen = MultiModalDataset(test_files, config.test_data, config.test_vis, augument=False, mode="test") test_loader = DataLoader(test_gen, 512, shuffle=False, pin_memory=True, num_workers=4) # --- train, val, test ------------------------- resume = False start = timer() print("multi fold val") #___________________________________________________________________________________________________________________ for index in [1, 2, 3]: print(index) checkpoint_loss = torch.load( 'checkpoints/best_models/0626_debug_fold_' + str(index) + '_model_best_loss.pth.tar') model.load_state_dict(checkpoint_loss["state_dict"]) test(val_loader, model, fold, checkpoint_loss, 'best_loss', False, index) checkpoint_acc = torch.load( 'checkpoints/best_models/0626_debug_fold_' + str(index) + '_model_best_acc.pth.tar') model.load_state_dict(checkpoint_acc["state_dict"]) test(val_loader, model, fold, checkpoint_acc, 'best_acc', False, index) #test_ensemble_loss_acc(test_loader, fold, [checkpoint_loss, checkpoint_acc], 'ensemble', True) 0 / 0 #___________________________________________________________________________________________________________________ if resume: checkpoint_loss = torch.load( 'checkpoints/best_models/0616_coslr_55_fold_0_model_best_loss.pth.tar' ) model.load_state_dict(checkpoint_loss["state_dict"]) test(test_loader, model, fold, checkpoint_loss, 'best_loss', False) checkpoint_acc = torch.load( 'checkpoints/best_models/0616_coslr_55_fold_0_model_best_acc.pth.tar' ) model.load_state_dict(checkpoint_acc["state_dict"]) test(test_loader, model, fold, checkpoint_acc, 'best_acc', False) test_ensemble_loss_acc(test_loader, fold, [checkpoint_loss, checkpoint_acc], 'ensemble', True) else: ### ---------- train loop ---------------- for epoch in range(4, config.epochs): scheduler_warmup.step(metrics=val_metrics[0]) for param_group in optimizer.param_groups: log.write(str(param_group['lr']) + '\n') train_metrics = train(train_loader, model, criterion, optimizer, epoch, val_metrics, best_results, start) # val_metrics_tta = evaluate(val_loader_tta,model,criterion,epoch,train_metrics,best_results,start) val_metrics = evaluate(val_loader, model, criterion, epoch, train_metrics, best_results, start) is_best_acc = val_metrics[0] > best_results[0] best_results[0] = max(val_metrics[0], best_results[0]) is_best_loss = val_metrics[1] < best_results[1] best_results[1] = min(val_metrics[1], best_results[1]) is_best_f1 = val_metrics[2] > best_results[2] best_results[2] = max(val_metrics[2], best_results[2]) save_checkpoint( { "epoch": epoch + 1, "model_name": config.model_name, "state_dict": model.state_dict(), "best_acc": best_results[0], "best_loss": best_results[1], "optimizer": optimizer.state_dict(), "fold": fold, "best_f1": best_results[2], }, is_best_acc, is_best_loss, is_best_f1, fold) print('\r', end='', flush=True) print(val_metrics[0], val_metrics[1], val_metrics[2], "val") log.write( '%s %5.1f %6.1f | %0.3f %0.3f %0.3f | %0.3f %0.3f %0.3f | %s %s %s | %s' % ( \ "best", epoch, epoch, train_metrics[0], train_metrics[1], train_metrics[2], val_metrics[0], val_metrics[1], val_metrics[2], str(best_results[0])[:8], str(best_results[1])[:8], str(best_results[2])[:8], time_to_str((timer() - start), 'min')) ) log.write("\n") time.sleep(0.01) # log.write("\n----------------------------------------------- [START %s] %s\n\n" % ( # datetime.now().strftime('%Y-%m-%d %H:%M:%S'), '-' * 51)) # log.write( # ' |------------ Train -------|----------- Valid ---------|----------Best Results---|------------|\n') # log.write( # 'mode iter epoch | acc loss f1_macro | acc loss f1_macro | acc loss f1_macro | time |\n') # log.write( # '-------------------------------------------------------------------------------------------------------------------------|\n') ### ---------- per fold ensemble best loss ckpt and best acc ckpt checkpoint_loss = torch.load( 'checkpoints/best_models/%s_fold_%s_model_best_loss.pth.tar' % (config.model_name, str(fold))) model.load_state_dict(checkpoint_loss["state_dict"]) test(test_loader, model, fold, checkpoint_loss, 'best_loss', False) checkpoint_acc = torch.load( 'checkpoints/best_models/%s_fold_%s_model_best_acc.pth.tar' % (config.model_name, str(fold))) model.load_state_dict(checkpoint_acc["state_dict"]) test(test_loader, model, fold, checkpoint_acc, 'best_acc', False) test_ensemble_loss_acc(test_loader, fold, [checkpoint_loss, checkpoint_acc], 'ensemble', not config.k_fold) ### ----------- last kfold ensemble all before k ensemble ckpts if config.k_fold and fold == config.num_kf: mean_npy = np.zeros([10000, 9]) for i in range(1, config.num_kf + 1): checkpoint = torch.load( 'checkpoints/best_models/%s_fold_%s_model_best_loss.pth.tar' % (config.model_name, str(i))) loss_pred = np.load('preds_9/%s/%s_val_fold%s_%s.npy' % (checkpoint["model_name"], checkpoint["model_name"], str(i), 'ensemble')) mean_npy += loss_pred mean_npy = mean_npy / config.num_kf np.save( 'preds_9/%s/%s_val_fold%s_%s.npy' % (checkpoint["model_name"], checkpoint["model_name"], 'cv', 'ensemble'), mean_npy) gen_txt(mean_npy, checkpoint, 'cv', 'ensemble')
class Fitter: def __init__(self, model, device, config, folder): self.config = config self.epoch = 0 #设置工作目录 self.base_dir = f'./model/seresnext_512/{folder}' if not os.path.exists(self.base_dir): os.makedirs(self.base_dir) self.log_path = f'{self.base_dir}/log.txt' self.best_score = 0 self.best_loss = 10**5 self.best_ap = 0 self.model = model self.device = device self.best_true = np.array([]) self.best_pred = np.array([]) param_optimizer = list(self.model.named_parameters()) no_decay = ['bias', 'LayerNorm.bias', 'LayerNorm.weight'] optimizer_grouped_parameters = [ {'params': [p for n, p in param_optimizer if not any(nd in n for nd in no_decay)], 'weight_decay': 0.001}, {'params': [p for n, p in param_optimizer if any(nd in n for nd in no_decay)], 'weight_decay': 0.0} ] self.optimizer = torch.optim.AdamW(self.model.parameters(), lr=config.lr) self.scheduler = config.SchedulerClass(self.optimizer, **config.scheduler_params) # self.scheduler.step #self.scheduler_warmup = GradualWarmupScheduler(self.optimizer, multiplier=1, total_epoch=5, after_scheduler=self.scheduler) self.scheduler_warmup = GradualWarmupScheduler(self.optimizer, multiplier=1, total_epoch=6) # self.criterion = FocalLoss(logits=True).to(self.device) self.criterion = LabelSmoothing().to(self.device) self.log(f'Fitter prepared. Device is {self.device}') def fit(self, train_loader, validation_loader): for e in range(self.config.n_epochs): if self.config.verbose: lr = self.optimizer.param_groups[0]['lr'] timestamp = datetime.utcnow().isoformat() self.log(f'\n{timestamp}\nLR: {lr}') if self.epoch <= 6: self.scheduler_warmup.step(self.epoch) print(self.epoch, self.optimizer.param_groups[0]['lr']) t = time.time() summary_loss, roc_auc_scores, ap_scores , f1_scores, acc_scores = self.train_one_epoch(train_loader) self.log(f'[RESULT]: Train. Epoch: {self.epoch}, summary_loss: {summary_loss.avg:.5f}, roc_auc: {roc_auc_scores.avg:.5f},\ acc:{acc_scores.avg:.5f}, ap: {ap_scores.avg:.5f}, f1_scores: {f1_scores.avg:.5f}, time: {(time.time() - t):.5f}') t = time.time() f_true, f_pred, summary_loss, roc_auc_scores, ap_scores , f1_scores, acc_scores= self.validation(validation_loader) self.log(f'[RESULT]: Val. Epoch: {self.epoch}, summary_loss: {summary_loss.avg:.5f}, roc_auc: {roc_auc_scores.avg:.5f},\ acc:{acc_scores.avg:.5f}, ap: {ap_scores.avg:.5f}, f1_scores: {f1_scores.avg:.5f}, time: {(time.time() - t):.5f}') if summary_loss.avg < self.best_loss: self.best_loss = summary_loss.avg self.save_model(f'{self.base_dir}/best-loss-checkpoint-{str(self.epoch).zfill(3)}epoch.bin') for path in sorted(glob(f'{self.base_dir}/best-loss-checkpoint-*epoch.bin'))[:-2]: os.remove(path) if roc_auc_scores.avg > self.best_score: self.best_score = roc_auc_scores.avg self.save_model(f'{self.base_dir}/best-score-checkpoint-{str(self.epoch).zfill(3)}epoch.bin') for path in sorted(glob(f'{self.base_dir}/best-score-checkpoint-*epoch.bin'))[:-2]: os.remove(path) self.best_true = f_true self.best_pred = f_pred if ap_scores.avg > self.best_ap: self.best_ap = ap_scores.avg self.save_model(f'{self.base_dir}/best-ap-checkpoint-{str(self.epoch).zfill(3)}epoch.bin') for path in sorted(glob(f'{self.base_dir}/best-ap-checkpoint-*epoch.bin'))[:-2]: os.remove(path) if self.config.validation_scheduler: if self.epoch > 6: self.scheduler.step(metrics=summary_loss.avg) self.epoch += 1 #if self.epoch == self.config.n_epochs: return self.best_true , self.best_pred def validation(self, val_loader): self.model.eval() summary_loss = AverageMeter() roc_auc_scores = RocAucMeter() ap_scores = APScoreMeter() f1_scores = F1Score() acc_scores = AccSocre() t = time.time() for step, (images, targets) in enumerate(val_loader): if self.config.verbose: if step % self.config.verbose_step == 0: print( f'Val Step {step}/{len(val_loader)}, ' + \ f'summary_loss: {summary_loss.avg:.5f}, roc_auc: {roc_auc_scores.avg:.5f}, ap: {ap_scores.avg:.5f} ' + \ f'f1_scores: {f1_scores.avg:.5f} ' + \ f'acc_scores: {acc_scores.avg:.5f} ' + \ f'time: {(time.time() - t):.5f}', end='\r' ) with torch.no_grad(): targets = targets.to(self.device).float() batch_size = images.shape[0] images = images.to(self.device).float() outputs = self.model(images) loss = self.criterion(outputs, targets) roc_auc_scores.update(targets, outputs) ap_scores.update(targets, outputs) f1_scores.update(targets, outputs) acc_scores.update(targets, outputs) summary_loss.update(loss.detach().item(), batch_size) f_true = roc_auc_scores.get_true() f_pred = roc_auc_scores.get_pred() return f_true, f_pred, summary_loss, roc_auc_scores, ap_scores, f1_scores, acc_scores def train_one_epoch(self, train_loader): self.model.train() summary_loss = AverageMeter() roc_auc_scores = RocAucMeter() ap_scores = APScoreMeter() f1_scores = F1Score() acc_scores = AccSocre() t = time.time() # print(len(train_loader), "gggggggg") for step, (images, targets) in enumerate(train_loader): if self.config.verbose: if step % self.config.verbose_step == 0: print( f'Train Step {step}/{len(train_loader)}, ' + \ f'summary_loss: {summary_loss.avg:.5f}, roc_auc: {roc_auc_scores.avg:.5f}, ap: {ap_scores.avg:.5f} ' + \ f'f1_scores: {f1_scores.avg:.5f} ' + \ f'acc_scores: {acc_scores.avg:.5f} ' + \ f'time: {(time.time() - t):.5f}', end='\r' ) targets = targets.to(self.device).float() images = images.to(self.device).float() batch_size = images.shape[0] self.optimizer.zero_grad() outputs = self.model(images) loss = self.criterion(outputs, targets) loss.backward() roc_auc_scores.update(targets, outputs) ap_scores.update(targets, outputs) summary_loss.update(loss.detach().item(), batch_size) f1_scores.update(targets, outputs) acc_scores.update(targets, outputs) self.optimizer.step() if self.config.step_scheduler: self.scheduler.step() return summary_loss, roc_auc_scores, ap_scores, f1_scores, acc_scores def save_model(self, path): self.model.eval() torch.save(self.model.state_dict(),path) def save(self, path): self.model.eval() torch.save({ 'model_state_dict': self.model.state_dict(), 'optimizer_state_dict': self.optimizer.state_dict(), 'scheduler_state_dict': self.scheduler.state_dict(), 'best_score': self.best_score, 'best_ap': self.best_ap, 'best_loss': self.best_loss, 'epoch': self.epoch, }, path) def load(self, path): checkpoint = torch.load(path) self.model.load_state_dict(checkpoint['model_state_dict']) self.optimizer.load_state_dict(checkpoint['optimizer_state_dict']) self.scheduler.load_state_dict(checkpoint['scheduler_state_dict']) self.best_score = checkpoint['best_score'] self.best_ap = checkpoint['best_ap'] self.best_loss = checkpoint['best_loss'] self.epoch = checkpoint['epoch'] def log(self, message): if self.config.verbose: print(message) with open(self.log_path, 'a+') as logger: logger.write(f'{message}\n')
def train(args, train_dataset, model): tb_writer = SummaryWriter(args.tb_writer_dir) result_writer = ResultWriter(args.eval_results_dir) if args.weighted_sampling == 1: # 세 가지 구질이 불균일하게 분포되었으므로 세 개를 동일한 비율로 샘플링 # 결과적으로 이 방법을 썼을 때 좋지 않아서 wighted_sampling은 쓰지 않았음 ball_type, counts = np.unique(train_dataset.pitch, return_counts=True) count_dict = dict(zip(ball_type, counts)) weights = [1.0 / count_dict[p] for p in train_dataset.pitch] sampler = WeightedRandomSampler(weights, len(train_dataset), replacement=True) logger.info("Do Weighted Sampling") else: sampler = RandomSampler(train_dataset) train_dataloader = DataLoader(train_dataset, batch_size=args.train_batch_size, sampler=sampler) if args.max_steps > 0: t_total = args.max_steps args.num_train_epochs = args.max_steps // len(train_dataloader) + 1 else: t_total = len(train_dataloader) * args.num_train_epochs args.warmup_step = int(args.warmup_percent * t_total) # Prepare optimizer and schedule (linear warmup and decay) no_decay = [ "bias", "layernorm.weight", ] optimizer_grouped_parameters = [ { "params": [ p for n, p in model.named_parameters() if not any(nd in n for nd in no_decay) ], "weight_decay": args.weight_decay, }, { "params": [ p for n, p in model.named_parameters() if any(nd in n for nd in no_decay) ], "weight_decay": 0.0, }, ] optimizer = optim.Adam(optimizer_grouped_parameters, lr=args.learning_rate, eps=args.adam_epsilon) if args.warmup_step != 0: scheduler_cosine = CosineAnnealingLR(optimizer, t_total) scheduler = GradualWarmupScheduler(optimizer, 1, args.warmup_step, after_scheduler=scheduler_cosine) else: scheduler = CosineAnnealingLR(optimizer, t_total) if args.fp16: try: from apex import amp except ImportError: raise ImportError( "Please install apex from https://www.github.com/nvidia/apex to use fp16 training." ) model, optimizer = amp.initialize(model, optimizer, opt_level=args.fp16_opt_level) if args.n_gpu > 1: model = torch.nn.DataParallel(model) loss_fct = torch.nn.NLLLoss() # Train! logger.info("***** Running Baseball Transformer *****") logger.info(" Num examples = %d", len(train_dataset)) logger.info(" Num Epochs = %d", args.num_train_epochs) logger.info(" Warmup Steps = %d", args.warmup_step) logger.info(" Instantaneous batch size per GPU = %d", args.train_batch_size) logger.info(" Total train batch size = %d", args.train_batch_size) logger.info(" Total optimization steps = %d", t_total) global_step = 0 epochs_trained = 0 best_step = 0 steps_trained_in_current_epoch = 0 tr_loss, logging_loss, logging_val_loss = 0.0, 0.0, 0.0 best_pitch_micro_f1, best_pitch_macro_f1, = 0, 0 best_loss = 1e10 best_pitch_macro_f1 = 0 model.zero_grad() train_iterator = trange( epochs_trained, int(args.num_train_epochs), desc="Epoch", ) set_seed(args) # Added here for reproducibility for _ in train_iterator: epoch_iterator = tqdm(train_dataloader, desc="Iteration") for step, batch in enumerate(epoch_iterator): ( pitcher, batter, state, pitch, label, pitch_memory, label_memory, memory_mask, ) = list(map(lambda x: x.to(args.device), batch)) model.train() pitching_score, memories = model( pitcher, batter, state, pitch_memory, label_memory, memory_mask, ) pitching_score = pitching_score.log_softmax(dim=-1) loss = loss_fct(pitching_score, pitch) if args.n_gpu > 1: loss = loss.mean() if args.fp16: with amp.scale_loss(loss, optimizer) as scaled_loss: scaled_loss.backward() else: loss.backward() tr_loss += loss.item() if args.fp16: torch.nn.utils.clip_grad_norm_(amp.master_params(optimizer), args.max_grad_norm) else: torch.nn.utils.clip_grad_norm_(model.parameters(), args.max_grad_norm) optimizer.step() scheduler.step() model.zero_grad() global_step += 1 if args.logging_steps > 0 and global_step % args.logging_steps == 0: if args.evaluate_during_training: results, f1_results, f1_log, cm = evaluate( args, args.eval_data_file, model) output_eval_file = os.path.join(args.output_dir, "eval_results.txt") print_result(output_eval_file, results, f1_log, cm) for key, value in results.items(): tb_writer.add_scalar("eval_{}".format(key), value, global_step) logging_val_loss = results["loss"] tb_writer.add_scalar("lr", scheduler.get_lr()[0], global_step) tb_writer.add_scalar("loss", (tr_loss - logging_loss) / args.logging_steps, global_step) logging_loss = tr_loss # best 모델 선정 지표를 loss말고 macro-f1으로 설정(trade-off 존재) # if best_loss > results["loss"]: if best_pitch_macro_f1 < results["pitch_macro_f1"]: best_pitch_micro_f1 = results["pitch_micro_f1"] best_pitch_macro_f1 = results["pitch_macro_f1"] best_loss = results["loss"] results["best_step"] = best_step = global_step output_dir = os.path.join(args.output_dir, "best_model/") os.makedirs(output_dir, exist_ok=True) torch.save(model.state_dict(), os.path.join(output_dir, "pytorch_model.bin")) torch.save(args, os.path.join(output_dir, "training_args.bin")) logger.info("Saving best model to %s", output_dir) result_path = os.path.join(output_dir, "best_results.txt") print_result(result_path, results, f1_log, cm, off_logger=True) results.update(dict(f1_results)) result_writer.update(args, **results) logger.info(" best pitch micro f1 : %s", best_pitch_micro_f1) logger.info(" best pitch macro f1 : %s", best_pitch_macro_f1) logger.info(" best loss : %s", best_loss) logger.info(" best step : %s", best_step) if args.save_steps > 0 and global_step % args.save_steps == 0: checkpoint_prefix = "checkpoint" # Save model checkpoint output_dir = os.path.join( args.output_dir, "{}-{}".format(checkpoint_prefix, global_step)) os.makedirs(output_dir, exist_ok=True) torch.save(model.state_dict(), os.path.join(output_dir, "pytorch_model.bin")) torch.save(args, os.path.join(output_dir, "training_args.bin")) logger.info("Saving model checkpoint to %s", output_dir) rotate_checkpoints(args, checkpoint_prefix) torch.save(optimizer.state_dict(), os.path.join(output_dir, "optimizer.pt")) torch.save(scheduler.state_dict(), os.path.join(output_dir, "scheduler.pt")) logger.info("Saving optimizer and scheduler states to %s", output_dir) tb_writer.close() return global_step, tr_loss / global_step
def train(name, df, VAL_FOLD=0, resume=False): dt_string = datetime.now().strftime("%d|%m_%H|%M|%S") print("Starting -->", dt_string) os.makedirs(OUTPUT_DIR, exist_ok=True) os.makedirs('checkpoint', exist_ok=True) run = f"{name}_[{dt_string}]" wandb.init(project="imanip", config=config_defaults, name=run) config = wandb.config # model = SRM_Classifer(num_classes=1, encoder_checkpoint='weights/pretrain_[31|03_12|16|32].h5') model = SMP_SRM_UPP(classifier_only=True) # for name_, param in model.named_parameters(): # if 'classifier' in name_: # continue # else: # param.requires_grad = False print("Parameters : ", sum(p.numel() for p in model.parameters() if p.requires_grad)) wandb.save('segmentation/smp_srm.py') wandb.save('dataset.py') train_imgaug, train_geo_aug = get_train_transforms() transforms_normalize = get_transforms_normalize() #region ########################-- CREATE DATASET and DATALOADER --######################## train_dataset = DATASET(dataframe=df, mode="train", val_fold=VAL_FOLD, test_fold=TEST_FOLD, transforms_normalize=transforms_normalize, imgaug_augment=train_imgaug, geo_augment=train_geo_aug) train_loader = DataLoader(train_dataset, batch_size=config.train_batch_size, shuffle=True, num_workers=4, pin_memory=True, drop_last=False) valid_dataset = DATASET( dataframe=df, mode="val", val_fold=VAL_FOLD, test_fold=TEST_FOLD, transforms_normalize=transforms_normalize, ) valid_loader = DataLoader(valid_dataset, batch_size=config.valid_batch_size, shuffle=True, num_workers=4, pin_memory=True, drop_last=False) test_dataset = DATASET( dataframe=df, mode="test", val_fold=VAL_FOLD, test_fold=TEST_FOLD, transforms_normalize=transforms_normalize, ) test_loader = DataLoader(test_dataset, batch_size=config.valid_batch_size, shuffle=True, num_workers=4, pin_memory=True, drop_last=False) #endregion ###################################################################################### optimizer = get_optimizer(model, config.optimizer, config.learning_rate, config.weight_decay) # after_scheduler = torch.optim.lr_scheduler.ReduceLROnPlateau( # optimizer, # patience=config.schedule_patience, # mode="min", # factor=config.schedule_factor, # ) after_scheduler = torch.optim.lr_scheduler.CosineAnnealingLR(optimizer, T_0=35, T_mult=2) scheduler = GradualWarmupScheduler(optimizer=optimizer, multiplier=1, total_epoch=config.warmup + 1, after_scheduler=after_scheduler) # this zero gradient update is needed to avoid a warning message, issue #8. # optimizer.zero_grad() # optimizer.step() criterion = nn.BCEWithLogitsLoss() es = EarlyStopping(patience=200, mode="min") model = nn.DataParallel(model).to(device) # wandb.watch(model, log_freq=50, log='all') start_epoch = 0 if resume: checkpoint = torch.load( 'checkpoint/(using pretrain)COMBO_ALL_FULL_[09|04_12|46|35].pt') scheduler.load_state_dict(checkpoint['scheduler_state_dict']) model.load_state_dict(checkpoint['model_state_dict']) optimizer.load_state_dict(checkpoint['optimizer_state_dict']) start_epoch = checkpoint['epoch'] + 1 print("-----------> Resuming <------------") for epoch in range(start_epoch, config.epochs): print(f"Epoch = {epoch}/{config.epochs-1}") print("------------------") train_metrics = train_epoch(model, train_loader, optimizer, scheduler, criterion, epoch) valid_metrics = valid_epoch(model, valid_loader, criterion, epoch) scheduler.step(valid_metrics['valid_loss']) print( f"TRAIN_ACC = {train_metrics['train_acc_05']}, TRAIN_LOSS = {train_metrics['train_loss']}" ) print( f"VALID_ACC = {valid_metrics['valid_acc_05']}, VALID_LOSS = {valid_metrics['valid_loss']}" ) print("Optimizer LR", optimizer.param_groups[0]['lr']) print("Scheduler LR", scheduler.get_lr()[0]) wandb.log({ 'optim_lr': optimizer.param_groups[0]['lr'], 'schedule_lr': scheduler.get_lr()[0] }) es( valid_metrics["valid_loss"], model, model_path=os.path.join(OUTPUT_DIR, f"{run}.h5"), ) if es.early_stop: print("Early stopping") break checkpoint = { 'epoch': epoch, 'model_state_dict': model.state_dict(), 'optimizer_state_dict': optimizer.state_dict(), 'scheduler_state_dict': scheduler.state_dict(), } torch.save(checkpoint, os.path.join('checkpoint', f"{run}.pt")) if os.path.exists(os.path.join(OUTPUT_DIR, f"{run}.h5")): print( model.load_state_dict( torch.load(os.path.join(OUTPUT_DIR, f"{run}.h5")))) print("LOADED FOR TEST") test_metrics = test(model, test_loader, criterion) wandb.save(os.path.join(OUTPUT_DIR, f"{run}.h5")) return test_metrics
# print(model) model = model.to(device) criterion = SmoothLabelCritierion(label_smoothing=0.1) # criterion = nn.CrossEntropyLoss() optimizer = optim.Adam(model.parameters(), lr=1e-3, weight_decay=1e-4) scheduler = optim.lr_scheduler.CosineAnnealingLR(optimizer, num_epochs - 5, eta_min=1e-4) lr_scheduler = GradualWarmupScheduler(optimizer, multiplier=1, total_epoch=5, after_scheduler=scheduler) optimizer.zero_grad() optimizer.step() lr_scheduler.step() util.check_dir('../data/models/') best_model, loss_dict, top1_acc_dict, top5_acc_dict = train_model( data_loaders, data_sizes, name, model, criterion, optimizer, lr_scheduler, num_epochs=num_epochs, device=device) # 保存最好的模型参数 # util.save_model(best_model.cpu(), '../data/models/best_%s.pth' % name)
'Time {batch_time.val:.3f} ({batch_time.avg:.3f})\t' 'Loss {loss.val:.4f} ({loss.avg:.4f})\t' 'Top 1-err {top1.val:.4f} ({top1.avg:.4f})'.format( epoch, epochs, i, len(val_loader), batch_time=batch_time, loss=losses, top1=top1)) print('* Epoch: [{0}/{1}]\t Top 1-err {top1.avg:.3f}\t Test Loss {loss.avg:.3f}'.format( epoch, epochs, top1=top1, loss=losses)) return top1.avg, losses.avg # _, _, val_loss = validate(valid_loader, model, criterion) val_loss = 5 for epoch in range(0, epochs): scheduler_warmup.step(epoch, val_loss) # train for one epoch train_loss = train(train_loader, model, criterion, optimizer, epoch) # evaluate on validation set err1, val_loss = validate(valid_loader, model, criterion, epoch) # remember best prec@1 and save checkpoint is_best = err1 <= best_err1 best_err1 = min(err1, best_err1) print('Current best accuracy (top-1):', best_err1) save_checkpoint({ 'epoch': epoch, 'state_dict': model.state_dict(), 'best_err1': best_err1,
class scene_transformer(LightningModule): def __init__(self, cfg): super(scene_transformer, self).__init__() self.hparams = cfg self.emb_dim = cfg["model"]["emb_dim"] self.save_hyperparameters(cfg) self.cfg = cfg self.cat_emb = nn.Embedding( cfg["model"]["cat"]["start_token"] + 1, cfg["model"]["emb_dim"], padding_idx=cfg["model"]["cat"]["pad_token"], ) self.pos_emb = nn.Embedding(cfg["model"]["max_seq_len"], cfg["model"]["emb_dim"]) self.coor_type_emb = nn.Embedding(3, cfg["model"]["emb_dim"]) self.x_coor_emb = nn.Embedding( cfg["model"]["coor"]["start_token"] + 1, cfg["model"]["emb_dim"], padding_idx=cfg["model"]["coor"]["pad_token"], ) self.y_coor_emb = nn.Embedding( cfg["model"]["coor"]["start_token"] + 1, cfg["model"]["emb_dim"], padding_idx=cfg["model"]["coor"]["pad_token"], ) self.z_coor_emb = nn.Embedding( cfg["model"]["coor"]["start_token"] + 1, cfg["model"]["emb_dim"], padding_idx=cfg["model"]["coor"]["pad_token"], ) self.orient_emb = nn.Embedding( cfg["model"]["orient"]["start_token"] + 1, cfg["model"]["emb_dim"], padding_idx=cfg["model"]["orient"]["pad_token"], ) self.dim_emb = nn.Embedding( cfg["model"]["dim"]["start_token"] + 1, cfg["model"]["emb_dim"], padding_idx=cfg["model"]["dim"]["pad_token"], ) self.shape_cond = cfg["model"]["dim"]["shape_cond"] if self.shape_cond: print("Using shape cond model") self.x_emb = nn.Embedding(16, self.emb_dim) self.y_emb = nn.Embedding(16, self.emb_dim) self.img_encoder = resnet_small(layers=[1, 2, 2], num_input_channels=1, dim=self.emb_dim) layer = nn.TransformerDecoderLayer gen_model = nn.TransformerDecoder else: layer = nn.TransformerEncoderLayer gen_model = nn.TransformerEncoder d_layer = layer( d_model=self.emb_dim, nhead=cfg["model"]["num_heads"], dim_feedforward=cfg["model"]["dim_fwd"], dropout=cfg["model"]["dropout"], ) self.generator = gen_model(d_layer, cfg["model"]["num_blocks"]) self.output_dim = nn.Linear(cfg["model"]["emb_dim"], cfg["model"]["dim"]["start_token"]) self.decoder_seq_len = cfg["model"]["max_seq_len"] def get_shape_memory(self, room_shape): """ Get the transformer encoder memory for the room_shape condition images (similar to PolyGen image conditional model) room_shape: (bsize, input_channel, 512, 512) return: (16*16, bsize, embdim) """ device = torch.device('cuda' if torch.cuda.is_available() else 'cpu') features = self.img_encoder(room_shape.to(device)) # dimension of condition image img_dim = features.shape[-1] # 0,1,2 .. img_dim ndx = torch.LongTensor(range(img_dim)).unsqueeze(0).to(device) # positional embedding in X and Y axes x_emb, y_emb = ( self.x_emb(ndx).transpose(1, 2).unsqueeze(3), self.y_emb(ndx).transpose(1, 2).unsqueeze(2), ) # add positional embedding tmp = features + x_emb + y_emb features_flat = tmp.reshape(tmp.shape[0], tmp.shape[1], -1) memory = features_flat.permute(2, 0, 1) return memory def forward( self, cat_seq, x_loc_seq, y_loc_seq, z_loc_seq, orient_seq, dim_seq, room_shape=None, ): device = torch.device("cuda" if torch.cuda.is_available() else "cpu") ( cat_emb, pos_emb, x_emb, y_emb, z_emb, ori_emb, dim_emb, coor_type_emb, ) = self.get_embedding(cat_seq, x_loc_seq, y_loc_seq, z_loc_seq, orient_seq, dim_seq) # ,obj_emb joint_emb = (cat_emb + pos_emb + x_emb + y_emb + z_emb + ori_emb + dim_emb + coor_type_emb) tgt_padding_mask = self.get_padding_mask(dim_seq)[:, :-1].to(device) tgt_mask = self.generate_square_subsequent_mask(dim_seq.shape[1] - 1).to(device) tgt = joint_emb.transpose(1, 0)[:-1, :, :] if self.shape_cond: memory = self.get_shape_memory( room_shape) if self.shape_cond else None out_embs = self.generator(tgt, memory, tgt_mask=tgt_mask, tgt_key_padding_mask=tgt_padding_mask) else: out_embs = self.generator(tgt, tgt_mask, tgt_padding_mask) out_embs = out_embs.transpose(1, 0) out_dim = self.output_dim(out_embs) logprobs_dim = F.log_softmax(out_dim, dim=-1) return logprobs_dim def get_embedding(self, cat_seq, x_loc_seq, y_loc_seq, z_loc_seq, orient_seq, dim_seq): device = torch.device("cuda" if torch.cuda.is_available() else "cpu") cat_emb = self.cat_emb(cat_seq) batch_size, seq_len = cat_seq.shape x_emb = self.x_coor_emb(x_loc_seq) y_emb = self.y_coor_emb(y_loc_seq) z_emb = self.z_coor_emb(z_loc_seq) ori_emb = self.orient_emb(orient_seq) dim_emb = self.dim_emb(dim_seq) pos_seq = torch.arange(0, seq_len).to(device) pos_emb = self.pos_emb(pos_seq) ndx = np.arange(seq_len).reshape((1, -1)) ndx_ref = np.arange(seq_len).reshape((1, -1)) ndx[ndx_ref % 3 == 1] = 0 ndx[ndx_ref % 3 == 2] = 1 ndx[ndx_ref % 3 == 0] = 2 ndx = torch.LongTensor(ndx).to(device) coor_type_emb = self.coor_type_emb(ndx).repeat(batch_size, 1, 1) return cat_emb, pos_emb, x_emb, y_emb, z_emb, ori_emb, dim_emb, coor_type_emb def generate_square_subsequent_mask(self, sz): mask = (torch.triu(torch.ones(sz, sz)) == 1).transpose(0, 1) mask = (mask.float().masked_fill(mask == 0, float("-inf")).masked_fill( mask == 1, float(0.0))) return mask def get_padding_mask(self, seq): mask = torch.ByteTensor(np.zeros(seq.shape, dtype=np.uint8)) mask[seq == self.cfg["model"]["dim"]["pad_token"]] = 1 return mask.bool() def configure_optimizers(self): self.optim = Adam( self.parameters(), lr=self.cfg["train"]["lr"], weight_decay=self.cfg["train"]["l2"], ) self.sched = CosineAnnealingLR(self.optim, T_max=self.cfg["train"]["lr_restart"]) self.warmup = GradualWarmupScheduler( self.optim, multiplier=1, total_epoch=self.cfg["train"]["warmup"], after_scheduler=self.sched, ) return [self.optim], [self.sched] def optimizer_step(self, current_epoch, batch_nb, optimizer, optimizer_i, second_order_closure=None): optimizer.step() self.warmup.step() optimizer.zero_grad() def general_step(self, batch): loss = 0 cat_seq, x_loc_seq, y_loc_seq, z_loc_seq, orient_seq, dim_seq = ( batch["cat_seq"], batch["x_loc_seq"], batch["y_loc_seq"], batch["z_loc_seq"], batch["orient_seq"], batch["dim_seq"], ) room_shape = batch["floor"] if self.shape_cond else None logprobs_ori = self.forward( cat_seq, x_loc_seq, y_loc_seq, z_loc_seq, orient_seq, dim_seq, room_shape=room_shape, ) loss_ori = F.nll_loss( logprobs_ori.transpose(1, 2), batch["dim_seq"][:, 1:], ignore_index=self.cfg["model"]["dim"]["pad_token"], ) loss = loss_ori return loss def training_step(self, batch, batch_idx): loss = self.general_step(batch) lr = get_lr(self.optim) log = {"loss": {"train_loss": loss}, "lr": lr} return {"loss": loss, "log": log} def validation_step(self, batch, batch_idx): loss = self.general_step(batch) return {"val_loss": loss} def validation_epoch_end(self, outputs): avg_loss = torch.stack([x["val_loss"] for x in outputs]).mean() log = {"loss": {"val": avg_loss}} return {"val_loss": avg_loss, "log": log} def decode_multi_model( self, out_ndx, cat_gen_seq, x_gen_seq, y_gen_seq, z_gen_seq, ori_gen_seq, dim_gen_seq, probabilistic=False, nucleus=False, room_shape=None, ): device = torch.device("cuda" if torch.cuda.is_available() else "cpu") curr_cat_seq = cat_gen_seq + (self.decoder_seq_len - len(cat_gen_seq)) * [0] curr_cat_seq = torch.LongTensor(curr_cat_seq).view(1, -1).to(device) curr_x_seq = x_gen_seq + (self.decoder_seq_len - len(x_gen_seq)) * [0] curr_x_seq = torch.LongTensor(curr_x_seq).view(1, -1).to(device) curr_y_seq = y_gen_seq + (self.decoder_seq_len - len(y_gen_seq)) * [0] curr_y_seq = torch.LongTensor(curr_y_seq).view(1, -1).to(device) curr_z_seq = z_gen_seq + (self.decoder_seq_len - len(z_gen_seq)) * [0] curr_z_seq = torch.LongTensor(curr_z_seq).view(1, -1).to(device) curr_orient_seq = ori_gen_seq + (self.decoder_seq_len - len(ori_gen_seq)) * [0] curr_orient_seq = torch.LongTensor(curr_orient_seq).view(1, -1).to(device) curr_dim_seq = dim_gen_seq + (self.decoder_seq_len - len(dim_gen_seq)) * [0] curr_dim_seq = torch.LongTensor(curr_dim_seq).view(1, -1).to(device) ( cat_emb, pos_emb, x_emb, y_emb, z_emb, ori_emb, dim_emb, coor_type_emb, ) = self.get_embedding( curr_cat_seq, curr_x_seq, curr_y_seq, curr_z_seq, curr_orient_seq, curr_dim_seq, ) joint_emb = (cat_emb + pos_emb + x_emb + y_emb + z_emb + ori_emb + dim_emb + coor_type_emb) tgt = joint_emb.transpose(1, 0) tgt_mask = self.generate_square_subsequent_mask( tgt.shape[0]).to(device) tgt_padding_mask = self.get_padding_mask(curr_cat_seq).to(device) if self.shape_cond: room_shape = room_shape.unsqueeze(0) memory = self.get_shape_memory( room_shape) if self.shape_cond else None out_embs = self.generator(tgt, memory, tgt_mask=tgt_mask, tgt_key_padding_mask=tgt_padding_mask) else: out_embs = self.generator(tgt, tgt_mask, tgt_padding_mask) logits_dim = self.output_dim(out_embs)[out_ndx][0] if probabilistic and nucleus: logits_dim = sample_top_p(logits_dim) probs_dim = F.softmax(logits_dim, dim=-1) if probabilistic: dim_next_token = Categorical(probs=probs_dim).sample() else: _, dim_next_token = torch.max(probs_dim, dim=0) if dim_next_token == self.cfg["model"]["dim"]["stop_token"]: dim_next_token = 999 return dim_next_token
class TrainingLoop(): def __init__(self, model_kwargs, train_positive_paths, train_negative_paths, train_unlabeled_paths, val_positive_paths, val_negative_paths, val_unlabeled_paths, data_cache_dir: str, notify_callback: Callable[[Dict[str, Any]], None] = lambda x: None): '''The training loop for background splitting models.''' self.data_cache_dir = data_cache_dir self.notify_callback = notify_callback self._setup_model_kwargs(model_kwargs) # Setup dataset self._setup_dataset(train_positive_paths, train_negative_paths, train_unlabeled_paths, val_positive_paths, val_negative_paths, val_unlabeled_paths) # Setup model self._setup_model() # Setup optimizer # Resume if requested resume_from = model_kwargs.get('resume_from', None) if resume_from: resume_training = model_kwargs.get('resume_training', False) self.load_checkpoint(resume_from, resume_training=resume_training) self.writer = SummaryWriter(log_dir=model_kwargs['log_dir']) # Variables for estimating run-time self.train_batch_time = EMA(0) self.val_batch_time = EMA(0) self.train_batches_per_epoch = (len(self.train_dataloader.dataset) / self.train_dataloader.batch_size) self.val_batches_per_epoch = (len(self.val_dataloader.dataset) / self.val_dataloader.batch_size) self.train_batch_idx = 0 self.val_batch_idx = 0 self.train_epoch_loss = 0 self.train_epoch_main_loss = 0 self.train_epoch_aux_loss = 0 def _setup_model_kwargs(self, model_kwargs): self.model_kwargs = copy.deepcopy(model_kwargs) self.num_workers = NUM_WORKERS self.val_frequency = model_kwargs.get('val_frequency', 1) self.checkpoint_frequency = model_kwargs.get('checkpoint_frequency', 1) self.use_cuda = bool(model_kwargs.get('use_cuda', True)) assert 'model_dir' in model_kwargs self.model_dir = model_kwargs['model_dir'] assert 'aux_labels' in model_kwargs self.aux_weight = float(model_kwargs.get('aux_weight', 0.1)) assert 'log_dir' in model_kwargs def _setup_dataset(self, train_positive_paths, train_negative_paths, train_unlabeled_paths, val_positive_paths, val_negative_paths, val_unlabeled_paths): assert self.model_kwargs aux_labels = self.model_kwargs['aux_labels'] image_input_size = self.model_kwargs.get('input_size', 224) batch_size = int(self.model_kwargs.get('batch_size', 64)) num_workers = self.num_workers restrict_aux_labels = bool( self.model_kwargs.get('restrict_aux_labels', True)) cache_images_on_disk = self.model_kwargs.get('cache_images_on_disk', False) train_transform = transforms.Compose([ transforms.RandomResizedCrop(image_input_size), transforms.RandomHorizontalFlip(), transforms.ConvertImageDtype(torch.float32), transforms.Normalize([0.485, 0.456, 0.406], [0.229, 0.224, 0.225]) ]) resize_size = int(image_input_size * 1.15) resize_size += int(resize_size % 2) val_transform = transforms.Compose([ transforms.Resize(resize_size), transforms.CenterCrop(image_input_size), transforms.ConvertImageDtype(torch.float32), transforms.Normalize([0.485, 0.456, 0.406], [0.229, 0.224, 0.225]) ]) self.train_dataloader = DataLoader(AuxiliaryDataset( positive_paths=train_positive_paths, negative_paths=train_negative_paths, unlabeled_paths=train_unlabeled_paths, auxiliary_labels=aux_labels, restrict_aux_labels=restrict_aux_labels, cache_images_on_disk=cache_images_on_disk, data_cache_dir=self.data_cache_dir, transform=train_transform), batch_size=batch_size, shuffle=True, num_workers=num_workers) self.val_dataloader = DataLoader(AuxiliaryDataset( positive_paths=val_positive_paths, negative_paths=val_negative_paths, unlabeled_paths=val_unlabeled_paths, auxiliary_labels=aux_labels, restrict_aux_labels=restrict_aux_labels, cache_images_on_disk=cache_images_on_disk, data_cache_dir=self.data_cache_dir, transform=val_transform), batch_size=batch_size, shuffle=False, num_workers=num_workers) def _setup_model(self): num_classes = 2 num_aux_classes = self.train_dataloader.dataset.num_auxiliary_classes freeze_backbone = self.model_kwargs.get('freeze_backbone', False) self.model_kwargs['num_aux_classes'] = num_aux_classes self.model = Model(num_main_classes=num_classes, num_aux_classes=num_aux_classes, freeze_backbone=freeze_backbone) if self.model_kwargs.get('aux_labels_type', None) == "imagenet": # Initialize auxiliary head to imagenet fc self.model.auxiliary_head.weight = self.model.backbone.fc.weight self.model.auxiliary_head.bias = self.model.backbone.fc.bias if self.use_cuda: self.model = self.model.cuda() self.model = nn.DataParallel(self.model) self.main_loss = nn.CrossEntropyLoss() self.auxiliary_loss = nn.CrossEntropyLoss() self.start_epoch = 0 self.end_epoch = self.model_kwargs.get('epochs_to_run', 1) self.current_epoch = 0 self.global_train_batch_idx = 0 self.global_val_batch_idx = 0 lr = float(self.model_kwargs.get('initial_lr', 0.01)) endlr = float(self.model_kwargs.get('endlr', 0.0)) optim_params = dict( lr=lr, momentum=float(self.model_kwargs.get('momentum', 0.9)), weight_decay=float(self.model_kwargs.get('weight_decay', 0.0001)), ) self.optimizer = optim.SGD(self.model.parameters(), **optim_params) max_epochs = int(self.model_kwargs.get('max_epochs', 90)) warmup_epochs = int(self.model_kwargs.get('warmup_epochs', 0)) scheduler = torch.optim.lr_scheduler.CosineAnnealingLR(self.optimizer, max_epochs - warmup_epochs, eta_min=endlr) self.optimizer_scheduler = GradualWarmupScheduler( optimizer=self.optimizer, multiplier=1.0, warmup_epochs=warmup_epochs, after_scheduler=scheduler) def _notify(self): epochs_left = self.end_epoch - self.current_epoch - 1 num_train_batches_left = ( epochs_left * self.train_batches_per_epoch + max(0, self.train_batches_per_epoch - self.train_batch_idx - 1)) num_val_batches_left = ( (1 + round(epochs_left / self.val_frequency)) * self.val_batches_per_epoch + max(0, self.val_batches_per_epoch - self.val_batch_idx - 1)) time_left = (num_train_batches_left * self.train_batch_time.value + num_val_batches_left * self.val_batch_time.value) self.notify_callback(**{"training_time_left": time_left}) def setup_resume(self, train_positive_paths, train_negative_paths, train_unlabeled_paths, val_positive_paths, val_negative_paths, val_unlabeled_paths): self._setup_dataset(train_positive_paths, train_negative_paths, train_unlabeled_paths, val_positive_paths, val_negative_paths, val_unlabeled_paths) self.start_epoch = self.end_epoch self.current_epoch = self.start_epoch self.end_epoch = self.start_epoch + self.model_kwargs.get( 'epochs_to_run', 1) def load_checkpoint(self, path: str, resume_training: bool = False): checkpoint_state = torch.load(path) self.model.load_state_dict(checkpoint_state['state_dict']) if resume_training: self.global_train_batch_idx = checkpoint_state[ 'global_train_batch_idx'] self.global_val_batch_idx = checkpoint_state[ 'global_val_batch_idx'] self.start_epoch = checkpoint_state['epoch'] + 1 self.current_epoch = self.start_epoch self.end_epoch = (self.start_epoch + self.model_kwargs.get('epochs_to_run', 1)) self.optimizer.load_state_dict(checkpoint_state['optimizer']) self.optimizer_scheduler.load_state_dict( checkpoint_state['optimizer_scheduler']) # Copy tensorboard state prev_log_dir = checkpoint_state['model_kwargs']['log_dir'] curr_log_dir = self.model_kwargs['log_dir'] shutil.copytree(prev_log_dir, curr_log_dir) def save_checkpoint(self, epoch, checkpoint_path: str): kwargs = dict(self.model_kwargs) del kwargs['aux_labels'] state = dict( global_train_batch_idx=self.global_train_batch_idx, global_val_batch_idx=self.global_val_batch_idx, model_kwargs=kwargs, epoch=epoch, state_dict=self.model.state_dict(), optimizer=self.optimizer.state_dict(), optimizer_scheduler=self.optimizer_scheduler.state_dict(), ) os.makedirs(os.path.dirname(checkpoint_path), exist_ok=True) torch.save(state, checkpoint_path) def _validate(self, dataloader): self.model.eval() loss_value = 0 main_gts = [] aux_gts = [] main_preds = [] aux_preds = [] for batch_idx, (images, main_labels, aux_labels) in enumerate(dataloader): batch_start = time.perf_counter() self.val_batch_idx = batch_idx if self.use_cuda: images = images.cuda() main_labels = main_labels.cuda() aux_labels = aux_labels.cuda() main_logits, aux_logits = self.model(images) valid_main_labels = main_labels != -1 valid_aux_labels = aux_labels != -1 main_loss_value = self.main_loss(main_logits[valid_main_labels], main_labels[valid_main_labels]) aux_loss_value = self.aux_weight * self.auxiliary_loss( aux_logits[valid_aux_labels], aux_labels[valid_aux_labels]) loss_value = torch.zeros_like(main_loss_value) if valid_main_labels.sum() > 0: loss_value += main_loss_value if valid_aux_labels.sum() > 0: loss_value += aux_loss_value loss_value = loss_value.item() if valid_main_labels.sum() > 0: main_pred = F.softmax(main_logits[valid_main_labels]) main_preds += list( main_pred.argmax(dim=1)[valid_main_labels].cpu().numpy()) main_gts += list(main_labels[valid_main_labels].cpu().numpy()) if valid_aux_labels.sum() > 0: aux_pred = F.softmax(main_logits[valid_main_labels]) aux_preds += list( aux_pred.argmax(dim=1)[valid_aux_labels].cpu().numpy()) aux_gts += list(aux_labels[valid_aux_labels].cpu().numpy()) batch_end = time.perf_counter() self.val_batch_time += (batch_end - batch_start) self.global_val_batch_idx += 1 # Compute F1 score if len(dataloader) > 0: loss_value /= (len(dataloader) + 1e-10) main_prec, main_recall, main_f1, _ = \ sklearn.metrics.precision_recall_fscore_support( main_gts, main_preds, average='binary') aux_prec, aux_recall, aux_f1, _ = \ sklearn.metrics.precision_recall_fscore_support( aux_gts, aux_preds, average='micro') else: loss_value = 0 main_prec = -1 main_recall = -1 main_f1 = -1 aux_prec = -1 aux_recall = -1 aux_f1 = -1 summary_data = [ ('loss', loss_value), ('f1/main_head', main_f1), ('prec/main_head', main_prec), ('recall/main_head', main_recall), ('f1/aux_head', aux_f1), ('prec/aux_head', aux_prec), ('recall/aux_head', aux_recall), ] for k, v in [('val/epoch/' + tag, v) for tag, v in summary_data]: self.writer.add_scalar(k, v, self.current_epoch) def validate(self): self._validate(self.val_dataloader) def train(self): self.model.train() logger.info('Starting train epoch') load_start = time.perf_counter() self.train_epoch_loss = 0 self.train_epoch_main_loss = 0 self.train_epoch_aux_loss = 0 main_gts = [] aux_gts = [] main_logits_all = [] main_preds = [] aux_preds = [] for batch_idx, (images, main_labels, aux_labels) in enumerate(self.train_dataloader): load_end = time.perf_counter() batch_start = time.perf_counter() self.train_batch_idx = batch_idx logger.debug('Train batch') if self.use_cuda: images = images.cuda() main_labels = main_labels.cuda() aux_labels = aux_labels.cuda() main_logits, aux_logits = self.model(images) # Compute loss valid_main_labels = main_labels != -1 valid_aux_labels = aux_labels != -1 main_loss_value = self.main_loss(main_logits[valid_main_labels], main_labels[valid_main_labels]) aux_loss_value = self.aux_weight * self.auxiliary_loss( aux_logits[valid_aux_labels], aux_labels[valid_aux_labels]) loss_value = torch.zeros_like(main_loss_value) if valid_main_labels.sum() > 0: loss_value += main_loss_value if valid_aux_labels.sum() > 0: loss_value += aux_loss_value self.train_epoch_loss += loss_value.item() if torch.sum(valid_main_labels) > 0: self.train_epoch_main_loss += main_loss_value.item() if torch.sum(valid_aux_labels) > 0: self.train_epoch_aux_loss += aux_loss_value.item() # Update gradients self.optimizer.zero_grad() loss_value.backward() self.optimizer.step() if valid_main_labels.sum() > 0: main_pred = F.softmax(main_logits[valid_main_labels], dim=1) main_logits_all += list( main_logits[valid_main_labels].detach().cpu().numpy()) main_preds += list( main_pred[valid_main_labels].argmax(dim=1).cpu().numpy()) main_gts += list(main_labels[valid_main_labels].cpu().numpy()) if valid_aux_labels.sum() > 0: aux_pred = F.softmax(aux_logits[valid_aux_labels], dim=1) aux_preds += list( aux_pred[valid_aux_labels].argmax(dim=1).cpu().numpy()) aux_gts += list(aux_labels[valid_aux_labels].cpu().numpy()) batch_end = time.perf_counter() total_batch_time = (batch_end - batch_start) total_load_time = (load_end - load_start) self.train_batch_time += total_batch_time + total_load_time logger.debug(f'Train batch time: {self.train_batch_time.value}, ' f'this batch time: {total_batch_time}, ' f'this load time: {total_load_time}, ' f'batch epoch loss: {loss_value.item()}, ' f'main loss: {main_loss_value.item()}, ' f'aux loss: {aux_loss_value.item()}') summary_data = [ ('loss', loss_value.item()), ('loss/main_head', main_loss_value.item()), ('loss/aux_head', aux_loss_value.item()), ] for k, v in [('train/batch/' + tag, v) for tag, v in summary_data]: self.writer.add_scalar(k, v, self.global_train_batch_idx) self._notify() self.global_train_batch_idx += 1 load_start = time.perf_counter() model_lr = self.optimizer.param_groups[-1]['lr'] self.optimizer_scheduler.step() logger.debug(f'Train epoch loss: {self.train_epoch_loss}, ' f'main loss: {self.train_epoch_main_loss}, ' f'aux loss: {self.train_epoch_aux_loss}') main_prec, main_recall, main_f1, _ = \ sklearn.metrics.precision_recall_fscore_support( main_gts, main_preds, average='binary') aux_prec, aux_recall, aux_f1, _ = \ sklearn.metrics.precision_recall_fscore_support( aux_gts, aux_preds, average='micro') logger.debug( f'Train epoch main: {main_prec}, {main_recall}, {main_f1}, ' f'aux: {aux_prec}, {aux_recall}, {aux_f1}' f'main loss: {self.train_epoch_main_loss}, ' f'aux loss: {self.train_epoch_aux_loss}') summary_data = [('lr', model_lr), ('loss', self.train_epoch_loss), ('loss/main_head', self.train_epoch_main_loss), ('loss/aux_head', self.train_epoch_aux_loss), ('f1/main_head', main_f1), ('prec/main_head', main_prec), ('recall/main_head', main_recall), ('f1/aux_head', aux_f1), ('prec/aux_head', aux_prec), ('recall/aux_head', aux_recall)] for k, v in [('train/epoch/' + tag, v) for tag, v in summary_data]: self.writer.add_scalar(k, v, self.current_epoch) if len(main_logits_all): self.writer.add_histogram( 'train/epoch/softmax/main_head', scipy.special.softmax(main_logits_all, axis=1)[:, 1]) def run(self): self.last_checkpoint_path = None for i in range(self.start_epoch, self.end_epoch): logger.info(f'Train: Epoch {i}') self.current_epoch = i self.train() if i % self.val_frequency == 0 or i == self.end_epoch - 1: logger.info(f'Validate: Epoch {i}') self.validate() if i % self.checkpoint_frequency == 0 or i == self.end_epoch - 1: logger.info(f'Checkpoint: Epoch {i}') self.last_checkpoint_path = os.path.join( self.model_dir, f'checkpoint_{i:03}.pth') self.save_checkpoint(i, self.last_checkpoint_path) return self.last_checkpoint_path
def main_worker(gpu, ngpus_per_node, args): global best_acc1 args.gpu = gpu if args.gpu is not None: print("Use GPU: {} for training".format(args.gpu)) if args.distributed: if args.dist_url == "env://" and args.rank == -1: args.rank = int(os.environ["RANK"]) if args.multiprocessing_distributed: # For multiprocessing distributed training, rank needs to be the # global rank among all the processes args.rank = args.rank * ngpus_per_node + gpu dist.init_process_group(backend=args.dist_backend, init_method=args.dist_url, world_size=args.world_size, rank=args.rank) num_classes = 1000 # create model print("=> creating model '{}'".format(args.arch)) model = models.__dict__[args.arch](stem_type=args.stem_type, num_classes=num_classes, block_type=models.PreBasicBlock, activation=nn.PReLU) bchef = BinaryChef('recepies/imagenet-baseline.yaml') model = bchef.run_step(model, args.step) print(model) print('Num paramters: {}'.format(count_parameters(model))) if args.distributed: # For multiprocessing distributed, DistributedDataParallel constructor # should always set the single device scope, otherwise, # DistributedDataParallel will use all available devices. if args.gpu is not None: torch.cuda.set_device(args.gpu) model.cuda(args.gpu) # When using a single GPU per process and per # DistributedDataParallel, we need to divide the batch size # ourselves based on the total number of GPUs we have args.batch_size = int(args.batch_size / ngpus_per_node) args.workers = int( (args.workers + ngpus_per_node - 1) / ngpus_per_node) model = torch.nn.parallel.DistributedDataParallel( model, device_ids=[args.gpu]) else: model.cuda() # DistributedDataParallel will divide and allocate batch_size to all # available GPUs if device_ids are not set model = torch.nn.parallel.DistributedDataParallel(model) elif args.gpu is not None: torch.cuda.set_device(args.gpu) model = model.cuda(args.gpu) else: # DataParallel will divide and allocate batch_size to all available GPUs if args.arch.startswith('alexnet') or args.arch.startswith('vgg'): model.features = torch.nn.DataParallel(model.features) model.cuda() else: model = torch.nn.DataParallel(model).cuda() # define loss function (criterion) and optimizer criterion = nn.CrossEntropyLoss().cuda(args.gpu) parameters = model.parameters() if args.optimizer == 'adamw': wd = args.weight_decay if args.step == 0 else 0 optimizer = torch.optim.AdamW(parameters, args.lr, weight_decay=wd) elif args.optimizer == 'adam': optimizer = torch.optim.Adam(parameters, args.lr) elif args.optimizer == 'sgd': wd = 0 if args.step > 0 else args.weight_decay optimizer = torch.optim.SGD(parameters, args.lr, momentum=args.momentum, weight_decay=wd) else: raise ValueError('Unknown optimizer selected: {}'.format( args.optimizer)) if args.scheduler == 'multistep': milestone = [40, 70, 80, 100, 110] lr_scheduler = torch.optim.lr_scheduler.MultiStepLR( optimizer, milestones=[x - args.warmup for x in milestone], gamma=0.1) # elif args.scheduler == 'cosine': lr_scheduler = torch.optim.lr_scheduler.CosineAnnealingLR( optimizer, float(args.epochs - args.warmup), eta_min=0) else: raise ValueError('Unknown schduler selected: {}'.format( args.scheduler)) if args.warmup > 0: print('=> Applying warmup ({} epochs)'.format(args.warmup)) lr_scheduler = GradualWarmupScheduler(optimizer, multiplier=1, total_epoch=args.warmup, after_scheduler=lr_scheduler) # optionally resume from a checkpoint if args.resume: if os.path.isfile(args.resume): print("=> loading checkpoint '{}'".format(args.resume)) if args.gpu is None: checkpoint = torch.load(args.resume) else: # Map model to be loaded to specified single gpu. loc = 'cuda:{}'.format(args.gpu) checkpoint = torch.load(args.resume, map_location=loc) if args.resume_epoch: args.start_epoch = checkpoint['epoch'] best_acc1 = checkpoint['best_acc1'] if args.gpu is not None: pass # best_acc1 may be from a checkpoint from a different GPU #best_acc1 = best_acc1.to(args.gpu) try: model.load_state_dict(checkpoint['state_dict']) if not ('adam' in args.optimizer and 'sgd' in args.resume): print('=> Loading optimizer...') #optimizer.load_state_dict(checkpoint['optimizer']) except: print( '=> Warning: dict model mismatch, loading with strict = False' ) model.load_state_dict(checkpoint['state_dict'], strict=False) print("=> loaded checkpoint '{}' (epoch {})".format( args.resume, checkpoint['epoch'])) else: print("=> no checkpoint found at '{}'".format(args.resume)) # Reset learning rate for g in optimizer.param_groups: g['lr'] = args.lr if args.start_epoch > 0: print('Advancing the scheduler to epoch {}'.format(args.start_epoch)) for i in range(args.start_epoch): lr_scheduler.step() cudnn.benchmark = True # Data loading code traindir = os.path.join(args.data, 'train') valdir = os.path.join(args.data, 'valid') normalize = transforms.Normalize(mean=[0.485, 0.456, 0.406], std=[0.229, 0.224, 0.225]) transforms_train = transforms.Compose([ transforms.RandomResizedCrop(224), transforms.RandomHorizontalFlip(), transforms.ToTensor(), normalize, ]) transforms_val = transforms.Compose([ transforms.Resize(256), transforms.CenterCrop(224), transforms.ToTensor(), normalize, ]) train_dataset = datasets.ImageFolder(traindir, transforms_train) val_dataset = datasets.ImageFolder(valdir, transforms_val) if args.distributed: train_sampler = torch.utils.data.distributed.DistributedSampler( train_dataset) else: train_sampler = None train_loader = torch.utils.data.DataLoader(train_dataset, batch_size=args.batch_size, shuffle=(train_sampler is None), num_workers=args.workers, pin_memory=True, sampler=train_sampler) val_loader = torch.utils.data.DataLoader(val_dataset, batch_size=args.batch_size, shuffle=False, num_workers=args.workers, pin_memory=True) if args.evaluate: validate(val_loader, model, criterion, args) return show_logs = (not args.multiprocessing_distributed) or ( args.multiprocessing_distributed and args.rank % ngpus_per_node == 0) for epoch in range(args.start_epoch, args.epochs): if args.distributed: train_sampler.set_epoch(epoch) if args.scheduler == 'cosine': lr_scheduler.step(epoch) else: lr_scheduler.step() if show_logs: print('New lr: {}'.format(lr_scheduler.get_last_lr())) # train for one epoch train(train_loader, model, criterion, optimizer, epoch, args, show_logs) # evaluate on validation set acc1 = validate(val_loader, model, criterion, args, show_logs) # remember best acc@1 and save checkpoint is_best = acc1 > best_acc1 best_acc1 = max(acc1, best_acc1) print('Current best: {}'.format(best_acc1)) if show_logs: save_checkpoint( { 'epoch': epoch + 1, 'arch': args.arch, 'state_dict': model.state_dict(), 'best_acc1': best_acc1, 'optimizer': optimizer.state_dict(), }, is_best, args.output_dir)
def main(): fold = 0 epoch = 3 mode = 1 batch = 2 num_workers = 1 SEED = 13 init_lr = 3e-4 warmup_factor = 10 #how long warmup_epo = 1 log = True seed_everything(SEED) model = HUB_MODELS['efficientnet-b0']('efficientnet-b0') model.to(DEVICE) df = pd.read_csv(os.path.join(path_data, 'train_folds.csv')) kernel = type(model).__name__ tr_idx = np.where(df.fold != fold)[0] vl_idx = np.where(df.fold == fold)[0] transforms_train = A.Compose([ # A.OneOf([ # A.ShiftScaleRotate(shift_limit=0.05, scale_limit=0.1, rotate_limit=15), # A.OpticalDistortion(distort_limit=0.11, shift_limit=0.15), # A.NoOp() # ]), # A.OneOf([ # A.RandomBrightnessContrast(brightness_limit=0.2, contrast_limit=0.2), # A.RandomGamma(gamma_limit=(50, 150)), # A.NoOp() # ]), # A.OneOf([ # A.RGBShift(r_shift_limit=20, b_shift_limit=15, g_shift_limit=15), # A.FancyPCA(3), # A.HueSaturationValue(hue_shift_limit=5, sat_shift_limit=5), # A.NoOp() # ]), # A.OneOf([ # A.CLAHE(), # A.NoOp() # ]), A.Transpose(p=0.5), A.VerticalFlip(p=0.5), A.HorizontalFlip(p=0.5), ]) # transforms_val = albumentations.Compose([]) dataset = { 'npy': [trainDataset_npy, 16], 'pkl': [trainDataset_pkl, 25], 'insta': [trainDataset_insta, None] } trainDataset, num = dataset['pkl'] td = trainDataset(df.iloc[tr_idx], df.iloc[tr_idx].isup_grade, num, rand=True, transform=transforms_train) vd = trainDataset(df.iloc[vl_idx], df.iloc[vl_idx].isup_grade, num, rand=False, transform=transforms_train) train_dl = DataLoader(td, batch_size=batch, sampler=RandomSampler(td), num_workers=num_workers) val_dl = DataLoader(vd, batch_size=batch, sampler=SequentialSampler(vd), num_workers=num_workers) optimizer = Adam(model.parameters(), lr=init_lr / warmup_factor) scheduler_cosine = torch.optim.lr_scheduler.CosineAnnealingLR( optimizer, epoch - warmup_epo) scheduler = GradualWarmupScheduler(optimizer, multiplier=warmup_factor, total_epoch=warmup_epo, after_scheduler=scheduler_cosine) criterion = nn.BCEWithLogitsLoss() scaler = amp.GradScaler() qwk_max = 0 for i in range(1, epoch + 1): print(f'Epoch: {i}') scheduler.step(i - 1) model.train() loss = train_epoch(model, train_dl, criterion, scaler, optimizer) model.eval() with torch.no_grad(): val_loss, pred, val_lab = train_epoch(model, val_dl, criterion, None, None) p = torch.cat(pred).cpu().numpy() t = torch.cat(val_lab).cpu().numpy() acc = (p == t).mean() * 100. qwk = cohen_kappa_score(p, t, weights='quadratic') #sch.step(val_loss) # Plateau if log: print('Log.....') lg = time.ctime( ) + ' ' + f'Epoch {i}, lr: {optimizer.param_groups[0]["lr"]:.7f}, train loss: {np.mean(loss):.5f}, val loss: {np.mean(val_loss):.5f}, acc: {(acc):.5f}, qwk: {(qwk):.5f}, fold: {fold+1}' print(lg) with open(os.path.join(path_log, f'log_{kernel}_kaggle.txt'), 'a') as appender: appender.write(lg + '\n') if qwk > qwk_max: print('Best ({:.6f} --> {:.6f}). Saving model ...'.format( qwk_max, qwk)) torch.save( model.state_dict(), os.path.join( path_model, f'{kernel}_kaggle_best_fold{fold+1}_epoch_{i}.pth')) qwk_max = qwk #make checkpoint #problem in win # name_check = '_'.join(time.ctime().split(':')) + '_model.pt' # torch.save({ # 'epoch': i, # 'model_state_dict': model.state_dict(), # 'optimizer_state_dict': optimizer.state_dict() # }, os.path.join(path_checkpoint, name_check)) torch.save( model.state_dict(), os.path.join(path_model, '{kernel}_kaggle_final_fold{fold+1}.pth'))
def main(): best_test_loss = np.inf model = Yolov1_vgg16bn(pretrained=True) print('pre-trained vgg16 model has loaded!') previous_model_path = model_name exists = os.path.isfile(previous_model_path) if exists: print("Starting from previous result...") model.load_state_dict(torch.load(previous_model_path)) else: print("Starting with new train") #print(model) print('') if use_gpu: model.cuda() # Data print('==> Preparing data..') transform = transforms.Compose([ transforms.ToTensor(), ]) #transforms.Normalize((0.5,0.5,0.5), (0.5,0.5,0.5)) parent_dir, img_size, S, B, C, transforms, num = 15000): train_dataset = DataGenerator(parent_dir=img_folder, img_size=img_size, S=S, B=B, C=C, transform=transform, num=train_num, train=True) train_loader = DataLoader(train_dataset, batch_size=n_batch, shuffle=True, num_workers=8) test_dataset = DataGenerator(parent_dir=validate_folder, img_size=img_size, S=S, B=B, C=C, transform=transform, num=test_num, train=False) test_loader = DataLoader(test_dataset, batch_size=n_batch, shuffle=False, num_workers=8) model.train() train_val_loss_log = open( os.path.join(results_folder, 'train_val_loss_log'), 'w+') #loss_fn = YoloLoss(B, S, lambda_coord, lambda_noobj) loss_fn = YoloLossNew(B, S, C, lambda_coord, lambda_noobj) optimizer = torch.optim.SGD(model.parameters(), lr=0.0001, momentum=0.9, weight_decay=0.0005) #optimizer = torch.optim.SGD(model.parameters(),lr=0.0001) scheduler = GradualWarmupScheduler(optimizer, multiplier=8, total_epoch=30) for epoch in range(num_epochs): scheduler.step(epoch) print(epoch, optimizer.param_groups[0]['lr']) for i, (img_name, images, target) in enumerate(train_loader): #images = images.float() #target = target.float() images = Variable(images) target = Variable(target) if use_gpu: images, target = images.cuda(), target.cuda() optimizer.zero_grad() pred = model(images) loss = loss_fn(pred, target) current_loss = loss.item() loss.backward() optimizer.step() if i % 20 == 0: print( "\r%d/%d batches in %d/%d iteration, current error is %f" % (i, len(train_loader), epoch + 1, num_epochs, current_loss)) save_model_by_epoch(epoch, model) # validat on validation set validation_loss = 0.0 model.eval() with torch.no_grad(): for i, (img_name, images, target) in enumerate(test_loader): #image = images.float() #target = target.float() images = Variable(images) target = Variable(target) if use_gpu: images, target = images.cuda(), target.cuda() pred = model(images) loss = loss_fn(pred, target) validation_loss += loss.item() validation_loss /= len(test_loader) # log the training loss and validation loss every epoch log_str = 'epoch: {}, train_loss: {}, val_loss: {} \n'.format( epoch + 1, current_loss, validation_loss) print(log_str) train_val_loss_log.writelines(log_str) train_val_loss_log.flush() if best_test_loss > validation_loss: best_test_loss = validation_loss save_torch_model(model, 'best.pth', epoch) train_val_loss_log.close()
def main(args): torch.backends.cudnn.benchmark = True seed_all(args.seed) num_classes = 1 d = Dataset(train_set_size=args.train_set_sz, num_cls=num_classes) train = d.train_set valid = d.test_set net = UNet(in_dim=1, out_dim=4).cuda() snake_approx_net = UNet(in_dim=1, out_dim=1, wf=3, padding=True, first_layer_pad=None, depth=4, last_layer_resize=True).cuda() best_val_dice = -np.inf optimizer = torch.optim.Adam(params=net.parameters(), lr=args.lr, weight_decay=args.weight_decay) snake_approx_optimizer = torch.optim.Adam( params=snake_approx_net.parameters(), lr=args.lr, weight_decay=args.weight_decay) scheduler_warmup = GradualWarmupScheduler(optimizer, multiplier=10, total_epoch=50, after_scheduler=None) # load model if args.ckpt: loaded = _pickle.load(open(args.ckpt, 'rb')) net.load_state_dict(loaded[0]) optimizer.load_state_dict(loaded[1]) snake_approx_net.load_state_dict(loaded[2]) snake_approx_optimizer.load_state_dict(loaded[3]) if not os.path.exists(args.log_dir): os.makedirs(args.log_dir, exist_ok=True) writer = tensorboardX.SummaryWriter(log_dir=args.log_dir) snake = SnakePytorch(args.delta, args.batch_sz * args.num_samples, args.num_lines, args.radius) snake_eval = SnakePytorch(args.delta, args.batch_sz, args.num_lines, args.radius) noises = torch.zeros( (args.batch_sz, args.num_samples, args.num_lines, args.radius)).cuda() step = 1 start = timeit.default_timer() for epoch in range(1, args.n_epochs + 1): for iteration in range( 1, int(np.ceil(train.dataset_sz() / args.batch_sz)) + 1): scheduler_warmup.step() imgs, masks, onehot_masks, centers, dts_modified, dts_original, jitter_radius, bboxes = \ train.next_batch(args.batch_sz) xs = make_batch_input(imgs) xs = torch.cuda.FloatTensor(xs) net.train() unet_logits = net(xs) center_jitters, angle_jitters = [], [] for img, mask, center in zip(imgs, masks, centers): c_j, a_j = get_random_jitter_by_mask(mask, center, [1], args.theta_jitter) if not args.use_center_jitter: c_j = np.zeros_like(c_j) center_jitters.append(c_j) angle_jitters.append(a_j) center_jitters = np.asarray(center_jitters) angle_jitters = np.asarray(angle_jitters) # args.radius + 1 because we need additional outermost points for the gradient gs_logits_whole_img = unet_logits[:, 3, ...] gs_logits, coords_r, coords_c = get_star_pattern_values( gs_logits_whole_img, None, centers, args.num_lines, args.radius + 1, center_jitters=center_jitters, angle_jitters=angle_jitters) # currently only class 1 is foreground # if there's multiple foreground classes use a for loop gs = gs_logits[:, :, 1:] - gs_logits[:, :, :-1] # compute the gradient noises.normal_( 0, 1 ) # noises here is only used for random exploration so no need mirrored sampling gs_noisy = torch.unsqueeze(gs, 1) + noises def batch_eval_snake(snake, inputs, batch_sz): n_inputs = len(inputs) assert n_inputs % batch_sz == 0 n_batches = int(np.ceil(n_inputs / batch_sz)) ind_sets = [] for j in range(n_batches): inps = inputs[j * batch_sz:(j + 1) * batch_sz] batch_ind_sets = snake(inps).data.cpu().numpy() ind_sets.append(batch_ind_sets) ind_sets = np.concatenate(ind_sets, 0) return ind_sets gs_noisy = gs_noisy.reshape((args.batch_sz * args.num_samples, args.num_lines, args.radius)) ind_sets = batch_eval_snake(snake, gs_noisy, args.batch_sz * args.num_samples) ind_sets = ind_sets.reshape( (args.batch_sz * args.num_samples, args.num_lines)) ind_sets = np.expand_dims( smooth_ind(ind_sets, args.smoothing_window), -1) # loss layers m = torch.nn.LogSoftmax(dim=1) loss = torch.nn.NLLLoss() # =========================================================================== # Inner loop: Train dice loss prediction network snake_approx_net.train() for _ in range(args.dice_approx_train_steps): snake_approx_logits = snake_approx_net( gs_noisy.reshape(args.batch_sz * args.num_samples, 1, args.num_lines, args.radius).detach()) snake_approx_train_loss = loss( m(snake_approx_logits.squeeze().transpose(2, 1)), torch.cuda.LongTensor(ind_sets.squeeze())) snake_approx_optimizer.zero_grad() snake_approx_train_loss.backward() snake_approx_optimizer.step() # =========================================================================== # =========================================================================== # Now, minimize the approximate dice loss snake_approx_net.eval() gt_indices = [] for mask, center, cj, aj in zip(masks, centers, center_jitters, angle_jitters): gt_ind = mask_to_indices(mask, center, args.radius, args.num_lines, cj, aj) gt_indices.append(gt_ind) gt_indices = np.asarray(gt_indices).astype(int) gt_indices = gt_indices.reshape((args.batch_sz, args.num_lines)) gt_indices = torch.cuda.LongTensor(gt_indices) snake_approx_logits = snake_approx_net( gs.reshape((args.batch_sz, 1, args.num_lines, args.radius))) nll_approx_loss = loss( m(snake_approx_logits.squeeze().transpose(2, 1)), gt_indices) total_loss = nll_approx_loss optimizer.zero_grad() total_loss.backward() optimizer.step() # =========================================================================== snake_approx_train_loss = snake_approx_train_loss.data.cpu().numpy( ) nll_approx_loss = nll_approx_loss.data.cpu().numpy() total_loss = snake_approx_train_loss + nll_approx_loss if step % args.log_freq == 0: stop = timeit.default_timer() print(f"step={step}\tepoch={epoch}\titer={iteration}" f"\tloss={total_loss}" f"\tsnake_approx_train_loss={snake_approx_train_loss}" f"\tnll_approx_loss={nll_approx_loss}" f"\tlr={optimizer.param_groups[0]['lr']}" f"\ttime={stop-start}") start = stop writer.add_scalar("total_loss", total_loss, step) writer.add_scalar("nll_approx_loss", nll_approx_loss, step) writer.add_scalar("lr", optimizer.param_groups[0]["lr"], step) if step % args.train_eval_freq == 0: train_dice = do_eval( net, snake_eval, train.images, train.masks, train.centers, args.batch_sz, args.num_lines, args.radius, smoothing_window=args.smoothing_window).data.cpu().numpy() writer.add_scalar("train_dice", train_dice, step) print( f"step={step}\tepoch={epoch}\titer={iteration}\ttrain_eval: train_dice={train_dice}" ) if step % args.val_eval_freq == 0: val_dice = do_eval( net, snake_eval, valid.images, valid.masks, valid.centers, args.batch_sz, args.num_lines, args.radius, smoothing_window=args.smoothing_window).data.cpu().numpy() writer.add_scalar("val_dice", val_dice, step) print( f"step={step}\tepoch={epoch}\titer={iteration}\tvalid_dice={val_dice}" ) if val_dice > best_val_dice: best_val_dice = val_dice _pickle.dump([ net.state_dict(), optimizer.state_dict(), snake_approx_net.state_dict(), snake_approx_optimizer.state_dict() ], open( os.path.join(args.log_dir, 'best_model.pth.tar'), 'wb')) f = open( os.path.join(args.log_dir, f"best_val_dice{step}.txt"), 'w') f.write(str(best_val_dice)) f.close() print(f"better val dice detected.") step += 1 return best_val_dice
def main(): data_dir = '../data/' df_biopsy = pd.read_csv(os.path.join(data_dir, 'train.csv')) image_folder = os.path.join(data_dir, 'train_images') kernel_type = 'efficientnet-b3_36x256x256' enet_type = 'efficientnet-b3' num_folds = 5 fold = 0 tile_size = 256 n_tiles = 32 batch_size = 9 num_workers = 24 out_dim = 5 init_lr = 3e-4 warmup_factor = 10 warmup_epo = 1 n_epochs = 30 use_amp = True writer = SummaryWriter(f'tensorboard_logs/{kernel_type}/fold-{fold}') if use_amp and not APEX_AVAILABLE: print("Error: could not import APEX module") exit() skf = StratifiedKFold(num_folds, shuffle=True, random_state=42) df_biopsy['fold'] = -1 for i, (train_idx, valid_idx) in enumerate( skf.split(df_biopsy, df_biopsy['isup_grade'])): df_biopsy.loc[valid_idx, 'fold'] = i mean = [0.90949707, 0.8188697, 0.87795304] std = [0.36357649, 0.49984502, 0.40477625] transform_train = transforms.Compose([ transforms.RandomChoice([ transforms.RandomHorizontalFlip(p=0.5), transforms.RandomVerticalFlip(p=0.5), RotationTransform([90, -90]) ]), transforms.ToTensor(), transforms.Normalize(mean, std) ]) transform_val = transforms.Compose( [transforms.ToTensor(), transforms.Normalize(mean, std)]) df_train = df_biopsy.loc[df_biopsy['fold'] != fold] df_valid = df_biopsy.loc[df_biopsy['fold'] == fold] dataset_train = PANDADataset(df_train, image_folder, tile_size, n_tiles, \ out_dim, transform=transform_train) dataset_valid = PANDADataset(df_valid, image_folder, tile_size, n_tiles, \ out_dim, transform=transform_val) train_loader = DataLoader( dataset_train, batch_size=batch_size, sampler=RandomSampler(dataset_train), num_workers=num_workers, ) valid_loader = DataLoader(dataset_valid, batch_size=batch_size, sampler=SequentialSampler(dataset_valid), num_workers=num_workers) model = enetv2(enet_type, out_dim=out_dim) model = model.to(device) optimizer = optim.Adam(model.parameters(), lr=init_lr / warmup_factor) scheduler_cosine = torch.optim.lr_scheduler.CosineAnnealingLR( optimizer, n_epochs - warmup_epo) scheduler = GradualWarmupScheduler(optimizer, multiplier=warmup_factor, \ total_epoch=warmup_epo, after_scheduler=scheduler_cosine) criterion = nn.BCEWithLogitsLoss() if use_amp: model, optimizer = amp.initialize(model, optimizer, opt_level="O1", keep_batchnorm_fp32=None, loss_scale="dynamic") model = nn.DataParallel(model) print("Number of train samples : {}".format(len(dataset_train))) print("Number of validation samples : {}".format(len(dataset_valid))) best_model = f'{kernel_type}_fold-{fold}_best.pth' save_path = f'../trained_models/{kernel_type}/fold-{fold}/' os.makedirs(save_path, exist_ok=True) qwk_max = 0. for epoch in range(1, n_epochs + 1): print(time.ctime(), 'Epoch:', epoch) scheduler.step(epoch - 1) train_loss = train_epoch(model, train_loader, optimizer, criterion, use_amp=use_amp) val_loss, acc, (qwk, qwk_k, qwk_r) = val_epoch(model, valid_loader, criterion, df_valid) writer.add_scalars(f'loss', { 'train': np.mean(train_loss), 'val': val_loss }, epoch) writer.add_scalars(f'qwk', { 'total': qwk, 'Karolinska': qwk_k, 'Radboud': qwk_r }, epoch) content = "{}, Epoch {}, lr: {:.7f}, train loss: {:.5f}," \ " val loss: {:.5f}, acc: {:.5f}, qwk: {:.5f}".format( time.ctime(), epoch, optimizer.param_groups[0]["lr"], np.mean(train_loss), np.mean(val_loss), acc, qwk ) print(content) with open('train_logs/log_{}_fold-{}.txt'.format(kernel_type, fold), 'a') as appender: appender.write(content + '\n') if qwk > qwk_max: print('score2 ({:.6f} --> {:.6f}). Saving current best model ...'. format(qwk_max, qwk)) torch.save(model.state_dict(), os.path.join(save_path, best_model)) qwk_max = qwk torch.save( { 'epoch': epoch, 'model_state_dict': model.state_dict(), 'optimizer_state_dict': optimizer.state_dict(), 'scheduler_state_dict': scheduler.state_dict(), 'qwk_max': qwk_max }, os.path.join(save_path, f'{kernel_type}_fold-{fold}_{epoch}.pth'))
def main_worker(gpu, args): global best_acc1 args.gpu = gpu if args.gpu is not None: print("Use GPU: {} for training".format(args.gpu)) # create model # if args.gen_map: # args.qw = -1 if args.pretrained: print("=> using pre-trained model '{}'".format(args.arch)) else: print("=> creating model '{}'".format(args.arch)) try: model = mnist_models.__dict__[args.arch](pretrained=args.pretrained) except KeyError: print('do not support {}'.format(args.arch)) return print('model:\n=========\n{}\n=========='.format(model)) if args.gpu is not None and args.gpus is None: #torch.cuda.set_device(args.gpu) model = model.cuda(args.gpu) else: os.environ["CUDA_VISIBLE_DEVICES"] = args.gpus print('Use {} gpus'.format(args.gpus)) # DataParallel will divide and allocate batch_size to all available GPUs if args.arch.startswith('alexnet') or args.arch.startswith('vgg'): model.features = torch.nn.DataParallel(model.features) model.cuda() else: model = torch.nn.DataParallel(model).cuda() # define loss function (criterion) and optimizer criterion = nn.CrossEntropyLoss().cuda(args.gpu) optimizer = torch.optim.SGD(model.parameters(), args.lr, momentum=args.momentum, weight_decay=args.weight_decay) # optionally resume from a checkpoint if args.resume: print(args.resume) if os.path.isfile(args.resume): print("=> loading checkpoint '{}'".format(args.resume)) checkpoint = torch.load(args.resume, map_location='cpu') model.load_state_dict( checkpoint['state_dict']) # GPU memory leak. todo if not args.quant_bias_scale: args.start_epoch = checkpoint['epoch'] best_acc1 = checkpoint['best_acc1'] optimizer.load_state_dict(checkpoint['optimizer']) print("=> loaded checkpoint '{}' (epoch {}) (acc: {})".format( args.resume, checkpoint['epoch'], best_acc1)) print('=> save only weights in {}.pth'.format(args.arch)) model.cpu() torch.save(model.state_dict(), '{}.pth'.format(args.arch)) model.cuda(args.gpu) # save pth here else: print("=> no checkpoint found at '{}'".format(args.resume)) # ConvQ + BN fusion if args.bn_fusion: print('BN fusion begin') model = wrapper.fuse_bn_recursively(model) print('after bn fusion: ') print(model) if args.resume_after: if os.path.isfile(args.resume_after): print('=> loading checkpoint {}'.format(args.resume_after)) checkpoint = torch.load(args.resume_after, map_location='cpu') model.load_state_dict(checkpoint['state_dict']) model.cuda(args.gpu) else: print("=> no checkpoint found at '{}'".format(args.resume)) if args.extract_inner_data: print('extract inner feature map and weight') wrapper.save_inner_hooks(model) for k, v in model.state_dict().items(): np.save('{}'.format(k), v.cpu().numpy()) cudnn.benchmark = True # Data loading code print('==> Preparing data..') # transform_train = transforms.Compose([ # transforms.RandomCrop(32, padding=4), # transforms.RandomHorizontalFlip(), # transforms.ToTensor(), # transforms.Normalize((0.4914, 0.4822, 0.4465), (0.2023, 0.1994, 0.2010)), # ]) # transform_test = transforms.Compose([ # transforms.ToTensor(), # transforms.Normalize((0.4914, 0.4822, 0.4465), (0.2023, 0.1994, 0.2010)), # ]) ''' trainset = torchvision.datasets.CIFAR10(root=args.data, train=True, download=True, transform=transform_train) train_loader = torch.utils.data.DataLoader(trainset, batch_size=args.batch_size, shuffle=True, num_workers=args.workers) args.batch_num = len(train_loader) testset = torchvision.datasets.CIFAR10(root=args.data, train=False, download=True, transform=transform_test) val_loader = torch.utils.data.DataLoader(testset, batch_size=args.batch_size, shuffle=False, num_workers=args.workers) ''' train_loader = torch.utils.data.DataLoader(torchvision.datasets.MNIST( '~/dataset/mnist', train=True, download=True, transform=transforms.Compose([ transforms.Resize((32, 32)), transforms.ToTensor(), transforms.Normalize((0.1307, ), (0.3081, )) ])), batch_size=args.batch_size, shuffle=True) args.batch_num = len(train_loader) val_loader = torch.utils.data.DataLoader(torchvision.datasets.MNIST( '~/dataset/mnist', train=False, transform=transforms.Compose([ transforms.Resize((32, 32)), transforms.ToTensor(), transforms.Normalize((0.1307, ), (0.3081, )) ])), batch_size=args.batch_size, shuffle=False) scheduler_cosine = torch.optim.lr_scheduler.CosineAnnealingLR( optimizer, T_max=args.epochs) scheduler_step = torch.optim.lr_scheduler.MultiStepLR( optimizer, milestones=[80, 160, 300]) scheduler_next = scheduler_step if args.cosine: scheduler_next = scheduler_cosine scheduler_warmup = GradualWarmupScheduler(optimizer, multiplier=10, total_epoch=10, after_scheduler=scheduler_next) if args.evaluate: validate(val_loader, model, criterion, args) return if 'q' in args.arch: args.log_name = 'logger/{}_{}'.format(args.arch, args.log_name) else: args.log_name = 'logger/{}_{}'.format(args.arch, args.log_name) writer = SummaryWriter(args.log_name) with open('{}/{}.txt'.format(args.log_name, args.arch), 'w') as wf: wf.write(str(model)) for epoch in range(args.start_epoch, args.epochs): # adjust_learning_rate(optimizer, epoch, args) scheduler_warmup.step() # train for one epoch train(train_loader, model, criterion, optimizer, epoch, args, writer) # evaluate on validation set acc1 = validate(val_loader, model, criterion, args) writer.add_scalar('val/acc1', acc1, epoch) writer.add_scalar('train/lr', optimizer.param_groups[0]['lr'], epoch) if args.debug: cnt = 0 for k, v in model.state_dict().items(): if 'pos' in k or 'neg' in k or 'shift' in k: writer.add_histogram(k, v, epoch) cnt += 1 if cnt == 10: break # remember best acc@1 and save checkpoint is_best = acc1 > best_acc1 best_acc1 = max(acc1, best_acc1) save_checkpoint( { 'epoch': epoch + 1, 'arch': args.arch, 'state_dict': model.state_dict(), 'best_acc1': best_acc1, 'optimizer': optimizer.state_dict(), }, is_best, prefix='{}/{}_'.format(args.log_name, args.arch))
for data in train_loader: data = data.to(device) optimizer.zero_grad() output = model(data) loss = F.l1_loss(output, data.y) loss_all += loss.item() * data.num_graphs loss.backward() clip_grad_norm_(model.parameters(), max_norm=1000, norm_type=2) optimizer.step() curr_epoch = epoch + float(step) / (len(train_dataset) / args.batch_size) scheduler_warmup.step(curr_epoch) ema(model) step += 1 train_loss = loss_all / len(train_loader.dataset) val_loss = test(val_loader) if best_val_loss is None or val_loss <= best_val_loss: test_loss = test(test_loader) best_epoch = epoch best_val_loss = val_loss print('Epoch: {:03d}, Train MAE: {:.7f}, Validation MAE: {:.7f}, ' 'Test MAE: {:.7f}'.format(epoch + 1, train_loss, val_loss,
if __name__ == '__main__': model = [torch.nn.Parameter(torch.randn(2, 2, requires_grad=True))] optim = SGD(model, 0.1) epochs = 20 # scheduler_warmup is chained with lr_schduler lr_schduler = CosineAnnealingLR(optim, T_max=epochs - 5, eta_min=0.02) scheduler_warmup = GradualWarmupScheduler(optim, multiplier=1, total_epoch=5, after_scheduler=lr_schduler) # this zero gradient update is needed to avoid a warning message, issue #8. optim.zero_grad() optim.step() scheduler_warmup.step() lr_list = list() for epoch in range(epochs): current_lr = optim.param_groups[0]['lr'] optim.step() scheduler_warmup.step() print(epoch + 1, current_lr) lr_list.append(current_lr) plot(lr_list)
def train(net, loader): losses = [] loss_fn = NTXentLoss(batch_size=BATCH_SIZE, temperature=TEMPERATURE, use_cosine_similarity=True) optimizer = SGD_with_lars(net.parameters(), lr=0.1 * BATCH_SIZE / 256, momentum=0.9, weight_decay=1e-6) from warmup_scheduler import GradualWarmupScheduler cosine_scheduler = optim.lr_scheduler.CosineAnnealingLR( optimizer, TOTAL_EPOCHS) scheduler = GradualWarmupScheduler( optimizer, multiplier=1, total_epoch=TOTAL_EPOCHS // 10, after_scheduler=cosine_scheduler, ) train_start = time.time() net.change_mode("pretrain") for epoch in range(1, TOTAL_EPOCHS + 1): train_loss = 0 net.train() epoch_start = time.time() for idx, (data, target) in enumerate(loader): optimizer.zero_grad() xi, xj, target = data[0].cuda(), data[1].cuda(), target.cuda() _, zis, _ = net(xi) _, zjs, _ = net(xj) loss = loss_fn(zis, zjs) train_loss += loss.item() loss.backward() optimizer.step() train_loss /= idx + 1 losses.append(train_loss) scheduler.step() epoch_time = time.time() - epoch_start print( "Epoch\t", epoch, "\tLoss\t", train_loss, "\tTime\t", epoch_time, ) elapsed_train_time = time.time() - train_start print("Finished training. Train time was:", elapsed_train_time) return losses
def main(pargs): # this should be global global have_wandb #init distributed training comm.init(pargs.wireup_method) comm_rank = comm.get_rank() comm_local_rank = comm.get_local_rank() comm_size = comm.get_size() # set up logging pargs.logging_frequency = max([pargs.logging_frequency, 1]) log_file = os.path.normpath( os.path.join(pargs.output_dir, "logs", pargs.run_tag + ".log")) logger = mll.mlperf_logger(log_file, "deepcam", "Umbrella Corp.") logger.log_start(key="init_start", sync=True) logger.log_event(key="cache_clear") #set seed seed = 333 logger.log_event(key="seed", value=seed) # Some setup torch.manual_seed(seed) if torch.cuda.is_available(): device = torch.device("cuda", comm_local_rank) torch.cuda.manual_seed(seed) #necessary for AMP to work torch.cuda.set_device(device) # TEST: allowed? Valuable? #torch.backends.cudnn.benchark = True else: device = torch.device("cpu") #visualize? visualize = (pargs.training_visualization_frequency > 0) or (pargs.validation_visualization_frequency > 0) #set up directories root_dir = os.path.join(pargs.data_dir_prefix) output_dir = pargs.output_dir plot_dir = os.path.join(output_dir, "plots") if comm_rank == 0: if not os.path.isdir(output_dir): os.makedirs(output_dir) if visualize and not os.path.isdir(plot_dir): os.makedirs(plot_dir) # Setup WandB if not pargs.enable_wandb: have_wandb = False if have_wandb and (comm_rank == 0): # get wandb api token certfile = os.path.join(pargs.wandb_certdir, ".wandbirc") try: with open(certfile) as f: token = f.readlines()[0].replace("\n", "").split() wblogin = token[0] wbtoken = token[1] except IOError: print("Error, cannot open WandB certificate {}.".format(certfile)) have_wandb = False if have_wandb: # log in: that call can be blocking, it should be quick sp.call(["wandb", "login", wbtoken]) #init db and get config resume_flag = pargs.run_tag if pargs.resume_logging else False wandb.init(entity=wblogin, project='deepcam', name=pargs.run_tag, id=pargs.run_tag, resume=resume_flag) config = wandb.config #set general parameters config.root_dir = root_dir config.output_dir = pargs.output_dir config.max_epochs = pargs.max_epochs config.local_batch_size = pargs.local_batch_size config.num_workers = comm_size config.channels = pargs.channels config.optimizer = pargs.optimizer config.start_lr = pargs.start_lr config.adam_eps = pargs.adam_eps config.weight_decay = pargs.weight_decay config.model_prefix = pargs.model_prefix config.amp_opt_level = pargs.amp_opt_level config.loss_weight_pow = pargs.loss_weight_pow config.lr_warmup_steps = pargs.lr_warmup_steps config.lr_warmup_factor = pargs.lr_warmup_factor # lr schedule if applicable if pargs.lr_schedule: for key in pargs.lr_schedule: config.update( {"lr_schedule_" + key: pargs.lr_schedule[key]}, allow_val_change=True) # Logging hyperparameters logger.log_event(key="global_batch_size", value=(pargs.local_batch_size * comm_size)) logger.log_event(key="opt_name", value=pargs.optimizer) logger.log_event(key="opt_base_learning_rate", value=pargs.start_lr * pargs.lr_warmup_factor) logger.log_event(key="opt_learning_rate_warmup_steps", value=pargs.lr_warmup_steps) logger.log_event(key="opt_learning_rate_warmup_factor", value=pargs.lr_warmup_factor) logger.log_event(key="opt_epsilon", value=pargs.adam_eps) # Define architecture n_input_channels = len(pargs.channels) n_output_channels = 3 net = deeplab_xception.DeepLabv3_plus(n_input=n_input_channels, n_classes=n_output_channels, os=16, pretrained=False, rank=comm_rank) net.to(device) #select loss loss_pow = pargs.loss_weight_pow #some magic numbers class_weights = [ 0.986267818390377**loss_pow, 0.0004578708870701058**loss_pow, 0.01327431072255291**loss_pow ] fpw_1 = 2.61461122397522257612 fpw_2 = 1.71641974795896018744 criterion = losses.fp_loss #select optimizer optimizer = None if pargs.optimizer == "Adam": optimizer = optim.Adam(net.parameters(), lr=pargs.start_lr, eps=pargs.adam_eps, weight_decay=pargs.weight_decay) elif pargs.optimizer == "AdamW": optimizer = optim.AdamW(net.parameters(), lr=pargs.start_lr, eps=pargs.adam_eps, weight_decay=pargs.weight_decay) elif have_apex and (pargs.optimizer == "LAMB"): optimizer = aoptim.FusedLAMB(net.parameters(), lr=pargs.start_lr, eps=pargs.adam_eps, weight_decay=pargs.weight_decay) else: raise NotImplementedError("Error, optimizer {} not supported".format( pargs.optimizer)) if have_apex: #wrap model and opt into amp net, optimizer = amp.initialize(net, optimizer, opt_level=pargs.amp_opt_level) #make model distributed net = DDP(net) #restart from checkpoint if desired #if (comm_rank == 0) and (pargs.checkpoint): #load it on all ranks for now if pargs.checkpoint: checkpoint = torch.load(pargs.checkpoint, map_location=device) start_step = checkpoint['step'] start_epoch = checkpoint['epoch'] optimizer.load_state_dict(checkpoint['optimizer']) net.load_state_dict(checkpoint['model']) if have_apex: amp.load_state_dict(checkpoint['amp']) else: start_step = 0 start_epoch = 0 #select scheduler if pargs.lr_schedule: scheduler_after = ph.get_lr_schedule(pargs.start_lr, pargs.lr_schedule, optimizer, last_step=start_step) # LR warmup if pargs.lr_warmup_steps > 0: if have_warmup_scheduler: scheduler = GradualWarmupScheduler( optimizer, multiplier=pargs.lr_warmup_factor, total_epoch=pargs.lr_warmup_steps, after_scheduler=scheduler_after) # Throw an error if the package is not found else: raise Exception( f'Requested {pargs.lr_warmup_steps} LR warmup steps ' 'but warmup scheduler not found. Install it from ' 'https://github.com/ildoonet/pytorch-gradual-warmup-lr') else: scheduler = scheduler_after #broadcast model and optimizer state steptens = torch.tensor(np.array([start_step, start_epoch]), requires_grad=False).to(device) dist.broadcast(steptens, src=0) ##broadcast model and optimizer state #hvd.broadcast_parameters(net.state_dict(), root_rank = 0) #hvd.broadcast_optimizer_state(optimizer, root_rank = 0) #unpack the bcasted tensor start_step = steptens.cpu().numpy()[0] start_epoch = steptens.cpu().numpy()[1] # Set up the data feeder # train train_dir = os.path.join(root_dir, "train") train_set = cam.CamDataset(train_dir, statsfile=os.path.join(root_dir, 'stats.h5'), channels=pargs.channels, allow_uneven_distribution=False, shuffle=True, preprocess=True, comm_size=comm_size, comm_rank=comm_rank) train_loader = DataLoader( train_set, pargs.local_batch_size, num_workers=min([pargs.max_inter_threads, pargs.local_batch_size]), pin_memory=True, drop_last=True) # validation: we only want to shuffle the set if we are cutting off validation after a certain number of steps validation_dir = os.path.join(root_dir, "validation") validation_set = cam.CamDataset(validation_dir, statsfile=os.path.join( root_dir, 'stats.h5'), channels=pargs.channels, allow_uneven_distribution=True, shuffle=(pargs.max_validation_steps is not None), preprocess=True, comm_size=comm_size, comm_rank=comm_rank) # use batch size = 1 here to make sure that we do not drop a sample validation_loader = DataLoader( validation_set, 1, num_workers=min([pargs.max_inter_threads, pargs.local_batch_size]), pin_memory=True, drop_last=True) # log size of datasets logger.log_event(key="train_samples", value=train_set.global_size) if pargs.max_validation_steps is not None: val_size = min([ validation_set.global_size, pargs.max_validation_steps * pargs.local_batch_size * comm_size ]) else: val_size = validation_set.global_size logger.log_event(key="eval_samples", value=val_size) # do sanity check if pargs.max_validation_steps is not None: logger.log_event(key="invalid_submission") #for visualization #if visualize: # viz = vizc.CamVisualizer() # Train network if have_wandb and (comm_rank == 0): wandb.watch(net) step = start_step epoch = start_epoch current_lr = pargs.start_lr if not pargs.lr_schedule else scheduler.get_last_lr( )[0] stop_training = False net.train() # start trining logger.log_end(key="init_stop", sync=True) logger.log_start(key="run_start", sync=True) # training loop while True: # start epoch logger.log_start(key="epoch_start", metadata={ 'epoch_num': epoch + 1, 'step_num': step }, sync=True) # epoch loop for inputs, label, filename in train_loader: # send to device inputs = inputs.to(device) label = label.to(device) # forward pass outputs = net.forward(inputs) # Compute loss and average across nodes loss = criterion(outputs, label, weight=class_weights, fpw_1=fpw_1, fpw_2=fpw_2) # Backprop optimizer.zero_grad() if have_apex: with amp.scale_loss(loss, optimizer) as scaled_loss: scaled_loss.backward() else: loss.backward() optimizer.step() # step counter step += 1 if pargs.lr_schedule: current_lr = scheduler.get_last_lr()[0] scheduler.step() #visualize if requested #if (step % pargs.training_visualization_frequency == 0) and (comm_rank == 0): # # Compute predictions # predictions = torch.max(outputs, 1)[1] # # # extract sample id and data tensors # sample_idx = np.random.randint(low=0, high=label.shape[0]) # plot_input = inputs.detach()[sample_idx, 0,...].cpu().numpy() # plot_prediction = predictions.detach()[sample_idx,...].cpu().numpy() # plot_label = label.detach()[sample_idx,...].cpu().numpy() # # # create filenames # outputfile = os.path.basename(filename[sample_idx]).replace("data-", "training-").replace(".h5", ".png") # outputfile = os.path.join(plot_dir, outputfile) # # # plot # viz.plot(filename[sample_idx], outputfile, plot_input, plot_prediction, plot_label) # # #log if requested # if have_wandb: # img = Image.open(outputfile) # wandb.log({"train_examples": [wandb.Image(img, caption="Prediction vs. Ground Truth")]}, step = step) #log if requested if (step % pargs.logging_frequency == 0): # allreduce for loss loss_avg = loss.detach() dist.reduce(loss_avg, dst=0, op=dist.ReduceOp.SUM) loss_avg_train = loss_avg.item() / float(comm_size) # Compute score predictions = torch.max(outputs, 1)[1] iou = utils.compute_score(predictions, label, device_id=device, num_classes=3) iou_avg = iou.detach() dist.reduce(iou_avg, dst=0, op=dist.ReduceOp.SUM) iou_avg_train = iou_avg.item() / float(comm_size) logger.log_event(key="learning_rate", value=current_lr, metadata={ 'epoch_num': epoch + 1, 'step_num': step }) logger.log_event(key="train_accuracy", value=iou_avg_train, metadata={ 'epoch_num': epoch + 1, 'step_num': step }) logger.log_event(key="train_loss", value=loss_avg_train, metadata={ 'epoch_num': epoch + 1, 'step_num': step }) if have_wandb and (comm_rank == 0): wandb.log( {"train_loss": loss_avg.item() / float(comm_size)}, step=step) wandb.log( {"train_accuracy": iou_avg.item() / float(comm_size)}, step=step) wandb.log({"learning_rate": current_lr}, step=step) wandb.log({"epoch": epoch + 1}, step=step) # validation step if desired if (step % pargs.validation_frequency == 0): logger.log_start(key="eval_start", metadata={'epoch_num': epoch + 1}) #eval net.eval() count_sum_val = torch.Tensor([0.]).to(device) loss_sum_val = torch.Tensor([0.]).to(device) iou_sum_val = torch.Tensor([0.]).to(device) # disable gradients with torch.no_grad(): # iterate over validation sample step_val = 0 # only print once per eval at most visualized = False for inputs_val, label_val, filename_val in validation_loader: #send to device inputs_val = inputs_val.to(device) label_val = label_val.to(device) # forward pass outputs_val = net.forward(inputs_val) # Compute loss and average across nodes loss_val = criterion(outputs_val, label_val, weight=class_weights, fpw_1=fpw_1, fpw_2=fpw_2) loss_sum_val += loss_val #increase counter count_sum_val += 1. # Compute score predictions_val = torch.max(outputs_val, 1)[1] iou_val = utils.compute_score(predictions_val, label_val, device_id=device, num_classes=3) iou_sum_val += iou_val # Visualize #if (step_val % pargs.validation_visualization_frequency == 0) and (not visualized) and (comm_rank == 0): # #extract sample id and data tensors # sample_idx = np.random.randint(low=0, high=label_val.shape[0]) # plot_input = inputs_val.detach()[sample_idx, 0,...].cpu().numpy() # plot_prediction = predictions_val.detach()[sample_idx,...].cpu().numpy() # plot_label = label_val.detach()[sample_idx,...].cpu().numpy() # # #create filenames # outputfile = os.path.basename(filename[sample_idx]).replace("data-", "validation-").replace(".h5", ".png") # outputfile = os.path.join(plot_dir, outputfile) # # #plot # viz.plot(filename[sample_idx], outputfile, plot_input, plot_prediction, plot_label) # visualized = True # # #log if requested # if have_wandb: # img = Image.open(outputfile) # wandb.log({"eval_examples": [wandb.Image(img, caption="Prediction vs. Ground Truth")]}, step = step) #increase eval step counter step_val += 1 if (pargs.max_validation_steps is not None ) and step_val > pargs.max_validation_steps: break # average the validation loss dist.all_reduce(count_sum_val, op=dist.ReduceOp.SUM) dist.all_reduce(loss_sum_val, op=dist.ReduceOp.SUM) dist.all_reduce(iou_sum_val, op=dist.ReduceOp.SUM) loss_avg_val = loss_sum_val.item() / count_sum_val.item() iou_avg_val = iou_sum_val.item() / count_sum_val.item() # print results logger.log_event(key="eval_accuracy", value=iou_avg_val, metadata={ 'epoch_num': epoch + 1, 'step_num': step }) logger.log_event(key="eval_loss", value=loss_avg_val, metadata={ 'epoch_num': epoch + 1, 'step_num': step }) # log in wandb if have_wandb and (comm_rank == 0): wandb.log({"eval_loss": loss_avg_val}, step=step) wandb.log({"eval_accuracy": iou_avg_val}, step=step) if (iou_avg_val >= pargs.target_iou): logger.log_event(key="target_accuracy_reached", value=pargs.target_iou, metadata={ 'epoch_num': epoch + 1, 'step_num': step }) stop_training = True # set to train net.train() logger.log_end(key="eval_stop", metadata={'epoch_num': epoch + 1}) #save model if desired if (pargs.save_frequency > 0) and (step % pargs.save_frequency == 0): logger.log_start(key="save_start", metadata={ 'epoch_num': epoch + 1, 'step_num': step }, sync=True) if comm_rank == 0: checkpoint = { 'step': step, 'epoch': epoch, 'model': net.state_dict(), 'optimizer': optimizer.state_dict() } if have_apex: checkpoint['amp'] = amp.state_dict() torch.save( checkpoint, os.path.join( output_dir, pargs.model_prefix + "_step_" + str(step) + ".cpt")) logger.log_end(key="save_stop", metadata={ 'epoch_num': epoch + 1, 'step_num': step }, sync=True) # Stop training? if stop_training: break # log the epoch logger.log_end(key="epoch_stop", metadata={ 'epoch_num': epoch + 1, 'step_num': step }, sync=True) epoch += 1 # are we done? if epoch >= pargs.max_epochs or stop_training: break # run done logger.log_end(key="run_stop", sync=True, metadata={'status': 'success'})
def train(args, train_dataloader, val_dataloader, test_dataloader, criterion): model = LSTMCrossCycleGCNDropout(args.voc_len, args.rnn_layers, args.birnn, 'gru', args.word_matrix, args.resnet_input_size, args.c3d_input_size, args.rnn_layers, args.birnn, 'gru', args.hidden_size, dropout_p=args.dropout, gcn_layers=args.gcn_layers, num_heads=8, answer_vocab_size=args.answer_vocab_size, q_max_len=args.q_max_length, v_max_len=args.v_max_length, tf_layers=args.tf_layers, two_loss=args.two_loss, fusion_type=args.fusion_type, ablation=args.ablation) if torch.cuda.device_count() > 1: print("Let's use", torch.cuda.device_count(), "GPUs!") model = nn.DataParallel(model) model.to(device) if args.change_lr == 'none': optimizer = torch.optim.Adam(model.parameters(), lr=args.lr, weight_decay=args.weight_decay) elif args.change_lr == 'acc': optimizer = torch.optim.Adam(model.parameters(), lr=args.lr / 5., weight_decay=args.weight_decay) # val plateau scheduler scheduler = torch.optim.lr_scheduler.ReduceLROnPlateau(optimizer, mode='max', factor=0.1, patience=3, verbose=True) # target lr = args.lr * multiplier scheduler_warmup = GradualWarmupScheduler(optimizer, multiplier=5, total_epoch=5, after_scheduler=scheduler) elif args.change_lr == 'loss': optimizer = torch.optim.Adam(model.parameters(), lr=args.lr / 5., weight_decay=args.weight_decay) # val plateau scheduler scheduler = torch.optim.lr_scheduler.ReduceLROnPlateau(optimizer, mode='min', factor=0.1, patience=3, verbose=True) # target lr = args.lr * multiplier scheduler_warmup = GradualWarmupScheduler(optimizer, multiplier=5, total_epoch=5, after_scheduler=scheduler) elif args.change_lr == 'cos': optimizer = torch.optim.Adam(model.parameters(), lr=args.lr / 5., weight_decay=args.weight_decay) # consine annealing scheduler = torch.optim.lr_scheduler.CosineAnnealingLR( optimizer, args.max_epoch) # target lr = args.lr * multiplier scheduler_warmup = GradualWarmupScheduler(optimizer, multiplier=5, total_epoch=5, after_scheduler=scheduler) elif args.change_lr == 'step': optimizer = torch.optim.Adam(model.parameters(), lr=args.lr, weight_decay=args.weight_decay) scheduler = torch.optim.lr_scheduler.MultiStepLR( optimizer, milestones=args.lr_list, gamma=0.1) # scheduler_warmup = GradualWarmupScheduler( # optimizer, multiplier=5, total_epoch=5, after_scheduler=scheduler) best_val_acc = 0. if args.task != 'Count' else -100. for epoch in range(args.max_epoch): print('Start Training Epoch: {}'.format(epoch)) model.train() loss_list = [] prediction_list = [] correct_answer_list = [] if args.change_lr == 'cos': # consine annealing scheduler_warmup.step(epoch=epoch) for ii, data in enumerate(train_dataloader): if epoch == 0 and ii == 0: print([d.dtype for d in data], [d.size() for d in data]) # print([d.dtype for d in data], [d.size() for d in data]) data = [d.to(device) for d in data] optimizer.zero_grad() out, predictions, answers, _ = model(args.task, *data) loss = criterion(out, answers) loss.backward() optimizer.step() correct_answer_list.append(answers) loss_list.append(loss.item()) prediction_list.append(predictions.detach()) if ii % 100 == 0: print("Batch: ", ii) train_loss = np.mean(loss_list) correct_answer = torch.cat(correct_answer_list, dim=0).long() predict_answer = torch.cat(prediction_list, dim=0).long() assert correct_answer.shape == predict_answer.shape current_num = torch.sum(predict_answer == correct_answer).cpu().numpy() acc = current_num / len(correct_answer) * 100. # print('Learning Rate: {}'.format(optimizer.param_groups[0]['lr'])) if args.change_lr == 'acc': scheduler_warmup.step(epoch, val_acc) elif args.change_lr == 'loss': scheduler_warmup.step(epoch, val_loss) elif args.change_lr == 'step': scheduler.step() print("Train|Epoch: {}, Acc : {:.3f}={}/{}, Train Loss: {:.3f}".format( epoch, acc, current_num, len(correct_answer), train_loss)) if args.task == 'Count': count_loss = F.mse_loss(predict_answer.float(), correct_answer.float()) print('Train|Count Real Loss:\t {:.3f}'.format(count_loss)) val_acc, val_loss = val(args, model, val_dataloader, epoch, criterion) if val_acc > best_val_acc: print('Best Val Acc ======') best_val_acc = val_acc if epoch % args.val_epoch_step == 0 or val_acc >= best_val_acc: test(args, model, test_dataloader, epoch, criterion)
lr = 0.001 optim = torch.optim.SGD([v], lr=lr) optim.param_groups[0]['initial_lr'] = lr last_epoch = -1 scheduler = lr_scheduler.MultiStepLR(optim, milestones=[4], gamma=0.1, last_epoch=-1) # scheduler = torch.optim.lr_scheduler.CosineAnnealingLR(optim, T_max=10, eta_min=0.00001, last_epoch=-1) # scheduler = lr_scheduler.OneCycleLR(optim, max_lr=0.001, total_steps=6000, pct_start=0.033, anneal_strategy='cos', last_epoch=last_epoch) warmup = True if warmup: scheduler = GradualWarmupScheduler(optim, multiplier=5, total_epoch=5, after_scheduler=scheduler) # if last_epoch != -1: # scheduler.step() lrs = [] for epoch in range(last_epoch + 1, 30): print(epoch, optim.param_groups[0]['lr']) lrs.append(optim.param_groups[0]['lr']) scheduler.step() plt.plot(lrs) plt.show()
x, y_a, y_b, lam = cutmix_data(x, y, config.cutmix_beta) pred = model(x) loss = criterion(pred, y_a * lam + y_b * (1 - lam)) else: pred = model(speech) loss = criterion(pred, speech_label) if config.amp: with amp.scale_loss(loss, optimizer) as scaled_loss: scaled_loss.backward() else: loss.backward() optimizer.step() optimizer.zero_grad() train_loss += loss.item() / len(train_loader) scheduler.step(step) step += 1 progress_bar.set_description( 'Step: {}. LR : {:.5f}. Epoch: {}/{}. Iteration: {}/{}. current loss: {:.5f}' .format(step, optimizer.param_groups[0]['lr'], epoch, config.n_epoch, idx + 1, len(train_loader), loss.item())) valid_loss = 0 valid_acc = 0 model.eval() for idx, data in enumerate(tqdm(valid_loader)): x = data['x'].cuda() y = data['y'].cuda() with torch.no_grad(): pred = model(x) loss = criterion(pred, y)
import torch from warmup_scheduler import GradualWarmupScheduler if __name__ == '__main__': v = torch.zeros(10) optim = torch.optim.SGD([v], lr=0.01) scheduler = GradualWarmupScheduler(optim, multiplier=8, total_epoch=10) for epoch in range(1, 20): scheduler.step(epoch) print(epoch, optim.param_groups[0]['lr'])
import torch from torch.optim.lr_scheduler import StepLR, ExponentialLR from torch.optim.sgd import SGD from warmup_scheduler import GradualWarmupScheduler if __name__ == '__main__': model = [torch.nn.Parameter(torch.randn(2, 2, requires_grad=True))] optim = SGD(model, 0.1) # scheduler_warmup is chained with schduler_steplr scheduler_steplr = StepLR(optim, step_size=10, gamma=0.1) scheduler_warmup = GradualWarmupScheduler(optim, multiplier=1, total_epoch=5, after_scheduler=scheduler_steplr) # this zero gradient update is needed to avoid a warning message, issue #8. optim.zero_grad() optim.step() for epoch in range(1, 20): scheduler_warmup.step(epoch) print(epoch, optim.param_groups[0]['lr']) optim.step() # backward pass (update network)
scheduler = GradualWarmupScheduler(optimizer, multiplier=warmup_factor, total_epoch=warmup_epo, after_scheduler=scheduler_cosine) # optimizer = Radam.Over9000(model.parameters(), lr = init_lr) # scheduler = torch.optim.lr_scheduler.CosineAnnealingLR(optimizer, n_epochs) model, optimizer = amp.initialize(model, optimizer, opt_level="O1") model = torch.nn.DataParallel(model, device_ids=list(range(len(gpus.split(","))))) qwk_max = 0. for epoch in range(1, n_epochs + 1): printOut(time.ctime(), 'Epoch:', epoch) scheduler.step(epoch - 1) train_loss = train_epoch(train_loader, optimizer) val_loss, acc, qwk = val_epoch(valid_loader, epoch == n_epochs) content = time.ctime( ) + ' ' + f'Epoch {epoch}, lr: {optimizer.param_groups[0]["lr"]:.7f}, train loss: {np.mean(train_loss):.5f}, val loss: {np.mean(val_loss):.5f}, acc: {(acc):.5f}, qwk: {(qwk):.5f}' printOut(content) if qwk > qwk_max: printOut('score2 ({:.6f} --> {:.6f}). Saving model ...'.format( qwk_max, qwk)) torch.save(model.module.state_dict(), modelpath) qwk_max = qwk torch.save(model.module.state_dict(),
def main(opt): torch.manual_seed(opt.seed) torch.backends.cudnn.benchmark = not opt.not_cuda_benchmark and not opt.test Dataset = get_dataset(opt.dataset, opt.task) opt = opts().update_dataset_info_and_set_heads(opt, Dataset) logger = Logger(opt) os.environ['CUDA_VISIBLE_DEVICES'] = opt.gpus_str opt.device = torch.device('cuda' if opt.gpus[0] >= 0 else 'cpu') print('Creating model...') model = create_model(opt.arch, opt.heads, opt.head_conv) opt.lr = 5e-3 optimizer = torch.optim.Adam(model.parameters(), opt.lr, weight_decay=0) scheduler_consine = torch.optim.lr_scheduler.CosineAnnealingLR(optimizer, T_max=310, eta_min=0) scheduler_warmup = GradualWarmupScheduler( optimizer, multiplier=1, total_epoch=10, after_scheduler=scheduler_consine) start_epoch = 0 if opt.load_model != '': model, optimizer, start_epoch = load_model(model, opt.load_model, optimizer, opt.resume, opt.lr, opt.lr_step) Trainer = train_factory[opt.task] trainer = Trainer(opt, model, optimizer) trainer.set_device(opt.gpus, opt.chunk_sizes, opt.device) print('Setting up data...') val_loader = torch.utils.data.DataLoader(Dataset(opt, 'val'), batch_size=1, shuffle=False, num_workers=1, pin_memory=True) if opt.test: _, preds = trainer.val(0, val_loader) val_loader.dataset.run_eval(preds, opt.save_dir) return train_loader = torch.utils.data.DataLoader(Dataset(opt, 'train'), batch_size=opt.batch_size, shuffle=True, num_workers=opt.num_workers, pin_memory=True, drop_last=True) print('Starting training...') best = 1e10 for epoch in range(start_epoch + 1, opt.num_epochs + 1): mark = epoch if opt.save_all else 'last' log_dict_train, _ = trainer.train(epoch, train_loader) logger.write('epoch: {} |'.format(epoch)) for k, v in log_dict_train.items(): logger.scalar_summary('train_{}'.format(k), v, epoch) logger.write('{} {:8f} | '.format(k, v)) if opt.val_intervals > 0 and epoch % opt.val_intervals == 0: save_model(os.path.join(opt.save_dir, 'model_{}.pth'.format(mark)), epoch, model, optimizer) with torch.no_grad(): log_dict_val, preds = trainer.val(epoch, val_loader) for k, v in log_dict_val.items(): logger.scalar_summary('val_{}'.format(k), v, epoch) logger.write('{} {:8f} | '.format(k, v)) if log_dict_val[opt.metric] < best: best = log_dict_val[opt.metric] save_model(os.path.join(opt.save_dir, 'model_best.pth'), epoch, model) else: save_model(os.path.join(opt.save_dir, 'model_last.pth'), epoch, model, optimizer) logger.write('\n') scheduler_warmup.step(epoch) # if epoch in opt.lr_step: # save_model(os.path.join(opt.save_dir, 'model_{}.pth'.format(epoch)), # epoch, model, optimizer) # lr = opt.lr * (0.1 ** (opt.lr_step.index(epoch) + 1)) # print('Drop LR to', lr) # for param_group in optimizer.param_groups: # param_group['lr'] = lr logger.close()