def main(): torch.cuda.set_device(1) # get dataframe df, out_dim = get_df(args.kernel_type, args.data_dir, args.train_step) print(f"out_dim = {out_dim}") # get adaptive margin tmp = np.sqrt(1 / np.sqrt(df['landmark_id'].value_counts().sort_index().values)) margins = (tmp - tmp.min()) / (tmp.max() - tmp.min()) * 0.45 + 0.05 # get augmentations transforms_train, transforms_val = get_transforms(args.image_size) print("\ndata augmentation is done!\n") #extract images in folder 0 as demo df_demo_0 = df[df['filepath'].str.startswith('/mnt/data/sjx/CS498_DL_Project/data/train/0/0')] df_demo_1 = df[df['filepath'].str.startswith('/mnt/data/sjx/CS498_DL_Project/data/train/0/1')] df_demo_2 = df[df['filepath'].str.startswith('/mnt/data/sjx/CS498_DL_Project/data/train/0/2')] df_demo_3 = df[df['filepath'].str.startswith('/mnt/data/sjx/CS498_DL_Project/data/train/0/3')] df_demo = df_demo_0.append([df_demo_1, df_demo_2, df_demo_3]) # get train and valid dataset df = df_demo df_train = df[df['fold'] != args.fold] df_valid = df[df['fold'] == args.fold].reset_index(drop=True).query("index % 15==0") dataset_train = LandmarkDataset(df_train, 'train', 'train', transform=transforms_train) dataset_valid = LandmarkDataset(df_valid, 'train', 'val', transform=transforms_val) valid_loader = torch.utils.data.DataLoader(dataset_valid, batch_size=args.batch_size, num_workers=args.num_workers, drop_last=True) print("dataset has been prepared!\n") # model print(torch.cuda.current_device()) model = ModelClass(args.enet_type, out_dim=out_dim) model = nn.DataParallel(model, device_ids=[1, 3]).to("cuda:1, 3") # loss func def criterion(logits_m, target): arc = ArcFaceLossAdaptiveMargin(margins=margins, s=80) loss_m = arc(logits_m, target, out_dim) return loss_m # optimizer optimizer = optim.Adam(model.parameters(), lr=args.init_lr) # load pretrained if len(args.load_from) > 0: # Todo: checkpoint = torch.load(args.load_from, map_location=lambda storage, loc: storage.cuda(3)) state_dict = checkpoint['model_state_dict'] state_dict = {k[7:] if k.startswith('module.') else k: state_dict[k] for k in state_dict.keys()} if args.train_step == 1: del state_dict['metric_classify.weight'] model.load_state_dict(state_dict, strict=False) else: model.load_state_dict(state_dict, strict=True) # if 'optimizer_state_dict' in checkpoint: # optimizer.load_state_dict(checkpoint['optimizer_state_dict']) del checkpoint, state_dict torch.cuda.empty_cache() import gc gc.collect() # lr scheduler scheduler_cosine = torch.optim.lr_scheduler.CosineAnnealingWarmRestarts(optimizer, args.n_epochs-1) scheduler_warmup = GradualWarmupSchedulerV2(optimizer, multiplier=10, total_epoch=1, after_scheduler=scheduler_cosine) # train & valid loop gap_m_max = 0. model_file = os.path.join(args.model_dir, f'{args.kernel_type}_fold{args.fold}.pth') for epoch in range(args.start_from_epoch, args.n_epochs+1): print(time.ctime(), 'Epoch:', epoch) train_loader = torch.utils.data.DataLoader(dataset_train, batch_size=args.batch_size, num_workers=args.num_workers, shuffle=True, drop_last=True) train_loss = train_epoch(model, train_loader, optimizer, criterion) val_loss, acc_m, gap_m = val_epoch(model, valid_loader, criterion) scheduler_warmup.step(epoch-1) if args.local_rank == 0: content = time.ctime() + ' ' + f'Fold {args.fold}, Epoch {epoch}, lr: {optimizer.param_groups[0]["lr"]:.7f}, train loss: {np.mean(train_loss):.5f}, valid loss: {(val_loss):.5f}, acc_m: {(acc_m):.6f}, gap_m: {(gap_m):.6f}.' print(content) with open(os.path.join(args.log_dir, f'{args.kernel_type}.txt'), 'a') as appender: appender.write(content + '\n') print('gap_m_max ({:.6f} --> {:.6f}). Saving model ...'.format(gap_m_max, gap_m)) torch.save({ 'epoch': epoch, 'model_state_dict': model.state_dict(), 'optimizer_state_dict': optimizer.state_dict(), }, model_file) gap_m_max = gap_m if epoch == args.stop_at_epoch: print(time.ctime(), 'Training Finished!') break torch.save({ 'epoch': epoch, 'model_state_dict': model.state_dict(), 'optimizer_state_dict': optimizer.state_dict(), }, os.path.join(args.model_dir, f'{args.kernel_type}_fold{args.fold}_final.pth'))
def run(fold, df, meta_features, n_meta_features, transforms_train, transforms_val, mel_idx): if args.DEBUG: args.n_epochs = 5 df_train = df[df['fold'] != fold].sample(args.batch_size * 5) df_valid = df[df['fold'] == fold].sample(args.batch_size * 5) else: df_train = df[df['fold'] != fold] df_valid = df[df['fold'] == fold] dataset_train = MelanomaDataset(df_train, 'train', meta_features, transform=transforms_train) dataset_valid = MelanomaDataset(df_valid, 'valid', meta_features, transform=transforms_val) train_loader = torch.utils.data.DataLoader( dataset_train, batch_size=args.batch_size, sampler=RandomSampler(dataset_train), num_workers=args.num_workers) # 随机不重复采样 valid_loader = torch.utils.data.DataLoader(dataset_valid, batch_size=args.batch_size, num_workers=args.num_workers) model = ModelClass( args.enet_type, n_meta_features=n_meta_features, n_meta_dim=[int(nd) for nd in args.n_meta_dim.split(',')], out_dim=args.out_dim, pretrained=True) if DP: model = apex.parallel.convert_syncbn_model(model) model = model.to(device) auc_max = 0. auc_20_max = 0. model_file = os.path.join(args.model_dir, f'{args.kernel_type}_best_fold{fold}.pth') model_file2 = os.path.join(args.model_dir, f'{args.kernel_type}_best_20_fold{fold}.pth') model_file3 = os.path.join(args.model_dir, f'{args.kernel_type}_final_fold{fold}.pth') optimizer = optim.Adam(model.parameters(), lr=args.init_lr) if args.use_amp: model, optimizer = amp.initialize(model, optimizer, opt_level="O1") if DP: model = nn.DataParallel(model) # scheduler_cosine = torch.optim.lr_scheduler.CosineAnnealingLR(optimizer, args.n_epochs - 1) scheduler_cosine = torch.optim.lr_scheduler.CosineAnnealingWarmRestarts( optimizer, args.n_epochs - 1) scheduler_warmup = GradualWarmupSchedulerV2( optimizer, multiplier=10, total_epoch=1, after_scheduler=scheduler_cosine) print(len(dataset_train), len(dataset_valid)) for epoch in range(1, args.n_epochs + 1): print(time.ctime(), f'Epoch {epoch}', f'Fold {fold}') # scheduler_warmup.step(epoch - 1) train_loss = train_epoch(model, train_loader, optimizer) val_loss, acc, auc, auc_20 = val_epoch( model, valid_loader, mel_idx, is_ext=df_valid['is_ext'].values) content = time.ctime( ) + ' ' + f'Epoch {epoch}, lr: {optimizer.param_groups[0]["lr"]:.7f}, train loss: {train_loss:.5f}, valid loss: {(val_loss):.5f}, acc: {(acc):.4f}, auc: {(auc):.6f}, auc_20: {(auc_20):.6f}.' print(content) with open(os.path.join(args.log_dir, f'log_{args.kernel_type}.txt'), 'a') as appender: appender.write(content + '\n') scheduler_warmup.step() if epoch == 2: scheduler_warmup.step() # bug workaround if auc > auc_max: print('auc_max ({:.6f} --> {:.6f}). Saving model ...'.format( auc_max, auc)) torch.save(model.state_dict(), model_file) auc_max = auc if auc_20 > auc_20_max: print('auc_20_max ({:.6f} --> {:.6f}). Saving model ...'.format( auc_20_max, auc_20)) torch.save(model.state_dict(), model_file2) auc_20_max = auc_20 torch.save(model.state_dict(), model_file3)
def run(fold, df, meta_features, n_meta_features, transforms_train, transforms_val, mel_idx): # en suivant le méthode k fold : if args.DEBUG: args.n_epochs = 5 # la validation se fait la paquet de données dont l'id est fold # le reste des paquets on l'utilise pour le training df_train = df[df['fold'] != fold].sample(args.batch_size * 5) df_valid = df[df['fold'] == fold].sample(args.batch_size * 5) else: df_train = df[df['fold'] != fold] df_valid = df[df['fold'] == fold] # on instantie nous objet dataset (Training + Validation) dataset_train = MelanomaDataset(df_train, 'train', meta_features, transform=transforms_train) dataset_valid = MelanomaDataset(df_valid, 'valid', meta_features, transform=transforms_val) # on instantie nous data loader (training validation ) train_loader = torch.utils.data.DataLoader( dataset_train, batch_size=args.batch_size, sampler=RandomSampler(dataset_train), num_workers=args.num_workers) valid_loader = torch.utils.data.DataLoader(dataset_valid, batch_size=args.batch_size, num_workers=args.num_workers) # on instantie notre model model = ModelClass( args.enet_type, # ex : Resnet n_meta_features= n_meta_features, # ex ['sex', 'age_approx', 'n_images', 'image_size'] n_meta_dim=[int(nd) for nd in args.n_meta_dim.split(',')], out_dim=args.out_dim, pretrained=True) if DP: model = apex.parallel.convert_syncbn_model(model) model = model.to(device) # on instantie nous variables de précisions auc_max = 0. auc_20_max = 0. # on définie les fichiers dans les quels on stocke les paramètres modèles model_file = os.path.join(args.model_dir, f'{args.kernel_type}_best_fold{fold}.pth') model_file2 = os.path.join(args.model_dir, f'{args.kernel_type}_best_20_fold{fold}.pth') model_file3 = os.path.join(args.model_dir, f'{args.kernel_type}_final_fold{fold}.pth') optimizer = optim.Adam(model.parameters(), lr=args.init_lr) if args.use_amp: model, optimizer = amp.initialize(model, optimizer, opt_level="O1") if DP: model = nn.DataParallel(model) # scheduler_cosine = torch.optim.lr_scheduler.CosineAnnealingLR(optimizer, args.n_epochs - 1) scheduler_cosine = torch.optim.lr_scheduler.CosineAnnealingWarmRestarts( optimizer, args.n_epochs - 1) scheduler_warmup = GradualWarmupSchedulerV2( optimizer, multiplier=10, total_epoch=1, after_scheduler=scheduler_cosine) print(len(dataset_train), len(dataset_valid)) for epoch in range(1, args.n_epochs + 1): print(time.ctime(), f'Fold {fold}, Epoch {epoch}') # scheduler_warmup.step(epoch - 1) # train loss train_loss = train_epoch(model, train_loader, optimizer) # validation loss val_loss, acc, auc, auc_20 = val_epoch( model, valid_loader, mel_idx, is_ext=df_valid['is_ext'].values) content = time.ctime( ) + ' ' + f'Fold {fold}, Epoch {epoch}, lr: {optimizer.param_groups[0]["lr"]:.7f}, train loss: {train_loss:.5f}, valid loss: {(val_loss):.5f}, acc: {(acc):.4f}, auc: {(auc):.6f}, auc_20: {(auc_20):.6f}.' print(content) with open(os.path.join(args.log_dir, f'log_{args.kernel_type}.txt'), 'a') as appender: appender.write(content + '\n') scheduler_warmup.step() if epoch == 2: scheduler_warmup.step() # bug workaround # on stocke les paramètres model dans les fichiers correspondants if auc > auc_max: print('auc_max ({:.6f} --> {:.6f}). Saving model ...'.format( auc_max, auc)) torch.save(model.state_dict(), model_file) auc_max = auc if auc_20 > auc_20_max: print('auc_20_max ({:.6f} --> {:.6f}). Saving model ...'.format( auc_20_max, auc_20)) torch.save(model.state_dict(), model_file2) auc_20_max = auc_20 # on stocke les paramètres model dont la précision maximale dans le fichier model_file3 torch.save(model.state_dict(), model_file3)
def main(): # get dataframe df, out_dim = get_df(args.kernel_type, args.data_dir, args.train_step) print(f"out_dim = {out_dim}") # get adaptive margin tmp = np.sqrt( 1 / np.sqrt(df['landmark_id'].value_counts().sort_index().values)) margins = (tmp - tmp.min()) / (tmp.max() - tmp.min()) * 0.45 + 0.05 # get augmentations transforms_train, transforms_val = get_transforms(args.image_size) # get train and valid dataset df_train = df[df['fold'] != args.fold] df_valid = df[df['fold'] == args.fold].reset_index( drop=True).query("index % 15==0") dataset_train = LandmarkDataset(df_train, 'train', 'train', transform=transforms_train) dataset_valid = LandmarkDataset(df_valid, 'train', 'val', transform=transforms_val) valid_loader = torch.utils.data.DataLoader(dataset_valid, batch_size=args.batch_size, num_workers=args.num_workers) # model model = ModelClass(args.enet_type, out_dim=out_dim) model = model.cuda() model = apex.parallel.convert_syncbn_model(model) # loss func def criterion(logits_m, target): arc = ArcFaceLossAdaptiveMargin(margins=margins, s=80) loss_m = arc(logits_m, target, out_dim) return loss_m # optimizer optimizer = optim.Adam(model.parameters(), lr=args.init_lr) if args.use_amp: model, optimizer = amp.initialize(model, optimizer, opt_level="O1") # load pretrained if len(args.load_from) > 0: checkpoint = torch.load(args.load_from, map_location='cuda:{}'.format(args.local_rank)) state_dict = checkpoint['model_state_dict'] state_dict = { k[7:] if k.startswith('module.') else k: state_dict[k] for k in state_dict.keys() } if args.train_step == 1: del state_dict['metric_classify.weight'] model.load_state_dict(state_dict, strict=False) else: model.load_state_dict(state_dict, strict=True) # if 'optimizer_state_dict' in checkpoint: # optimizer.load_state_dict(checkpoint['optimizer_state_dict']) del checkpoint, state_dict torch.cuda.empty_cache() import gc gc.collect() model = DistributedDataParallel(model, delay_allreduce=True) # lr scheduler scheduler_cosine = torch.optim.lr_scheduler.CosineAnnealingWarmRestarts( optimizer, args.n_epochs - 1) scheduler_warmup = GradualWarmupSchedulerV2( optimizer, multiplier=10, total_epoch=1, after_scheduler=scheduler_cosine) # train & valid loop gap_m_max = 0. model_file = os.path.join(args.model_dir, f'{args.kernel_type}_fold{args.fold}.pth') for epoch in range(args.start_from_epoch, args.n_epochs + 1): print(time.ctime(), 'Epoch:', epoch) scheduler_warmup.step(epoch - 1) train_sampler = torch.utils.data.distributed.DistributedSampler( dataset_train) train_sampler.set_epoch(epoch) train_loader = torch.utils.data.DataLoader( dataset_train, batch_size=args.batch_size, num_workers=args.num_workers, shuffle=train_sampler is None, sampler=train_sampler, drop_last=True) train_loss = train_epoch(model, train_loader, optimizer, criterion) val_loss, acc_m, gap_m = val_epoch(model, valid_loader, criterion) if args.local_rank == 0: content = time.ctime( ) + ' ' + f'Fold {args.fold}, Epoch {epoch}, lr: {optimizer.param_groups[0]["lr"]:.7f}, train loss: {np.mean(train_loss):.5f}, valid loss: {(val_loss):.5f}, acc_m: {(acc_m):.6f}, gap_m: {(gap_m):.6f}.' print(content) with open(os.path.join(args.log_dir, f'{args.kernel_type}.txt'), 'a') as appender: appender.write(content + '\n') print('gap_m_max ({:.6f} --> {:.6f}). Saving model ...'.format( gap_m_max, gap_m)) torch.save( { 'epoch': epoch, 'model_state_dict': model.state_dict(), 'optimizer_state_dict': optimizer.state_dict(), }, model_file) gap_m_max = gap_m if epoch == args.stop_at_epoch: print(time.ctime(), 'Training Finished!') break torch.save( { 'epoch': epoch, 'model_state_dict': model.state_dict(), 'optimizer_state_dict': optimizer.state_dict(), }, os.path.join(args.model_dir, f'{args.kernel_type}_fold{args.fold}_final.pth'))
def run(fold, df, meta_features, n_meta_features, transforms_train, transforms_val, mel_idx): if args.DEBUG: args.n_epochs = 5 df_train = df[df['fold'] != fold].sample(args.batch_size * 5) df_valid = df[df['fold'] == fold].sample(args.batch_size * 5) else: df_train = df[df['fold'] != fold] df_valid = df[df['fold'] == fold] dataset_train = MelanomaDataset(df_train, 'train', meta_features, transform=transforms_train) dataset_valid = MelanomaDataset(df_valid, 'valid', meta_features, transform=transforms_val) train_loader = torch.utils.data.DataLoader(dataset_train, batch_size=args.batch_size, sampler=RandomSampler(dataset_train), num_workers=args.num_workers) valid_loader = torch.utils.data.DataLoader(dataset_valid, batch_size=args.batch_size, num_workers=args.num_workers) model = ModelClass() model = model.to(device) auc_max = 0. auc_20_max = 0. model_file = os.path.join(args.model_dir, f'{args.kernel_type}_best_fold{fold}.pth') model_file2 = os.path.join(args.model_dir, f'{args.kernel_type}_best_20_fold{fold}.pth') model_file3 = os.path.join(args.model_dir, f'{args.kernel_type}_final_fold{fold}.pth') optimizer = optim.AdamW(model.parameters(), lr=args.init_lr,weight_decay=args.weight_decay) scheduler_cosine = torch.optim.lr_scheduler.CosineAnnealingWarmRestarts(optimizer, args.n_epochs - 1) scheduler_warmup = GradualWarmupSchedulerV2(optimizer, multiplier=10, total_epoch=1, after_scheduler=scheduler_cosine) print(len(dataset_train), len(dataset_valid)) print('Continuing with model from ' + model_file3) try: checkpoint = torch.load(model_file) model.load_state_dict(checkpoint,strict=False) except: print('error') pass for epoch in range(1, args.n_epochs + 1): print(time.ctime(), f'Fold {fold}, Epoch {epoch}') # scheduler_warmup.step(epoch - 1) train_loss = train_epoch(model, train_loader, optimizer) val_loss, acc, auc, auc_20 = val_epoch(model, valid_loader, mel_idx, is_ext=df_valid['is_ext'].values) content = time.ctime() + ' ' + f'Fold {fold}, Epoch {epoch}, lr: {optimizer.param_groups[0]["lr"]:.7f}, train loss: {train_loss:.5f}, valid loss: {(val_loss):.5f}, acc: {(acc):.4f}, auc: {(auc):.6f}, auc_20: {(auc_20):.6f}.' print(content) with open(os.path.join(args.log_dir, f'log_{args.kernel_type}.txt'), 'a') as appender: appender.write(content + '\n') scheduler_warmup.step() if epoch == 2: scheduler_warmup.step() # bug workaround if auc > auc_max: print('auc_max ({:.6f} --> {:.6f}). Saving model ...'.format(auc_max, auc)) torch.save(model.state_dict(), model_file) auc_max = auc if auc_20 > auc_20_max: print('auc_20_max ({:.6f} --> {:.6f}). Saving model ...'.format(auc_20_max, auc_20)) torch.save(model.state_dict(), model_file2) auc_20_max = auc_20 torch.save({ 'net': model.state_dict(), 'optimizer': optimizer.state_dict(), }, model_file3)
def run(folds, df, transforms_train, transforms_val): if args.DEBUG: args.n_epochs = 3 df_train = df[df['fold'].isin(folds)].sample(args.batch_size * 4) df_valid = df[~df['fold'].isin(folds)].sample(args.batch_size * 4) else: df_train = df[df['fold'].isin(folds)] df_valid = df[~df['fold'].isin(folds)] dataset_train = RetinalDataset(df_train, 'train', transform=transforms_train) dataset_valid = RetinalDataset(df_valid, 'valid', transform=transforms_val) train_loader = torch.utils.data.DataLoader( dataset_train, batch_size=args.batch_size, sampler=RandomSampler(dataset_train), num_workers=args.num_workers) valid_loader = torch.utils.data.DataLoader(dataset_valid, batch_size=args.batch_size, num_workers=args.num_workers) model = ModelClass(args.enet_type, out_dim=args.out_dim, pretrained=True, freeze_cnn=args.freeze_cnn, load_model=args.load_model, pretrain_cnn=args.pretrain_cnn, pretrain_file=args.pretrain_file) para_num = sum(p.numel() for p in model.parameters() if p.requires_grad) content = f'Number of trainable parameters:{para_num}\n' if DP: pass model = model.to(device) score_max = 0. if args.DEBUG: model_file_best = os.path.join(args.model_dir + '/debug', f'{args.kernel_type}_best.pth') model_file_final = os.path.join(args.model_dir + '/debug', f'{args.kernel_type}_final.pth') else: model_file_best = os.path.join(args.model_dir, f'{args.kernel_type}_best.pth') model_file_final = os.path.join(args.model_dir, f'{args.kernel_type}_final.pth') if args.freeze_cnn: optimizer = optim.Adam(filter(lambda p: p.requires_grad, model.parameters()), lr=args.init_lr) else: optimizer = optim.Adam(model.parameters(), lr=args.init_lr) if DP: pass scheduler_cosine = torch.optim.lr_scheduler.CosineAnnealingWarmRestarts( optimizer, args.n_epochs - 1) scheduler_warmup = GradualWarmupSchedulerV2( optimizer, multiplier=10, total_epoch=1, after_scheduler=scheduler_cosine) nums = dataset_train.get_num() content += f'total num of train:{len(dataset_train)},class nums:{nums}' + '\n' nums = dataset_valid.get_num() content += f'total num of val:{len(dataset_valid)},class nums:{nums}' + '\n' if args.DEBUG: with open( os.path.join(args.log_dir + '/debug', f'log_{args.kernel_type}.txt'), 'a') as appender: appender.write(content) else: with open(os.path.join(args.log_dir, f'log_{args.kernel_type}.txt'), 'a') as appender: appender.write(content) for epoch in range(1, args.n_epochs + 1): print(time.ctime(), f'Epoch {epoch}') train_loss = train_epoch(model, train_loader, optimizer) val_loss, mean_score, scores = val_epoch(model, valid_loader) content = time.ctime( ) + ' ' + f'Epoch {epoch}, lr: {optimizer.param_groups[0]["lr"]:.7f}, train loss: {train_loss:.5f}, valid loss: {val_loss:.5f}, mean_score: {mean_score:.4f}, scores: {scores[0]:.4f} {scores[1]:.4f} {scores[2]:.4f} {scores[3]:.4f} {scores[4]:.4f} {scores[5]:.4f} {scores[6]:.4f}.' print(content) if args.DEBUG: with open( os.path.join(args.log_dir + '/debug', f'log_{args.kernel_type}.txt'), 'a') as appender: appender.write(content + '\n') else: with open( os.path.join(args.log_dir, f'log_{args.kernel_type}.txt'), 'a') as appender: appender.write(content + '\n') scheduler_warmup.step() if epoch == 2: scheduler_warmup.step() # bug workaround if mean_score > score_max: print('score_max ({:.6f} --> {:.6f}). Saving model ...'.format( score_max, mean_score)) torch.save(model.state_dict(), model_file_best) score_max = mean_score torch.save(model.state_dict(), model_file_final)
def main(args): # get dataframe df = get_df(args.groups) # get adaptive margin tmp = np.sqrt( 1 / np.sqrt(df['label_group'].value_counts().sort_index().values)) margins = (tmp - tmp.min()) / (tmp.max() - tmp.min()) * 0.45 + 0.05 # get augmentations transforms_train, transforms_val = get_transforms(args.image_size, args.stage) # get train and valid dataset df_train = df[df['fold'] != args.fold] if not args.full else df df_train['label_group'] = LabelEncoder().fit_transform( df_train.label_group) df_valid = df[df['fold'] == args.fold] out_dim = df_train.label_group.nunique() print(f"out_dim = {out_dim}") dataset_train = ShoppeDataset(df_train, 'train', transform=transforms_train) dataset_valid = ShoppeDataset(df_valid, 'val', transform=transforms_val) print( f'Train on {len(df_train)} images, validate on {len(df_valid)} images') train_loader = torch.utils.data.DataLoader(dataset_train, batch_size=args.batch_size, num_workers=args.num_workers, pin_memory=True, shuffle=True, drop_last=True) valid_loader = torch.utils.data.DataLoader(dataset_valid, batch_size=args.batch_size, num_workers=args.num_workers) loss_config = decode_config(args.loss_config) # model if args.enet_type == 'resnest50': model = Resnest50(out_dim=out_dim, loss_config=loss_config, args=args) else: model = Model(args.enet_type, out_dim=out_dim, loss_config=loss_config, args=args) model = model.cuda() # loss func criterion = get_criterion(args, out_dim, margins) # optimizer optimizer = optim.AdamW(model.parameters(), lr=args.init_lr) # load pretrained if args.load_from and args.load_from != 'none': checkpoint = torch.load(args.load_from, map_location='cuda:0') state_dict = checkpoint['model_state_dict'] state_dict = { k[7:] if k.startswith('module.') else k: state_dict[k] for k in state_dict.keys() } model.load_state_dict(state_dict, strict=True) del checkpoint, state_dict torch.cuda.empty_cache() gc.collect() print(f"Loaded weight from {args.load_from}") # lr scheduler scheduler_cosine = torch.optim.lr_scheduler.CosineAnnealingWarmRestarts( optimizer, args.n_epochs - 1) warmup_epochs = args.warmup_epochs if args.stage == 1 else 1 print(warmup_epochs) scheduler_warmup = GradualWarmupSchedulerV2( optimizer, multiplier=10, total_epoch=warmup_epochs, after_scheduler=scheduler_cosine) # train & valid loop best_score = -1 model_file = os.path.join( args.model_dir, weight_file(args.kernel_type, args.fold, args.stage, loss_config.loss_type, out_dim)) for epoch in range(args.start_from_epoch, args.n_epochs + 1): print(time.ctime(), f'Epoch: {epoch}/{args.n_epochs}') scheduler_warmup.step(epoch - 1) train_loss, acc_list = train_epoch(model, train_loader, optimizer, criterion) f1score = val_epoch(model, valid_loader, criterion, df_valid, args) content = time.ctime() + ' ' + \ ( f'Fold {args.fold}, Epoch {epoch}, lr: {optimizer.param_groups[0]["lr"]:.7f}, train loss: {np.mean(train_loss):.5f},' f' train acc {np.mean(acc_list):.5f}, f1score: {(f1score):.6f}.') print(content) with open(os.path.join(args.log_dir, f'{args.kernel_type}.txt'), 'a') as appender: appender.write(content + '\n') if f1score > best_score: print('best f1 score ({:.6f} --> {:.6f}). Saving model ...'.format( best_score, f1score)) torch.save( { 'epoch': epoch, 'model_state_dict': model.state_dict(), 'optimizer_state_dict': optimizer.state_dict(), }, model_file) best_score = f1score if epoch == args.stop_at_epoch: print(time.ctime(), 'Training Finished!') break torch.save( { 'epoch': epoch, 'model_state_dict': model.state_dict(), 'optimizer_state_dict': optimizer.state_dict(), }, model_file)