def train(model_name, image_size): if not os.path.exists(snapshot_path): os.makedirs(snapshot_path) header = ['Epoch', 'Learning rate', 'Time', 'Train Loss', 'Val Loss'] if not os.path.isfile(snapshot_path + '/log.csv'): with open(snapshot_path + '/log.csv', 'a', newline='') as f: writer = csv.writer(f) writer.writerow(header) df_all = pd.read_csv(csv_path) kfold_path_train = '../data/fold_5_by_study/' kfold_path_val = '../data/fold_5_by_study_image/' for num_fold in range(5): print('fold_num:', num_fold) with open(snapshot_path + '/log.csv', 'a', newline='') as f: writer = csv.writer(f) writer.writerow([num_fold]) f_train = open( kfold_path_train + 'fold' + str(num_fold) + '/train.txt', 'r') f_val = open(kfold_path_val + 'fold' + str(num_fold) + '/val.txt', 'r') c_train = f_train.readlines() c_val = f_val.readlines() f_train.close() f_val.close() c_train = [s.replace('\n', '') for s in c_train] c_val = [s.replace('\n', '') for s in c_val] # for debug # c_train = c_train[0:1000] # c_val = c_val[0:4000] print('train dataset study num:', len(c_train), ' val dataset image num:', len(c_val)) with open(snapshot_path + '/log.csv', 'a', newline='') as f: writer = csv.writer(f) writer.writerow( ['train dataset:', len(c_train), ' val dataset:', len(c_val)]) writer.writerow([ 'train_batch_size:', train_batch_size, 'val_batch_size:', val_batch_size ]) train_transform, val_transform = generate_transforms(image_size) train_loader, val_loader = generate_dataset_loader( df_all, c_train, train_transform, train_batch_size, c_val, val_transform, val_batch_size, workers) model = eval(model_name + '()') model = model.cuda() optimizer = torch.optim.Adam(model.parameters(), lr=0.0005, betas=(0.9, 0.999), eps=1e-08, weight_decay=0.00002) scheduler = WarmRestart(optimizer, T_max=5, T_mult=1, eta_min=1e-5) model = torch.nn.DataParallel(model) loss_cls = torch.nn.BCEWithLogitsLoss(pos_weight=torch.FloatTensor( [1.0, 1.0, 1.0, 1.0, 1.0, 1.0]).cuda()) trMaxEpoch = 80 for epochID in range(0, trMaxEpoch): epochID = epochID + 0 start_time = time.time() model.train() trainLoss = 0 lossTrainNorm = 10 if epochID < 10: pass elif epochID < 80: if epochID != 10: scheduler.step() scheduler = warm_restart(scheduler, T_mult=2) else: optimizer.param_groups[0]['lr'] = 1e-5 for batchID, (input, target) in enumerate(train_loader): if batchID == 0: ss_time = time.time() print(str(batchID) + '/' + str(int(len(c_train) / train_batch_size)) + ' ' + str((time.time() - ss_time) / (batchID + 1)), end='\r') varInput = torch.autograd.Variable(input) target = target.view(-1, 6).contiguous().cuda() varTarget = torch.autograd.Variable(target.contiguous().cuda()) varOutput = model(varInput) lossvalue = loss_cls(varOutput, varTarget) trainLoss = trainLoss + lossvalue.item() lossTrainNorm = lossTrainNorm + 1 lossvalue.backward() optimizer.step() optimizer.zero_grad() del lossvalue trainLoss = trainLoss / lossTrainNorm if (epochID + 1) % 5 == 0 or epochID > 79 or epochID == 0: valLoss, auc, loss_list, loss_sum = epochVal( model, val_loader, loss_cls, c_val, val_batch_size) epoch_time = time.time() - start_time if (epochID + 1) % 5 == 0 or epochID > 79: torch.save( { 'epoch': epochID + 1, 'state_dict': model.state_dict(), 'valLoss': valLoss }, snapshot_path + '/model_epoch_' + str(epochID) + '_' + str(num_fold) + '.pth') result = [ epochID, round(optimizer.state_dict()['param_groups'][0]['lr'], 6), round(epoch_time, 0), round(trainLoss, 5), round(valLoss, 5), 'auc:', auc, 'loss:', loss_list, loss_sum ] print(result) with open(snapshot_path + '/log.csv', 'a', newline='') as f: writer = csv.writer(f) writer.writerow(result) del model
def train_one_model(model_name, image_size): if not os.path.exists(snapshot_path): os.makedirs(snapshot_path) header = ['Epoch', 'Learning rate', 'Time', 'Train Loss', 'Val Loss', 'best_thr_with_no_mask', 'best_dice_with_no_mask'] hheader = "[epochID, round(optimizer.state_dict()['param_groups'][0]['lr'], 6), round(epoch_time, 0), round(trainLoss, 4), round(trainLoss_seg, 3), round(trainLoss_cls, 3), round(valLoss, 3), round(valLoss_seg, 3), round(valLoss_cls, 3), round(np.mean(auc), 3), round(np.mean(max_threshold_list), 3), round(np.mean(max_result_f1_list), 3), round(best_thr_with_no_mask, 3), round(float(best_dice_with_no_mask), 3), round(best_f1_thr_seg_to_cls, 3), round(best_f1_value_seg_to_cls, 3), round(best_thrr_without_no_mask, 3), round(best_dicer_without_no_mask, 3), round(best_thrr_with_no_mask_2, 3), round(best_dicer_with_no_mask_2, 3)]" print(hheader) if not os.path.isfile(snapshot_path + '/log.csv'): with open(snapshot_path + '/log.csv', 'a', newline='') as f: writer = csv.writer(f) writer.writerow(header) writer.writerow(hheader) df_all = pd.read_csv(csv_path) kfold_path = path_data['k_fold_path_cls'] for num_fold in range(5): print(num_fold) with open(snapshot_path + '/log.csv', 'a', newline='') as f: writer = csv.writer(f) writer.writerow([num_fold]) f_train = open(kfold_path + 'fold' + str(num_fold) + '/train.txt', 'r') f_val = open(kfold_path + 'fold' + str(num_fold) + '/val.txt', 'r') c_train = f_train.readlines() c_val = f_val.readlines() f_train.close() f_val.close() c_train = [s.replace('\n', '') for s in c_train] c_val = [s.replace('\n', '') for s in c_val] print('train dataset:', len(c_train), ' val dataset c_val_without_no_mask:', 476, ' val dataset c_val_with_no_mask:', len(c_val)) with open(snapshot_path + '/log.csv', 'a', newline='') as f: writer = csv.writer(f) writer.writerow(['train dataset:', len(c_train), ' val dataset c_val_without_no_mask:', 476, ' val dataset c_val_with_no_mask:', len(c_val)]) writer.writerow(['train_batch_size:', train_batch_size, 'val_batch_size:', val_batch_size]) train_transform, val_transform = generate_transforms(image_size) train_loader, val_loader = generate_dataset_loader_cls_seg(df_all, c_train, train_transform, train_batch_size, c_val, val_transform, val_batch_size, workers) model = eval(model_name+'()') model = torch.nn.DataParallel(model).cuda() optimizer = torch.optim.Adam(model.parameters(), lr=0.0005, betas=(0.9, 0.999), eps=1e-08, weight_decay=0) scheduler = WarmRestart(optimizer, T_max=5, T_mult=1, eta_min=1e-5) def loss_cls_com(input, target): loss_1 = FocalLoss() loss_2 = torch.nn.BCEWithLogitsLoss() loss = loss_1(input, target) + loss_2(input, target) return loss loss_cls = FocalLoss() loss_seg = torch.nn.BCEWithLogitsLoss() trMaxEpoch = 34 lossMIN = 100000 val_dice_max = 0 for epochID in range (0, trMaxEpoch): start_time = time.time() model.train() trainLoss = 0 lossTrainNorm = 0 trainLoss_cls = 0 trainLoss_seg = 0 if epochID < 30: if epochID != 0: scheduler.step() scheduler = warm_restart(scheduler, T_mult=2) elif epochID > 29 and epochID < 32: optimizer.param_groups[0]['lr'] = 1e-5 else: optimizer.param_groups[0]['lr'] = 5e-6 for batchID, (input, target_seg, target_cls) in enumerate (train_loader): # print(input.shape) if batchID == 0: ss_time = time.time() print(str(batchID) + '/' + str(int(len(c_train)/train_batch_size)) + ' ' + str((time.time()-ss_time)/(batchID+1)), end='\r') varInput = torch.autograd.Variable(input) varTarget_seg = torch.autograd.Variable(target_seg.contiguous().cuda(async=True)) varTarget_cls = torch.autograd.Variable(target_cls.contiguous().cuda(async=True)) varOutput_cls, varOutput_seg = model(varInput) varTarget_seg = varTarget_seg.float() lossvalue_seg = loss_seg(varOutput_seg, varTarget_seg) trainLoss_seg = trainLoss_seg + lossvalue_seg.item() lossvalue_cls = loss_cls_com(varOutput_cls, varTarget_cls) trainLoss_cls = trainLoss_cls + lossvalue_cls.item() lossvalue = lossvalue_cls + lossvalue_seg lossTrainNorm = lossTrainNorm + 1 optimizer.zero_grad() lossvalue.backward() optimizer.step() del lossvalue_seg, lossvalue_cls, lossvalue trainLoss_seg = trainLoss_seg / lossTrainNorm trainLoss_cls = trainLoss_cls / lossTrainNorm trainLoss = trainLoss_seg + trainLoss_cls if epochID%1 == 0: valLoss, valLoss_seg, valLoss_cls, auc, max_threshold_list, max_result_f1_list, precision_list, recall_list = epochVal(num_fold, model, val_loader, loss_seg, loss_cls, c_val, val_batch_size) epoch_time = time.time() - start_time if epochID%1 == 0: torch.save({'epoch': epochID + 1, 'state_dict': model.state_dict(), 'valLoss': valLoss}, snapshot_path + '/model_epoch_' + str(epochID) + '_' + str(num_fold) + '.pth') result = [epochID, round(optimizer.state_dict()['param_groups'][0]['lr'], 6), round(epoch_time, 0), round(trainLoss, 4), round(trainLoss_seg, 3), round(trainLoss_cls, 3), round(valLoss, 3), round(valLoss_seg, 3), round(valLoss_cls, 3), round(np.mean(auc), 3), round(np.mean(max_threshold_list), 3), round(np.mean(max_result_f1_list), 3)] print(result) with open(snapshot_path + '/log.csv', 'a', newline='') as f: writer = csv.writer(f) writer.writerow(result) del model
def train_one_model(model_name, img_size, use_chexpert, path_data): RESIZE_SIZE = img_size train_transform = albumentations.Compose([ albumentations.Resize(RESIZE_SIZE, RESIZE_SIZE), albumentations.OneOf([ albumentations.RandomGamma(gamma_limit=(60, 120), p=0.9), albumentations.RandomBrightnessContrast(brightness_limit=0.2, contrast_limit=0.2, p=0.9), albumentations.CLAHE(clip_limit=4.0, tile_grid_size=(4, 4), p=0.9), ]), albumentations.OneOf([ albumentations.Blur(blur_limit=4, p=1), albumentations.MotionBlur(blur_limit=4, p=1), albumentations.MedianBlur(blur_limit=4, p=1) ], p=0.5), albumentations.HorizontalFlip(p=0.5), albumentations.ShiftScaleRotate(shift_limit=0.2, scale_limit=0.2, rotate_limit=20, interpolation=cv2.INTER_LINEAR, border_mode=cv2.BORDER_CONSTANT, p=1), albumentations.Normalize(mean=(0.485, 0.456, 0.406), std=(0.229, 0.224, 0.225), max_pixel_value=255.0, p=1.0) ]) val_transform = albumentations.Compose([ albumentations.Resize(RESIZE_SIZE, RESIZE_SIZE, p=1), albumentations.Normalize(mean=(0.485, 0.456, 0.406), std=(0.229, 0.224, 0.225), max_pixel_value=255.0, p=1.0) ]) if not os.path.exists(snapshot_path): os.makedirs(snapshot_path) header = [ 'Epoch', 'Learning rate', 'Time', 'Train Loss', 'Val Loss', 'best_thr_with_no_mask', 'best_dice_with_no_mask', 'best_thr_without_no_mask', 'best_dice_without_no_mask' ] if not os.path.isfile(snapshot_path + '/log.csv'): with open(snapshot_path + '/log.csv', 'a', newline='') as f: writer = csv.writer(f) writer.writerow(header) kfold_path = path_data['k_fold_path'] extra_data = path_data['extra_img_csv'] for f_fold in range(5): num_fold = f_fold print(num_fold) with open(snapshot_path + '/log.csv', 'a', newline='') as f: writer = csv.writer(f) writer.writerow([num_fold]) if use_chexpert: # only use csv1 df1 = pd.read_csv(csv_path) df2 = pd.read_csv(extra_data + 'chexpert_mask_{}.csv'.format(num_fold + 1)) df_all = df1.append(df2, ignore_index=True) f_train = open(kfold_path + 'fold' + str(num_fold) + '/train.txt', 'r') f_val = open(kfold_path + 'fold' + str(num_fold) + '/val.txt', 'r') f_fake = open( extra_data + '/chexpert_list_{}.txt'.format(num_fold + 1), 'r') c_train = f_train.readlines() c_val = f_val.readlines() c_fake = f_fake.readlines() c_train = c_fake + c_train f_train.close() f_val.close() f_fake.close() else: df_all = pd.read_csv(csv_path) f_train = open(kfold_path + 'fold' + str(num_fold) + '/train.txt', 'r') f_val = open(kfold_path + 'fold' + str(num_fold) + '/val.txt', 'r') c_train = f_train.readlines() c_val = f_val.readlines() f_train.close() f_val.close() c_train = [s.replace('\n', '') for s in c_train] c_val = [s.replace('\n', '') for s in c_val] print('train dataset:', len(c_train), ' val dataset c_val_without_no_mask:', 476, ' val dataset c_val_with_no_mask:', len(c_val)) with open(snapshot_path + '/log.csv', 'a', newline='') as f: writer = csv.writer(f) writer.writerow([ 'train dataset:', len(c_train), ' val dataset c_val_without_no_mask:', 476, ' val dataset c_val_with_no_mask:', len(c_val) ]) writer.writerow([ 'train_batch_size:', train_batch_size, 'val_batch_size:', val_batch_size ]) train_loader, val_loader = generate_dataset_loader_cls_seg( df_all, c_train, train_transform, train_batch_size, c_val, val_transform, val_batch_size, workers) if model_name == 'deep_se50': from semantic_segmentation.network.deepv3 import DeepSRNX50V3PlusD_m1 # r model = DeepSRNX50V3PlusD_m1(1, SoftDiceLoss_binary()) elif model_name == 'unet_ef3': from ef_unet import EfficientNet_3_unet model = EfficientNet_3_unet() elif model_name == 'unet_ef5': from ef_unet import EfficientNet_5_unet model = EfficientNet_5_unet() else: print('No model name in it') model = None model = apex.parallel.convert_syncbn_model(model).cuda() optimizer = torch.optim.Adam(model.parameters(), lr=5e-4, betas=(0.9, 0.999), eps=1e-08, weight_decay=0) scheduler = WarmRestart(optimizer, T_max=5, T_mult=1, eta_min=1e-6) model, optimizer = amp.initialize(model, optimizer, opt_level="O1") model = torch.nn.DataParallel(model) loss_seg = SoftDiceLoss_binary() trMaxEpoch = 44 lossMIN = 100000 val_dice_max = 0 for epochID in range(0, trMaxEpoch): start_time = time.time() model.train() trainLoss = 30 lossTrainNorm = 0 trainLoss_cls = 0 trainLoss_seg = 0 if epochID < 40: if epochID != 0: scheduler.step() scheduler = warm_restart(scheduler, T_mult=2) elif epochID > 39 and epochID < 42: optimizer.param_groups[0]['lr'] = 1e-5 else: optimizer.param_groups[0]['lr'] = 5e-6 for batchID, (input, target_seg, target_cls) in enumerate(train_loader): if batchID == 0: ss_time = time.time() print(str(batchID) + '/' + str(int(len(c_train) / train_batch_size)) + ' ' + str((time.time() - ss_time) / (batchID + 1)), end='\r') varInput = torch.autograd.Variable(input) varTarget_seg = torch.autograd.Variable( target_seg.contiguous().cuda(async=True)) varOutput_seg = model(varInput) varTarget_seg = varTarget_seg.float() lossvalue_seg = loss_seg(varOutput_seg, varTarget_seg) trainLoss_seg = trainLoss_seg + lossvalue_seg.item() lossvalue = lossvalue_seg lossTrainNorm = lossTrainNorm + 1 optimizer.zero_grad() with amp.scale_loss(lossvalue, optimizer) as scaled_loss: scaled_loss.backward() optimizer.step() trainLoss_seg = trainLoss_seg / lossTrainNorm trainLoss = trainLoss_seg best_thr_with_no_mask = -1 best_dice_with_no_mask = -1 best_thr_without_no_mask = -1 best_dice_without_no_mask = -1 valLoss_seg = -1 if epochID % 1 == 0: valLoss_seg, best_thr_with_no_mask, best_dice_with_no_mask, best_thr_without_no_mask, best_dice_without_no_mask = epochVal( model, val_loader, loss_seg, c_val, val_batch_size ) # (model, dataLoader, loss_seg, c_val, val_batch_size): epoch_time = time.time() - start_time if epochID % 1 == 0: torch.save( { 'epoch': epochID + 1, 'state_dict': model.state_dict(), 'valLoss': 0, 'best_thr_with_no_mask': best_thr_with_no_mask, 'best_dice_with_no_mask': float(best_dice_with_no_mask), 'best_thr_without_no_mask': best_thr_without_no_mask, 'best_dice_without_no_mask': float(best_dice_without_no_mask) }, snapshot_path + '/model_epoch_' + str(epochID) + '_' + str(num_fold) + '.pth.tar') result = [ epochID, round(optimizer.state_dict()['param_groups'][0]['lr'], 6), round(epoch_time, 0), round(trainLoss, 4), round(valLoss_seg, 4), round(best_thr_with_no_mask, 3), round(float(best_dice_with_no_mask), 3), round(best_thr_without_no_mask, 3), round(float(best_dice_without_no_mask), 3) ] print(result) with open(snapshot_path + '/log.csv', 'a', newline='') as f: writer = csv.writer(f) writer.writerow(result) del model
def main(): data_dir = './input/another_cv_3/' epoch = 100 batch_size = 1 lr = 0.0001 # 分割した画像データのディレクトリを指定する. x_train_dir = os.path.join(data_dir, 'train_images_png') y_train_dir = os.path.join(data_dir, 'train_images_inpainted_labels') x_valid_dir = os.path.join(data_dir, 'val_images_png') y_valid_dir = os.path.join(data_dir, 'val_images_inpainted_labels') # 分割した画像データのファイルリストを作成する. x_train_files = glob.glob(x_train_dir + '/*') y_train_files = glob.glob(y_train_dir + '/*') # ENCODER = 'resnet18' ENCODER = 'inceptionv4' ENCODER_WEIGHTS = 'imagenet' CLASSES = ['coastline'] ACTIVATION = 'sigmoid' # could be None for logits or 'softmax2d' for multicalss segmentation DEVICE = 'cuda' # create segmentation model with pretrained encoder # model = smp.Unet( # encoder_name=ENCODER, # encoder_weights=ENCODER_WEIGHTS, # classes=len(CLASSES), # activation=ACTIVATION, # ) model = smp.FPN( encoder_name=ENCODER, encoder_weights=ENCODER_WEIGHTS, classes=len(CLASSES), activation=ACTIVATION, ) # tensorboardの設定 writer = SummaryWriter(comment=f'_ENCODER_{ENCODER}_LR_{lr}') log_dir = writer.log_dir preprocessing_fn = smp.encoders.get_preprocessing_fn( ENCODER, ENCODER_WEIGHTS) train_dataset = Dataset( x_train_dir, y_train_dir, augmentation=get_training_augmentation(), preprocessing=get_preprocessing(preprocessing_fn), classes=CLASSES, ) valid_dataset = Dataset( x_valid_dir, y_valid_dir, augmentation=get_validation_augmentation(), preprocessing=get_preprocessing(preprocessing_fn), classes=CLASSES, ) train_loader = DataLoader(train_dataset, batch_size=batch_size, shuffle=True, num_workers=0) valid_loader = DataLoader(valid_dataset, batch_size=1, shuffle=False, num_workers=0) # lossの設定 loss = smp.utils.losses.DiceLoss() # loss = smp.utils.losses.BCELoss() metrics = [ smp.utils.metrics.IoU(threshold=0.5), ] # optimizerの設定 # optimizer = torch.optim.Adam([ # dict(params=model.parameters(), lr=lr), # ]) optimizer = torch.optim.Adam(model.parameters(), lr=0.0005, betas=(0.9, 0.999), eps=1e-08, weight_decay=0) scheduler = WarmRestart(optimizer, T_max=5, T_mult=1, eta_min=1e-5) # 学習ループの設定 train_epoch = smp.utils.train.TrainEpoch( model, loss=loss, metrics=metrics, optimizer=optimizer, device=DEVICE, verbose=True, ) valid_epoch = smp.utils.train.ValidEpoch( model, loss=loss, metrics=metrics, device=DEVICE, verbose=True, ) max_score = 0 # train accurascy, train loss, val_accuracy, val_loss をグラフ化できるように設定. x_epoch_data = [] train_dice_loss = [] train_iou_score = [] valid_dice_loss = [] valid_iou_score = [] for i in range(epoch): if i < 30: if i != 0: scheduler.step() scheduler = warm_restart(scheduler, T_mult=2) elif i > 29 and i < 32: optimizer.param_groups[0]['lr'] = 1e-5 else: optimizer.param_groups[0]['lr'] = 5e-6 print('\nEpoch: {}'.format(i)) train_logs = train_epoch.run(train_loader) valid_logs = valid_epoch.run(valid_loader) x_epoch_data.append(i) train_dice_loss.append(train_logs['dice_loss']) train_iou_score.append(train_logs['iou_score']) valid_dice_loss.append(valid_logs['dice_loss']) valid_iou_score.append(valid_logs['iou_score']) writer.add_scalar('Loss/train', train_logs['dice_loss'], i) writer.add_scalar('iou/train', train_logs['iou_score'], i) writer.add_scalar('Loss/valid', valid_logs['dice_loss'], i) writer.add_scalar('iou/valid', valid_logs['iou_score'], i) # do something (save model, change lr, etc.) if max_score < valid_logs['iou_score']: max_score = valid_logs['iou_score'] torch.save(model, log_dir + '/best_model.pth') print('Model saved!')
def train_one_model(model_name, image_size): if not os.path.exists(snapshot_path): os.makedirs(snapshot_path) header = ['Epoch', 'Learning rate', 'Time', 'Train Loss', 'Val Loss'] if not os.path.isfile(snapshot_path + '/log.csv'): with open(snapshot_path + '/log.csv', 'a', newline='') as f: writer = csv.writer(f) writer.writerow(header) df_all = pd.read_csv(csv_path) kfold_path_train = '../data/fold_5_by_study/' kfold_path_val = '../data/fold_5_by_study_image/' for num_fold in range(5): print(num_fold) # if num_fold in [0,1,2]: # continue with open(snapshot_path + '/log.csv', 'a', newline='') as f: writer = csv.writer(f) writer.writerow([num_fold]) f_train = open( kfold_path_train + 'fold' + str(num_fold) + '/train.txt', 'r') f_val = open(kfold_path_val + 'fold' + str(num_fold) + '/val.txt', 'r') c_train = f_train.readlines() c_val = f_val.readlines() f_train.close() f_val.close() c_train = [s.replace('\n', '') for s in c_train] c_val = [s.replace('\n', '') for s in c_val] c_train = c_train[0:100] c_val = c_val[0:4000] print('train dataset:', len(c_train), ' val dataset:', len(c_val)) with open(snapshot_path + '/log.csv', 'a', newline='') as f: writer = csv.writer(f) writer.writerow( ['train dataset:', len(c_train), ' val dataset:', len(c_val)]) writer.writerow([ 'train_batch_size:', train_batch_size, 'val_batch_size:', val_batch_size ]) train_transform, val_transform = generate_transforms(image_size) train_loader, val_loader = generate_dataset_loader_cls_seg( df_all, c_train, train_transform, train_batch_size, c_val, val_transform, val_batch_size, workers) model = eval(model_name + '()') # state = torch.load('/data/lanjun/kaggle_rsna2019/models_snapshot/DenseNet169_change_avg_test_context_256/model_epoch_59_'+str(num_fold)+'.pth')['state_dict'] # new_state_dict = OrderedDict() # for k, v in state.items(): # name = k[7:] # new_state_dict[name] = v # model.load_state_dict(new_state_dict) model = apex.parallel.convert_syncbn_model(model).cuda() optimizer = torch.optim.Adam(model.parameters(), lr=0.0005, betas=(0.9, 0.999), eps=1e-08, weight_decay=0.00002) scheduler = WarmRestart(optimizer, T_max=5, T_mult=1, eta_min=1e-5) model, optimizer = amp.initialize(model, optimizer, opt_level="O1") model = torch.nn.DataParallel(model) def loss_cls_com(input, target): loss_1 = FocalLoss() loss_2 = torch.nn.BCEWithLogitsLoss() loss = loss_1(input, target) + loss_2(input, target) return loss loss_cls = torch.nn.BCEWithLogitsLoss(pos_weight=torch.FloatTensor( [1.0, 1.0, 1.0, 1.0, 1.0, 1.0]).cuda()) trMaxEpoch = 1 for epochID in range(0, trMaxEpoch): epochID = epochID + 0 start_time = time.time() model.train() trainLoss = 0 lossTrainNorm = 10 if epochID < 10: pass elif epochID < 80: if epochID != 10: scheduler.step() scheduler = warm_restart(scheduler, T_mult=2) else: optimizer.param_groups[0]['lr'] = 1e-5 for batchID, (input, target) in enumerate(train_loader): if batchID == 0: ss_time = time.time() print(str(batchID) + '/' + str(int(len(c_train) / train_batch_size)) + ' ' + str((time.time() - ss_time) / (batchID + 1)), end='\r') varInput = torch.autograd.Variable(input) target = target.view(-1, 6).contiguous().cuda(async=True) varTarget = torch.autograd.Variable( target.contiguous().cuda(async=True)) varOutput = model(varInput) lossvalue = loss_cls(varOutput, varTarget) trainLoss = trainLoss + lossvalue.item() lossTrainNorm = lossTrainNorm + 1 optimizer.zero_grad() with amp.scale_loss(lossvalue, optimizer) as scaled_loss: scaled_loss.backward() torch.nn.utils.clip_grad_norm_(model.parameters(), 0.5) optimizer.step() del lossvalue trainLoss = trainLoss / lossTrainNorm if (epochID + 1) % 5 == 0 or epochID > 79 or epochID == 0: valLoss, auc, loss_list, loss_sum = epochVal( num_fold, model, val_loader, loss_cls, c_val, val_batch_size) epoch_time = time.time() - start_time if (epochID + 1) % 5 == 0 or epochID > 79: torch.save( { 'epoch': epochID + 1, 'state_dict': model.state_dict(), 'valLoss': valLoss }, snapshot_path + '/model_epoch_' + str(epochID) + '_' + str(num_fold) + '.pth') result = [ epochID, round(optimizer.state_dict()['param_groups'][0]['lr'], 6), round(epoch_time, 0), round(trainLoss, 5), round(valLoss, 5), 'auc:', auc, loss_list, loss_sum ] print(result) with open(snapshot_path + '/log.csv', 'a', newline='') as f: writer = csv.writer(f) writer.writerow(result) del model
def deeplab_training(model_name, optimizer_name, lr_scheduler_name, lr, batch_size, valid_batch_size, num_epoch, start_epoch, accumulation_steps, train_data_folder, checkpoint_folder, load_pretrain): train_dataset, valid_dataset, train_dataloader, valid_dataloader = generate_dataset_loader( train_data_folder, batch_size, valid_batch_size, SEED) ############################################################################## define unet model with backbone def get_model(model_name="deep_se50", in_channel=6, num_classes=1, criterion=SoftDiceLoss_binary()): if model_name == 'deep_se50': from semantic_segmentation.network.deepv3 import DeepSRNX50V3PlusD_m1 # r model = DeepSRNX50V3PlusD_m1(in_channel=6, num_classes=num_classes, criterion=SoftDiceLoss_binary()) elif model_name == 'unet_ef3': from ef_unet import EfficientNet_3_unet model = EfficientNet_3_unet() elif model_name == 'unet_ef5': from ef_unet import EfficientNet_5_unet model = EfficientNet_5_unet() else: print('No model name in it') model = None return model # We weigh the loss for the 0 class lower to account for (some of) the big class imbalance. device = torch.device('cuda' if torch.cuda.is_available() else 'cpu') classes = [ "car", "motorcycle", "bus", "bicycle", "truck", "pedestrian", "other_vehicle", "animal", "emergency_vehicle" ] class_weights = torch.from_numpy( np.array([0.2] + [1.0] * len(classes), dtype=np.float32)) class_weights = class_weights.to(device) ############################################################################### training parameters train_batch_size = batch_size checkpoint_filename = model_name + "_deeplab_checkpoint.pth" checkpoint_filepath = os.path.join(checkpoint_folder, checkpoint_filename) ############################################################################### model and optimizer model = get_model(model_name=model_name, in_channel=6, num_classes=len(classes) + 1, criterion=SoftDiceLoss_binary()) if (load_pretrain): model.load_pretrain(checkpoint_filepath) model = model.to(device) if optimizer_name == "Adam": optimizer = torch.optim.Adam(model.parameters(), lr=1e-3) elif optimizer_name == "adamonecycle": flatten_model = lambda m: sum(map(flatten_model, m.children()), [] ) if num_children(m) else [m] get_layer_groups = lambda m: [nn.Sequential(*flatten_model(m))] optimizer_func = partial(optim.Adam, betas=(0.9, 0.99)) optimizer = OptimWrapper.create(optimizer_func, 3e-3, get_layer_groups(model), wd=1e-4, true_wd=True, bn_wd=True) elif optimizer_name == "Ranger": optimizer = Ranger(filter(lambda p: p.requires_grad, model.parameters()), lr, weight_decay=1e-5) else: raise NotImplementedError if lr_scheduler_name == "adamonecycle": scheduler = lsf.OneCycle(optimizer, len(train_dataset) * num_epoch, lr, [0.95, 0.85], 10.0, 0.4) lr_scheduler_each_iter = True elif lr_scheduler_name == "CosineAnealing": scheduler = torch.optim.lr_scheduler.CosineAnnealingLR(optimizer, num_epoch, eta_min=0, last_epoch=-1) lr_scheduler_each_iter = False elif lr_scheduler_name == "WarmRestart": scheduler = WarmRestart(optimizer, T_max=5, T_mult=1, eta_min=1e-6) lr_scheduler_each_iter = False else: raise NotImplementedError model, optimizer = amp.initialize(model, optimizer, opt_level="O1") ############################################################################### training writer = SummaryWriter() log_file = open(model_name + "_log.txt", "a+") valid_metric_optimal = np.inf for epoch in range(1, num_epoch + 1): if (epoch < start_epoch): continue print("Epoch", epoch) print("Epoch", epoch, file=log_file) seed_everything(SEED + epoch) eval_step = len(train_dataloader) train_losses = [] valid_losses = [] valid_ce_losses = [] torch.cuda.empty_cache() if (not lr_scheduler_each_iter): scheduler.step(epoch) optimizer.zero_grad() for tr_batch_i, (X, target, sample_ids) in enumerate(train_dataloader): if (lr_scheduler_each_iter): scheduler.step(tr_batch_i) model.train() X = X.to(device).float() # [N, 6, H, W] target = target.to(device) # [N, H, W] with class indices (0, 1) prediction = model(X) # [N, C, H, W] loss = F.cross_entropy(prediction, target, weight=class_weights) target = target.clone().unsqueeze_(1) one_hot = torch.cuda.FloatTensor(target.size(0), len(classes) + 1, target.size(2), target.size(3)).zero_() target_one_hot = one_hot.scatter_(1, target.data, 1) loss += model.criterion(prediction, target_one_hot) with amp.scale_loss(loss, optimizer) as scaled_loss: scaled_loss.backward() #loss.backward() if ((tr_batch_i + 1) % accumulation_steps == 0): torch.nn.utils.clip_grad_norm_(model.parameters(), max_norm=5.0, norm_type=2) optimizer.step() optimizer.zero_grad() writer.add_scalar( 'train_loss', loss.item() * accumulation_steps, epoch * len(train_data_folder) * train_batch_size + tr_batch_i * train_batch_size) train_losses.append(loss.detach().cpu().numpy()) if (tr_batch_i + 1) % eval_step == 0: with torch.no_grad(): torch.cuda.empty_cache() for val_batch_i, ( X, target, sample_ids) in enumerate(valid_dataloader): model.eval() X = X.to(device).float() # [N, 3, H, W] target = target.to( device) # [N, H, W] with class indices (0, 1) prediction = model(X) # [N, C, H, W] ce_loss = F.cross_entropy( prediction, target, weight=class_weights).detach().cpu().numpy() target = target.unsqueeze_(1) one_hot = torch.cuda.FloatTensor( target.size(0), len(classes) + 1, target.size(2), target.size(3)).zero_() target_one_hot = one_hot.scatter_(1, target.data, 1) loss = model.criterion(prediction, target_one_hot) writer.add_scalar( 'val_loss', loss, epoch * len(valid_dataloader) * valid_batch_size + val_batch_i * valid_batch_size) valid_losses.append(loss.detach().cpu().numpy()) valid_ce_losses.append(ce_loss) print("Train Loss:", np.mean(train_losses), "Valid Loss:", np.mean(valid_losses), "Valid CE Loss:", np.mean(valid_ce_losses)) print("Train Loss:", np.mean(train_losses), "Valid Loss:", np.mean(valid_losses), "Valid CE Loss:", np.mean(valid_ce_losses), file=log_file) val_metric_epoch = np.mean(valid_losses) if (val_metric_epoch <= valid_metric_optimal): print('Validation metric improved ({:.6f} --> {:.6f}). Saving model ...'.format(\ valid_metric_optimal, val_metric_epoch)) print('Validation metric improved ({:.6f} --> {:.6f}). Saving model ...'.format(\ valid_metric_optimal, val_metric_epoch), file=log_file) valid_metric_optimal = val_metric_epoch torch.save(model.state_dict(), checkpoint_filepath)
def train_one_model(model_name): snapshot_path = path_data['snapshot_path'] + model_name + '_' + str( Image_size) + '_25_local_val' if not os.path.exists(snapshot_path): os.makedirs(snapshot_path) df_all = pd.read_csv(csv_path) kfold_path = path_data['k_fold_path'] for num_fold in range(5): print(num_fold) # if num_fold in [0,1,2]: # continue with open(snapshot_path + '/log.csv', 'a', newline='') as f: writer = csv.writer(f) writer.writerow([num_fold]) writer.writerow([ 'train_batch_size:', str(train_batch_size), 'val_batch_size:', str(val_batch_size), 'backbone', model_name, 'Image_size', Image_size ]) f_train = open(kfold_path + 'fold' + str(num_fold) + '/train.txt', 'r') f_val = open(kfold_path + 'fold' + str(num_fold) + '/val.txt', 'r') c_train = f_train.readlines() c_val = f_val.readlines() f_train.close() f_val.close() c_train = [s.replace('\n', '') for s in c_train] c_val = [s.replace('\n', '') for s in c_val] print('train dataset:', len(c_train), ' val dataset:', len(c_val)) with open(snapshot_path + '/log.csv', 'a', newline='') as f: writer = csv.writer(f) writer.writerow( ['train dataset:', len(c_train), ' val dataset:', len(c_val)]) # c_train = c_train[0:500] # c_val = c_val[0:2000] train_loader, val_loader = generate_dataset_loader_25( df_all, c_train, train_transform, train_batch_size, c_val, val_transform, val_batch_size, workers) model = DenseNet121_change_avg(25, True) model = torch.nn.DataParallel(model).cuda() optimizer = torch.optim.Adamax(model.parameters(), lr=0.001, betas=(0.9, 0.999), eps=1e-08, weight_decay=0) scheduler = WarmRestart(optimizer, T_max=10, T_mult=1, eta_min=1e-5) loss = torch.nn.BCELoss(size_average=True) trMaxEpoch = 42 lossMIN = 100000 val_f1_mean = 0 val_auc_mean = 0 for epochID in range(0, trMaxEpoch): start_time = time.time() model.train() trainLoss = 0 lossTrainNorm = 0 for batchID, (input, target) in enumerate(train_loader): target = target.view(-1, 25).contiguous().cuda(async=True) varInput = torch.autograd.Variable(input) varTarget = torch.autograd.Variable(target) varOutput = model(varInput) # print(varOutput.shape, varTarget.shape) lossvalue = loss(varOutput, varTarget) trainLoss = trainLoss + lossvalue.item() lossTrainNorm = lossTrainNorm + 1 optimizer.zero_grad() lossvalue.backward() optimizer.step() if epochID < 39: scheduler.step() scheduler = warm_restart(scheduler, T_mult=2) else: optimizer = torch.optim.Adam(model.parameters(), lr=1e-5, betas=(0.9, 0.999), weight_decay=1e-5, eps=1e-08, amsgrad=True) trainLoss = trainLoss / lossTrainNorm if (epochID + 1) % 10 == 0 or epochID > 39 or epochID == 0: valLoss, val_auc, val_threshold, val_f1, precision_list, recall_list = epochVal( model, val_loader, optimizer, scheduler, loss) epoch_time = time.time() - start_time if valLoss < lossMIN: lossMIN = valLoss torch.save( { 'epoch': epochID + 1, 'state_dict': model.state_dict(), 'best_loss': lossMIN, 'optimizer': optimizer.state_dict(), 'val_threshold': val_threshold, 'val_f1': val_f1, 'val_f1_mean': np.mean(val_f1), 'val_auc': val_auc, 'val_auc_mean': np.mean(val_auc) }, snapshot_path + '/model_min_loss_' + str(num_fold) + '.pth.tar') if val_f1_mean < np.mean(val_f1): val_f1_mean = np.mean(val_f1) torch.save( { 'epoch': epochID + 1, 'state_dict': model.state_dict(), 'best_loss': lossMIN, 'optimizer': optimizer.state_dict(), 'val_threshold': val_threshold, 'val_f1': val_f1, 'val_f1_mean': np.mean(val_f1), 'val_auc': val_auc, 'val_auc_mean': np.mean(val_auc) }, snapshot_path + '/model_max_f1_' + str(num_fold) + '.pth.tar') if val_auc_mean < np.mean(val_auc): val_auc_mean = np.mean(val_auc) torch.save( { 'epoch': epochID + 1, 'state_dict': model.state_dict(), 'best_loss': lossMIN, 'optimizer': optimizer.state_dict(), 'val_threshold': val_threshold, 'val_f1': val_f1, 'val_f1_mean': np.mean(val_f1), 'val_auc': val_auc, 'val_auc_mean': np.mean(val_auc) }, snapshot_path + '/model_max_auc_' + str(num_fold) + '.pth.tar') if (epochID + 1) % 10 == 0: torch.save( { 'epoch': epochID + 1, 'state_dict': model.state_dict(), 'best_loss': lossMIN, 'optimizer': optimizer.state_dict(), 'val_threshold': val_threshold, 'val_f1': val_f1, 'val_f1_mean': np.mean(val_f1), 'val_auc': val_auc, 'val_auc_mean': np.mean(val_auc) }, snapshot_path + '/model_epoch_' + str(epochID) + '_' + str(num_fold) + '.pth.tar') result = [ epochID, round(optimizer.state_dict()['param_groups'][0]['lr'], 5), round(trainLoss, 4), round(valLoss, 4), round(epoch_time, 0), round(np.mean(val_f1), 3), round(np.mean(val_auc), 4) ] print(result) # print(val_f1) with open(snapshot_path + '/log.csv', 'a', newline='') as f: writer = csv.writer(f) writer.writerow(result + val_threshold + val_f1 + val_auc + precision_list + recall_list) del model