def main(): config_path = Path(args.config_path) config = yaml.load(open(config_path)) net_config = config['Net'] data_config = config['Data'] train_config = config['Train'] loss_config = config['Loss'] opt_config = config['Optimizer'] device = torch.device('cuda' if torch.cuda.is_available() else 'cpu') n_class = net_config['n_class'] max_epoch = train_config['max_epoch'] batch_size = train_config['batch_size'] num_workers = train_config['num_workers'] test_every = train_config['test_every'] resume = train_config['resume'] pretrained_path = train_config['pretrained_path'] use_rank = train_config['use_rank'] use_bined = train_config['use_bined'] del train_config['use_rank'] del train_config['use_bined'] train_dir = data_config['train_dir'] val_dir = data_config['val_dir'] train_name = data_config['train_name'] val_name = data_config['val_name'] train_type = data_config['train_type'] val_type = data_config['val_type'] del data_config['train_dir'] del data_config['val_dir'] del data_config['train_name'] del data_config['val_name'] del data_config['train_type'] del data_config['val_type'] model = load_model(**net_config) # To device model = model.to(device) modelname = config_path.stem output_dir = Path('../model') / modelname output_dir.mkdir(exist_ok=True) log_dir = Path('../logs') / modelname log_dir.mkdir(exist_ok=True) logger = debug_logger(log_dir) logger.debug(config) logger.info(f'Device: {device}') logger.info(f'Max Epoch: {max_epoch}') loss_fn = Criterion(**loss_config).to(device) params = model.parameters() optimizer, scheduler = create_optimizer(params, **opt_config) # history if resume: with open(log_dir.joinpath('history.pkl'), 'rb') as f: history_dict = pickle.load(f) best_metrics = history_dict['best_metrics'] loss_history = history_dict['loss'] diff_history = history_dict['diff'] # start_epoch = len(diff_history) start_epoch = 47 for _ in range(start_epoch): scheduler.step() else: start_epoch = 0 best_metrics = float('inf') loss_history = [] diff_history = [] # Dataset affine_augmenter = albu.Compose([albu.GaussNoise(var_limit=(0,25),p=.2), albu.GaussianBlur(3, p=0.2), albu.JpegCompression(50, 100, p=0.2)]) image_augmenter = albu.Compose([ albu.OneOf([ albu.RandomBrightnessContrast(0.25,0.25), albu.CLAHE(clip_limit=2), albu.RandomGamma(), ], p=0.5), albu.HueSaturationValue(hue_shift_limit=20, sat_shift_limit=30, val_shift_limit=20,p=0.2), albu.RGBShift(p=0.2), ]) # image_augmenter = None train_dataset = laod_dataset(data_type=train_type, affine_augmenter=affine_augmenter, image_augmenter=image_augmenter, base_dir=train_dir, filename=train_name, use_bined=use_bined, n_class=n_class, **data_config) valid_dataset = laod_dataset(data_type=val_type, split='valid', base_dir=val_dir, filename=val_name, use_bined=use_bined, n_class=n_class, **data_config) # top_10 = len(train_dataset) // 10 # top_30 = len(train_dataset) // 3.33 # train_weights = [ 3 if idx<top_10 else 2 if idx<top_30 else 1 for idx in train_dataset.labels_sort_idx] # train_sample = WeightedRandomSampler(train_weights, num_samples=len(train_dataset), replacement=True) # train_loader = DataLoader(train_dataset, batch_size=batch_size, sampler=train_sample, num_workers=num_workers, # pin_memory=True, drop_last=True) train_loader = DataLoader(train_dataset, batch_size=batch_size, num_workers=num_workers, pin_memory=True, drop_last=True) valid_loader = DataLoader(valid_dataset, batch_size=1, shuffle=False, num_workers=num_workers, pin_memory=True) if torch.cuda.is_available(): model = nn.DataParallel(model) # Pretrained model if pretrained_path: logger.info(f'Load pretrained from {pretrained_path}') param = torch.load(pretrained_path, map_location='cpu') if "state_dict" in param: model.load_state_dict(param['state_dict'], strict=False) else: model.load_state_dict(param) del param # Restore model if resume: print("[INFO] resume training.") model_path = output_dir.joinpath(f'model_epoch_{start_epoch-1}.pth') logger.info(f'Resume from {model_path}') param = torch.load(model_path, map_location='cpu') model.load_state_dict(param) del param opt_path = output_dir.joinpath(f'opt_epoch_{start_epoch-1}.pth') param = torch.load(opt_path) optimizer.load_state_dict(param) del param file_train_log = open("file_train_log.txt", "a") file_val_log = open("file_val_log.txt", "a") # Train for i_epoch in range(start_epoch, max_epoch): logger.info(f'Epoch: {i_epoch}') logger.info(f'Learning rate: {optimizer.param_groups[0]["lr"]}') train_losses = [] train_diffs = [] model.train() with tqdm(train_loader) as _tqdm: for batched in _tqdm: optimizer.zero_grad() if use_rank: if use_bined: img1, img2, lbl1, lbl2, labels, yaw_lbl1, pitch_lbl1, roll_lbl1, yaw_lbl2, pitch_lbl2, roll_lbl2 = batched img1, img2, lbl1, lbl2, labels = img1.to(device),img2.to(device),lbl1.to(device),lbl2.to(device),labels.to(device) yaw_lbl1, pitch_lbl1, roll_lbl1 = yaw_lbl1.to(device), pitch_lbl1.to(device), roll_lbl1.to(device) yaw_lbl2, pitch_lbl2, roll_lbl2 = yaw_lbl2.to(device), pitch_lbl2.to(device), roll_lbl2.to(device) preds1, y_pres1, p_pres1, r_pres1 = model(img1, True) preds2, y_pres2, p_pres2, r_pres2 = model(img2, True) pre_list = [preds1,preds2,y_pres1,p_pres1,r_pres1,y_pres2,p_pres2,r_pres2] lbl_list = [lbl1,lbl2,yaw_lbl1,pitch_lbl1,roll_lbl1,yaw_lbl2,pitch_lbl2,roll_lbl2,labels] loss = loss_fn(pre_list, lbl_list, use_bined=True) else: img1, img2, lbl1, lbl2, labels = batched img1, img2, lbl1, lbl2, labels = img1.to(device),img2.to(device),lbl1.to(device),lbl2.to(device),labels.to(device) preds1 = model(img1, False) preds2 = model(img2, False) loss = loss_fn([preds1,preds2], [lbl1,lbl2,labels], use_bined=False) # print(f"Preds1: {preds1}") # print(f"Preds2: {preds2}") # print(f"lib1: {lbl1}") # print(f"lib2: {lbl2}") diff = calculate_diff(preds1, lbl1) diff += calculate_diff(preds2, lbl2) diff /= 2 # print(f"Diff: {diff}") elif use_bined: images, labels, yaw_labels, pitch_labels, roll_labels = batched images, labels = images.to(device), labels.to(device) yaw_labels, pitch_labels, roll_labels = yaw_labels.to(device), pitch_labels.to(device), roll_labels.to(device) preds, y_pres, p_pres, r_pres = model(images, use_bined) loss = loss_fn([preds, y_pres, p_pres, r_pres], [labels, yaw_labels, pitch_labels, roll_labels], use_bined) diff = calculate_diff(preds, labels) else: images, labels = batched images, labels = images.to(device), labels.to(device) preds = model(images, use_bined) loss = loss_fn([preds], [labels]) diff = calculate_diff(preds, labels, mean=True) _tqdm.set_postfix(OrderedDict(loss=f'{loss.item():.3f}', mae=f'{diff:.1f}')) train_losses.append(loss.item()) train_diffs.append(diff) loss.backward() optimizer.step() scheduler.step() train_loss = np.mean(train_losses) train_diff = np.nanmean(train_diffs) logger.info(f'train loss: {train_loss}') logger.info(f'train diff: {train_diff}') file_train_log.write(f"{train_loss},{train_diff}") # torch.save(model.module.state_dict(), output_dir.joinpath(f'model_tmp_epoch_{i_epoch}.pth')) # torch.save(optimizer.state_dict(), output_dir.joinpath(f'opt_tmp_{i_epoch}.pth')) if (i_epoch + 1) % test_every == 0: valid_losses = [] valid_diffs = [] model.eval() with torch.no_grad(): with tqdm(valid_loader) as _tqdm: for batched in _tqdm: if use_bined: images, labels, yaw_labels, pitch_labels, roll_labels = batched images, labels = images.to(device), labels.to(device) # yaw_labels, pitch_labels, roll_labels = yaw_labels.to(device), pitch_labels.to(device), roll_labels.to(device) preds, y_pres, p_pres, r_pres = model(images, use_bined) # loss = loss_fn([preds, y_pres, p_pres, r_pres], [labels, yaw_labels, pitch_labels, roll_labels]) diff = calculate_diff(preds, labels) else: images, labels = batched images, labels = images.to(device), labels.to(device) preds = model(images, use_bined) # loss = loss_fn([preds], [labels]) diff = calculate_diff(preds, labels) _tqdm.set_postfix(OrderedDict(mae=f'{diff:.2f}')) # _tqdm.set_postfix(OrderedDict(loss=f'{loss.item():.3f}', d_y=f'{np.mean(diff[:,0]):.1f}', d_p=f'{np.mean(diff[:,1]):.1f}', d_r=f'{np.mean(diff[:,2]):.1f}')) valid_losses.append(0) valid_diffs.append(diff) valid_loss = np.mean(valid_losses) valid_diff = np.mean(valid_diffs) logger.info(f'valid seg loss: {valid_loss}') logger.info(f'valid diff: {valid_diff}') file_val_log.write(f"{valid_loss},{valid_diff}") if best_metrics >= valid_diff: best_metrics = valid_diff logger.info('Best Model!\n') torch.save(model.state_dict(), output_dir.joinpath(f'model_epoch_{i_epoch}_{valid_diff}.pth')) torch.save(optimizer.state_dict(), output_dir.joinpath(f'opt_epoch_{i_epoch}_{valid_diff}.pth')) torch.save(model.state_dict(), output_dir.joinpath(f'model_epoch_{i_epoch}_{valid_diff}.pth')) torch.save(optimizer.state_dict(), output_dir.joinpath(f'opt_epoch_{i_epoch}_{valid_diff}.pth')) else: valid_loss = None valid_diff = None loss_history.append([train_loss, valid_loss]) diff_history.append([train_diff, valid_diff]) history_ploter(loss_history, log_dir.joinpath('loss.png')) history_ploter(diff_history, log_dir.joinpath('diff.png')) history_dict = {'loss': loss_history, 'diff': diff_history, 'best_metrics': best_metrics} with open(log_dir.joinpath('history.pkl'), 'wb') as f: pickle.dump(history_dict, f) file_train_log.close() file_val_log.close()
pred_data_path = "E:\CSI2019\AASCE_stage4_Process\data/0_DataOriginal\Archive/" pred_output_path = "E:\CSI2019\AASCE_stage4_Process\data_output/Archive/labels_mat/" pred_dataset = PredORIDataset(data_path=pred_data_path) pred_loader = DataLoader(pred_dataset, batch_size=1, shuffle=False) # # pred_data_path = "E:\CSI2019\AASCE_stage4_Process\data\selected2/" # pred_data_path = "E:\CSI2019\AASCE_stage4_Process\data\ppt_selected2/" # pred_output_path = "E:\CSI2019\AASCE_stage4_Process\data\selected2/labels_mat/" # pred_dataset = PredORIDataset(data_path=pred_data_path) # pred_loader = DataLoader(pred_dataset, batch_size=1, shuffle=False) dir_checkpoint = "E:\CSI2019\AASCE_stage4_Process/saved_files" log_dir = Path(dir_checkpoint + 'logs') log_dir.mkdir(exist_ok=True, parents=True) logger = debug_logger(log_dir) logger.info(f'Device: {device}') # model = UNetWithResnet50Encoder(n_channels=1, n_classes=18).cuda() # model = NestedUNet(n_channels=1, n_classes=20) # model = SCSE_UNet(n_channels=1, n_classes=20) # model = SCSENestedUNet(n_channels=1, n_classes=18) # model = DilatedUNet(in_channels=1, classes=20) # model = ResUNet(in_channel=1, n_classes=5) # stage1_model_whole = ResUNet(in_channel=1, n_classes=2) # # root_dir = "E:\CSI2019\AASCE_stage4_Process/" # stage1_model_whole_load_dir = "saved_model/stage1_resunet_whole_line/CP200.pth" # stage1_model_whole.load_state_dict(torch.load(os.path.join(root_dir, stage1_model_whole_load_dir))) # logger.info('Stage1_Model loaded from {}'.format(stage1_model_whole_load_dir))
def process(config_path): gc.collect() torch.cuda.empty_cache() config = yaml.load(open(config_path)) net_config = config['Net'] data_config = config['Data'] train_config = config['Train'] loss_config = config['Loss'] opt_config = config['Optimizer'] device = torch.device('cuda:0' if torch.cuda.is_available() else 'cpu') t_max = opt_config['t_max'] # Collect training parameters max_epoch = train_config['max_epoch'] batch_size = train_config['batch_size'] fp16 = train_config['fp16'] resume = train_config['resume'] pretrained_path = train_config['pretrained_path'] freeze_enabled = train_config['freeze'] seed_enabled = train_config['seed'] ######################################### # Deterministic training if seed_enabled: seed = 100 torch.manual_seed(seed) torch.cuda.manual_seed_all(seed) np.random.seed(seed=seed) import random random.seed(a=100) ######################################### # Network if 'unet' in net_config['dec_type']: net_type = 'unet' model = EncoderDecoderNet(**net_config) else: net_type = 'deeplab' net_config['output_channels'] = 19 model = SPPNet(**net_config) dataset = data_config['dataset'] if dataset == 'deepglobe-dynamic': from dataset.deepglobe_dynamic import DeepGlobeDatasetDynamic as Dataset net_config['output_channels'] = 7 classes = np.arange(0, 7) else: raise NotImplementedError del data_config['dataset'] modelname = config_path.stem timestamp = datetime.timestamp(datetime.now()) print("timestamp =", datetime.fromtimestamp(timestamp)) output_dir = Path(os.path.join(ROOT_DIR, f'model/{modelname}_{datetime.fromtimestamp(timestamp)}') ) output_dir.mkdir(exist_ok=True) log_dir = Path(os.path.join(ROOT_DIR, f'logs/{modelname}_{datetime.fromtimestamp(timestamp)}') ) log_dir.mkdir(exist_ok=True) dataset_dir= '/home/sfoucher/DEV/pytorch-segmentation/data/deepglobe_as_pascalvoc/VOCdevkit/VOC2012' logger = debug_logger(log_dir) logger.debug(config) logger.info(f'Device: {device}') logger.info(f'Max Epoch: {max_epoch}') # Loss loss_fn = MultiClassCriterion(**loss_config).to(device) params = model.parameters() optimizer, scheduler = create_optimizer(params, **opt_config) # history if resume: with open(log_dir.joinpath('history.pkl'), 'rb') as f: history_dict = pickle.load(f) best_metrics = history_dict['best_metrics'] loss_history = history_dict['loss'] iou_history = history_dict['iou'] start_epoch = len(iou_history) for _ in range(start_epoch): scheduler.step() else: start_epoch = 0 best_metrics = 0 loss_history = [] iou_history = [] affine_augmenter = albu.Compose([albu.HorizontalFlip(p=.5),albu.VerticalFlip(p=.5) # Rotate(5, p=.5) ]) # image_augmenter = albu.Compose([albu.GaussNoise(p=.5), # albu.RandomBrightnessContrast(p=.5)]) image_augmenter = None # This has been put in the loop for the dynamic training """ # Dataset train_dataset = Dataset(affine_augmenter=affine_augmenter, image_augmenter=image_augmenter, net_type=net_type, **data_config) valid_dataset = Dataset(split='valid', net_type=net_type, **data_config) train_loader = DataLoader(train_dataset, batch_size=batch_size, shuffle=True, num_workers=4, pin_memory=True, drop_last=True) valid_loader = DataLoader(valid_dataset, batch_size=1, shuffle=False, num_workers=4, pin_memory=True) """ # Pretrained model if pretrained_path: logger.info(f'Resume from {pretrained_path}') param = torch.load(pretrained_path) model.load_state_dict(param) model.logits = torch.nn.Conv2d(256, net_config['output_channels'], 1) del param # To device model = model.to(device) ######################################### if freeze_enabled: # Code de RĂ©mi # Freeze layers for param_index in range(int((len(optimizer.param_groups[0]['params']))*0.5)): optimizer.param_groups[0]['params'][param_index].requires_grad = False ######################################### params_to_update = model.parameters() print("Params to learn:") if freeze_enabled: params_to_update = [] for name,param in model.named_parameters(): if param.requires_grad == True: params_to_update.append(param) print("\t",name) optimizer, scheduler = create_optimizer(params_to_update, **opt_config) # fp16 if fp16: # I only took the necessary files because I don't need the C backend of apex, # which is broken and can't be installed # from apex import fp16_utils from utils.apex.apex.fp16_utils.fp16util import BN_convert_float from utils.apex.apex.fp16_utils.fp16_optimizer import FP16_Optimizer # model = fp16_utils.BN_convert_float(model.half()) model = BN_convert_float(model.half()) # optimizer = fp16_utils.FP16_Optimizer(optimizer, verbose=False, dynamic_loss_scale=True) optimizer = FP16_Optimizer(optimizer, verbose=False, dynamic_loss_scale=True) logger.info('Apply fp16') # Restore model if resume: model_path = output_dir.joinpath(f'model_tmp.pth') logger.info(f'Resume from {model_path}') param = torch.load(model_path) model.load_state_dict(param) del param opt_path = output_dir.joinpath(f'opt_tmp.pth') param = torch.load(opt_path) optimizer.load_state_dict(param) del param i_iter = 0 ma_loss= 0 ma_iou= 0 # Train for i_epoch in range(start_epoch, max_epoch): logger.info(f'Epoch: {i_epoch}') logger.info(f'Learning rate: {optimizer.param_groups[0]["lr"]}') train_losses = [] train_ious = [] model.train() # Initialize randomized but balanced datasets train_dataset = Dataset(base_dir = dataset_dir, affine_augmenter=affine_augmenter, image_augmenter=image_augmenter, net_type=net_type, **data_config) valid_dataset = Dataset(base_dir = dataset_dir, split='valid', net_type=net_type, **data_config) train_loader = DataLoader(train_dataset, batch_size=batch_size, shuffle=True, num_workers=4, pin_memory=True, drop_last=True) valid_loader = DataLoader(valid_dataset, batch_size=1, shuffle=False, num_workers=4, pin_memory=True) with tqdm(train_loader) as _tqdm: for i, batched in enumerate(_tqdm): images, labels = batched if fp16: images = images.half() images, labels = images.to(device), labels.to(device) optimizer.zero_grad() preds = model(images) if net_type == 'deeplab': preds = F.interpolate(preds, size=labels.shape[1:], mode='bilinear', align_corners=True) if fp16: loss = loss_fn(preds.float(), labels) else: loss = loss_fn(preds, labels) preds_np = preds.detach().cpu().numpy() labels_np = labels.detach().cpu().numpy() iou = compute_iou_batch(np.argmax(preds_np, axis=1), labels_np, classes) _tqdm.set_postfix(OrderedDict(seg_loss=f'{loss.item():.5f}', iou=f'{iou:.3f}')) train_losses.append(loss.item()) train_ious.append(iou) ma_loss= 0.01*loss.item() + 0.99 * ma_loss ma_iou= 0.01*iou + 0.99 * ma_iou plotter.plot('loss', 'train', 'iteration Loss', i_iter, loss.item()) plotter.plot('iou', 'train', 'iteration iou', i_iter, iou) plotter.plot('loss', 'ma_loss', 'iteration Loss', i_iter, ma_loss) plotter.plot('iou', 'ma_iou', 'iteration iou', i_iter, ma_iou) if fp16: optimizer.backward(loss) else: loss.backward() optimizer.step() i_iter += 1 scheduler.step() train_loss = np.mean(train_losses) train_iou = np.nanmean(train_ious) logger.info(f'train loss: {train_loss}') logger.info(f'train iou: {train_iou}') plotter.plot('loss-epoch', 'train', 'iteration Loss', i_epoch, train_loss) plotter.plot('iou-epoch', 'train', 'iteration iou', i_epoch, train_iou) torch.save(model.state_dict(), output_dir.joinpath('model_tmp.pth')) torch.save(optimizer.state_dict(), output_dir.joinpath('opt_tmp.pth')) valid_losses = [] valid_ious = [] model.eval() with torch.no_grad(): with tqdm(valid_loader) as _tqdm: for batched in _tqdm: images, labels = batched if fp16: images = images.half() images, labels = images.to(device), labels.to(device) preds = model.tta(images, net_type=net_type) if fp16: loss = loss_fn(preds.float(), labels) else: loss = loss_fn(preds, labels) preds_np = preds.detach().cpu().numpy() labels_np = labels.detach().cpu().numpy() # I changed a parameter in the compute_iou method to prevent it from yielding nans iou = compute_iou_batch(np.argmax(preds_np, axis=1), labels_np, classes) _tqdm.set_postfix(OrderedDict(seg_loss=f'{loss.item():.5f}', iou=f'{iou:.3f}')) valid_losses.append(loss.item()) valid_ious.append(iou) valid_loss = np.mean(valid_losses) valid_iou = np.mean(valid_ious) logger.info(f'valid seg loss: {valid_loss}') logger.info(f'valid iou: {valid_iou}') plotter.plot('loss-epoch', 'valid', 'iteration Loss', i_epoch, valid_loss) plotter.plot('iou-epoch', 'valid', 'iteration iou', i_epoch, valid_iou) if best_metrics < valid_iou: best_metrics = valid_iou logger.info('Best Model!') torch.save(model.state_dict(), output_dir.joinpath('model.pth')) torch.save(optimizer.state_dict(), output_dir.joinpath('opt.pth')) loss_history.append([train_loss, valid_loss]) iou_history.append([train_iou, valid_iou]) history_ploter(loss_history, log_dir.joinpath('loss.png')) history_ploter(iou_history, log_dir.joinpath('iou.png')) history_dict = {'loss': loss_history, 'iou': iou_history, 'best_metrics': best_metrics} with open(log_dir.joinpath('history.pkl'), 'wb') as f: pickle.dump(history_dict, f)