def train(opt): params = Params(f'projects/{opt.project}.yml') if params.num_gpus == 0: os.environ['CUDA_VISIBLE_DEVICES'] = '-1' if torch.cuda.is_available(): torch.cuda.manual_seed(42) else: torch.manual_seed(42) opt.saved_path = opt.saved_path + f'/{params.project_name}/' opt.log_path = opt.log_path + f'/{params.project_name}/tensorboard/' os.makedirs(opt.log_path, exist_ok=True) os.makedirs(opt.saved_path, exist_ok=True) input_sizes = [512, 640, 768, 896, 1024, 1280, 1280, 1536, 1536] training_set = CocoDataset(root_dir=os.path.join(opt.data_path, params.project_name), set=params.train_set, phase='train', transforms=get_train_transforms()) val_set = CocoDataset(root_dir=os.path.join(opt.data_path, params.project_name), set=params.val_set, phase='val', transforms=get_valid_transforms()) training_generator = torch.utils.data.DataLoader( training_set, batch_size=opt.batch_size, sampler=RandomSampler(training_set), pin_memory=False, drop_last=True, num_workers=opt.num_workers, collate_fn=collate_fn, ) val_generator = torch.utils.data.DataLoader( val_set, batch_size=opt.batch_size, num_workers=opt.num_workers, shuffle=False, sampler=SequentialSampler(val_set), pin_memory=False, collate_fn=collate_fn, ) model = EfficientDetBackbone(num_classes=len(params.obj_list), compound_coef=opt.compound_coef, ratios=eval(params.anchors_ratios), scales=eval(params.anchors_scales)) # load last weights if opt.load_weights is not None: if opt.load_weights.endswith('.pth'): weights_path = opt.load_weights else: weights_path = get_last_weights(opt.saved_path) try: last_step = int( os.path.basename(weights_path).split('_')[-1].split('.')[0]) except: last_step = 0 try: ret = model.load_state_dict(torch.load(weights_path), strict=False) except RuntimeError as e: print(f'[Warning] Ignoring {e}') print( '[Warning] Don\'t panic if you see this, this might be because you load a pretrained weights with different number of classes. The rest of the weights should be loaded already.' ) print( f'[Info] loaded weights: {os.path.basename(weights_path)}, resuming checkpoint from step: {last_step}' ) else: last_step = 0 print('[Info] initializing weights...') init_weights(model) # freeze backbone if train head_only if opt.head_only: def freeze_backbone(m): classname = m.__class__.__name__ for ntl in ['EfficientNet', 'BiFPN']: if ntl in classname: for param in m.parameters(): param.requires_grad = False model.apply(freeze_backbone) print('[Info] freezed backbone') # https://github.com/vacancy/Synchronized-BatchNorm-PyTorch # apply sync_bn when using multiple gpu and batch_size per gpu is lower than 4 # useful when gpu memory is limited. # because when bn is disable, the training will be very unstable or slow to converge, # apply sync_bn can solve it, # by packing all mini-batch across all gpus as one batch and normalize, then send it back to all gpus. # but it would also slow down the training by a little bit. if params.num_gpus > 1 and opt.batch_size // params.num_gpus < 4: model.apply(replace_w_sync_bn) use_sync_bn = True else: use_sync_bn = False writer = SummaryWriter( opt.log_path + f'/{datetime.datetime.now().strftime("%Y%m%d-%H%M%S")}/') # warp the model with loss function, to reduce the memory usage on gpu0 and speedup model = ModelWithLoss(model, debug=opt.debug) if params.num_gpus > 0: model = model.cuda() if params.num_gpus > 1: model = CustomDataParallel(model, params.num_gpus) if use_sync_bn: patch_replication_callback(model) if opt.optim == 'adamw': optimizer = torch.optim.AdamW(model.parameters(), opt.lr) else: optimizer = torch.optim.SGD(model.parameters(), opt.lr, momentum=0.9, nesterov=True) scheduler = torch.optim.lr_scheduler.ReduceLROnPlateau(optimizer, patience=3, verbose=True) epoch = 0 best_loss = 1e5 best_epoch = 0 accumulation_steps = 32 step = max(0, last_step) model.train() num_iter_per_epoch = len(training_generator) try: for epoch in range(opt.num_epochs): last_epoch = step // num_iter_per_epoch if epoch < last_epoch: continue epoch_loss = [] progress_bar = tqdm(training_generator) for iter, (imgs, annots) in enumerate(progress_bar): pass if iter < step - last_epoch * num_iter_per_epoch: progress_bar.update() continue try: imgs = torch.stack(imgs) annot = pad_annots(annots) if params.num_gpus == 1: # if only one gpu, just send it to cuda:0 # elif multiple gpus, send it to multiple gpus in CustomDataParallel, not here imgs = imgs.cuda() annot = annot.cuda() # print(annot) # optimizer.zero_grad() cls_loss, reg_loss = model(imgs, annot, obj_list=params.obj_list) cls_loss = cls_loss.mean() reg_loss = reg_loss.mean() loss = cls_loss + reg_loss if loss == 0 or not torch.isfinite(loss): continue loss.backward() # torch.nn.utils.clip_grad_norm_(model.parameters(), 0.1) if (iter + 1) % (accumulation_steps // opt.batch_size) == 0: # print('step') optimizer.step() optimizer.zero_grad() # optimizer.step() epoch_loss.append(float(loss)) progress_bar.set_description( 'Step: {}. Epoch: {}/{}. Iteration: {}/{}. Cls loss: {:.5f}. Reg loss: {:.5f}. Total loss: {:.5f}' .format(step, epoch, opt.num_epochs, iter + 1, num_iter_per_epoch, cls_loss.item(), reg_loss.item(), loss.item())) writer.add_scalars('Loss', {'train': loss}, step) writer.add_scalars('Regression_loss', {'train': reg_loss}, step) writer.add_scalars('Classfication_loss', {'train': cls_loss}, step) # log learning_rate current_lr = optimizer.param_groups[0]['lr'] writer.add_scalar('learning_rate', current_lr, step) step += 1 if step % opt.save_interval == 0 and step > 0: save_checkpoint( model, f'efficientdet-d{opt.compound_coef}_{epoch}_{step}.pth' ) print('checkpoint...') except Exception as e: print('[Error]', traceback.format_exc()) print(e) continue scheduler.step(np.mean(epoch_loss)) if epoch % opt.val_interval == 0: model.eval() loss_regression_ls = [] loss_classification_ls = [] for iter, (imgs, annots) in enumerate(val_generator): with torch.no_grad(): imgs = torch.stack(imgs) annot = pad_annots(annots) if params.num_gpus == 1: imgs = imgs.cuda() annot = annot.cuda() cls_loss, reg_loss = model(imgs, annot, obj_list=params.obj_list) cls_loss = cls_loss.mean() reg_loss = reg_loss.mean() loss = cls_loss + reg_loss if loss == 0 or not torch.isfinite(loss): continue loss_classification_ls.append(cls_loss.item()) loss_regression_ls.append(reg_loss.item()) cls_loss = np.mean(loss_classification_ls) reg_loss = np.mean(loss_regression_ls) loss = cls_loss + reg_loss print( 'Val. Epoch: {}/{}. Classification loss: {:1.5f}. Regression loss: {:1.5f}. Total loss: {:1.5f}' .format(epoch, opt.num_epochs, cls_loss, reg_loss, loss)) writer.add_scalars('Loss', {'val': loss}, step) writer.add_scalars('Regression_loss', {'val': reg_loss}, step) writer.add_scalars('Classfication_loss', {'val': cls_loss}, step) if loss + opt.es_min_delta < best_loss: best_loss = loss best_epoch = epoch save_checkpoint( model, f'efficientdet-d{opt.compound_coef}_{epoch}_{step}.pth' ) model.train() # Early stopping if epoch - best_epoch > opt.es_patience > 0: print( '[Info] Stop training at epoch {}. The lowest loss achieved is {}' .format(epoch, best_loss)) break except KeyboardInterrupt: save_checkpoint( model, f'efficientdet-d{opt.compound_coef}_{epoch}_{step}.pth') writer.close() writer.close()
def train(opt): params = Params(f'projects/{opt.project}.yml') if params.num_gpus == 0: os.environ['CUDA_VISIBLE_DEVICES'] = '-1' if torch.cuda.is_available(): torch.cuda.manual_seed(42) else: torch.manual_seed(42) opt.saved_path = opt.saved_path + f'/{params.project_name}/' opt.log_path = opt.log_path + f'/{params.project_name}/tensorboard/' os.makedirs(opt.log_path, exist_ok=True) os.makedirs(opt.saved_path, exist_ok=True) training_params = { 'batch_size': opt.batch_size, 'shuffle': True, 'drop_last': True, 'collate_fn': collater, 'num_workers': opt.num_workers } val_params = { 'batch_size': opt.batch_size, 'shuffle': False, 'drop_last': True, 'collate_fn': collater, 'num_workers': opt.num_workers } input_sizes = [512, 640, 768, 896, 1024, 1280, 1280, 1536] training_set = CocoDataset(root_dir=opt.data_path + params.project_name, set=params.train_set, transform=transforms.Compose([ Normalizer(mean=params.mean, std=params.std), Augmenter(), Resizer(input_sizes[opt.compound_coef]) ])) training_generator = DataLoader(training_set, **training_params) val_set = CocoDataset(root_dir=opt.data_path + params.project_name, set=params.val_set, transform=transforms.Compose([ Normalizer(mean=params.mean, std=params.std), Resizer(input_sizes[opt.compound_coef]) ])) val_generator = DataLoader(val_set, **val_params) model = EfficientDetBackbone(num_anchors=9, num_classes=len(params.obj_list), compound_coef=opt.compound_coef) # load last weights if opt.load_weights is not None: if opt.load_weights.endswith('.pth'): weights_path = opt.load_weights else: weights_path = get_last_weights(opt.saved_path) try: last_step = int( os.path.basename(weights_path).split('_')[-1].split('.')[0]) except: last_step = 0 model.load_state_dict(torch.load(weights_path)) print( f'loaded weights: {os.path.basename(weights_path)}, resuming checkpoint from step: {last_step}' ) else: last_step = 0 print('initializing weights...') init_weights(model) # freeze backbone if train head_only if opt.head_only: def freeze_backbone(m): classname = m.__class__.__name__ for ntl in ['EfficientNet', 'BiFPN']: if ntl in classname: for param in m.parameters(): param.requires_grad = False model.apply(freeze_backbone) print('freezed backbone') # https://github.com/vacancy/Synchronized-BatchNorm-PyTorch # apply sync_bn when using multiple gpu and batch_size per gpu is lower than 4 # useful when gpu memory is limited. # because when bn is disable, the training will be very unstable or slow to converge, # apply sync_bn can solve it, # by packing all mini-batch across all gpus as one batch and normalize, then send it back to all gpus. # but it would also slow down the training by a little bit. if params.num_gpus > 1 and opt.batch_size // params.num_gpus < 4: model.apply(replace_w_sync_bn) writer = SummaryWriter( opt.log_path + f'/{datetime.datetime.now().strftime("%Y%m%d-%H%M%S")}/') if params.num_gpus > 0: model = model.cuda() model = CustomDataParallel(model, params.num_gpus) optimizer = torch.optim.AdamW(model.parameters(), opt.lr) scheduler = torch.optim.lr_scheduler.ReduceLROnPlateau(optimizer, patience=3, verbose=True) criterion = FocalLoss() best_loss = 1e5 best_epoch = 0 step = max(0, last_step) model.train() num_iter_per_epoch = len(training_generator) for epoch in range(opt.num_epochs): try: model.train() epoch_loss = [] progress_bar = tqdm(training_generator) for iter, data in enumerate(progress_bar): try: imgs = data['img'] annot = data['annot'] if params.num_gpus > 0: annot = annot.cuda() optimizer.zero_grad() _, regression, classification, anchors = model(imgs) cls_loss, reg_loss = criterion( classification, regression, anchors, annot, # imgs=imgs, obj_list=params.obj_list # uncomment this to debug ) loss = cls_loss + reg_loss if loss == 0 or not torch.isfinite(loss): continue loss.backward() # torch.nn.utils.clip_grad_norm_(model.parameters(), 0.1) optimizer.step() epoch_loss.append(float(loss)) progress_bar.set_description( 'Step: {}. Epoch: {}/{}. Iteration: {}/{}. Cls loss: {:.5f}. Reg loss: {:.5f}. Total loss: {:.5f}' .format(step, epoch + 1, opt.num_epochs, iter + 1, num_iter_per_epoch, cls_loss.item(), reg_loss.item(), loss.item())) writer.add_scalars('Loss', {'train': loss}, step) writer.add_scalars('Regression_loss', {'train': reg_loss}, step) writer.add_scalars('Classfication_loss', {'train': cls_loss}, step) # log learning_rate current_lr = optimizer.param_groups[0]['lr'] writer.add_scalar('learning_rate', current_lr, step) step += 1 except Exception as e: print(traceback.format_exc()) print(e) continue scheduler.step(np.mean(epoch_loss)) if step % opt.save_interval == 0 and step > 0: save_checkpoint( model, f'efficientdet-d{opt.compound_coef}_{epoch}_{step}.pth') if epoch % opt.val_interval == 0: model.eval() loss_regression_ls = [] loss_classification_ls = [] for iter, data in enumerate(val_generator): with torch.no_grad(): imgs = data['img'] annot = data['annot'] if params.num_gpus > 0: annot = annot.cuda() _, regression, classification, anchors = model(imgs) cls_loss, reg_loss = criterion(classification, regression, anchors, annot) loss = cls_loss + reg_loss if loss == 0 or not torch.isfinite(loss): continue loss_classification_ls.append(cls_loss.item()) loss_regression_ls.append(reg_loss.item()) cls_loss = np.mean(loss_classification_ls) reg_loss = np.mean(loss_regression_ls) loss = cls_loss + reg_loss print( 'Val. Epoch: {}/{}. Classification loss: {:1.5f}. Regression loss: {:1.5f}. Total loss: {:1.5f}' .format(epoch + 1, opt.num_epochs, cls_loss, reg_loss, loss.mean())) writer.add_scalars('Total_loss', {'val': loss}, step) writer.add_scalars('Regression_loss', {'val': reg_loss}, step) writer.add_scalars('Classfication_loss', {'val': cls_loss}, step) if loss + opt.es_min_delta < best_loss: best_loss = loss best_epoch = epoch save_checkpoint( model, f'efficientdet-d{opt.compound_coef}_{epoch}_{step}.pth' ) # onnx export is not tested. # dummy_input = torch.rand(opt.batch_size, 3, 512, 512) # if torch.cuda.is_available(): # dummy_input = dummy_input.cuda() # if isinstance(model, nn.DataParallel): # model.module.backbone_net.model.set_swish(memory_efficient=False) # # torch.onnx.export(model.module, dummy_input, # os.path.join(opt.saved_path, 'signatrix_efficientdet_coco.onnx'), # verbose=False) # model.module.backbone_net.model.set_swish(memory_efficient=True) # else: # model.backbone_net.model.set_swish(memory_efficient=False) # # torch.onnx.export(model, dummy_input, # os.path.join(opt.saved_path, 'signatrix_efficientdet_coco.onnx'), # verbose=False) # model.backbone_net.model.set_swish(memory_efficient=True) # Early stopping if epoch - best_epoch > opt.es_patience > 0: print( 'Stop training at epoch {}. The lowest loss achieved is {}' .format(epoch, loss)) break writer.close() except KeyboardInterrupt: save_checkpoint( model, f'efficientdet-d{opt.compound_coef}_{epoch}_{step}.pth')
def train(opt): params = Params(f'projects/{opt.project}_crop.yml') if params.num_gpus == 0: os.environ['CUDA_VISIBLE_DEVICES'] = '1-' if torch.cuda.is_available(): torch.cuda.manual_seed(42) else: torch.manual_seed(42) save_time = datetime.datetime.now().strftime("%Y%m%d-%H%M%S") opt.saved_path = opt.saved_path + f'/{params.project_name}/crop/weights/{save_time}' opt.log_path = opt.log_path + f'/{params.project_name}/crop/tensorboard/' os.makedirs(opt.log_path, exist_ok=True) os.makedirs(opt.saved_path, exist_ok=True) print('save_path :', opt.saved_path) print('log_path :', opt.log_path) training_params = { 'batch_size': opt.batch_size, 'shuffle': True, 'drop_last': True, 'collate_fn': collater, 'num_workers': opt.num_workers } val_params = { 'batch_size': opt.batch_size, 'shuffle': False, 'drop_last': True, 'collate_fn': collater, 'num_workers': opt.num_workers } input_sizes = [512, 640, 768, 896, 1024, 1280, 1280, 1536, 1536] training_set = Project42Dataset(root_dir=os.path.join( opt.data_path, params.project_name, 'crop'), set=params.train_set, params=params, transform=transforms.Compose([ Normalizer(mean=params.mean, std=params.std), Augmenter(), Resizer(input_sizes[opt.compound_coef]) ])) training_generator = DataLoader(training_set, **training_params) val_set = Project42Dataset(root_dir=os.path.join(opt.data_path, params.project_name, 'crop'), set=params.val_set, params=params, transform=transforms.Compose([ Normalizer(mean=params.mean, std=params.std), Resizer(input_sizes[opt.compound_coef]) ])) val_generator = DataLoader(val_set, **val_params) # labels labels = training_set.labels print('label:', labels) model = EfficientDetBackbone(num_classes=len(params.obj_list), compound_coef=opt.compound_coef, ratios=eval(params.anchors_ratios), scales=eval(params.anchors_scales)) # load last weights if opt.load_weights is not None: if opt.load_weights.endswith('.pth'): weights_path = opt.load_weights else: weights_path = get_last_weights(opt.saved_path) try: last_step = int( os.path.basename(weights_path).split('_')[-1].split('.')[0]) except: last_step = 0 try: ret = model.load_state_dict(torch.load(weights_path), strict=False) except RuntimeError as e: print(f'[Warning] Ignoring {e}') print( '[Warning] Don\'t panic if you see this, this might be because you load a pretrained weights with different number of classes. The rest of the weights should be loaded already.' ) print( f'[Info] loaded weights: {os.path.basename(weights_path)}, resuming checkpoint from step: {last_step}' ) else: last_step = 0 print('[Info] initializing weights...') init_weights(model) # freeze backbone if train head_only if opt.head_only: def freeze_backbone(m): classname = m.__class__.__name__ for ntl in ['EfficientNet', 'BiFPN']: if ntl in classname: for param in m.parameters(): param.requires_grad = False model.apply(freeze_backbone) print('[Info] freezed backbone') # https://github.com/vacancy/Synchronized-BatchNorm-PyTorch # apply sync_bn when using multiple gpu and batch_size per gpu is lower than 4 # useful when gpu memory is limited. # because when bn is disable, the training will be very unstable or slow to converge, # apply sync_bn can solve it, # by packing all mini-batch across all gpus as one batch and normalize, then send it back to all gpus. # but it would also slow down the training by a little bit. if params.num_gpus > 1 and opt.batch_size // params.num_gpus < 4: model.apply(replace_w_sync_bn) use_sync_bn = True else: use_sync_bn = False writer = SummaryWriter(opt.log_path + f'/{save_time}/') # warp the model with loss function, to reduce the memory usage on gpu0 and speedup model = ModelWithLoss(model, debug=opt.debug) if params.num_gpus > 0: model = model.cuda() if params.num_gpus > 1: model = CustomDataParallel(model, params.num_gpus) if use_sync_bn: patch_replication_callback(model) if opt.optim == 'adamw': optimizer = torch.optim.AdamW(model.parameters(), opt.lr) else: optimizer = torch.optim.SGD(model.parameters(), opt.lr, momentum=0.9, nesterov=True) scheduler = torch.optim.lr_scheduler.ReduceLROnPlateau(optimizer, patience=3, verbose=True) epoch = 0 best_loss = 1e5 best_epoch = 0 step = max(0, last_step) model.train() num_iter_per_epoch = len(training_generator) try: for epoch in range(opt.num_epochs): last_epoch = step // num_iter_per_epoch if epoch < last_epoch: continue epoch_loss = [] progress_bar = tqdm(training_generator) for iter, data in enumerate(progress_bar): if iter < step - last_epoch * num_iter_per_epoch: progress_bar.update() continue try: imgs = data['img'] annot = data['annot'] ## train image show # for idx in range(len(imgs)): # showshow = imgs[idx].numpy() # print(showshow.shape) # showshow = showshow.transpose(1, 2, 0) # a = annot[idx].numpy().reshape(5, ) # img_show = cv2.rectangle(showshow, (a[0],a[1]), (a[2],a[3]), (0, 0, 0), 3) # cv2.imshow(f'{idx}_{params.obj_list[int(a[4])]}', img_show) # cv2.waitKey(1000) # cv2.destroyAllWindows() if params.num_gpus == 1: # if only one gpu, just send it to cuda:0 # elif multiple gpus, send it to multiple gpus in CustomDataParallel, not here imgs = imgs.cuda() annot = annot.cuda() optimizer.zero_grad() cls_loss, reg_loss, regression, classification, anchors = model( imgs, annot, obj_list=params.obj_list) cls_loss = cls_loss.mean() reg_loss = reg_loss.mean() loss = cls_loss + reg_loss if loss == 0 or not torch.isfinite(loss): continue loss.backward() # torch.nn.utils.clip_grad_norm_(model.parameters(), 0.1) optimizer.step() # loss epoch_loss.append(float(loss)) # mAP threshold = 0.2 iou_threshold = 0.2 regressBoxes = BBoxTransform() clipBoxes = ClipBoxes() out = postprocess(imgs, anchors, regression, classification, regressBoxes, clipBoxes, threshold, iou_threshold) mAP = mAP_score(annot, out, labels) mAP = mAP.results['mAP'] progress_bar.set_description( 'Step: {}. Epoch: {}/{}. Iteration: {}/{}. Cls loss: {:.5f}. Reg loss: {:.5f}. Total loss: {:.5f}. mAP: {:.2f}' .format(step, epoch + 1, opt.num_epochs, iter + 1, num_iter_per_epoch, cls_loss.item(), reg_loss.item(), loss.item(), mAP)) writer.add_scalars('Loss', {'train': loss}, step) writer.add_scalars('Regression_loss', {'train': reg_loss}, step) writer.add_scalars('Classfication_loss', {'train': cls_loss}, step) writer.add_scalars('mAP', {'train': mAP}, step) # log learning_rate current_lr = optimizer.param_groups[0]['lr'] writer.add_scalar('learning_rate', current_lr, step) step += 1 if step % opt.save_interval == 0 and step > 0: save_checkpoint( model, f'efficientdet-d{opt.compound_coef}_{epoch}.pth') print('checkpoint...') except Exception as e: print('[Error]', traceback.format_exc()) print(e) continue scheduler.step(np.mean(epoch_loss)) if epoch % opt.val_interval == 0: model.eval() loss_regression_ls = [] loss_classification_ls = [] for iter, data in enumerate(val_generator): with torch.no_grad(): imgs = data['img'] annot = data['annot'] if params.num_gpus == 1: imgs = imgs.cuda() annot = annot.cuda() cls_loss, reg_loss, regression, classification, anchors = model( imgs, annot, obj_list=params.obj_list) cls_loss = cls_loss.mean() reg_loss = reg_loss.mean() loss = cls_loss + reg_loss if loss == 0 or not torch.isfinite(loss): continue loss_classification_ls.append(cls_loss.item()) loss_regression_ls.append(reg_loss.item()) cls_loss = np.mean(loss_classification_ls) reg_loss = np.mean(loss_regression_ls) loss = cls_loss + reg_loss # mAP threshold = 0.2 iou_threshold = 0.2 regressBoxes = BBoxTransform() clipBoxes = ClipBoxes() out = postprocess(imgs, anchors, regression, classification, regressBoxes, clipBoxes, threshold, iou_threshold) mAP = mAP_score(annot, out, labels) mAP = mAP.results['mAP'] print( 'Val. Epoch: {}/{}. Classification loss: {:1.5f}. Regression loss: {:1.5f}. Total loss: {:1.5f}. mAP: {:.2f}' .format(epoch + 1, opt.num_epochs, cls_loss, reg_loss, loss, mAP)) writer.add_scalars('Loss', {'val': loss}, step) writer.add_scalars('Regression_loss', {'val': reg_loss}, step) writer.add_scalars('Classfication_loss', {'val': cls_loss}, step) writer.add_scalars('mAP', {'val': mAP}, step) if loss + opt.es_min_delta < best_loss: best_loss = loss best_epoch = epoch save_checkpoint( model, f'efficientdet-d{opt.compound_coef}_{epoch}_{step}.pth' ) model.train() # Early stopping if epoch - best_epoch > opt.es_patience > 0: print( '[Info] Stop training at epoch {}. The lowest loss achieved is {}' .format(epoch, best_loss)) break except KeyboardInterrupt: save_checkpoint( model, f'efficientdet-d{opt.compound_coef}_{epoch}_{step}.pth') writer.close() writer.close()
def train_cls(opt, cfg): training_params = { 'batch_size': cfg.batch_size, 'shuffle': True, 'drop_last': True, 'collate_fn': collater, 'num_workers': opt.num_workers } val_params = { 'batch_size': cfg.batch_size, 'shuffle': False, 'drop_last': True, 'collate_fn': collater, 'num_workers': opt.num_workers } input_sizes = [224, 240, 260, 300, 380, 456, 528, 600] # training_set = CocoDataset( # # root_dir=os.path.join(opt.data_path, params.project_name), # root_dir=opt.data_path, # set=params.train_set, # transform=transforms.Compose([Normalizer(mean=params.mean, std=params.std), # # AdvProp(), # Augmenter(), # Resizer(input_sizes[cfg.compound_coef])])) training_set = DataGenerator(data_path=os.path.join( opt.data_path, 'Train', 'OriginImage'), class_ids=cfg.dictionary_class_name.keys(), transform=transforms.Compose([ Augmenter(), Normalizer(mean=cfg.mean, std=cfg.std), Resizer(input_sizes[cfg.compound_coef]) ])) training_generator = DataLoader(training_set, **training_params) # val_set = CocoDataset( # # root_dir=os.path.join(opt.data_path, params.project_name), # root_dir=opt.data_path, # set=params.val_set, # transform=transforms.Compose([Normalizer(mean=params.mean, std=params.std), # Resizer(input_sizes[cfg.compound_coef])])) val_set = DataGenerator( # root_dir=os.path.join(opt.data_path, params.project_name), data_path=os.path.join(opt.data_path, 'Validation'), class_ids=cfg.dictionary_class_name.keys(), transform=transforms.Compose([ Normalizer(mean=cfg.mean, std=cfg.std), Resizer(input_sizes[cfg.compound_coef]) ])) val_generator = DataLoader(val_set, **val_params) model = EffNet.from_name( f'efficientnet-b{cfg.compound_coef}', override_params={'num_classes': len(cfg.dictionary_class_name.keys())}) # load last weights if opt.load_weights is not None: if opt.load_weights.endswith('.pth'): weights_path = opt.load_weights else: weights_path = get_last_weights(opt.saved_path) try: last_step = int( os.path.basename(weights_path).split('_')[-1].split('.')[0]) except: last_step = 0 try: ret = model.load_state_dict(torch.load(weights_path), strict=False) print(ret) except RuntimeError as e: print(f'[Warning] Ignoring {e}') print( '[Warning] Don\'t panic if you see this, ' 'this might be because you load a pretrained weights with different number of classes. ' 'The rest of the weights should be loaded already.') print( f'[Info] loaded weights: {os.path.basename(weights_path)}, resuming checkpoint from step: {last_step}' ) else: last_step = 0 print('[Info] initializing weights...') init_weights(model) # freeze backbone if train head_only if cfg.training_layer.lower() == 'heads': def freeze_backbone(m): classname = m.__class__.__name__ for ntl in ['EfficientNet', 'BiFPN']: if ntl in classname: for param in m.parameters(): param.requires_grad = False model.apply(freeze_backbone) print('[Info] freezed backbone') # https://github.com/vacancy/Synchronized-BatchNorm-PyTorch # apply sync_bn when using multiple gpu and batch_size per gpu is lower than 4 # useful when gpu memory is limited. # because when bn is disable, the training will be very unstable or slow to converge, # apply sync_bn can solve it, # by packing all mini-batch across all gpus as one batch and normalize, then send it back to all gpus. # but it would also slow down the training by a little bit. if cfg.num_gpus > 1 and cfg.batch_size // cfg.num_gpus < 4: model.apply(replace_w_sync_bn) use_sync_bn = True else: use_sync_bn = False # warp the model with loss function, to reduce the memory usage on gpu0 and speedup model = EfficientNetWrapper(model) if cfg.num_gpus > 0: model = model.cuda() if cfg.num_gpus > 1: model = CustomDataParallel(model, cfg.num_gpus) if use_sync_bn: patch_replication_callback(model) if cfg.optimizer.lower() == 'adamw': optimizer = torch.optim.AdamW(model.parameters(), cfg.learning_rate) else: optimizer = torch.optim.SGD(model.parameters(), cfg.learning_rate, momentum=0.9, nesterov=True) scheduler = torch.optim.lr_scheduler.ReduceLROnPlateau(optimizer, patience=3, verbose=True) # Setup complete, then start training now = datetime.datetime.now() opt.saved_path = opt.saved_path + f'/trainlogs_{now.strftime("%Y%m%d_%H%M%S")}' if opt.log_path is None: opt.log_path = opt.saved_path os.makedirs(opt.log_path, exist_ok=True) os.makedirs(opt.saved_path, exist_ok=True) # Write history if 'backlog' not in opt.config: with open( os.path.join(opt.saved_path, f'{now.strftime("%Y%m%d%H%M%S")}.backlog.json'), 'w') as f: backlog = dict(cfg.to_pascal_case()) backlog['__metadata__'] = 'Backlog at ' + now.strftime( "%Y/%m/%d %H:%M:%S") json.dump(backlog, f) else: with open( os.path.join(opt.saved_path, f'{now.strftime("%Y%m%d%H%M%S")}.history.json'), 'w') as f: history = dict(cfg.to_pascal_case()) history['__metadata__'] = now.strftime("%Y/%m/%d %H:%M:%S") json.dump(history, f) writer = SummaryWriter(opt.log_path + f'/tensorboard') epoch = 0 best_loss = 1e5 best_epoch = 0 step = max(0, last_step) model.train() num_iter_per_epoch = len(training_generator) try: for epoch in range(cfg.no_epochs): # metrics correct_preds = 0. last_epoch = step // num_iter_per_epoch if epoch < last_epoch: continue epoch_loss = [] progress_bar = tqdm(training_generator) for iter, data in enumerate(progress_bar): if iter < step - last_epoch * num_iter_per_epoch: progress_bar.set_description( f'Skip {iter} < {step} - {last_epoch} * {num_iter_per_epoch}' ) progress_bar.update() continue try: imgs = data['img'] annot = data['annot'] # if params.num_gpus == 1: # # if only one gpu, just send it to cuda:0 # # elif multiple gpus, send it to multiple gpus in CustomDataParallel, not here imgs = imgs.cuda() annot = annot.cuda() optimizer.zero_grad() logits, loss = model(imgs, annot) loss = loss.mean() if loss == 0 or not torch.isfinite(loss): continue loss.backward() # torch.nn.utils.clip_grad_norm_(model.parameters(), 0.1) optimizer.step() epoch_loss.append(float(loss)) _, preds = torch.max(logits, dim=1) correct_preds += torch.sum(preds == annot) acc = correct_preds / ( (step % num_iter_per_epoch + 1) * cfg.batch_size) progress_bar.set_description( 'Step: {}. Epoch: {}/{}. Iteration: {}/{}. ' 'Loss: {:.5f}. Accuracy: {:.5f}.'.format( step, epoch, cfg.no_epochs, iter + 1, num_iter_per_epoch, float(loss), float(acc))) writer.add_scalars('Loss', {'train': float(loss)}, step) writer.add_scalars('Accuracy', {'train': float(acc)}, step) # log learning_rate current_lr = optimizer.param_groups[0]['lr'] writer.add_scalar('learning_rate', current_lr, step) step += 1 except Exception as e: print('[Error]', traceback.format_exc()) print(e) continue scheduler.step(np.mean(epoch_loss)) if epoch % opt.val_interval == 0: correct_preds = 0. fusion_matrix = torch.zeros( len(cfg.dictionary_class_name), len(cfg.dictionary_class_name)).cuda() model.eval() val_losses = [] for iter, data in enumerate(val_generator): with torch.no_grad(): imgs = data['img'] annot = data['annot'] # if params.num_gpus == 1: imgs = imgs.cuda() annot = annot.cuda() logits, loss = model(imgs, annot) loss = loss.mean() _, preds = torch.max(logits, dim=1) correct_preds += torch.sum(preds == annot) # Update matrix for i, j in zip(preds, annot): fusion_matrix[i, j] += 1 if loss == 0 or not torch.isfinite(loss): continue val_losses.append(loss.item()) val_loss = np.mean(val_losses) val_acc = float(correct_preds) / (len(val_generator) * cfg.batch_size) progress_bar.set_description( 'Val. Epoch: {}/{}. Loss: {:1.5f}. Accuracy: {:1.5f}. '. format(epoch, cfg.no_epochs, val_loss.item(), val_acc)) # Calculate predictions and recalls preds_total = torch.sum(fusion_matrix, dim=1) recall_total = torch.sum(fusion_matrix, dim=0) predictions = { l: float(fusion_matrix[i, i]) / max(1, preds_total[i].item()) for l, i in val_set.classes.items() } recalls = { l: float(fusion_matrix[i, i]) / max(1, recall_total[i].item()) for l, i in val_set.classes.items() } writer.add_scalars('Loss', {'val': val_loss}, step) writer.add_scalars('Accuracy', {'val': val_acc}, step) writer.add_scalars('Predictions', predictions, step) writer.add_scalars('Recalls', recalls, step) print(fusion_matrix) if loss + opt.es_min_delta < best_loss: best_loss = loss best_epoch = epoch save_checkpoint( model, f"{opt.saved_path}/cls_b{cfg.compound_coef}_{epoch}_{step}.pth" ) model.train() # Early stopping if epoch - best_epoch > opt.es_patience > 0: print( '[Info] Stop training at epoch {}. The lowest loss achieved is {}' .format(epoch, best_loss)) break print( f'[Info] Finished training. Best loss achieved {best_loss} at epoch {best_epoch}.' ) except KeyboardInterrupt: save_checkpoint( model, f"{opt.saved_path}/cls_b{cfg.compound_coef}_{epoch}_{step}.pth") writer.close() writer.close()
def train(): if config.num_gpus == 0: os.environ['CUDA_VISIBLE_DEVICES'] = '-1' if torch.cuda.is_available(): torch.cuda.manual_seed(42) else: torch.manual_seed(42) config.saved_path = config.saved_path + '/{0}/'.format(config.dataset_name) config.log_path = config.log_path + '/{0}/'.format(config.dataset_name) os.makedirs(config.log_path, exist_ok=True) os.makedirs(config.saved_path, exist_ok=True) training_params = { 'batch_size': config.batch_size, 'shuffle': True, 'drop_last': True, 'collate_fn': collater, 'num_workers': config.num_workers } val_params = { 'batch_size': config.batch_size, 'shuffle': False, 'drop_last': True, 'collate_fn': collater, 'num_workers': config.num_workers } input_sizes = [512, 640, 768, 896, 1024, 1280, 1280, 1536] if ("coco" in config.dataset_name): DS = CocoDataset else: DS = PascalVocDataset training_set = DS(root_dir=os.path.join(config.data_path, config.dataset_name), set=config.train_set, img_size=input_sizes[config.compound_coef], anchor_free_mode=config.anchor_free_mode, transform=transforms.Compose([ Normalizer(mean=config.mean, std=config.std), Augmenter(), Resizer(input_sizes[config.compound_coef]) ])) training_generator = DataLoader(training_set, **training_params) val_set = DS(root_dir=os.path.join(config.data_path, config.dataset_name), set=config.val_set, img_size=input_sizes[config.compound_coef], anchor_free_mode=config.anchor_free_mode, transform=transforms.Compose([ Normalizer(mean=config.mean, std=config.std), Resizer(input_sizes[config.compound_coef]) ])) val_generator = DataLoader(val_set, **val_params) model = EfficientDetBackbone(num_classes=len(config.obj_list), compound_coef=config.compound_coef, load_weights=False, anchor_free_mode=config.anchor_free_mode, ratios=eval(config.anchors_ratios), scales=eval(config.anchors_scales)) init_weights(model) last_step = 0 # load last weights if config.load_weights: # 首先使用init_weights来初始化网络参数,然后再restore, # 使得网络中未restore的参数可以正常初始化 if config.pret_weight_path.endswith('.pth'): weights_path = config.pret_weight_path try: model_dict = torch.load(weights_path) new_dict = {} for k, v in model_dict.items(): if 'header' not in k: new_dict[k] = v ret = model.load_state_dict(new_dict, strict=False) except RuntimeError as e: print('[Warning] Ignoring {0}'.format(e)) print( '[Warning] Don\'t panic if you see this, this might be because you load a pretrained weights with different number of classes. The rest of the weights should be loaded already.' ) print('[Info] loaded pretrained weights: {0},'.format(weights_path)) if config.head_only: def freeze_backbone(m): classname = m.__class__.__name__ for ntl in ['EfficientNet']: if ntl in classname: for param in m.parameters(): param.requires_grad = False model.apply(freeze_backbone) print('[Info] freezed backbone') # https://github.com/vacancy/Synchronized-BatchNorm-PyTorch # apply sync_bn when using multiple gpu and batch_size per gpu is lower than 4 # useful when gpu memory is limited. # because when bn is disable, the training will be very unstable or slow to converge, # apply sync_bn can solve it, # by packing all mini-batch across all gpus as one batch and normalize, then send it back to all gpus. # but it would also slow down the training by a little bit. if config.num_gpus > 1 and config.batch_size // config.num_gpus < 4: model.apply(replace_w_sync_bn) use_sync_bn = True else: use_sync_bn = False writer = SummaryWriter( config.log_path + '/{0}/'.format(datetime.datetime.now().strftime("%Y%m%d-%H%M%S"))) # warp the model with loss function, to reduce the memory usage on gpu0 and speedup model = ModelWithLoss(model, debug=config.debug) if config.num_gpus > 0: model = model.cuda() if config.num_gpus > 1: model = CustomDataParallel(model, config.num_gpus) if use_sync_bn: patch_replication_callback(model) if config.optim == 'adamw': optimizer = torch.optim.AdamW(model.parameters(), config.lr) else: optimizer = torch.optim.SGD(model.parameters(), config.lr, momentum=0.9, nesterov=True) scheduler = torch.optim.lr_scheduler.ReduceLROnPlateau( optimizer, patience=config.patience, verbose=True, factor=config.factor, min_lr=config.min_lr) epoch = 0 step = max(0, last_step) model.train() num_iter_per_epoch = len(training_generator) try: for epoch in range(config.num_epochs): last_epoch = step // num_iter_per_epoch if epoch < last_epoch: continue epoch_loss = [] for iter, data in enumerate(training_generator): try: imgs = data['img'] annot = data['annot'] if config.num_gpus == 1: # if only one gpu, just send it to cuda:0 # elif multiple gpus, send it to multiple gpus in CustomDataParallel, not here imgs = imgs.cuda() annot = annot.cuda() optimizer.zero_grad() cls_loss, reg_loss = model(imgs, annot, obj_list=config.obj_list) cls_loss = cls_loss.mean() reg_loss = reg_loss.mean() loss = cls_loss + reg_loss if loss == 0 or not torch.isfinite(loss): continue loss.backward() # torch.nn.utils.clip_grad_norm_(model.parameters(), 0.1) optimizer.step() epoch_loss.append(float(loss)) print( 'Step: {}. Epoch: {}/{}. Iteration: {}/{}. Cls loss: {:.5f}. Reg loss: {:.5f}. Total loss: {:.5f}' .format(step, epoch, config.num_epochs, iter + 1, num_iter_per_epoch, cls_loss.item(), reg_loss.item(), loss.item())) writer.add_scalars('Loss', {'train': loss}, step) writer.add_scalars('Regression_loss', {'train': reg_loss}, step) writer.add_scalars('Classfication_loss', {'train': cls_loss}, step) # log learning_rate current_lr = optimizer.param_groups[0]['lr'] writer.add_scalar('learning_rate', current_lr, step) step += 1 except Exception as e: print('[Error]', traceback.format_exc()) print(e) continue scheduler.step(np.mean(epoch_loss)) if epoch % config.val_interval == 0 and epoch > config.start_interval: model.eval() loss_regression_ls = [] loss_classification_ls = [] for iter, data in enumerate(val_generator): with torch.no_grad(): imgs = data['img'] annot = data['annot'] if config.num_gpus == 1: imgs = imgs.cuda() annot = annot.cuda() cls_loss, reg_loss = model(imgs, annot, obj_list=config.obj_list) cls_loss = cls_loss.mean() reg_loss = reg_loss.mean() loss = cls_loss + reg_loss if loss == 0 or not torch.isfinite(loss): continue loss_classification_ls.append(cls_loss.item()) loss_regression_ls.append(reg_loss.item()) cls_loss = np.mean(loss_classification_ls) reg_loss = np.mean(loss_regression_ls) loss = cls_loss + reg_loss print( 'Val. Epoch: {}/{}. Classification loss: {:1.5f}. Regression loss: {:1.5f}. Total loss: {:1.5f}' .format(epoch, config.num_epochs, cls_loss, reg_loss, loss)) writer.add_scalars('Loss', {'val': loss}, step) writer.add_scalars('Regression_loss', {'val': reg_loss}, step) writer.add_scalars('Classfication_loss', {'val': cls_loss}, step) save_checkpoint( model, 'efficientdet-d{0}_{1}_{2}.pth'.format( config.compound_coef, epoch, step)) model.train() except KeyboardInterrupt: save_checkpoint( model, 'efficientdet-d{0}_{1}_{2}.pth'.format(config.compound_coef, epoch, step)) writer.close() writer.close()
def train(args): # noqa: C901 train_start_time = time.perf_counter() assert num_gpus > 0, "Found 0 cuda devices, CPU training is not supported." total_batch_size = args.batch_size * num_gpus assert total_batch_size % args.num_workers == 0, ( f"batch_size * num_gpus ({total_batch_size}) must be divisible by num_workers " f"({args.num_workers}).") with open(os.path.join(args.model_dir, "hyperparameters.yml"), "w") as f: yaml.dump(vars(args), f) # initialization of tensorboard summary writers date_time = datetime.datetime.now().strftime(_STRFTIME_FORMAT) writer = SummaryWriter( os.path.join(args.tensorboard_dir, f"logs/{date_time}")) train_writer = SummaryWriter( os.path.join(args.tensorboard_dir, f"logs/{date_time}/train")) val_writer = SummaryWriter( os.path.join(args.tensorboard_dir, f"logs/{date_time}/val")) # get weights path, selecting the best weights if weights == "best" weights_path = _get_weights_path(args.weights_dir, args.weights) # create the correct data structure splitting input data in train and val sets prepare_annotations(args.data_dir, args.classes, ["train", "val"]) torch.cuda.manual_seed(args.seed) train_loader = _get_train_data_loader(args) val_loader = _get_val_data_loader(args) model = EfficientDetBackbone( num_classes=len(args.classes), compound_coef=args.compound_coef, ratios=args.anchors_ratios, scales=args.anchors_scales, ) _init_weights(model, weights_path) if args.freeze_backbone: logger.info("Freezing backbone") model.apply(_freeze_submodule_if_backbone) # https://github.com/vacancy/Synchronized-BatchNorm-PyTorch # use synchronized batch normalization when the batch size per gpu is too small if args.batch_size < 4: model.apply(replace_w_sync_bn) use_sync_bn = True logger.info("Using Synchronized Batch Normalization") else: use_sync_bn = False # warp the model with loss function, to reduce the memory usage on gpu0 and speedup model = ModelWithLoss(model) model = model.cuda() if num_gpus > 1: # TODO: see if there are better way to parallelize model = CustomDataParallel(model, num_gpus) if use_sync_bn: patch_replication_callback(model) steps_per_epoch = len(train_loader) last_step, es_baseline = _get_last_step_and_es_baseline( weights_path, args.resume_training) es = EarlyStopping(args, baseline=es_baseline, best_epoch=last_step // steps_per_epoch - 1) optimizer = _get_optimizer(model, args) scheduler = _get_scheduler(optimizer, steps_per_epoch, args) model.train() logger.info(f"Starting training from step {last_step}") for epoch in range(args.epochs): if epoch in args.milestones: for group in optimizer.param_groups: if args.scheduler == "onecyclelr": group["max_lr"] *= args.multisteplr_gamma group["min_lr"] *= args.multisteplr_gamma else: group["lr"] *= args.multisteplr_gamma last_epoch = last_step // steps_per_epoch if epoch < last_epoch: if scheduler is not None: for _ in range(steps_per_epoch): scheduler.step() continue train_loader_iter = iter(train_loader) for batch_idx in range(steps_per_epoch): iter_start_time = time.perf_counter() data_start_time = time.perf_counter() data = next(train_loader_iter) data_time = time.perf_counter() - data_start_time if batch_idx < (last_step - last_epoch * steps_per_epoch): if scheduler is not None: scheduler.step() continue imgs = data["img"] annotations = data["annot"] # if only one gpu, just send it to cuda:0 elif multiple gpus, # send it to multiple gpus in CustomDataParallel if num_gpus == 1: imgs = imgs.cuda() annotations = annotations.cuda() optimizer.zero_grad() loss_cls, loss_box_reg = model(imgs, annotations) loss_cls = loss_cls.mean() loss_box_reg = loss_box_reg.mean() total_loss = loss_cls + loss_box_reg if total_loss == 0 or not torch.isfinite(total_loss): continue total_loss.backward() if args.clip_gradients_norm > 0: torch.nn.utils.clip_grad_norm_(model.parameters(), args.clip_gradients_norm) lr = optimizer.param_groups[0]["lr"] optimizer.step() if scheduler is not None: scheduler.step() date_time = datetime.datetime.now().strftime("%m/%d %H:%M:%S") eta = datetime.timedelta(seconds=round(time.perf_counter() - train_start_time)) max_mem_mb = torch.cuda.max_memory_allocated() / 1024.0 / 1024.0 iter_time = time.perf_counter() - iter_start_time logger.info(f"[{date_time} train]: " f"eta: {eta} " f"epoch: {epoch + 1}/{args.epochs} " f"batch: {batch_idx + 1}/{steps_per_epoch} " f"loss_cls: {loss_cls.item():.4f} " f"loss_box_reg: {loss_box_reg.item():.4f} " f"total_loss: {total_loss.item():.4f} " f"time: {iter_time:.4f} " f"data_time: {data_time:.4f} " f"lr: {lr:.6f} " f"max_mem: {max_mem_mb:.0f}M") writer.add_scalar("hp/lr", lr, last_step) if args.cycle_momentum: momentum = optimizer.param_groups[0]["momentum"] writer.add_scalar("hp/momentum", momentum, last_step) writer.add_scalar("usage/max_mem", max_mem_mb, last_step) writer.flush() train_writer.add_scalar("loss/total_loss", total_loss.item(), last_step) train_writer.add_scalar("loss/loss_cls", loss_cls.item(), last_step) train_writer.add_scalar("loss/loss_box_reg", loss_box_reg.item(), last_step) train_writer.add_scalar("time/time", iter_time, last_step) train_writer.add_scalar("time/data_time", data_time, last_step) train_writer.flush() last_step += 1 # See https://github.com/pytorch/pytorch/issues/1355#issuecomment-658660582. del train_loader_iter if epoch % args.val_interval == 0 or epoch + 1 == args.epochs: total_val_loss = validate(model, val_loader, last_step - 1, epoch, args.epochs, val_writer) _save_model( model, args.checkpoints_dir, args.compound_coef, epoch, last_step, total_val_loss, ) if es.step(epoch, total_val_loss): break model.train() model_params = { "classes": args.classes, "compound_coef": args.compound_coef, "anchors_scales": args.anchors_scales, "anchors_ratios": args.anchors_ratios, } with open(os.path.join(args.model_dir, "model_params.yml"), "w") as f: yaml.dump(model_params, f) writer.close() train_writer.close() val_writer.close() best_weights_path = _get_best_weights_path(args.checkpoints_dir) shutil.copyfile(best_weights_path, os.path.join(args.model_dir, "model.pth")) evaluate( args.model_dir, args.data_dir, eval_set="val", threshold=args.eval_threshold, nms_threshold=args.eval_nms_threshold, max_imgs=args.eval_max_imgs, use_float16=args.use_float16, device=args.eval_device, )
def train(opt): params = Params(f'projects/{opt.project}.yml') if opt.project == "vcoco": num_obj_class = 90 num_union_action = 25 num_inst_action = 51 else: assert opt.project == "hico-det" num_obj_class = 90 num_union_action = 117 num_inst_action = 234 if params.num_gpus == 0: os.environ['CUDA_VISIBLE_DEVICES'] = '-1' opt.saved_path = opt.saved_path + f'/{params.project_name}/' opt.log_path = opt.log_path + f'/{params.project_name}/tensorboard/' os.makedirs(opt.log_path, exist_ok=True) os.makedirs(opt.saved_path, exist_ok=True) training_params = { 'batch_size': opt.batch_size, 'shuffle': True, 'drop_last': True, 'collate_fn': collater, 'num_workers': opt.num_workers, 'pin_memory': False } val_params = { 'batch_size': opt.batch_size * 2, 'shuffle': False, 'drop_last': True, 'collate_fn': collater, 'num_workers': opt.num_workers, 'pin_memory': False } input_sizes = [512, 640, 768, 896, 1024, 1280, 1280, 1536] train_transform = transforms.Compose([ Normalizer(mean=params.mean, std=params.std), Augmenter(), Resizer(input_sizes[opt.compound_coef]) ]) val_transform = transforms.Compose([ Normalizer(mean=params.mean, std=params.std), Resizer(input_sizes[opt.compound_coef]) ]) if opt.project == "vcoco": training_set = VCOCO_Dataset(root_dir="./datasets/vcoco", set=params.train_set, color_prob=1, transform=train_transform) val_set = VCOCO_Dataset(root_dir="./datasets/vcoco", set=params.val_set, transform=val_transform) else: training_set = HICO_DET_Dataset(root_dir="datasets/hico_20160224_det", set="train", color_prob=1, transform=train_transform) val_set = HICO_DET_Dataset(root_dir="datasets/hico_20160224_det", set="test", transform=val_transform) training_generator = DataLoader(training_set, **training_params) val_generator = DataLoader(val_set, **val_params) model = EfficientDetBackbone(num_classes=num_obj_class, num_union_classes=num_union_action, num_inst_classes=num_inst_action, compound_coef=opt.compound_coef, ratios=eval(params.anchors_ratios), scales=eval(params.anchors_scales)) model.train() print("num_classes:", num_obj_class) print("num_union_classes:", num_union_action) print("instance_action_list", num_inst_action) # load last weights if opt.load_weights is not None: if opt.load_weights.endswith('.pth'): weights_path = opt.load_weights else: weights_path = get_last_weights(opt.saved_path) try: last_step = int( os.path.basename(weights_path).split('_')[-1].split('.')[0]) # last_epoch = int(os.path.basename(weights_path).split('_')[-2].split('.')[0]) + 1 # last_step = last_epoch * len(training_generator) except: last_step = 0 try: init_weights(model) print(weights_path) model_dict = model.state_dict() pretrained_dict = torch.load(weights_path, map_location=torch.device('cpu')) new_pretrained_dict = {} for k, v in pretrained_dict.items(): if k in model_dict: new_pretrained_dict[k] = v elif ("instance_branch.object_" + k) in model_dict: new_pretrained_dict["instance_branch.object_" + k] = v # print("instance_branch.object_"+k) ret = model.load_state_dict(new_pretrained_dict, strict=False) except RuntimeError as e: print(f'[Warning] Ignoring {e}') print( '[Warning] Don\'t panic if you see this, this might be because you load a pretrained weights with different number of classes. The rest of the weights should be loaded already.' ) print( f'[Info] loaded weights: {os.path.basename(weights_path)}, resuming checkpoint from step: {last_step}' ) else: last_step = 0 print('[Info] initializing weights...') init_weights(model) # freeze backbone if train head_only if opt.head_only: model.apply(freeze_backbone) freeze_bn_backbone(model) print('[Info] freezed backbone') if opt.freeze_object_detection: freeze_object_detection(model) freeze_bn_object_detection(model) # model.apply(freeze_object_detection) print('[Info] freezed object detection branch') # https://github.com/vacancy/Synchronized-BatchNorm-PyTorch # apply sync_bn when using multiple gpu and batch_size per gpu is lower than 4 # useful when gpu memory is limited. # because when bn is disable, the training will be very unstable or slow to converge, # apply sync_bn can solve it, # by packing all mini-batch across all gpus as one batch and normalize, then send it back to all gpus. # but it would also slow down the training by a little bit. if params.num_gpus > 1 and opt.batch_size // params.num_gpus < 8: model.apply(replace_w_sync_bn) use_sync_bn = True else: use_sync_bn = False writer = SummaryWriter( opt.log_path + f'/{datetime.datetime.now().strftime("%Y%m%d-%H%M%S")}/') # warp the model with loss function, to reduce the memory usage on gpu0 and speedup model = ModelWithLoss(model, dataset=opt.project, debug=opt.debug) if params.num_gpus > 0: model = model.cuda() if params.num_gpus > 1: model = CustomDataParallel(model, params.num_gpus) if use_sync_bn: patch_replication_callback(model) if opt.head_only: print('[Info] freezed SyncBN backbone') freeze_bn_backbone(model.module.model) if opt.freeze_object_detection: print('[Info] freezed SyncBN object detection') freeze_bn_object_detection(model.module.model) if opt.optim == 'adamw': # optimizer = torch.optim.AdamW(model.parameters(), opt.lr) optimizer = torch.optim.AdamW( filter(lambda p: p.requires_grad, model.parameters()), opt.lr) elif opt.optim == "adam": # optimizer = torch.optim.Adam(model.parameters(), opt.lr) optimizer = torch.optim.Adam( filter(lambda p: p.requires_grad, model.parameters()), opt.lr) else: # optimizer = torch.optim.SGD(model.parameters(), opt.lr, momentum=0.9, nesterov=True) optimizer = torch.optim.SGD(filter(lambda p: p.requires_grad, model.parameters()), opt.lr, momentum=0.9, nesterov=True) scheduler = torch.optim.lr_scheduler.ReduceLROnPlateau(optimizer, patience=2, verbose=True, min_lr=1e-7) epoch = 0 best_loss = 1e5 best_epoch = 0 step = max(0, last_step) num_iter_per_epoch = (len(training_generator) + opt.accumulate_batch - 1) // opt.accumulate_batch try: for epoch in range(opt.num_epochs): last_epoch = step // num_iter_per_epoch + 1 if epoch < last_epoch: continue if epoch in [120, 130]: optimizer.param_groups[0][ 'lr'] = optimizer.param_groups[0]['lr'] / 10 epoch_loss = [] for iter, data in enumerate(training_generator): try: imgs = data['img'] annot = data['annot'] # torch.cuda.empty_cache() if params.num_gpus == 1: # if only one gpu, just send it to cuda:0 # elif multiple gpus, send it to multiple gpus in CustomDataParallel, not here imgs = imgs.cuda() for key in annot: annot[key] = annot[key].cuda() union_act_cls_loss, union_sub_reg_loss, union_obj_reg_loss, union_diff_reg_loss, \ inst_act_cls_loss, inst_obj_cls_loss, inst_obj_reg_loss = model(imgs, annot["instance"], annot["interaction"]) union_act_cls_loss = union_act_cls_loss.mean() union_sub_reg_loss = union_sub_reg_loss.mean() union_obj_reg_loss = union_obj_reg_loss.mean() union_diff_reg_loss = union_diff_reg_loss.mean() inst_act_cls_loss = inst_act_cls_loss.mean() inst_obj_cls_loss = inst_obj_cls_loss.mean() inst_obj_reg_loss = inst_obj_reg_loss.mean() union_loss = union_act_cls_loss + union_sub_reg_loss + union_obj_reg_loss + union_diff_reg_loss instance_loss = inst_act_cls_loss + inst_obj_cls_loss + inst_obj_reg_loss loss = union_loss + inst_act_cls_loss if loss == 0 or not torch.isfinite(loss): continue batch_loss = loss / opt.accumulate_batch batch_loss.backward() if (iter + 1) % opt.accumulate_batch == 0 or iter == len( training_generator) - 1: optimizer.step() optimizer.zero_grad() step += 1 loss = loss.item() union_loss = union_loss.item() instance_loss = instance_loss.item() epoch_loss.append(float(loss)) current_lr = optimizer.param_groups[0]['lr'] if step % opt.log_interval == 0: writer.add_scalars('Union Action Classification Loss', {'train': union_act_cls_loss}, step) writer.add_scalars('Union Subject Regression Loss', {'train': union_sub_reg_loss}, step) writer.add_scalars('Union Object Regression Loss', {'train': union_obj_reg_loss}, step) writer.add_scalars('Union Diff Regression Loss', {'train': union_diff_reg_loss}, step) writer.add_scalars( 'Instance Action Classification Loss', {'train': inst_act_cls_loss}, step) writer.add_scalars( 'Instance Object Classification Loss', {'train': inst_obj_cls_loss}, step) writer.add_scalars('Instance Regression Loss', {'train': inst_obj_reg_loss}, step) writer.add_scalars('Total Loss', {'train': loss}, step) writer.add_scalars('Union Loss', {'train': union_loss}, step) writer.add_scalars('Instance Loss', {'train': instance_loss}, step) # log learning_rate writer.add_scalar('learning_rate', current_lr, step) if iter % 20 == 0: print( 'Step: {}. Epoch: {}/{}. Iteration: {}/{}. Union loss: {:.5f}. Instance loss: {:.5f}. ' ' Total loss: {:.5f}. Learning rate: {:.5f}'. format(step, epoch, opt.num_epochs, (iter + 1) // opt.accumulate_batch, num_iter_per_epoch, union_loss, instance_loss, loss, current_lr)) if step % opt.save_interval == 0 and step > 0: save_checkpoint( model, f'efficientdet-d{opt.compound_coef}_{epoch}_{step}.pth' ) print('checkpoint...') except Exception as e: print('[Error]', traceback.format_exc()) print(e) continue # scheduler.step(np.mean(epoch_loss)) if epoch % opt.val_interval == 0: # model.eval() union_loss_ls = [] instance_loss_ls = [] union_act_cls_loss_ls = [] union_obj_cls_loss_ls = [] union_act_reg_loss_ls = [] union_sub_reg_loss_ls = [] union_obj_reg_loss_ls = [] union_diff_reg_loss_ls = [] inst_act_cls_loss_ls = [] inst_obj_cls_loss_ls = [] inst_obj_reg_loss_ls = [] val_loss = [] for iter, data in enumerate(val_generator): if (iter + 1) % 50 == 0: print("%d/%d" % (iter + 1, len(val_generator))) with torch.no_grad(): imgs = data['img'] annot = data['annot'] if params.num_gpus == 1: imgs = imgs.cuda() for key in annot: annot[key] = annot[key].cuda() union_act_cls_loss, union_sub_reg_loss, union_obj_reg_loss, union_diff_reg_loss, \ inst_act_cls_loss, inst_obj_cls_loss, inst_obj_reg_loss = model(imgs, annot["instance"], annot["interaction"]) union_act_cls_loss = union_act_cls_loss.mean() union_sub_reg_loss = union_sub_reg_loss.mean() union_obj_reg_loss = union_obj_reg_loss.mean() union_diff_reg_loss = union_diff_reg_loss.mean() inst_act_cls_loss = inst_act_cls_loss.mean() inst_obj_cls_loss = inst_obj_cls_loss.mean() inst_obj_reg_loss = inst_obj_reg_loss.mean() union_loss = union_act_cls_loss + union_sub_reg_loss + union_obj_reg_loss + union_diff_reg_loss instance_loss = inst_act_cls_loss + inst_obj_cls_loss + inst_obj_reg_loss loss = union_loss + inst_act_cls_loss if loss == 0 or not torch.isfinite(loss): continue val_loss.append(loss.item()) union_act_cls_loss_ls.append(union_act_cls_loss.item()) union_sub_reg_loss_ls.append(union_sub_reg_loss.item()) union_obj_reg_loss_ls.append(union_obj_reg_loss.item()) union_diff_reg_loss_ls.append( union_diff_reg_loss.item()) # union_obj_cls_loss_ls.append(union_obj_cls_loss.item()) # union_act_reg_loss_ls.append(union_act_reg_loss.item()) inst_act_cls_loss_ls.append(inst_act_cls_loss.item()) inst_obj_cls_loss_ls.append(inst_obj_cls_loss.item()) inst_obj_reg_loss_ls.append(inst_obj_reg_loss.item()) union_loss_ls.append(union_loss.item()) instance_loss_ls.append(instance_loss.item()) union_loss = np.mean(union_loss_ls) instance_loss = np.mean(instance_loss_ls) union_act_cls_loss = np.mean(union_act_cls_loss_ls) union_sub_reg_loss = np.mean(union_sub_reg_loss_ls) union_obj_reg_loss = np.mean(union_obj_reg_loss_ls) union_diff_reg_loss = np.mean(union_diff_reg_loss_ls) inst_act_cls_loss = np.mean(inst_act_cls_loss_ls) inst_obj_cls_loss = np.mean(inst_obj_cls_loss_ls) inst_obj_reg_loss = np.mean(inst_obj_reg_loss_ls) loss = union_loss + inst_act_cls_loss print( 'Val. Epoch: {}/{}. Union loss: {:1.5f}. Instance loss: {:1.5f}. ' 'Total loss: {:1.5f}'.format(epoch, opt.num_epochs, union_loss, instance_loss, loss)) writer.add_scalars('Union Action Classification Loss', {'val': union_act_cls_loss}, step) writer.add_scalars('Union Subject Regression Loss', {'val': union_sub_reg_loss}, step) writer.add_scalars('Union Object Regression Loss', {'val': union_obj_reg_loss}, step) writer.add_scalars('Union Diff Regression Loss', {'val': union_diff_reg_loss}, step) writer.add_scalars('Instance Action Classification Loss', {'val': inst_act_cls_loss}, step) writer.add_scalars('Instance Object Classification Loss', {'val': inst_obj_cls_loss}, step) writer.add_scalars('Instance Regression Loss', {'val': inst_obj_reg_loss}, step) writer.add_scalars('Total Loss', {'val': loss}, step) writer.add_scalars('Union Loss', {'val': union_loss}, step) writer.add_scalars('Instance Loss', {'val': instance_loss}, step) if loss + opt.es_min_delta < best_loss: best_loss = loss best_epoch = epoch save_checkpoint( model, f'efficientdet-d{opt.compound_coef}_{epoch}_{step}.pth' ) # model.train() # scheduler.step() scheduler.step(np.mean(val_loss)) if optimizer.param_groups[0]['lr'] < opt.lr / 100: break # Early stopping # if epoch - best_epoch > opt.es_patience > 0: # print('[Info] Stop training at epoch {}. The lowest loss achieved is {}'.format(epoch, loss)) # break except KeyboardInterrupt: save_checkpoint( model, f'efficientdet-d{opt.compound_coef}_{epoch}_{step}.pth') writer.close() writer.close()
def train(opt): params = Params(f'projects/{opt.project}.yml') # Neptune staff all_params = opt.__dict__ all_params.update(params.params) data_path = os.path.join(opt.data_path, params.project_name) tags = [ 'EfficientDet', f'D{opt.compound_coef}', f'bs{opt.batch_size}', opt.optim ] if opt.head_only: tags.append('head_only') if len(params.obj_list) == 1: tags.append('one_class') if opt.no_aug: tags.append('no_aug') neptune.create_experiment(name='EfficientDet', tags=tags, params=all_params, upload_source_files=['train.py', 'coco_eval.py']) log_data_version(data_path) if params.num_gpus == 0: os.environ['CUDA_VISIBLE_DEVICES'] = '-1' if torch.cuda.is_available(): torch.cuda.manual_seed(42) else: torch.manual_seed(42) opt.saved_path = os.path.join(opt.saved_path, params.project_name) opt.log_path = os.path.join(opt.log_path, params.project_name, 'tensorboard/') os.makedirs(opt.log_path, exist_ok=True) os.makedirs(opt.saved_path, exist_ok=True) training_params = { 'batch_size': opt.batch_size, 'shuffle': True, 'drop_last': True, 'collate_fn': collater, 'num_workers': opt.num_workers } val_params = { 'batch_size': opt.batch_size, 'shuffle': False, 'drop_last': True, 'collate_fn': collater, 'num_workers': opt.num_workers } input_sizes = [512, 640, 768, 896, 1024, 1280, 1280, 1536, 1536] if opt.no_aug: transform_list = [ Normalizer(mean=params.mean, std=params.std), Resizer(input_sizes[opt.compound_coef]) ] else: transform_list = [ Normalizer(mean=params.mean, std=params.std), Augmenter(), Resizer(input_sizes[opt.compound_coef]) ] training_set = CocoDataset(root_dir=os.path.join(opt.data_path, params.project_name), set=params.train_set, transform=transforms.Compose(transform_list)) training_generator = DataLoader(training_set, **training_params) val_set = CocoDataset(root_dir=os.path.join(opt.data_path, params.project_name), set=params.val_set, transform=transforms.Compose([ Normalizer(mean=params.mean, std=params.std), Resizer(input_sizes[opt.compound_coef]) ])) val_generator = DataLoader(val_set, **val_params) model = EfficientDetBackbone(num_classes=len(params.obj_list), compound_coef=opt.compound_coef, ratios=eval(params.anchors_ratios), scales=eval(params.anchors_scales)) # load last weights if opt.load_weights is not None: if opt.load_weights.endswith('.pth'): weights_path = opt.load_weights else: weights_path = get_last_weights(opt.saved_path) try: last_step = int( os.path.basename(weights_path).split('_')[-1].split('.')[0]) except: last_step = 0 try: ret = model.load_state_dict(torch.load(weights_path), strict=False) except RuntimeError as e: print(f'[Warning] Ignoring {e}') print( '[Warning] Don\'t panic if you see this, this might be because you load a pretrained weights with different number of classes. The rest of the weights should be loaded already.' ) print( f'[Info] loaded weights: {os.path.basename(weights_path)}, resuming checkpoint from step: {last_step}' ) else: last_step = 0 print('[Info] initializing weights...') init_weights(model) # freeze backbone if train head_only if opt.head_only: def freeze_backbone(m): classname = m.__class__.__name__ for ntl in ['EfficientNet', 'BiFPN']: if ntl in classname: for param in m.parameters(): param.requires_grad = False model.apply(freeze_backbone) print('[Info] freezed backbone') # https://github.com/vacancy/Synchronized-BatchNorm-PyTorch # apply sync_bn when using multiple gpu and batch_size per gpu is lower than 4 # useful when gpu memory is limited. # because when bn is disable, the training will be very unstable or slow to converge, # apply sync_bn can solve it, # by packing all mini-batch across all gpus as one batch and normalize, then send it back to all gpus. # but it would also slow down the training by a little bit. if params.num_gpus > 1 and opt.batch_size // params.num_gpus < 4: model.apply(replace_w_sync_bn) use_sync_bn = True else: use_sync_bn = False writer = SummaryWriter( opt.log_path + f'/{datetime.datetime.now().strftime("%Y%m%d-%H%M%S")}/') # warp the model with loss function, to reduce the memory usage on gpu0 and speedup model = ModelWithLoss(model, debug=opt.debug) if params.num_gpus > 0: model = model.cuda() if params.num_gpus > 1: model = CustomDataParallel(model, params.num_gpus) if use_sync_bn: patch_replication_callback(model) if opt.optim == 'adamw': optimizer = torch.optim.AdamW(model.parameters(), opt.lr) else: optimizer = torch.optim.SGD(model.parameters(), opt.lr, momentum=opt.momentum, nesterov=True) scheduler = torch.optim.lr_scheduler.ReduceLROnPlateau(optimizer, patience=3, verbose=True) epoch = 0 best_loss = 1e5 best_epoch = 0 best_step = 0 best_checkpoint = None step = max(0, last_step) model.train() num_iter_per_epoch = len(training_generator) try: for epoch in range(opt.num_epochs): last_epoch = step // num_iter_per_epoch if epoch < last_epoch: continue epoch_loss = [] epoch_cls_loss = [] epoch_reg_loss = [] if epoch % opt.val_interval == 0: model.eval() loss_regression_ls = [] loss_classification_ls = [] for iter, data in enumerate(val_generator): with torch.no_grad(): imgs = data['img'] annot = data['annot'] if params.num_gpus == 1: imgs = imgs.cuda() annot = annot.cuda() cls_loss, reg_loss = model(imgs, annot, obj_list=params.obj_list) cls_loss = cls_loss.mean() reg_loss = reg_loss.mean() loss = cls_loss + reg_loss if loss == 0 or not torch.isfinite(loss): continue loss_classification_ls.append(cls_loss.item()) loss_regression_ls.append(reg_loss.item()) cls_loss = np.mean(loss_classification_ls) reg_loss = np.mean(loss_regression_ls) loss = cls_loss + reg_loss print( 'Val. Epoch: {}/{}. Classification loss: {:1.5f}. Regression loss: {:1.5f}. Total loss: {:1.5f}' .format(epoch, opt.num_epochs, cls_loss, reg_loss, loss)) writer.add_scalars('Loss', {'val': loss}, step) writer.add_scalars('Regression Loss', {'val': reg_loss}, step) writer.add_scalars('Classfication Loss', {'val': cls_loss}, step) neptune.log_metric('Val Loss', step, loss) neptune.log_metric('Val Regression Loss', step, reg_loss) neptune.log_metric('Val Classification Loss', step, cls_loss) with torch.no_grad(): stats = evaluate(model.model, params.params, threshold=opt.val_threshold, step=step) neptune.log_metric('AP at IoU=.50:.05:.95', step, stats[0]) neptune.log_metric('AP at IoU=.50', step, stats[1]) neptune.log_metric('AP at IoU=.75', step, stats[2]) neptune.log_metric('AR given 1 detection per image', step, stats[6]) neptune.log_metric('AR given 10 detection per image', step, stats[7]) neptune.log_metric('AR given 100 detection per image', step, stats[8]) if loss + opt.es_min_delta < best_loss: best_loss = loss best_epoch = epoch best_step = step checkpoint_name = f'efficientdet-d{opt.compound_coef}_{epoch}_{step}.pth' checkpoint_path = save_checkpoint(model, opt.saved_path, checkpoint_name) best_checkpoint = checkpoint_path model.train() progress_bar = tqdm(training_generator) for iter, data in enumerate(progress_bar): if iter < step - last_epoch * num_iter_per_epoch: progress_bar.update() continue try: imgs = data['img'] annot = data['annot'] if params.num_gpus == 1: # if only one gpu, just send it to cuda:0 # elif multiple gpus, send it to multiple gpus in CustomDataParallel, not here imgs = imgs.cuda() annot = annot.cuda() optimizer.zero_grad() cls_loss, reg_loss = model(imgs, annot, obj_list=params.obj_list, step=step) cls_loss = cls_loss.mean() reg_loss = reg_loss.mean() loss = cls_loss + reg_loss if loss == 0 or not torch.isfinite(loss): continue loss.backward() # torch.nn.utils.clip_grad_norm_(model.parameters(), 0.1) optimizer.step() epoch_loss.append(float(loss)) epoch_cls_loss.append(float(cls_loss)) epoch_reg_loss.append(float(reg_loss)) progress_bar.set_description( 'Step: {}. Epoch: {}/{}. Iteration: {}/{}. Cls loss: {:.5f}. Reg loss: {:.5f}. Total loss: {:.5f}' .format(step, epoch, opt.num_epochs, iter + 1, num_iter_per_epoch, cls_loss.item(), reg_loss.item(), loss.item())) writer.add_scalars('Loss', {'train': loss}, step) writer.add_scalars('Regression_loss', {'train': reg_loss}, step) writer.add_scalars('Classfication_loss', {'train': cls_loss}, step) neptune.log_metric('Train Loss', step, loss) neptune.log_metric('Train Regression Loss', step, reg_loss) neptune.log_metric('Train Classification Loss', step, cls_loss) # log learning_rate current_lr = optimizer.param_groups[0]['lr'] writer.add_scalar('learning_rate', current_lr, step) neptune.log_metric('Learning Rate', step, current_lr) step += 1 if step % opt.save_interval == 0 and step > 0: save_checkpoint( model, opt.saved_path, f'efficientdet-d{opt.compound_coef}_{epoch}_{step}.pth' ) print('checkpoint...') except Exception as e: print('[Error]', traceback.format_exc()) print(e) continue scheduler.step(np.mean(epoch_loss)) neptune.log_metric('Epoch Loss', step, np.mean(epoch_loss)) neptune.log_metric('Epoch Classification Loss', step, np.mean(epoch_cls_loss)) neptune.log_metric('Epoch Regression Loss', step, np.mean(epoch_reg_loss)) # Early stopping if epoch - best_epoch > opt.es_patience > 0: print( '[Info] Stop training at epoch {}. The lowest loss achieved is {}' .format(epoch, best_loss)) break except KeyboardInterrupt: save_checkpoint( model, opt.saved_path, f'efficientdet-d{opt.compound_coef}_{epoch}_{step}.pth') send_best_checkpoint(best_checkpoint, best_step) writer.close() writer.close() send_best_checkpoint(best_checkpoint, best_step) neptune.stop()
def start_training(self): if self.system_dict["params"]["num_gpus"] == 0: os.environ['CUDA_VISIBLE_DEVICES'] = '-1' if torch.cuda.is_available(): torch.cuda.manual_seed(42) else: torch.manual_seed(42) self.system_dict["params"]["saved_path"] = self.system_dict["params"][ "saved_path"] + "/" + self.system_dict["params"][ "project_name"] + "/" self.system_dict["params"]["log_path"] = self.system_dict["params"][ "log_path"] + "/" + self.system_dict["params"][ "project_name"] + "/tensorboard/" os.makedirs(self.system_dict["params"]["saved_path"], exist_ok=True) os.makedirs(self.system_dict["params"]["log_path"], exist_ok=True) training_params = { 'batch_size': self.system_dict["params"]["batch_size"], 'shuffle': True, 'drop_last': True, 'collate_fn': collater, 'num_workers': self.system_dict["params"]["num_workers"] } val_params = { 'batch_size': self.system_dict["params"]["batch_size"], 'shuffle': False, 'drop_last': True, 'collate_fn': collater, 'num_workers': self.system_dict["params"]["num_workers"] } input_sizes = [512, 640, 768, 896, 1024, 1280, 1280, 1536] training_set = CocoDataset( self.system_dict["dataset"]["train"]["root_dir"], self.system_dict["dataset"]["train"]["coco_dir"], self.system_dict["dataset"]["train"]["img_dir"], set_dir=self.system_dict["dataset"]["train"]["set_dir"], transform=transforms.Compose([ Normalizer(mean=self.system_dict["params"]["mean"], std=self.system_dict["params"]["std"]), Augmenter(), Resizer( input_sizes[self.system_dict["params"]["compound_coef"]]) ])) training_generator = DataLoader(training_set, **training_params) if (self.system_dict["dataset"]["val"]["status"]): val_set = CocoDataset( self.system_dict["dataset"]["val"]["root_dir"], self.system_dict["dataset"]["val"]["coco_dir"], self.system_dict["dataset"]["val"]["img_dir"], set_dir=self.system_dict["dataset"]["val"]["set_dir"], transform=transforms.Compose([ Normalizer(self.system_dict["params"]["mean"], self.system_dict["params"]["std"]), Resizer(input_sizes[self.system_dict["params"] ["compound_coef"]]) ])) val_generator = DataLoader(val_set, **val_params) print("") print("") model = EfficientDetBackbone( num_classes=len(self.system_dict["params"]["obj_list"]), compound_coef=self.system_dict["params"]["compound_coef"], ratios=eval(self.system_dict["params"]["anchors_ratios"]), scales=eval(self.system_dict["params"]["anchors_scales"])) os.makedirs("pretrained_weights", exist_ok=True) if (self.system_dict["params"]["compound_coef"] == 0): if (not os.path.isfile( self.system_dict["params"]["load_weights"])): print("Downloading weights") cmd = "wget https://github.com/zylo117/Yet-Another-Efficient-Pytorch/releases/download/1.0/efficientdet-d0.pth -O " + \ self.system_dict["params"]["load_weights"] os.system(cmd) elif (self.system_dict["params"]["compound_coef"] == 1): if (not os.path.isfile( self.system_dict["params"]["load_weights"])): print("Downloading weights") cmd = "wget https://github.com/zylo117/Yet-Another-Efficient-Pytorch/releases/download/1.0/efficientdet-d1.pth -O " + \ self.system_dict["params"]["load_weights"] os.system(cmd) elif (self.system_dict["params"]["compound_coef"] == 2): if (not os.path.isfile( self.system_dict["params"]["load_weights"])): print("Downloading weights") cmd = "wget https://github.com/zylo117/Yet-Another-Efficient-Pytorch/releases/download/1.0/efficientdet-d2.pth -O " + \ self.system_dict["params"]["load_weights"] os.system(cmd) elif (self.system_dict["params"]["compound_coef"] == 3): if (not os.path.isfile( self.system_dict["params"]["load_weights"])): print("Downloading weights") cmd = "wget https://github.com/zylo117/Yet-Another-Efficient-Pytorch/releases/download/1.0/efficientdet-d3.pth -O " + \ self.system_dict["params"]["load_weights"] os.system(cmd) elif (self.system_dict["params"]["compound_coef"] == 4): if (not os.path.isfile( self.system_dict["params"]["load_weights"])): print("Downloading weights") cmd = "wget https://github.com/zylo117/Yet-Another-Efficient-Pytorch/releases/download/1.0/efficientdet-d4.pth -O " + \ self.system_dict["params"]["load_weights"] os.system(cmd) elif (self.system_dict["params"]["compound_coef"] == 5): if (not os.path.isfile( self.system_dict["params"]["load_weights"])): print("Downloading weights") cmd = "wget https://github.com/zylo117/Yet-Another-Efficient-Pytorch/releases/download/1.0/efficientdet-d5.pth -O " + \ self.system_dict["params"]["load_weights"] os.system(cmd) elif (self.system_dict["params"]["compound_coef"] == 6): if (not os.path.isfile( self.system_dict["params"]["load_weights"])): print("Downloading weights") cmd = "wget https://github.com/zylo117/Yet-Another-Efficient-Pytorch/releases/download/1.0/efficientdet-d6.pth -O " + \ self.system_dict["params"]["load_weights"] os.system(cmd) elif (self.system_dict["params"]["compound_coef"] == 7): if (not os.path.isfile( self.system_dict["params"]["load_weights"])): print("Downloading weights") cmd = "wget https://github.com/zylo117/Yet-Another-Efficient-Pytorch/releases/download/1.0/efficientdet-d7.pth -O " + \ self.system_dict["params"]["load_weights"] os.system(cmd) # load last weights if self.system_dict["params"]["load_weights"] is not None: if self.system_dict["params"]["load_weights"].endswith('.pth'): weights_path = self.system_dict["params"]["load_weights"] else: weights_path = get_last_weights( self.system_dict["params"]["saved_path"]) try: last_step = int( os.path.basename(weights_path).split('_')[-1].split('.') [0]) except: last_step = 0 try: ret = model.load_state_dict(torch.load(weights_path), strict=False) except RuntimeError as e: print(f'[Warning] Ignoring {e}') print( '[Warning] Don\'t panic if you see this, this might be because you load a pretrained weights with different number of classes. The rest of the weights should be loaded already.' ) print( f'[Info] loaded weights: {os.path.basename(weights_path)}, resuming checkpoint from step: {last_step}' ) else: last_step = 0 print('[Info] initializing weights...') init_weights(model) print("") print("") # freeze backbone if train head_only if self.system_dict["params"]["head_only"]: def freeze_backbone(m): classname = m.__class__.__name__ for ntl in ['EfficientNet', 'BiFPN']: if ntl in classname: for param in m.parameters(): param.requires_grad = False model.apply(freeze_backbone) print('[Info] freezed backbone') print("") print("") if self.system_dict["params"]["num_gpus"] > 1 and self.system_dict[ "params"]["batch_size"] // self.system_dict["params"][ "num_gpus"] < 4: model.apply(replace_w_sync_bn) use_sync_bn = True else: use_sync_bn = False writer = SummaryWriter( self.system_dict["params"]["log_path"] + f'/{datetime.datetime.now().strftime("%Y%m%d-%H%M%S")}/') model = ModelWithLoss(model, debug=self.system_dict["params"]["debug"]) if self.system_dict["params"]["num_gpus"] > 0: model = model.cuda() if self.system_dict["params"]["num_gpus"] > 1: model = CustomDataParallel( model, self.system_dict["params"]["num_gpus"]) if use_sync_bn: patch_replication_callback(model) if self.system_dict["params"]["optim"] == 'adamw': optimizer = torch.optim.AdamW(model.parameters(), self.system_dict["params"]["lr"]) else: optimizer = torch.optim.SGD(model.parameters(), self.system_dict["params"]["lr"], momentum=0.9, nesterov=True) scheduler = torch.optim.lr_scheduler.ReduceLROnPlateau(optimizer, patience=3, verbose=True) epoch = 0 best_loss = 1e5 best_epoch = 0 step = max(0, last_step) model.train() num_iter_per_epoch = len(training_generator) try: for epoch in range(self.system_dict["params"]["num_epochs"]): last_epoch = step // num_iter_per_epoch if epoch < last_epoch: continue epoch_loss = [] progress_bar = tqdm(training_generator) for iter, data in enumerate(progress_bar): if iter < step - last_epoch * num_iter_per_epoch: progress_bar.update() continue try: imgs = data['img'] annot = data['annot'] if self.system_dict["params"]["num_gpus"] == 1: # if only one gpu, just send it to cuda:0 # elif multiple gpus, send it to multiple gpus in CustomDataParallel, not here imgs = imgs.cuda() annot = annot.cuda() optimizer.zero_grad() cls_loss, reg_loss = model( imgs, annot, obj_list=self.system_dict["params"]["obj_list"]) cls_loss = cls_loss.mean() reg_loss = reg_loss.mean() loss = cls_loss + reg_loss if loss == 0 or not torch.isfinite(loss): continue loss.backward() # torch.nn.utils.clip_grad_norm_(model.parameters(), 0.1) optimizer.step() epoch_loss.append(float(loss)) progress_bar.set_description( 'Step: {}. Epoch: {}/{}. Iteration: {}/{}. Cls loss: {:.5f}. Reg loss: {:.5f}. Total loss: {:.5f}' .format(step, epoch, self.system_dict["params"]["num_epochs"], iter + 1, num_iter_per_epoch, cls_loss.item(), reg_loss.item(), loss.item())) writer.add_scalars('Loss', {'train': loss}, step) writer.add_scalars('Regression_loss', {'train': reg_loss}, step) writer.add_scalars('Classfication_loss', {'train': cls_loss}, step) # log learning_rate current_lr = optimizer.param_groups[0]['lr'] writer.add_scalar('learning_rate', current_lr, step) step += 1 if step % self.system_dict["params"][ "save_interval"] == 0 and step > 0: self.save_checkpoint( model, f'efficientdet-d{self.system_dict["params"]["compound_coef"]}_trained.pth' ) #print('checkpoint...') except Exception as e: print('[Error]', traceback.format_exc()) print(e) continue scheduler.step(np.mean(epoch_loss)) if epoch % self.system_dict["params"][ "val_interval"] == 0 and self.system_dict["dataset"][ "val"]["status"]: print("Running validation") model.eval() loss_regression_ls = [] loss_classification_ls = [] for iter, data in enumerate(val_generator): with torch.no_grad(): imgs = data['img'] annot = data['annot'] if self.system_dict["params"]["num_gpus"] == 1: imgs = imgs.cuda() annot = annot.cuda() cls_loss, reg_loss = model( imgs, annot, obj_list=self.system_dict["params"] ["obj_list"]) cls_loss = cls_loss.mean() reg_loss = reg_loss.mean() loss = cls_loss + reg_loss if loss == 0 or not torch.isfinite(loss): continue loss_classification_ls.append(cls_loss.item()) loss_regression_ls.append(reg_loss.item()) cls_loss = np.mean(loss_classification_ls) reg_loss = np.mean(loss_regression_ls) loss = cls_loss + reg_loss print( 'Val. Epoch: {}/{}. Classification loss: {:1.5f}. Regression loss: {:1.5f}. Total loss: {:1.5f}' .format(epoch, self.system_dict["params"]["num_epochs"], cls_loss, reg_loss, loss)) writer.add_scalars('Loss', {'val': loss}, step) writer.add_scalars('Regression_loss', {'val': reg_loss}, step) writer.add_scalars('Classfication_loss', {'val': cls_loss}, step) if loss + self.system_dict["params"][ "es_min_delta"] < best_loss: best_loss = loss best_epoch = epoch self.save_checkpoint( model, f'efficientdet-d{self.system_dict["params"]["compound_coef"]}_trained.pth' ) model.train() # Early stopping if epoch - best_epoch > self.system_dict["params"][ "es_patience"] > 0: print( '[Info] Stop training at epoch {}. The lowest loss achieved is {}' .format(epoch, best_loss)) break except KeyboardInterrupt: self.save_checkpoint( model, f'efficientdet-d{self.system_dict["params"]["compound_coef"]}_trained.pth' ) writer.close() writer.close() print("") print("") print("Training complete")
def train(opt): ''' Input: get_args() Function: Train the model. ''' params = Params(f'projects/{opt.project}.yml') if params.num_gpus == 0: os.environ['CUDA_VISIBLE_DEVICES'] = '-1' if torch.cuda.is_available(): torch.cuda.manual_seed(42) else: torch.manual_seed(42) opt.saved_path = opt.saved_path + f'/{params.project_name}/' opt.log_path = opt.log_path + f'/{params.project_name}/tensorboard/' os.makedirs(opt.log_path, exist_ok=True) os.makedirs(opt.saved_path, exist_ok=True) # evaluation json file pred_folder = f'{OPT.data_path}/{OPT.project}/predictions' os.makedirs(pred_folder, exist_ok=True) evaluation_pred_file = f'{pred_folder}/instances_bbox_results.json' training_params = { 'batch_size': opt.batch_size, 'shuffle': True, 'drop_last': True, 'collate_fn': collater, 'num_workers': opt.num_workers } val_params = { 'batch_size': opt.batch_size, 'shuffle': True, 'drop_last': True, 'collate_fn': collater, 'num_workers': opt.num_workers } input_sizes = [512, 640, 768, 896, 1024, 1280, 1280, 1536] training_set = CocoDataset(root_dir=os.path.join(opt.data_path, params.project_name), set=params.train_set, transform=torchvision.transforms.Compose([ Normalizer(mean=params.mean, std=params.std), Augmenter(), Resizer(input_sizes[opt.compound_coef]) ])) training_generator = DataLoader(training_set, **training_params) val_set = CocoDataset(root_dir=os.path.join(opt.data_path, params.project_name), set=params.val_set, transform=torchvision.transforms.Compose([ Normalizer(mean=params.mean, std=params.std), Resizer(input_sizes[opt.compound_coef]) ])) val_generator = DataLoader(val_set, **val_params) model = EfficientDetBackbone(num_classes=len(params.obj_list), compound_coef=opt.compound_coef, ratios=eval(params.anchors_ratios), scales=eval(params.anchors_scales)) # load last weights if opt.load_weights is not None: if opt.load_weights.endswith('.pth'): weights_path = opt.load_weights else: weights_path = get_last_weights(opt.saved_path) try: last_step = int( os.path.basename(weights_path).split('_')[-1].split('.')[0]) except Exception as exception: last_step = 0 try: _ = model.load_state_dict(torch.load(weights_path), strict=False) except RuntimeError as rerror: print(f'[Warning] Ignoring {rerror}') print('[Warning] Don\'t panic if you see this, '\ 'this might be because you load a pretrained weights with different number of classes.'\ ' The rest of the weights should be loaded already.') print( f'[Info] loaded weights: {os.path.basename(weights_path)}, resuming checkpoint from step: {last_step}' ) else: last_step = 0 print('[Info] initializing weights...') init_weights(model) # freeze backbone if train head_only if opt.head_only: def freeze_backbone(mdl): classname = mdl.__class__.__name__ for ntl in ['EfficientNet', 'BiFPN']: if ntl in classname: for param in mdl.parameters(): param.requires_grad = False model.apply(freeze_backbone) print('[Info] freezed backbone') # https://github.com/vacancy/Synchronized-BatchNorm-PyTorch # apply sync_bn when using multiple gpu and batch_size per gpu is lower than 4 # useful when gpu memory is limited. # because when bn is disable, the training will be very unstable or slow to converge, # apply sync_bn can solve it, # by packing all mini-batch across all gpus as one batch and normalize, then send it back to all gpus. # but it would also slow down the training by a little bit. if params.num_gpus > 1 and opt.batch_size // params.num_gpus < 4: model.apply(replace_w_sync_bn) use_sync_bn = True else: use_sync_bn = False writer = SummaryWriter( opt.log_path + f'/{datetime.datetime.now().strftime("%Y%m%d-%H%M%S")}/') # warp the model with loss function, to reduce the memory usage on gpu0 and speedup model = ModelWithLoss(model, debug=opt.debug) if params.num_gpus > 0: model = model.cuda() if params.num_gpus > 1: model = CustomDataParallel(model, params.num_gpus) if use_sync_bn: patch_replication_callback(model) if opt.optim == 'adamw': optimizer = torch.optim.AdamW(model.parameters(), opt.lr) else: optimizer = torch.optim.SGD(model.parameters(), opt.lr, momentum=0.9, nesterov=True) scheduler = torch.optim.lr_scheduler.ReduceLROnPlateau(optimizer, patience=3, verbose=True) epoch = 0 best_loss = 1e5 best_epoch = 0 step = max(0, last_step) model.train() num_iter_per_epoch = len(training_generator) num_val_iter_per_epoch = len(val_generator) # Limit the no.of preds to #images in val. # Here, I averaged the #obj to 5 for computational efficacy if opt.max_preds_toeval > 0: opt.max_preds_toeval = len(val_generator) * opt.batch_size * 5 try: for epoch in range(opt.num_epochs): last_epoch = step // num_iter_per_epoch if epoch < last_epoch: continue epoch_loss = [] progress_bar = tqdm(training_generator) for iternum, data in enumerate(progress_bar): if iternum < step - last_epoch * num_iter_per_epoch: progress_bar.update() continue try: imgs = data['img'] annot = data['annot'] if params.num_gpus == 1: # if only one gpu, just send it to cuda:0 # elif multiple gpus, send it to multiple gpus in CustomDataParallel, not here imgs = imgs.cuda() annot = annot.cuda() optimizer.zero_grad() if iternum % int(num_iter_per_epoch * (opt.eval_percent_epoch / 100)) != 0: model.debug = False cls_loss, reg_loss, _ = model(imgs, annot, obj_list=params.obj_list) else: model.debug = True cls_loss, reg_loss, imgs_labelled = model( imgs, annot, obj_list=params.obj_list) cls_loss = cls_loss.mean() reg_loss = reg_loss.mean() loss = cls_loss + reg_loss if loss == 0 or not torch.isfinite(loss): continue loss.backward() # torch.nn.utils.clip_grad_norm_(model.parameters(), 0.1) optimizer.step() epoch_loss.append(float(loss)) progress_bar.set_description( 'Step: {}. Epoch: {}/{}. Iteration: {}/{}. Cls loss: {:.5f}. Reg loss: {:.5f}. Total loss: {:.5f}' .format(step, epoch, opt.num_epochs, iternum + 1, num_iter_per_epoch, cls_loss.item(), reg_loss.item(), loss.item())) writer.add_scalars('Loss', {'train': loss}, step) writer.add_scalars('Regression_loss', {'train': reg_loss}, step) writer.add_scalars('Classfication_loss', {'train': cls_loss}, step) if iternum % int( num_iter_per_epoch * (opt.eval_percent_epoch / 100)) == 0 and step > 0: # create grid of images imgs_labelled = np.asarray(imgs_labelled) imgs_labelled = torch.from_numpy( imgs_labelled) # (N, H, W, C) imgs_labelled.transpose_(1, 3) # (N, C, H, W) imgs_labelled.transpose_(2, 3) img_grid = torchvision.utils.make_grid(imgs_labelled) # write to tensorboard writer.add_image('Training_images', img_grid, global_step=step) #########################################################start EVAL##################################################### model.eval() model.debug = False # Don't print images in tensorboard now. # remove json if os.path.exists(evaluation_pred_file): os.remove(evaluation_pred_file) loss_regression_ls = [] loss_classification_ls = [] model.evalresults = [ ] # Empty the results for next evaluation. imgs_to_viz = [] num_validation_steps = int( num_val_iter_per_epoch * (opt.eval_sampling_percent / 100)) for valiternum, valdata in enumerate(val_generator): with torch.no_grad(): imgs = valdata['img'] annot = valdata['annot'] resizing_imgs_scales = valdata['scale'] new_ws = valdata['new_w'] new_hs = valdata['new_h'] imgs_ids = valdata['img_id'] if params.num_gpus >= 1: imgs = imgs.cuda() annot = annot.cuda() if valiternum % (num_validation_steps // (opt.num_visualize_images // opt.batch_size)) != 0: model.debug = False cls_loss, reg_loss, _ = model( imgs, annot, obj_list=params.obj_list, resizing_imgs_scales= resizing_imgs_scales, new_ws=new_ws, new_hs=new_hs, imgs_ids=imgs_ids) else: model.debug = True cls_loss, reg_loss, val_imgs_labelled = model( imgs, annot, obj_list=params.obj_list, resizing_imgs_scales= resizing_imgs_scales, new_ws=new_ws, new_hs=new_hs, imgs_ids=imgs_ids) imgs_to_viz += list(val_imgs_labelled) loss_classification_ls.append(cls_loss.item()) loss_regression_ls.append(reg_loss.item()) if valiternum > (num_validation_steps): break cls_loss = np.mean(loss_classification_ls) reg_loss = np.mean(loss_regression_ls) loss = cls_loss + reg_loss print( 'Val. Epoch: {}/{}. Classification loss: {:1.5f}. Regression loss: {:1.5f}. Total loss: {:1.5f}' .format(epoch, opt.num_epochs, cls_loss, reg_loss, loss)) writer.add_scalars('Loss', {'val': loss}, step) writer.add_scalars('Regression_loss', {'val': reg_loss}, step) writer.add_scalars('Classfication_loss', {'val': cls_loss}, step) # create grid of images val_imgs_labelled = np.asarray(imgs_to_viz) val_imgs_labelled = torch.from_numpy( val_imgs_labelled) # (N, H, W, C) val_imgs_labelled.transpose_(1, 3) # (N, C, H, W) val_imgs_labelled.transpose_(2, 3) val_img_grid = torchvision.utils.make_grid( val_imgs_labelled, nrow=2) # write to tensorboard writer.add_image('Eval_Images', val_img_grid, \ global_step=(step)) if opt.max_preds_toeval > 0: json.dump(model.evalresults, open(evaluation_pred_file, 'w'), indent=4) try: val_results = calc_mAP_fin(params.project_name,\ params.val_set, evaluation_pred_file, \ val_gt=f'{OPT.data_path}/{OPT.project}/annotations/instances_{params.val_set}.json') for catgname in val_results: metricname = 'Average Precision (AP) @[ IoU = 0.50 | area = all | maxDets = 100 ]' evalscore = val_results[catgname][ metricname] writer.add_scalars( f'mAP@IoU=0.5 and area=all', {f'{catgname}': evalscore}, step) except Exception as exption: print("Unable to perform evaluation", exption) if loss + opt.es_min_delta < best_loss: best_loss = loss best_epoch = epoch save_checkpoint( model, f'efficientdet-d{opt.compound_coef}_{epoch}_{step}.pth' ) model.train() # Early stopping if epoch - best_epoch > opt.es_patience > 0: print( '[Info] Stop training at epoch {}. The lowest loss achieved is {}' .format(epoch, best_loss)) break #########################################################EVAL##################################################### # log learning_rate current_lr = optimizer.param_groups[0]['lr'] writer.add_scalar('learning_rate', current_lr, step) step += 1 if step % opt.save_interval == 0 and step > 0: save_checkpoint( model, f'efficientdet-d{opt.compound_coef}_{epoch}_{step}.pth' ) print('checkpoint...') except Exception as exception: print('[Error]', traceback.format_exc()) print(exception) continue scheduler.step(np.mean(epoch_loss)) except KeyboardInterrupt: save_checkpoint( model, f'efficientdet-d{opt.compound_coef}_{epoch}_{step}.pth') writer.close() writer.close()
def train(opt): params = Params(f'projects/{opt.project}.yml') global_validation_it = 0 if params.num_gpus == 0: os.environ['CUDA_VISIBLE_DEVICES'] = '-1' if torch.cuda.is_available(): torch.cuda.manual_seed(42) else: torch.manual_seed(42) opt.saved_path = opt.saved_path + f'/{params.project_name}/' opt.log_path = opt.log_path + f'/{params.project_name}/tensorboard/' os.makedirs(opt.log_path, exist_ok=True) os.makedirs(opt.saved_path, exist_ok=True) training_params = { 'batch_size': opt.batch_size, 'shuffle': True, 'drop_last': True, 'collate_fn': TUMuchTrafficDataset.collater, 'num_workers': opt.num_workers } val_params = { 'batch_size': opt.batch_size, 'shuffle': False, 'drop_last': True, 'collate_fn': TUMuchTrafficDataset.collater, 'num_workers': opt.num_workers } advprop = opt.advprop if advprop: # for models using advprop pretrained weights normalize = transforms.Lambda( lambda mem: { "img": (mem["img"] * 2.0 - 1.0).astype(np.float32), "annot": mem["annot"] }) else: # for other models normalize = Normalizer(mean=[0.485, 0.456, 0.406], std=[0.229, 0.224, 0.225]) tfs = transforms.Compose([ TopCutter(886), transforms.RandomApply([Negate()], p=0.1), transforms.RandomApply([ContrastEnhancementWithNoiseReduction()], p=0.1), Resize(384), RandomCrop(384, 768), normalize, HorizontalFlip(prob=0.5), transforms.RandomApply([AddGaussianNoise(0, 2.55)], p=0.5), transforms.RandomApply([AddSaltAndPepperNoise(prob=0.0017)], p=0.5), ToTensor() ]) tfrecord_paths = [opt.data_path ] if opt.data_path.endswith(".tfrecord") else [ str(x.absolute()) for x in Path(opt.data_path).rglob('*.tfrecord') ] training_set = TUMuchTrafficDataset(tfrecord_paths=tfrecord_paths, transform=tfs) training_generator = DataLoader(training_set, **training_params) tfrecord_paths = [opt.data_path ] if opt.data_path.endswith(".tfrecord") else [ str(x.absolute()) for x in Path(opt.val_path).rglob('*.tfrecord') ] val_set = TUMuchTrafficDataset(tfrecord_paths=tfrecord_paths, transform=tfs) val_generator = DataLoader(val_set, **val_params) if not opt.load_backbone: load_weights = False else: load_weights = True model = EfficientDetBackbone(num_classes=len(params.obj_list), compound_coef=opt.compound_coef, ratios=eval(params.anchors_ratios), scales=eval(params.anchors_scales), load_weights=load_weights) pytorch_total_params = sum(p.numel() for p in model.parameters()) print("# Params: {:08d}".format(pytorch_total_params)) # load last weights if opt.load_weights is not None: if opt.load_weights.endswith('.pth'): weights_path = opt.load_weights else: weights_path = get_last_weights(opt.saved_path) try: last_step = int( os.path.basename(weights_path).split('_')[-1].split('.')[0]) except: last_step = 0 try: ret = model.load_state_dict(torch.load(weights_path), strict=False) except RuntimeError as e: print(f'[Warning] Ignoring {e}') print( '[Warning] Don\'t panic if you see this, this might be because you load a pretrained weights with different number of classes. The rest of the weights should be loaded already.' ) print( f'[Info] loaded weights: {os.path.basename(weights_path)}, resuming checkpoint from step: {last_step}' ) else: last_step = 0 print('[Info] initializing weights...') init_weights(model) # freeze backbone if train head_only if opt.head_only: def freeze_backbone(m): classname = m.__class__.__name__ for ntl in ['EfficientNet', 'BiFPN']: if ntl in classname: for param in m.parameters(): param.requires_grad = False model.apply(freeze_backbone) print('[Info] freezed backbone') # freeze backbone (only efficientnet) if train no_effnet if opt.no_effnet: def freeze_backbone(m): classname = m.__class__.__name__ for ntl in ['EfficientNet']: if ntl in classname: for param in m.parameters(): param.requires_grad = False model.apply(freeze_backbone) print('[Info] freezed backbone') pytorch_total_params = sum(p.numel() for p in model.parameters() if p.requires_grad) print("# Training Parameters: {:06}".format(pytorch_total_params)) # https://github.com/vacancy/Synchronized-BatchNorm-PyTorch # apply sync_bn when using multiple gpu and batch_size per gpu is lower than 4 # useful when gpu memory is limited. # because when bn is disable, the training will be very unstable or slow to converge, # apply sync_bn can solve it, # by packing all mini-batch across all gpus as one batch and normalize, then send it back to all gpus. # but it would also slow down the training by a little bit. if params.num_gpus > 1 and opt.batch_size // params.num_gpus < 4: model.apply(replace_w_sync_bn) use_sync_bn = True else: use_sync_bn = False writer = SummaryWriter( opt.log_path + f'/{datetime.datetime.now().strftime("%Y%m%d-%H%M")}/') # warp the model with loss function, to reduce the memory usage on gpu0 and speedup model = ModelWithLoss(model, debug=opt.debug) if params.num_gpus > 0: model = model.cuda() if params.num_gpus > 1: model = CustomDataParallel(model, params.num_gpus) if use_sync_bn: patch_replication_callback(model) if opt.optim == 'adamw': optimizer = torch.optim.AdamW(model.parameters(), opt.lr) else: optimizer = torch.optim.SGD(model.parameters(), opt.lr, momentum=0.9, nesterov=True) scheduler = torch.optim.lr_scheduler.ReduceLROnPlateau(optimizer, patience=1e6, verbose=True) # use apex for mixed precision training # model, optimizer = amp.initialize(model, optimizer) epoch = 0 best_loss = 1e5 best_epoch = 0 step = max(0, last_step) model.train() num_iter_per_epoch = len(training_generator) try: for epoch in range(opt.num_epochs): last_epoch = step // num_iter_per_epoch if epoch < last_epoch: continue epoch_loss = [] progress_bar = tqdm(training_generator) for it, data in enumerate(progress_bar): if it < step - last_epoch * num_iter_per_epoch: progress_bar.update() continue try: imgs = data['img'] annot = data['annot'] if params.num_gpus == 1: # if only one gpu, just send it to cuda:0 # elif multiple gpus, send it to multiple gpus in CustomDataParallel, not here imgs = imgs.cuda() annot = annot.cuda() global_validation_it += 1 optimizer.zero_grad() cls_loss, reg_loss = model(imgs, annot) cls_loss = cls_loss.mean() reg_loss = reg_loss.mean() loss = cls_loss + reg_loss if loss == 0 or not torch.isfinite(loss): continue loss.backward() # torch.nn.utils.clip_grad_norm_(model.parameters(), 0.1) optimizer.step() epoch_loss.append(float(loss)) progress_bar.set_description( 'Step: {}. Epoch: {}/{}. Iteration: {}/{}. Cls loss: {:.5f}. Reg loss: {:.5f}. Total loss: {:.5f}' .format(step, epoch, opt.num_epochs, it + 1, num_iter_per_epoch, cls_loss.item(), reg_loss.item(), loss.item())) writer.add_scalars('Loss', {'train': loss}, step) writer.add_scalars('Regression_loss', {'train': reg_loss}, step) writer.add_scalars('Classfication_loss', {'train': cls_loss}, step) # log learning_rate current_lr = optimizer.param_groups[0]['lr'] writer.add_scalar('learning_rate', current_lr, step) step += 1 if step % opt.save_interval == 0 and step > 0: save_checkpoint( model, f'efficientdet-d{opt.compound_coef}_{epoch}_{step}.pth' ) print('checkpoint...') except Exception as e: print('[Error]', traceback.format_exc()) print(e) continue scheduler.step(np.mean(epoch_loss)) # sleep for 30 seconds, to reduce overheating import time time.sleep(30) if epoch % opt.val_interval == 0: model.eval() loss_regression_ls = [] loss_classification_ls = [] for it, data in enumerate(val_generator): with torch.no_grad(): imgs = data['img'] annot = data['annot'] if params.num_gpus == 1: # if only one gpu, just send it to cuda:0 # elif multiple gpus, send it to multiple gpus in CustomDataParallel, not here imgs = imgs.cuda() annot = annot.cuda() if it < 12: plot_tensorboard(imgs, annot, model, writer, global_validation_it, it, "") global_validation_it += 1 if params.num_gpus == 1: imgs = imgs.cuda() annot = annot.cuda() cls_loss, reg_loss = model(imgs, annot, obj_list=params.obj_list) cls_loss = cls_loss.mean() reg_loss = reg_loss.mean() loss = cls_loss + reg_loss if loss == 0 or not torch.isfinite(loss): continue loss_classification_ls.append(cls_loss.item()) loss_regression_ls.append(reg_loss.item()) cls_loss = np.mean(loss_classification_ls) reg_loss = np.mean(loss_regression_ls) loss = cls_loss + reg_loss print( 'Val. Epoch: {}/{}. Classification loss: {:1.5f}. Regression loss: {:1.5f}. Total loss: {:1.5f}' .format(epoch, opt.num_epochs, cls_loss, reg_loss, loss)) writer.add_scalars('Loss', {'val': loss}, step) writer.add_scalars('Regression_loss', {'val': reg_loss}, step) writer.add_scalars('Classfication_loss', {'val': cls_loss}, step) if loss + opt.es_min_delta < best_loss: best_loss = loss best_epoch = epoch save_checkpoint( model, f'efficientdet-d{opt.compound_coef}_{epoch}_{step}.pth' ) model.train() # Early stopping if epoch - best_epoch > opt.es_patience > 0: print( '[Info] Stop training at epoch {}. The lowest loss achieved is {}' .format(epoch, best_loss)) break except KeyboardInterrupt: save_checkpoint( model, f'efficientdet-d{opt.compound_coef}_{epoch}_{step}.pth') writer.close() writer.close()
def train(opt): params = Params(f'projects/{opt.project}.yml') params.num_gpus = 4 # opt.log_path = 'C:/Users/giang/Desktop/result_temp/' if params.num_gpus == 0: os.environ['CUDA_VISIBLE_DEVICES'] = '-1' if torch.cuda.is_available(): torch.cuda.manual_seed(42) else: torch.manual_seed(42) opt.saved_path = opt.saved_path + f'/{params.project_name}/' opt.log_path = opt.log_path + f'/{params.project_name}/tensorboard/' os.makedirs(opt.log_path, exist_ok=True) os.makedirs(opt.saved_path, exist_ok=True) training_params = { 'batch_size': opt.batch_size, 'shuffle': True, 'drop_last': True, 'collate_fn': collater, 'num_workers': opt.num_workers } val_params = { 'batch_size': opt.batch_size * 4, 'shuffle': False, 'drop_last': True, 'collate_fn': collater, 'num_workers': opt.num_workers } input_sizes = [512, 640, 768, 896, 1024, 1280, 1280, 1536, 1536] # root_train = 'D:/Etri_tracking_data/Etri_full/train_1024/' # side_train = 'D:/Etri_tracking_data/Etri_full/train_Sejin_1024/' # ground_truth_train = 'D:/Etri_tracking_data/Etri_full/train_1024.txt' root_train = '/home/../../data3/giangData/train_1024/' side_train = '/home/../../data3/giangData/train_Sejin_1024/' ground_truth_train = '/home/../../data3/giangData/train_1024.txt' training_set = TobyCustom(root_dir=root_train, side_dir = side_train, \ annot_path = ground_truth_train, \ transform=ComposeAlb([Flip_X(), \ Flip_Y(), \ Equalize(), \ Brightness(), \ Constrast(), \ Resizer(input_sizes[opt.compound_coef], num_channels=3), \ Normalizer(mean = [0.485, 0.456, 0.406], std = [0.229, 0.224, 0.225])])) training_generator = DataLoader(training_set, **training_params) # root_val = 'D:/Etri_tracking_data/Etri_full/val_1024/' # side_val = 'D:/Etri_tracking_data/Etri_full/val_Sejin_1024/' # ground_truth_val = 'D:/Etri_tracking_data/Etri_full/val_1024.txt' root_val = '/home/../../data3/giangData/val_1024/' side_val = '/home/../../data3/giangData/val_Sejin_1024/' ground_truth_val = '/home/../../data3/giangData/val_1024.txt' val_set = TobyCustom(root_dir=root_val, side_dir = side_val, \ annot_path = ground_truth_val, \ transform=ComposeAlb([Resizer(input_sizes[opt.compound_coef], num_channels=3), Normalizer(mean = [0.485, 0.456, 0.406], std = [0.229, 0.224, 0.225])])) val_generator = DataLoader(val_set, **val_params) model = EfficientDetBackbone(num_classes=len(params.obj_list), compound_coef=opt.compound_coef, ratios=eval(params.anchors_ratios), scales=eval(params.anchors_scales)) from efficientdet.model import Classifier # model.backbone_net.model._conv_stem.conv = nn.Conv2d(4, 48, kernel_size=(3, 3), stride=(2, 2), bias=False) # model.classifier.header.pointwise_conv.conv = nn.Conv2d(224, 9, kernel_size=(1, 1), stride=(1, 1)) model.classifier = Classifier( in_channels=model.fpn_num_filters[opt.compound_coef], num_anchors=model.num_anchors, num_classes=1, num_layers=model.box_class_repeats[opt.compound_coef], pyramid_levels=model.pyramid_levels[opt.compound_coef]) # opt.load_weights = 'C:/Users/giang/Desktop/efficientdet-d4_107_15228_6.1788892433756875.pth' opt.load_weights = './../result_3channel_21/save/coco/efficientdet-d4_21_3000.pth' # block' # for EfficientNetB5, please test again with B4 # load last weights if opt.load_weights is not None: if opt.load_weights.endswith('.pth'): weights_path = opt.load_weights else: weights_path = get_last_weights(opt.saved_path) try: last_step = int( os.path.basename(weights_path).split('_')[-1].split('.')[0]) except: last_step = 0 try: ret = model.load_state_dict(torch.load(weights_path), strict=False) except RuntimeError as e: print(f'[Warning] Ignoring {e}') print( '[Warning] Don\'t panic if you see this, this might be because you load a pretrained weights with different number of classes. The rest of the weights should be loaded already.' ) print( f'[Info] loaded weights: {os.path.basename(weights_path)}, resuming checkpoint from step: {last_step}' ) else: last_step = 0 print('[Info] initializing weights...') init_weights(model) ''' ============================================ Modify model ''' # from efficientdet.model import Classifier # model.backbone_net.model._conv_stem.conv = nn.Conv2d(4, 48, kernel_size=(3, 3), stride=(2, 2), bias=False) # model.classifier.header.pointwise_conv.conv = nn.Conv2d(224, 9, kernel_size=(1, 1), stride=(1, 1)) # model.classifier = Classifier(in_channels=model.fpn_num_filters[opt.compound_coef], num_anchors=model.num_anchors, # num_classes=1, # num_layers=model.box_class_repeats[opt.compound_coef], # pyramid_levels=model.pyramid_levels[opt.compound_coef]) ''' ============================================= ''' # freeze backbone if train head_only if opt.head_only: def freeze_backbone(m): classname = m.__class__.__name__ for ntl in ['EfficientNet', 'BiFPN']: if ntl in classname: for param in m.parameters(): param.requires_grad = False model.apply(freeze_backbone) print('[Info] freezed backbone') # https://github.com/vacancy/Synchronized-BatchNorm-PyTorch # apply sync_bn when using multiple gpu and batch_size per gpu is lower than 4 # useful when gpu memory is limited. # because when bn is disable, the training will be very unstable or slow to converge, # apply sync_bn can solve it, # by packing all mini-batch across all gpus as one batch and normalize, then send it back to all gpus. # but it would also slow down the training by a little bit. if params.num_gpus > 1 and opt.batch_size // params.num_gpus < 4: model.apply(replace_w_sync_bn) use_sync_bn = True else: use_sync_bn = False writer = SummaryWriter( opt.log_path + f'/{datetime.datetime.now().strftime("%Y%m%d-%H%M%S")}/') # warp the model with loss function, to reduce the memory usage on gpu0 and speedup model = ModelWithLoss(model, debug=opt.debug) if params.num_gpus > 0: model = model.cuda() if params.num_gpus > 1: model = CustomDataParallel(model, params.num_gpus) if use_sync_bn: patch_replication_callback(model) if opt.optim == 'adamw': optimizer = torch.optim.AdamW(model.parameters(), opt.lr) else: optimizer = torch.optim.SGD(model.parameters(), opt.lr, momentum=0.9, nesterov=True) scheduler = torch.optim.lr_scheduler.ReduceLROnPlateau(optimizer, patience=3, verbose=True) epoch = 0 best_loss = 1e5 best_epoch = 0 step = max(0, last_step) model.train() num_iter_per_epoch = len(training_generator) try: for epoch in range(opt.num_epochs): last_epoch = step // num_iter_per_epoch if epoch < last_epoch: continue epoch_loss = [] progress_bar = tqdm(training_generator) for iter, data in enumerate(progress_bar): if iter < step - last_epoch * num_iter_per_epoch: progress_bar.update() continue try: imgs = data['img'] annot = data['annot'] image_path = data['image_path'] # print(image_path) if params.num_gpus == 1: # if only one gpu, just send it to cuda:0 # elif multiple gpus, send it to multiple gpus in CustomDataParallel, not here imgs = imgs.cuda() annot = annot.cuda() optimizer.zero_grad() cls_loss, reg_loss = model(imgs, annot, obj_list=params.obj_list) cls_loss = cls_loss.mean() reg_loss = reg_loss.mean() loss = cls_loss + reg_loss if loss == 0 or not torch.isfinite(loss): continue loss.backward() # torch.nn.utils.clip_grad_norm_(model.parameters(), 0.1) optimizer.step() epoch_loss.append(float(loss)) progress_bar.set_description( 'Step: {}. Epoch: {}/{}. Iteration: {}/{}. Cls loss: {:.5f}. Reg loss: {:.5f}. Total loss: {:.5f}' .format(step, epoch, opt.num_epochs, iter + 1, num_iter_per_epoch, cls_loss.item(), reg_loss.item(), loss.item())) writer.add_scalars('Loss', {'train': loss}, step) writer.add_scalars('Regression_loss', {'train': reg_loss}, step) writer.add_scalars('Classfication_loss', {'train': cls_loss}, step) # log learning_rate current_lr = optimizer.param_groups[0]['lr'] writer.add_scalar('learning_rate', current_lr, step) step += 1 if step % opt.save_interval == 0 and step > 0: save_checkpoint( model, f'efficientdet-d{opt.compound_coef}_{epoch}_{step}.pth' ) print('checkpoint...') except Exception as e: print('[Error]', traceback.format_exc()) print(e) continue scheduler.step(np.mean(epoch_loss)) if epoch % opt.val_interval == 0: model.eval() loss_regression_ls = [] loss_classification_ls = [] for iter, data in enumerate(val_generator): with torch.no_grad(): imgs = data['img'] annot = data['annot'] if params.num_gpus == 1: imgs = imgs.cuda() annot = annot.cuda() cls_loss, reg_loss = model(imgs, annot, obj_list=params.obj_list) cls_loss = cls_loss.mean() reg_loss = reg_loss.mean() loss = cls_loss + reg_loss if loss == 0 or not torch.isfinite(loss): continue loss_classification_ls.append(cls_loss.item()) loss_regression_ls.append(reg_loss.item()) cls_loss = np.mean(loss_classification_ls) reg_loss = np.mean(loss_regression_ls) loss = cls_loss + reg_loss print( 'Val. Epoch: {}/{}. Classification loss: {:1.5f}. Regression loss: {:1.5f}. Total loss: {:1.5f}' .format(epoch, opt.num_epochs, cls_loss, reg_loss, loss)) writer.add_scalars('Loss', {'val': loss}, step) writer.add_scalars('Regression_loss', {'val': reg_loss}, step) writer.add_scalars('Classfication_loss', {'val': cls_loss}, step) print('\n') if loss + opt.es_min_delta < best_loss: best_loss = loss best_epoch = epoch save_loss = round(loss, 4) save_checkpoint( model, f'efficientdet-d{opt.compound_coef}_{epoch}_{step}_{loss}.pth' ) model.train() # Early stopping if epoch - best_epoch > opt.es_patience > 0: print( '[Info] Stop training at epoch {}. The lowest loss achieved is {}' .format(epoch, best_loss)) break except KeyboardInterrupt: save_checkpoint( model, f'efficientdet-d{opt.compound_coef}_{epoch}_{step}.pth') writer.close() writer.close()
def train(opt): params = Params(opt.config) if params.num_gpus == 0: os.environ["CUDA_VISIBLE_DEVICES"] = "-1" if torch.cuda.is_available(): torch.cuda.manual_seed(42) else: torch.manual_seed(42) opt.saved_path = params.logdir opt.log_path = os.path.join(params.logdir, "tensorboard") os.makedirs(opt.saved_path, exist_ok=True) os.makedirs(opt.log_path, exist_ok=True) training_params = { "batch_size": opt.batch_size, "shuffle": True, "drop_last": True, "collate_fn": collater, "num_workers": opt.num_workers, } val_params = { "batch_size": opt.batch_size, "shuffle": False, "drop_last": True, "collate_fn": collater, "num_workers": opt.num_workers, } input_sizes = [512, 640, 768, 896, 1024, 1280, 1280, 1536, 1536] training_set = CocoDataset( image_dir=params.image_dir, json_path=params.train_annotations, transform=transforms.Compose( [ Normalizer(mean=params.mean, std=params.std), Augmenter(), Resizer(input_sizes[opt.compound_coef]), ] ), ) training_generator = DataLoader(training_set, **training_params) if params.val_image_dir is None: params.val_image_dir = params.image_dir val_set = CocoDataset( image_dir=params.val_image_dir, json_path=params.val_annotations, transform=transforms.Compose( [Normalizer(mean=params.mean, std=params.std), Resizer(input_sizes[opt.compound_coef])] ), ) val_generator = DataLoader(val_set, **val_params) model = EfficientDetBackbone( num_classes=len(params.obj_list), compound_coef=opt.compound_coef, ratios=eval(params.anchors_ratios), scales=eval(params.anchors_scales), ) # load last weights if opt.load_weights is not None: if opt.load_weights.endswith(".pth"): weights_path = opt.load_weights else: weights_path = get_last_weights(opt.saved_path) try: last_step = int(os.path.basename(weights_path).split("_")[-1].split(".")[0]) except: last_step = 0 try: ret = model.load_state_dict(torch.load(weights_path), strict=False) except RuntimeError as e: print(f"[Warning] Ignoring {e}") print( "[Warning] Don't panic if you see this, this might be because you load a pretrained weights with different number of classes. The rest of the weights should be loaded already." ) print( f"[Info] loaded weights: {os.path.basename(weights_path)}, resuming checkpoint from step: {last_step}" ) else: last_step = 0 print("[Info] initializing weights...") init_weights(model) # freeze backbone if train head_only if opt.head_only: def freeze_backbone(m): classname = m.__class__.__name__ for ntl in ["EfficientNet", "BiFPN"]: if ntl in classname: for param in m.parameters(): param.requires_grad = False model.apply(freeze_backbone) print("[Info] freezed backbone") # https://github.com/vacancy/Synchronized-BatchNorm-PyTorch # apply sync_bn when using multiple gpu and batch_size per gpu is lower than 4 # useful when gpu memory is limited. # because when bn is disable, the training will be very unstable or slow to converge, # apply sync_bn can solve it, # by packing all mini-batch across all gpus as one batch and normalize, then send it back to all gpus. # but it would also slow down the training by a little bit. if params.num_gpus > 1 and opt.batch_size // params.num_gpus < 4: model.apply(replace_w_sync_bn) use_sync_bn = True else: use_sync_bn = False writer = SummaryWriter(opt.log_path + f'/{datetime.datetime.now().strftime("%Y%m%d-%H%M%S")}/') # warp the model with loss function, to reduce the memory usage on gpu0 and speedup model = ModelWithLoss(model, debug=opt.debug) if params.num_gpus > 0: model = model.cuda() if params.num_gpus > 1: model = CustomDataParallel(model, params.num_gpus) if use_sync_bn: patch_replication_callback(model) if opt.optim == "adamw": optimizer = torch.optim.AdamW(model.parameters(), opt.lr) else: optimizer = torch.optim.SGD(model.parameters(), opt.lr, momentum=0.9, nesterov=True) scheduler = torch.optim.lr_scheduler.ReduceLROnPlateau(optimizer, patience=3, verbose=True) epoch = 0 best_loss = 1e5 best_epoch = 0 step = max(0, last_step) model.train() num_iter_per_epoch = len(training_generator) try: for epoch in range(opt.num_epochs): last_epoch = step // num_iter_per_epoch if epoch < last_epoch: continue epoch_loss = [] progress_bar = tqdm(training_generator) for iter, data in enumerate(progress_bar): if iter < step - last_epoch * num_iter_per_epoch: progress_bar.update() continue try: imgs = data["img"] annot = data["annot"] if params.num_gpus == 1: # if only one gpu, just send it to cuda:0 # elif multiple gpus, send it to multiple gpus in CustomDataParallel, not here imgs = imgs.cuda() annot = annot.cuda() optimizer.zero_grad() cls_loss, reg_loss = model(imgs, annot, obj_list=params.obj_list) cls_loss = cls_loss.mean() reg_loss = reg_loss.mean() loss = cls_loss + reg_loss if loss == 0 or not torch.isfinite(loss): continue loss.backward() # torch.nn.utils.clip_grad_norm_(model.parameters(), 0.1) optimizer.step() epoch_loss.append(float(loss)) progress_bar.set_description( "Step: {}. Epoch: {}/{}. Iteration: {}/{}. Cls loss: {:.5f}. Reg loss: {:.5f}. Total loss: {:.5f}".format( step, epoch, opt.num_epochs, iter + 1, num_iter_per_epoch, cls_loss.item(), reg_loss.item(), loss.item(), ) ) writer.add_scalars("Loss", {"train": loss}, step) writer.add_scalars("Regression_loss", {"train": reg_loss}, step) writer.add_scalars("Classfication_loss", {"train": cls_loss}, step) # log learning_rate current_lr = optimizer.param_groups[0]["lr"] writer.add_scalar("learning_rate", current_lr, step) step += 1 if step % opt.save_interval == 0 and step > 0: save_checkpoint( model, f"efficientdet-d{opt.compound_coef}_{epoch}_{step}.pth" ) print("checkpoint...") except Exception as e: print("[Error]", traceback.format_exc()) print(e) continue scheduler.step(np.mean(epoch_loss)) if epoch % opt.val_interval == 0: model.eval() loss_regression_ls = [] loss_classification_ls = [] for iter, data in enumerate(val_generator): with torch.no_grad(): imgs = data["img"] annot = data["annot"] if params.num_gpus == 1: imgs = imgs.cuda() annot = annot.cuda() cls_loss, reg_loss = model(imgs, annot, obj_list=params.obj_list) cls_loss = cls_loss.mean() reg_loss = reg_loss.mean() loss = cls_loss + reg_loss if loss == 0 or not torch.isfinite(loss): continue loss_classification_ls.append(cls_loss.item()) loss_regression_ls.append(reg_loss.item()) cls_loss = np.mean(loss_classification_ls) reg_loss = np.mean(loss_regression_ls) loss = cls_loss + reg_loss print( "Val. Epoch: {}/{}. Classification loss: {:1.5f}. Regression loss: {:1.5f}. Total loss: {:1.5f}".format( epoch, opt.num_epochs, cls_loss, reg_loss, loss ) ) writer.add_scalars("Loss", {"val": loss}, step) writer.add_scalars("Regression_loss", {"val": reg_loss}, step) writer.add_scalars("Classfication_loss", {"val": cls_loss}, step) if loss + opt.es_min_delta < best_loss: best_loss = loss best_epoch = epoch save_checkpoint(model, f"efficientdet-d{opt.compound_coef}_{epoch}_{step}.pth") model.train() # Early stopping if epoch - best_epoch > opt.es_patience > 0: print( "[Info] Stop training at epoch {}. The lowest loss achieved is {}".format( epoch, best_loss ) ) break except KeyboardInterrupt: save_checkpoint(model, f"efficientdet-d{opt.compound_coef}_{epoch}_{step}.pth") writer.close() writer.close()
def train(opt): params = Params(f'projects/{opt.project}.yml') if params.num_gpus == 0: os.environ['CUDA_VISIBLE_DEVICES'] = '-1' if torch.cuda.is_available(): torch.cuda.manual_seed(42) else: torch.manual_seed(42) opt.saved_path = opt.saved_path + f'/{params.project_name}/' opt.log_path = opt.log_path + f'/{params.project_name}/tensorboard/' os.makedirs(opt.log_path, exist_ok=True) os.makedirs(opt.saved_path, exist_ok=True) training_params = { 'batch_size': opt.batch_size, 'shuffle': True, 'drop_last': True, 'collate_fn': collater, 'num_workers': opt.num_workers } val_params = { 'batch_size': opt.batch_size, 'shuffle': False, 'drop_last': True, 'collate_fn': collater, 'num_workers': opt.num_workers } input_sizes = [512, 640, 768, 896, 1024, 1280, 1280, 1536] train_df = pd.read_csv(os.path.join(params.data_dir, 'train.csv')) train_df, val_df = get_train_val(train_df) training_set = WheatDataset(dataframe=train_df, image_dir=os.path.join(params.data_dir, params.train_set), transforms=transforms.Compose([ Normalizer(mean=params.mean, std=params.std), Augmenter(), Resizer(input_sizes[opt.compound_coef]) ])) training_generator = DataLoader(training_set, **training_params) val_set = WheatDataset(dataframe=val_df, image_dir=os.path.join(params.data_dir, params.train_set), transforms=transforms.Compose([ Normalizer(mean=params.mean, std=params.std), Augmenter(), Resizer(input_sizes[opt.compound_coef]) ])) val_generator = DataLoader(val_set, **val_params) model = EfficientDetBackbone(num_classes=len(params.obj_list), compound_coef=opt.compound_coef, ratios=eval(params.anchors_ratios), scales=eval(params.anchors_scales)) # load last weights if opt.load_weights is not None: if opt.load_weights.endswith('.pth'): weights_path = opt.load_weights else: weights_path = get_last_weights(opt.saved_path) try: last_step = int( os.path.basename(weights_path).split('_')[-1].split('.')[0]) except: last_step = 0 try: ret = model.load_state_dict(torch.load(weights_path), strict=False) except RuntimeError as e: print(f'[Warning] Ignoring {e}') print( '[Warning] Don\'t panic if you see this, this might be because you load a pretrained weights with different number of classes. The rest of the weights should be loaded already.' ) print( f'[Info] loaded weights: {os.path.basename(weights_path)}, resuming checkpoint from step: {last_step}' ) else: last_step = 0 print('[Info] initializing weights...') init_weights(model) # freeze backbone if train head_We wish we could give free compute without any bounds, because they help a lot of people do deep learning who otherwise lack access to GPUs. Unfortunately, we have a finite budget, and we've started hitting our limit.only if opt.head_only: def freeze_backbone(m): classname = m.__class__.__name__ for ntl in ['EfficientNet', 'BiFPN']: if ntl in classname: for param in m.parameters(): param.requires_grad = False model.apply(freeze_backbone) print('[Info] freezed backbone') if params.num_gpus > 1 and opt.batch_size // params.num_gpus < 4: model.apply(replace_w_sync_bn) use_sync_bn = True else: use_sync_bn = False writer = SummaryWriter( opt.log_path + f'/{datetime.datetime.now().strftime("%Y%m%d-%H%M%S")}/') # warp the model with loss function, to reduce the memory usage on gpu0 and speedup model = ModelWithLoss(model, debug=opt.debug) if params.num_gpus > 0: model = model.cuda() if params.num_gpus > 1: model = CustomDataParallel(model, params.num_gpus) if use_sync_bn: patch_replication_callback(model) if opt.optim == 'adam': optimizer = torch.optim.Adam(model.parameters(), opt.lr) else: optimizer = torch.optim.SGD(model.parameters(), opt.lr, momentum=0.9, nesterov=True) scheduler = torch.optim.lr_scheduler.ReduceLROnPlateau(optimizer, patience=3, verbose=True) epoch = 0 best_loss = 1e5 best_epoch = 0 step = max(0, last_step) model.train() num_iter_per_epoch = len(training_generator) try: for epoch in range(opt.num_epochs): last_epoch = step // num_iter_per_epoch if epoch < last_epoch: continue epoch_loss = [] progress_bar = tqdm(training_generator) for iter, data in enumerate(progress_bar): if iter < step - last_epoch * num_iter_per_epoch: progress_bar.update() continue try: imgs = data['image'] annot = data['bboxes'] if params.num_gpus == 1: # if only one gpu, just send it to cuda:0 # elif multiple gpus, send it to multiple gpus in CustomDataParallel, not here imgs = imgs.cuda() annot = annot.cuda() optimizer.zero_grad() cls_loss, reg_loss = model(imgs, annot, obj_list=params.obj_list) cls_loss = cls_loss.mean() reg_loss = reg_loss.mean() loss = cls_loss + reg_loss if loss == 0 or not torch.isfinite(loss): continue loss.backward() # torch.nn.utils.clip_grad_norm_(model.parameters(), 0.1) optimizer.step() epoch_loss.append(float(loss)) progress_bar.set_description( 'Step: {}. Epoch: {}/{}. Iteration: {}/{}. Cls loss: {:.5f}. Reg loss: {:.5f}. Total loss: {:.5f}' .format(step, epoch, opt.num_epochs, iter + 1, num_iter_per_epoch, cls_loss.item(), reg_loss.item(), loss.item())) writer.add_scalars('Loss', {'train': loss}, step) writer.add_scalars('Regression_loss', {'train': reg_loss}, step) writer.add_scalars('Classfication_loss', {'train': cls_loss}, step) # log learning_rate current_lr = optimizer.param_groups[0]['lr'] writer.add_scalar('learning_rate', current_lr, step) step += 1 if step % opt.save_interval == 0 and step > 0: save_checkpoint( model, f'efficientdet-d{opt.compound_coef}_{epoch}_{step}.pth' ) print('checkpoint...') except Exception as e: print('[Error]', traceback.format_exc()) print(e) continue scheduler.step(np.mean(epoch_loss)) if epoch % opt.val_interval == 0: model.eval() loss_regression_ls = [] loss_classification_ls = [] for iter, data in enumerate(val_generator): with torch.no_grad(): imgs = data['image'] annot = data['bboxes'] if params.num_gpus == 1: imgs = imgs.cuda() annot = annot.cuda() cls_loss, reg_loss = model(imgs, annot, obj_list=params.obj_list) cls_loss = cls_loss.mean() reg_loss = reg_loss.mean() loss = cls_loss + reg_loss if loss == 0 or not torch.isfinite(loss): continue loss_classification_ls.append(cls_loss.item()) loss_regression_ls.append(reg_loss.item()) cls_loss = np.mean(loss_classification_ls) reg_loss = np.mean(loss_regression_ls) loss = cls_loss + reg_loss print( 'Val. Epoch: {}/{}. Classification loss: {:1.5f}. Regression loss: {:1.5f}. Total loss: {:1.5f}' .format(epoch, opt.num_epochs, cls_loss, reg_loss, loss)) writer.add_scalars('Loss', {'val': loss}, step) writer.add_scalars('Regression_loss', {'val': reg_loss}, step) writer.add_scalars('Classfication_loss', {'val': cls_loss}, step) if loss + opt.es_min_delta < best_loss: best_loss = loss best_epoch = epoch save_checkpoint( model, f'efficientdet-d{opt.compound_coef}_{epoch}_{step}.pth' ) model.train() # Early stopping if epoch - best_epoch > opt.es_patience > 0: print( '[Info] Stop training at epoch {}. The lowest loss achieved is {}' .format(epoch, best_loss)) break except KeyboardInterrupt: save_checkpoint( model, f'efficientdet-d{opt.compound_coef}_{epoch}_{step}.pth') writer.close() writer.close()
def train(opt): params = Params(f'projects/{opt.project}.yml') # print(opt.project) # exit() if params.num_gpus == 0: os.environ['CUDA_VISIBLE_DEVICES'] = '-1' if torch.cuda.is_available(): torch.cuda.manual_seed(42) else: torch.manual_seed(42) opt.saved_path = opt.saved_path + f'/{params.project_name}/' opt.log_path = opt.log_path + f'/{params.project_name}/tensorboard/' os.makedirs(opt.log_path, exist_ok=True) os.makedirs(opt.saved_path, exist_ok=True) training_params = { 'batch_size': opt.batch_size, 'shuffle': True, 'drop_last': True, 'collate_fn': collater, 'num_workers': opt.num_workers } # training_params = {'batch_size': opt.batch_size, # 'shuffle': False, # 'drop_last': True, # 'collate_fn': collater, # 'num_workers': opt.num_workers} val_params = { 'batch_size': opt.batch_size, 'shuffle': False, 'drop_last': True, 'collate_fn': collater, 'num_workers': opt.num_workers } input_sizes = [512, 640, 768, 896, 1024, 1280, 1280, 1536, 1356] training_set = CocoDataset(root_dir=os.path.join(opt.data_path, params.project_name), set=params.train_set, transform=transforms.Compose([ Normalizer(mean=params.mean, std=params.std), Augmenter(), Resizer(input_sizes[opt.compound_coef]) ])) training_generator = DataLoader(training_set, **training_params) # import cv2 # for iter, data in enumerate(training_generator): # if iter>0: # break # img = data["img"] # ano = data["annot"] # scale = data["scale"] # for i in range(0, img.shape[0]): # for ii in range(0, ano.shape[1]): # imgshow = img[i].permute(1, 2, 0).detach().numpy() # imgshow = cv2.rectangle(imgshow, (int(ano[i][ii][0]), int(ano[i][ii][1])), # (int(ano[i][ii][2]), int(ano[i][ii][3])), (0, 255, 0), thickness=4) # # cv2.imshow("1", imgshow) # cv2.waitKey(0) # print(img) # print(ano) # print(scale) # # exit() val_set = CocoDataset(root_dir=os.path.join(opt.data_path, params.project_name), set=params.val_set, transform=transforms.Compose([ Normalizer(mean=params.mean, std=params.std), Resizer(input_sizes[opt.compound_coef]) ])) # print(len(val_set)) # exit() val_generator = DataLoader(val_set, **val_params) # print(training_set[1058]['img']) # print(training_set[1058]['annot']) # print(len(params.obj_list)) # exit() model = EfficientDetBackbone(num_classes=len(params.obj_list), compound_coef=opt.compound_coef, ratios=eval(params.anchors_ratios), scales=eval(params.anchors_scales)) # load last weights if opt.load_weights is not None: if opt.load_weights.endswith('.pth'): weights_path = opt.load_weights else: weights_path = get_last_weights(opt.saved_path) try: last_step = int( os.path.basename(weights_path).split('_')[-1].split('.')[0]) except: last_step = 0 try: ret = model.load_state_dict(torch.load(weights_path), strict=False) except RuntimeError as e: print(f'[Warning] Ignoring {e}') print( '[Warning] Don\'t panic if you see this, this might be because you load a pretrained weights with different number of classes. The rest of the weights should be loaded already.' ) print( f'[Info] loaded weights: {os.path.basename(weights_path)}, resuming checkpoint from step: {last_step}' ) else: last_step = 0 print('[Info] initializing weights...') init_weights(model) # freeze backbone if train head_only if opt.head_only: def freeze_backbone(m): classname = m.__class__.__name__ for ntl in ['EfficientNet', 'BiFPN']: if ntl in classname: for param in m.parameters(): param.requires_grad = False model.apply(freeze_backbone) print('[Info] freezed backbone') # https://github.com/vacancy/Synchronized-BatchNorm-PyTorch # apply sync_bn when using multiple gpu and batch_size per gpu is lower than 4 # useful when gpu memory is limited. # because when bn is disable, the training will be very unstable or slow to converge, # apply sync_bn can solve it, # by packing all mini-batch across all gpus as one batch and normalize, then send it back to all gpus. # but it would also slow down the training by a little bit. if params.num_gpus > 1 and opt.batch_size // params.num_gpus < 4: model.apply(replace_w_sync_bn) use_sync_bn = True else: use_sync_bn = False writer = SummaryWriter( opt.log_path + f'/{datetime.datetime.now().strftime("%Y%m%d-%H%M%S")}/') # warp the model with loss function, to reduce the memory usage on gpu0 and speedup model = ModelWithLoss(model, debug=opt.debug) if params.num_gpus > 0: model = model.cuda() if params.num_gpus > 1: model = CustomDataParallel(model, params.num_gpus) if use_sync_bn: patch_replication_callback(model) if opt.optim == 'adamw': optimizer = torch.optim.AdamW(model.parameters(), opt.lr) else: optimizer = torch.optim.SGD(model.parameters(), opt.lr, momentum=0.9, nesterov=True) if opt.coslr: scheduler = torch.optim.lr_scheduler.CosineAnnealingLR(optimizer, T_max=5, eta_min=1e-8) else: # scheduler = torch.optim.lr_scheduler.StepLR(optimizer, step_size=1, gamma=0.9) scheduler = torch.optim.lr_scheduler.ReduceLROnPlateau(optimizer, patience=3, verbose=True) # scheduler = torch.optim.lr_scheduler.ReduceLROnPlateau(optimizer, patience=3, verbose=True) epoch = 0 best_loss = 1e5 best_epoch = 0 step = max(0, last_step) model.train() num_iter_per_epoch = len(training_generator) #writer = SummaryWriter(log_dir='logs', flush_secs=60) # if Cuda: # graph_inputs = torch.from_numpy(np.random.rand(1, 3, input_shape[0], input_shape[1])).type( # torch.FloatTensor).cuda() # else: # graph_inputs = torch.from_numpy(np.random.rand(1, 3, input_shape[0], input_shape[1])).type(torch.FloatTensor) # writer.add_graph(model, (graph_inputs,)) try: for epoch in range(opt.num_epochs): last_epoch = step // num_iter_per_epoch if epoch < last_epoch: continue epoch_loss = [] progress_bar = tqdm(training_generator) for iter, data in enumerate(progress_bar): if iter < step - last_epoch * num_iter_per_epoch: progress_bar.update() continue try: imgs = data['img'] annot = data['annot'] if params.num_gpus == 1: # if only one gpu, just send it to cuda:0 # elif multiple gpus, send it to multiple gpus in CustomDataParallel, not here imgs = imgs.cuda() annot = annot.cuda() optimizer.zero_grad() cls_loss, reg_loss = model(imgs, annot, obj_list=params.obj_list) cls_loss = cls_loss.mean() reg_loss = reg_loss.mean() loss = cls_loss + reg_loss if loss == 0 or not torch.isfinite(loss): continue loss.backward() # torch.nn.utils.clip_grad_norm_(model.parameters(), 0.1) optimizer.step() epoch_loss.append(float(loss)) progress_bar.set_description( 'Step: {}. Epoch: {}/{}. Iteration: {}/{}. Cls loss: {:.5f}. Reg loss: {:.5f}. Total loss: {:.5f}\n' .format(step, epoch, opt.num_epochs, iter + 1, num_iter_per_epoch, cls_loss.item(), reg_loss.item(), loss.item())) writer.add_scalars('Loss', {'train': loss}, step) writer.add_scalars('Regression_loss', {'train': reg_loss}, step) writer.add_scalars('Classfication_loss', {'train': cls_loss}, step) # log learning_rate # current_lr = optimizer.param_groups[0]['lr'] current_lr = scheduler.get_lr() writer.add_scalar('learning_rate', current_lr, step) step += 1 if step % opt.save_interval == 0 and step > 0: save_checkpoint( model, f'efficientdet-d{opt.compound_coef}_{epoch}_{step}.pth' ) print('checkpoint...') except Exception as e: print('[Error]', traceback.format_exc()) print(e) continue #scheduler.step(np.mean(epoch_loss)) scheduler.step(np.mean(epoch_loss)) if epoch % opt.val_interval == 0: model.eval() loss_regression_ls = [] loss_classification_ls = [] for iter, data in enumerate(val_generator): with torch.no_grad(): imgs = data['img'] annot = data['annot'] if params.num_gpus == 1: imgs = imgs.cuda() annot = annot.cuda() cls_loss, reg_loss = model(imgs, annot, obj_list=params.obj_list) cls_loss = cls_loss.mean() reg_loss = reg_loss.mean() loss = cls_loss + reg_loss if loss == 0 or not torch.isfinite(loss): continue loss_classification_ls.append(cls_loss.item()) loss_regression_ls.append(reg_loss.item()) cls_loss = np.mean(loss_classification_ls) reg_loss = np.mean(loss_regression_ls) loss = cls_loss + reg_loss print( 'Val. Epoch: {}/{}. Classification loss: {:1.5f}. Regression loss: {:1.5f}. Total loss: {:1.5f}' .format(epoch, opt.num_epochs, cls_loss, reg_loss, loss)) writer.add_scalars('Loss', {'val': loss}, step) writer.add_scalars('Regression_loss', {'val': reg_loss}, step) writer.add_scalars('Classfication_loss', {'val': cls_loss}, step) if loss + opt.es_min_delta < best_loss: best_loss = loss best_epoch = epoch save_checkpoint( model, f'efficientdet-d{opt.compound_coef}_{epoch}_{step}.pth' ) model.train() # Early stopping if epoch - best_epoch > opt.es_patience > 0: print( '[Info] Stop training at epoch {}. The lowest loss achieved is {}' .format(epoch, best_loss)) break except KeyboardInterrupt: save_checkpoint( model, f'efficientdet-d{opt.compound_coef}_{epoch}_{step}.pth') writer.close() writer.close()
def train_det(opt, cfg): # # Write history # if 'backlog' not in opt.config: # with open(os.path.join(opt.saved_path, f'{opt.project}_backlog.yml'), 'w') as f: # doc = open(f'projects/{opt.project}.yml', 'r') # f.write('#History log file') # f.write(f'\n__backlog__: {now.strftime("%Y/%m/%d %H:%M:%S")}\n') # f.write(doc.read()) # f.write('\n# Manual seed used') # f.write(f'\nmanual_seed: {cfg.manual_seed}') # else: # with open(os.path.join(opt.saved_path, f'{opt.project}_history.yml'), 'w') as f: # doc = open(f'projects/{opt.project}.yml', 'r') # f.write(doc.read()) training_params = { 'batch_size': cfg.batch_size, 'shuffle': True, 'drop_last': True, 'collate_fn': collater, 'num_workers': opt.num_workers } val_params = { 'batch_size': cfg.batch_size, 'shuffle': False, 'drop_last': True, 'collate_fn': collater, 'num_workers': opt.num_workers } input_sizes = [512, 640, 768, 896, 1024, 1280, 1280, 1536] training_set = DataGenerator( data_path=os.path.join(opt.data_path, 'Train'), class_ids=cfg.dictionary_class_name.keys(), transform=transforms.Compose([ Augmenter(), Normalizer(mean=cfg.mean, std=cfg.std), Resizer(input_sizes[cfg.compound_coef]) ]), pre_augments=['', *[f'{aug}_' for aug in cfg.augment_list]] if cfg.augment_list else None) training_generator = DataLoader(training_set, **training_params) val_set = DataGenerator( # root_dir=os.path.join(opt.data_path, cfg.project_name), data_path=os.path.join(opt.data_path, 'Validation'), class_ids=cfg.dictionary_class_name.keys(), transform=transforms.Compose([ Normalizer(mean=cfg.mean, std=cfg.std), Resizer(input_sizes[cfg.compound_coef]) ])) val_generator = DataLoader(val_set, **val_params) model = EfficientDetBackbone(num_classes=len(cfg.dictionary_class_name), compound_coef=cfg.compound_coef, ratios=eval(cfg.anchor_ratios), scales=eval(cfg.anchor_scales)) # load last weights if opt.load_weights is not None: if opt.load_weights.endswith('.pth'): weights_path = opt.load_weights else: weights_path = get_last_weights(opt.saved_path) try: last_step = int( os.path.basename(weights_path).split('_')[-1].split('.')[0]) except: last_step = 0 try: ret = model.load_state_dict(torch.load(weights_path), strict=False) except RuntimeError as e: print(f'[Warning] Ignoring {e}') print( '[Warning] Don\'t panic if you see this, ' 'this might be because you load a pretrained weights with different number of classes. ' 'The rest of the weights should be loaded already.') print( f'[Info] loaded weights: {os.path.basename(weights_path)}, resuming checkpoint from step: {last_step}' ) else: last_step = 0 print('[Info] initializing weights...') init_weights(model) # freeze backbone if train head_only if cfg.training_layer.lower() == 'heads': def freeze_backbone(m): classname = m.__class__.__name__ for ntl in ['EfficientNet', 'BiFPN']: if ntl in classname: for param in m.parameters(): param.requires_grad = False model.apply(freeze_backbone) print('[Info] freezed backbone') # https://github.com/vacancy/Synchronized-BatchNorm-PyTorch # apply sync_bn when using multiple gpu and batch_size per gpu is lower than 4 # useful when gpu memory is limited. # because when bn is disable, the training will be very unstable or slow to converge, # apply sync_bn can solve it, # by packing all mini-batch across all gpus as one batch and normalize, then send it back to all gpus. # but it would also slow down the training by a little bit. if cfg.num_gpus > 1 and cfg.batch_size // cfg.num_gpus < 4: model.apply(replace_w_sync_bn) use_sync_bn = True else: use_sync_bn = False # warp the model with loss function, to reduce the memory usage on gpu0 and speedup model = ModelWithLoss(model, debug=opt.debug) if cfg.num_gpus > 0: model = model.cuda() if cfg.num_gpus > 1: model = CustomDataParallel(model, cfg.num_gpus) if use_sync_bn: patch_replication_callback(model) if cfg.optimizer.lower() == 'adamw': optimizer = torch.optim.AdamW(model.parameters(), cfg.learning_rate) if cfg.optimizer.lower() == 'srsgd': optimizer = SRSGD(model.parameters(), lr=cfg.learning_rate, weight_decay=5e-4, iter_count=100) else: optimizer = torch.optim.SGD(model.parameters(), cfg.learning_rate, momentum=0.9, nesterov=True) scheduler = torch.optim.lr_scheduler.ReduceLROnPlateau(optimizer, patience=3, verbose=True) # Setup complete, then start training now = datetime.datetime.now() opt.saved_path = opt.saved_path + f'/trainlogs_{now.strftime("%Y%m%d_%H%M%S")}' if opt.log_path is None: opt.log_path = opt.saved_path os.makedirs(opt.log_path, exist_ok=True) os.makedirs(opt.saved_path, exist_ok=True) # Write history if 'backlog' not in opt.config: with open( os.path.join(opt.saved_path, f'{now.strftime("%Y%m%d%H%M%S")}.backlog.json'), 'w') as f: backlog = dict(cfg.to_pascal_case()) backlog['__metadata__'] = 'Backlog at ' + now.strftime( "%Y/%m/%d %H:%M:%S") json.dump(backlog, f) else: with open( os.path.join(opt.saved_path, f'{now.strftime("%Y%m%d%H%M%S")}.history.json'), 'w') as f: history = dict(cfg.to_pascal_case()) history['__metadata__'] = now.strftime("%Y/%m/%d %H:%M:%S") json.dump(history, f) writer = SummaryWriter(opt.log_path + f'/tensorboard') epoch = 0 best_loss = 1e5 best_epoch = 0 step = max(0, last_step) model.train() num_iter_per_epoch = len(training_generator) try: for epoch in range(cfg.no_epochs): last_epoch = step // num_iter_per_epoch if epoch < last_epoch: continue epoch_loss = [] progress_bar = tqdm(training_generator) for iter, data in enumerate(progress_bar): if iter < step - last_epoch * num_iter_per_epoch: progress_bar.set_description( f'Skip {iter} < {step} - {last_epoch} * {num_iter_per_epoch}' ) progress_bar.update() continue try: imgs = data['img'] annot = data['annot'] if cfg.num_gpus == 1: # if only one gpu, just send it to cuda:0 # elif multiple gpus, send it to multiple gpus in CustomDataParallel, not here imgs = imgs.cuda() annot = annot.cuda() optimizer.zero_grad() cls_loss, reg_loss = model( imgs, annot, obj_list=cfg.dictionary_class_name.keys()) cls_loss = cls_loss.mean() reg_loss = reg_loss.mean() loss = cls_loss + reg_loss if loss == 0 or not torch.isfinite(loss): continue loss.backward() # torch.nn.utils.clip_grad_norm_(model.parameters(), 0.1) optimizer.step() epoch_loss.append(float(loss)) progress_bar.set_description( 'Step: {}. Epoch: {}/{}. Iteration: {}/{}. Cls loss: {:.5f}. Reg loss: {:.5f}. ' 'Total loss: {:.5f}'.format(step, epoch, cfg.no_epochs, iter + 1, num_iter_per_epoch, cls_loss.item(), reg_loss.item(), loss.item())) writer.add_scalars('Loss', {'train': loss}, step) writer.add_scalars('Regression_loss', {'train': reg_loss}, step) writer.add_scalars('Classification_loss', {'train': cls_loss}, step) # log learning_rate current_lr = optimizer.param_groups[0]['lr'] writer.add_scalar('learning_rate', current_lr, step) step += 1 except Exception as e: print('[Error]', traceback.format_exc()) print(e) continue scheduler.step(np.mean(epoch_loss)) model.eval() loss_regression_ls = [] loss_classification_ls = [] for iter, data in enumerate(val_generator): with torch.no_grad(): imgs = data['img'] annot = data['annot'] if cfg.num_gpus == 1: imgs = imgs.cuda() annot = annot.cuda() cls_loss, reg_loss = model( imgs, annot, obj_list=cfg.dictionary_class_name.keys()) cls_loss = cls_loss.mean() reg_loss = reg_loss.mean() loss = cls_loss + reg_loss if loss == 0 or not torch.isfinite(loss): continue loss_classification_ls.append(cls_loss.item()) loss_regression_ls.append(reg_loss.item()) cls_loss = np.mean(loss_classification_ls) reg_loss = np.mean(loss_regression_ls) loss = cls_loss + reg_loss progress_bar.set_description( 'Val. Epoch: {}/{}. Classification loss: {:1.5f}. Regression loss: {:1.5f}.' ' Total loss: {:1.5f}'.format(epoch, cfg.no_epochs, cls_loss, reg_loss, loss)) writer.add_scalars('Loss', {'val': loss}, step) writer.add_scalars('Regression_loss', {'val': reg_loss}, step) writer.add_scalars('Classification_loss', {'val': cls_loss}, step) if cfg.only_best_weights: if loss + opt.es_min_delta < best_loss: best_loss = loss best_epoch = epoch save_checkpoint( model, f"{opt.saved_path}/det_d{cfg.compound_coef}_{epoch}_{step}.pth" ) else: if loss + opt.es_min_delta < best_loss: best_loss = loss best_epoch = epoch save_checkpoint( model, f"{opt.saved_path}/det_d{cfg.compound_coef}_{epoch}_{step}.pth" ) model.train() # Early stopping if epoch - best_epoch > opt.es_patience > 0: print( '[Info] Stop training at epoch {}. The lowest loss achieved is {}' .format(epoch, best_loss)) break print( f'[Info] Finished training. Best loss achieved {best_loss} at epoch {best_epoch}.' ) except KeyboardInterrupt: save_checkpoint( model, f"{opt.saved_path}/d{cfg.compound_coef}_{epoch}_{step}.pth") writer.close() writer.close()