def train(opt): params = Params(f'projects/{opt.project}.yml') if params.num_gpus == 0: os.environ['CUDA_VISIBLE_DEVICES'] = '-1' if torch.cuda.is_available(): torch.cuda.manual_seed(42) else: torch.manual_seed(42) opt.saved_path = opt.saved_path + f'/{params.project_name}/' opt.log_path = opt.log_path + f'/{params.project_name}/tensorboard/' os.makedirs(opt.log_path, exist_ok=True) os.makedirs(opt.saved_path, exist_ok=True) training_params = { 'batch_size': opt.batch_size, 'shuffle': True, 'drop_last': True, 'collate_fn': collater, 'num_workers': opt.num_workers } val_params = { 'batch_size': opt.batch_size, 'shuffle': False, 'drop_last': True, 'collate_fn': collater, 'num_workers': opt.num_workers } input_sizes = [512, 640, 768, 896, 1024, 1280, 1280, 1536, 1536] training_set = CocoDataset(root_dir=os.path.join(opt.data_path, params.project_name), set=params.train_set, transform=transforms.Compose([ Normalizer(mean=params.mean, std=params.std), Augmenter(), Resizer(input_sizes[opt.compound_coef]) ])) training_generator = DataLoader(training_set, **training_params) val_set = CocoDataset(root_dir=os.path.join(opt.data_path, params.project_name), set=params.val_set, transform=transforms.Compose([ Normalizer(mean=params.mean, std=params.std), Resizer(input_sizes[opt.compound_coef]) ])) val_generator = DataLoader(val_set, **val_params) model = EfficientDetBackbone(num_classes=len(params.obj_list), compound_coef=opt.compound_coef, ratios=eval(params.anchors_ratios), scales=eval(params.anchors_scales)) # load last weights if opt.load_weights is not None: if opt.load_weights.endswith('.pth'): weights_path = opt.load_weights else: weights_path = get_last_weights(opt.saved_path) try: last_step = int( os.path.basename(weights_path).split('_')[-1].split('.')[0]) except: last_step = 0 try: ret = model.load_state_dict(torch.load(weights_path), strict=False) except RuntimeError as e: print(f'[Warning] Ignoring {e}') print( '[Warning] Don\'t panic if you see this, this might be because you load a pretrained weights with different number of classes. The rest of the weights should be loaded already.' ) print( f'[Info] loaded weights: {os.path.basename(weights_path)}, resuming checkpoint from step: {last_step}' ) else: last_step = 0 print('[Info] initializing weights...') init_weights(model) # freeze backbone if train head_only if opt.head_only: def freeze_backbone(m): classname = m.__class__.__name__ for ntl in ['EfficientNet', 'BiFPN']: if ntl in classname: for param in m.parameters(): param.requires_grad = False model.apply(freeze_backbone) print('[Info] freezed backbone') # https://github.com/vacancy/Synchronized-BatchNorm-PyTorch # apply sync_bn when using multiple gpu and batch_size per gpu is lower than 4 # useful when gpu memory is limited. # because when bn is disable, the training will be very unstable or slow to converge, # apply sync_bn can solve it, # by packing all mini-batch across all gpus as one batch and normalize, then send it back to all gpus. # but it would also slow down the training by a little bit. if params.num_gpus > 1 and opt.batch_size // params.num_gpus < 4: model.apply(replace_w_sync_bn) use_sync_bn = True else: use_sync_bn = False writer = SummaryWriter( opt.log_path + f'/{datetime.datetime.now().strftime("%Y%m%d-%H%M%S")}/') # warp the model with loss function, to reduce the memory usage on gpu0 and speedup model = ModelWithLoss(model, debug=opt.debug) if params.num_gpus > 0: model = model.cuda() if params.num_gpus > 1: model = CustomDataParallel(model, params.num_gpus) if use_sync_bn: patch_replication_callback(model) if opt.optim == 'adamw': optimizer = torch.optim.AdamW(model.parameters(), opt.lr) else: optimizer = torch.optim.SGD(model.parameters(), opt.lr, momentum=0.9, nesterov=True) scheduler = torch.optim.lr_scheduler.ReduceLROnPlateau(optimizer, patience=3, verbose=True) epoch = 0 best_loss = 1e5 best_epoch = 0 step = max(0, last_step) model.train() num_iter_per_epoch = len(training_generator) try: for epoch in range(opt.num_epochs): last_epoch = step // num_iter_per_epoch if epoch < last_epoch: continue epoch_loss = [] progress_bar = tqdm(training_generator) for iter, data in enumerate(progress_bar): if iter < step - last_epoch * num_iter_per_epoch: progress_bar.update() continue try: imgs = data['img'] annot = data['annot'] if params.num_gpus == 1: # if only one gpu, just send it to cuda:0 # elif multiple gpus, send it to multiple gpus in CustomDataParallel, not here imgs = imgs.cuda() annot = annot.cuda() optimizer.zero_grad() cls_loss, reg_loss = model(imgs, annot, obj_list=params.obj_list) cls_loss = cls_loss.mean() reg_loss = reg_loss.mean() loss = cls_loss + reg_loss if loss == 0 or not torch.isfinite(loss): continue loss.backward() # torch.nn.utils.clip_grad_norm_(model.parameters(), 0.1) optimizer.step() epoch_loss.append(float(loss)) progress_bar.set_description( 'Step: {}. Epoch: {}/{}. Iteration: {}/{}. Cls loss: {:.5f}. Reg loss: {:.5f}. Total loss: {:.5f}' .format(step, epoch, opt.num_epochs, iter + 1, num_iter_per_epoch, cls_loss.item(), reg_loss.item(), loss.item())) writer.add_scalars('Loss', {'train': loss}, step) writer.add_scalars('Regression_loss', {'train': reg_loss}, step) writer.add_scalars('Classfication_loss', {'train': cls_loss}, step) # log learning_rate current_lr = optimizer.param_groups[0]['lr'] writer.add_scalar('learning_rate', current_lr, step) step += 1 if step % opt.save_interval == 0 and step > 0: save_checkpoint( model, f'efficientdet-d{opt.compound_coef}_{epoch}_{step}.pth' ) print('checkpoint...') except Exception as e: print('[Error]', traceback.format_exc()) print(e) continue scheduler.step(np.mean(epoch_loss)) if epoch % opt.val_interval == 0: model.eval() loss_regression_ls = [] loss_classification_ls = [] for iter, data in enumerate(val_generator): with torch.no_grad(): imgs = data['img'] annot = data['annot'] if params.num_gpus == 1: imgs = imgs.cuda() annot = annot.cuda() cls_loss, reg_loss = model(imgs, annot, obj_list=params.obj_list) cls_loss = cls_loss.mean() reg_loss = reg_loss.mean() loss = cls_loss + reg_loss if loss == 0 or not torch.isfinite(loss): continue loss_classification_ls.append(cls_loss.item()) loss_regression_ls.append(reg_loss.item()) cls_loss = np.mean(loss_classification_ls) reg_loss = np.mean(loss_regression_ls) loss = cls_loss + reg_loss print( 'Val. Epoch: {}/{}. Classification loss: {:1.5f}. Regression loss: {:1.5f}. Total loss: {:1.5f}' .format(epoch, opt.num_epochs, cls_loss, reg_loss, loss)) writer.add_scalars('Loss', {'val': loss}, step) writer.add_scalars('Regression_loss', {'val': reg_loss}, step) writer.add_scalars('Classfication_loss', {'val': cls_loss}, step) if loss + opt.es_min_delta < best_loss: best_loss = loss best_epoch = epoch save_checkpoint( model, f'efficientdet-d{opt.compound_coef}_{epoch}_{step}.pth' ) model.train() # Early stopping if epoch - best_epoch > opt.es_patience > 0: print( '[Info] Stop training at epoch {}. The lowest loss achieved is {}' .format(epoch, best_loss)) break except KeyboardInterrupt: save_checkpoint( model, f'efficientdet-d{opt.compound_coef}_{epoch}_{step}.pth') writer.close() writer.close()
def train(opt): params = Params(f'projects/{opt.project}.yml') if opt.project == "vcoco": num_obj_class = 90 num_union_action = 25 num_inst_action = 51 else: assert opt.project == "hico-det" num_obj_class = 90 num_union_action = 117 num_inst_action = 234 if params.num_gpus == 0: os.environ['CUDA_VISIBLE_DEVICES'] = '-1' opt.saved_path = opt.saved_path + f'/{params.project_name}/' opt.log_path = opt.log_path + f'/{params.project_name}/tensorboard/' os.makedirs(opt.log_path, exist_ok=True) os.makedirs(opt.saved_path, exist_ok=True) training_params = { 'batch_size': opt.batch_size, 'shuffle': True, 'drop_last': True, 'collate_fn': collater, 'num_workers': opt.num_workers, 'pin_memory': False } val_params = { 'batch_size': opt.batch_size * 2, 'shuffle': False, 'drop_last': True, 'collate_fn': collater, 'num_workers': opt.num_workers, 'pin_memory': False } input_sizes = [512, 640, 768, 896, 1024, 1280, 1280, 1536] train_transform = transforms.Compose([ Normalizer(mean=params.mean, std=params.std), Augmenter(), Resizer(input_sizes[opt.compound_coef]) ]) val_transform = transforms.Compose([ Normalizer(mean=params.mean, std=params.std), Resizer(input_sizes[opt.compound_coef]) ]) if opt.project == "vcoco": # training_set = VCOCO_Dataset(root_dir="./datasets/vcoco", set=params.train_set, color_prob=1, # transform=train_transform) # val_set = VCOCO_Dataset(root_dir="./datasets/vcoco", set=params.val_set, # transform=val_transform) exit(-999) else: training_set = HICO_DET_Dataset(root_dir="data/hico_20160224_det", set="train", color_prob=1, transform=train_transform) val_set = HICO_DET_Dataset(root_dir="data/hico_20160224_det", set="test", transform=val_transform) training_generator = DataLoader(training_set, **training_params) val_generator = DataLoader(val_set, **val_params) model = EfficientDetBackbone(num_classes=num_obj_class, num_union_classes=num_union_action, num_inst_classes=num_inst_action, compound_coef=opt.compound_coef, ratios=eval(params.anchors_ratios), scales=eval(params.anchors_scales)) model.train() print("num_classes:", num_obj_class) print("num_union_classes:", num_union_action) print("instance_action_list", num_inst_action) # load last weights if opt.load_weights is not None: if opt.load_weights.endswith('.pth'): weights_path = opt.load_weights else: weights_path = get_last_weights(opt.saved_path) try: # last_step = int(os.path.basename(weights_path).split('_')[-1].split('.')[0]) # last_epoch = int(os.path.basename(weights_path).split('_')[-2].split('.')[0]) + 1 last_epoch = int( os.path.basename(weights_path).split('_')[-1].split('.')[0]) last_step = last_epoch * len(training_generator) except: last_step = 0 try: init_weights(model) print(weights_path) model_dict = model.state_dict() pretrained_dict = torch.load(weights_path, map_location=torch.device('cpu')) new_pretrained_dict = {} for k, v in pretrained_dict.items(): if k in model_dict: new_pretrained_dict[k] = v elif ("instance_branch.object_" + k) in model_dict: new_pretrained_dict["instance_branch.object_" + k] = v # print("instance_branch.object_"+k) ret = model.load_state_dict(new_pretrained_dict, strict=False) except RuntimeError as e: print(f'[Warning] Ignoring {e}') print( '[Warning] Don\'t panic if you see this, this might be because you load a pretrained weights with different number of classes. The rest of the weights should be loaded already.' ) print( f'[Info] loaded weights: {os.path.basename(weights_path)}, resuming checkpoint from step: {last_step}' ) else: last_step = 0 print('[Info] initializing weights...') init_weights(model) # freeze backbone if train head_only if opt.head_only: model.apply(freeze_backbone) freeze_bn_backbone(model) print('[Info] freezed backbone') if opt.freeze_object_detection: freeze_object_detection(model) freeze_bn_object_detection(model) # model.apply(freeze_object_detection) print('[Info] freezed object detection branch') # https://github.com/vacancy/Synchronized-BatchNorm-PyTorch # apply sync_bn when using multiple gpu and batch_size per gpu is lower than 4 # useful when gpu memory is limited. # because when bn is disable, the training will be very unstable or slow to converge, # apply sync_bn can solve it, # by packing all mini-batch across all gpus as one batch and normalize, then send it back to all gpus. # but it would also slow down the training by a little bit. if params.num_gpus > 1 and opt.batch_size // params.num_gpus < 8: model.apply(replace_w_sync_bn) use_sync_bn = True else: use_sync_bn = False # if os.path.exists('nohup.out'): # os.remove('nohup.out') # f = open('nohup.out', 'w') # f.close() if os.path.exists(opt.log_path): import shutil shutil.rmtree(opt.log_path) os.makedirs(opt.log_path) writer = SummaryWriter( opt.log_path + f'/{datetime.datetime.now().strftime("%Y%m%d-%H%M%S")}/') # warp the model with loss function, to reduce the memory usage on gpu0 and speedup model = ModelWithLoss(model, dataset=opt.project, debug=opt.debug) if params.num_gpus > 0: model = model.cuda() if params.num_gpus > 1: model = CustomDataParallel(model, params.num_gpus) if use_sync_bn: patch_replication_callback(model) if opt.head_only: print('[Info] freezed SyncBN backbone') freeze_bn_backbone(model.module.model) if opt.freeze_object_detection: print('[Info] freezed SyncBN object detection') freeze_bn_object_detection(model.module.model) if opt.optim == 'adamw': # optimizer = torch.optim.AdamW(model.parameters(), opt.lr) optimizer = torch.optim.AdamW( filter(lambda p: p.requires_grad, model.parameters()), opt.lr) elif opt.optim == "adam": # optimizer = torch.optim.Adam(model.parameters(), opt.lr) optimizer = torch.optim.Adam( filter(lambda p: p.requires_grad, model.parameters()), opt.lr) else: # optimizer = torch.optim.SGD(model.parameters(), opt.lr, momentum=0.9, nesterov=True) optimizer = torch.optim.SGD(filter(lambda p: p.requires_grad, model.parameters()), opt.lr, momentum=0.9, nesterov=True) scheduler = torch.optim.lr_scheduler.ReduceLROnPlateau(optimizer, patience=2, verbose=True, min_lr=1e-7) epoch = 0 best_loss = 1e5 best_epoch = 0 step = max(0, last_step) num_iter_per_epoch = (len(training_generator) + opt.accumulate_batch - 1) // opt.accumulate_batch start_time = time.time() try: for epoch in range(opt.num_epochs): last_epoch = step // num_iter_per_epoch + 1 if epoch < last_epoch: continue if epoch in [12, 16]: optimizer.param_groups[0][ 'lr'] = optimizer.param_groups[0]['lr'] / 10 epoch_loss = [] for iter, data in enumerate(training_generator): try: imgs = data['img'] annot = data['annot'] # torch.cuda.empty_cache() if params.num_gpus == 1: # if only one gpu, just send it to cuda:0 # elif multiple gpus, send it to multiple gpus in CustomDataParallel, not here imgs = imgs.cuda() for key in annot: annot[key] = annot[key].cuda() union_act_cls_loss, union_sub_reg_loss, union_obj_reg_loss, union_diff_reg_loss, \ inst_act_cls_loss, inst_obj_cls_loss, inst_obj_reg_loss = model(imgs, annot["instance"], annot["interaction"]) union_act_cls_loss = union_act_cls_loss.mean() union_sub_reg_loss = union_sub_reg_loss.mean() union_obj_reg_loss = union_obj_reg_loss.mean() union_diff_reg_loss = union_diff_reg_loss.mean() inst_act_cls_loss = inst_act_cls_loss.mean() inst_obj_cls_loss = inst_obj_cls_loss.mean() inst_obj_reg_loss = inst_obj_reg_loss.mean() union_loss = union_act_cls_loss + union_sub_reg_loss + union_obj_reg_loss + union_diff_reg_loss instance_loss = inst_act_cls_loss + inst_obj_cls_loss + inst_obj_reg_loss loss = union_loss + inst_act_cls_loss if loss == 0 or not torch.isfinite(loss): continue batch_loss = loss / opt.accumulate_batch batch_loss.backward() if (iter + 1) % opt.accumulate_batch == 0 or iter == len( training_generator) - 1: optimizer.step() optimizer.zero_grad() step += 1 loss = loss.item() union_loss = union_loss.item() instance_loss = instance_loss.item() epoch_loss.append(float(loss)) current_lr = optimizer.param_groups[0]['lr'] if step % opt.log_interval == 0: writer.add_scalars('Union Action Classification Loss', {'train': union_act_cls_loss}, step) writer.add_scalars('Union Subject Regression Loss', {'train': union_sub_reg_loss}, step) writer.add_scalars('Union Object Regression Loss', {'train': union_obj_reg_loss}, step) writer.add_scalars('Union Diff Regression Loss', {'train': union_diff_reg_loss}, step) writer.add_scalars( 'Instance Action Classification Loss', {'train': inst_act_cls_loss}, step) writer.add_scalars( 'Instance Object Classification Loss', {'train': inst_obj_cls_loss}, step) writer.add_scalars('Instance Regression Loss', {'train': inst_obj_reg_loss}, step) writer.add_scalars('Total Loss', {'train': loss}, step) writer.add_scalars('Union Loss', {'train': union_loss}, step) writer.add_scalars('Instance Loss', {'train': instance_loss}, step) # log learning_rate writer.add_scalar('learning_rate', current_lr, step) if iter % 20 == 0: end_time = time.time() print( 'Step: {}. Epoch: {}/{}. Iteration: {}/{}. Union loss: {:.5f}. Instance loss: {:.5f}. ' ' Total loss: {:.5f}. Learning rate: {:.5f} Time: {:.2f}s' .format(step, epoch, opt.num_epochs, (iter + 1) // opt.accumulate_batch, num_iter_per_epoch, union_loss, instance_loss, loss, current_lr, end_time - start_time)) start_time = time.time() except Exception as e: print('[Error]', traceback.format_exc()) print(e) continue # scheduler.step(np.mean(epoch_loss)) save_checkpoint(model, f'efficientdet-d{opt.compound_coef}_{epoch}.pth') print('checkpoint...') if epoch % opt.val_interval == 0: # model.eval() union_loss_ls = [] instance_loss_ls = [] union_act_cls_loss_ls = [] union_obj_cls_loss_ls = [] union_act_reg_loss_ls = [] union_sub_reg_loss_ls = [] union_obj_reg_loss_ls = [] union_diff_reg_loss_ls = [] inst_act_cls_loss_ls = [] inst_obj_cls_loss_ls = [] inst_obj_reg_loss_ls = [] val_loss = [] for iter, data in enumerate(val_generator): if (iter + 1) % 50 == 0: print("%d/%d" % (iter + 1, len(val_generator))) with torch.no_grad(): imgs = data['img'] annot = data['annot'] if params.num_gpus == 1: imgs = imgs.cuda() for key in annot: annot[key] = annot[key].cuda() union_act_cls_loss, union_sub_reg_loss, union_obj_reg_loss, union_diff_reg_loss, \ inst_act_cls_loss, inst_obj_cls_loss, inst_obj_reg_loss = model(imgs, annot["instance"], annot["interaction"]) union_act_cls_loss = union_act_cls_loss.mean() union_sub_reg_loss = union_sub_reg_loss.mean() union_obj_reg_loss = union_obj_reg_loss.mean() union_diff_reg_loss = union_diff_reg_loss.mean() inst_act_cls_loss = inst_act_cls_loss.mean() inst_obj_cls_loss = inst_obj_cls_loss.mean() inst_obj_reg_loss = inst_obj_reg_loss.mean() union_loss = union_act_cls_loss + union_sub_reg_loss + union_obj_reg_loss + union_diff_reg_loss instance_loss = inst_act_cls_loss + inst_obj_cls_loss + inst_obj_reg_loss loss = union_loss + inst_act_cls_loss if loss == 0 or not torch.isfinite(loss): continue val_loss.append(loss.item()) union_act_cls_loss_ls.append(union_act_cls_loss.item()) union_sub_reg_loss_ls.append(union_sub_reg_loss.item()) union_obj_reg_loss_ls.append(union_obj_reg_loss.item()) union_diff_reg_loss_ls.append( union_diff_reg_loss.item()) # union_obj_cls_loss_ls.append(union_obj_cls_loss.item()) # union_act_reg_loss_ls.append(union_act_reg_loss.item()) inst_act_cls_loss_ls.append(inst_act_cls_loss.item()) inst_obj_cls_loss_ls.append(inst_obj_cls_loss.item()) inst_obj_reg_loss_ls.append(inst_obj_reg_loss.item()) union_loss_ls.append(union_loss.item()) instance_loss_ls.append(instance_loss.item()) union_loss = np.mean(union_loss_ls) instance_loss = np.mean(instance_loss_ls) union_act_cls_loss = np.mean(union_act_cls_loss_ls) union_sub_reg_loss = np.mean(union_sub_reg_loss_ls) union_obj_reg_loss = np.mean(union_obj_reg_loss_ls) union_diff_reg_loss = np.mean(union_diff_reg_loss_ls) inst_act_cls_loss = np.mean(inst_act_cls_loss_ls) inst_obj_cls_loss = np.mean(inst_obj_cls_loss_ls) inst_obj_reg_loss = np.mean(inst_obj_reg_loss_ls) loss = union_loss + inst_act_cls_loss print( 'Val. Epoch: {}/{}. Union loss: {:1.5f}. Instance loss: {:1.5f}. ' 'Total loss: {:1.5f}'.format(epoch, opt.num_epochs, union_loss, instance_loss, loss)) writer.add_scalars('Union Action Classification Loss', {'val': union_act_cls_loss}, step) writer.add_scalars('Union Subject Regression Loss', {'val': union_sub_reg_loss}, step) writer.add_scalars('Union Object Regression Loss', {'val': union_obj_reg_loss}, step) writer.add_scalars('Union Diff Regression Loss', {'val': union_diff_reg_loss}, step) writer.add_scalars('Instance Action Classification Loss', {'val': inst_act_cls_loss}, step) writer.add_scalars('Instance Object Classification Loss', {'val': inst_obj_cls_loss}, step) writer.add_scalars('Instance Regression Loss', {'val': inst_obj_reg_loss}, step) writer.add_scalars('Total Loss', {'val': loss}, step) writer.add_scalars('Union Loss', {'val': union_loss}, step) writer.add_scalars('Instance Loss', {'val': instance_loss}, step) if loss + opt.es_min_delta < best_loss: best_loss = loss best_epoch = epoch save_checkpoint( model, f'efficientdet-d{opt.compound_coef}_{best_epoch}_best.pth' ) # model.train() # scheduler.step() scheduler.step(np.mean(val_loss)) if optimizer.param_groups[0]['lr'] < opt.lr / 100: break # Early stopping # if epoch - best_epoch > opt.es_patience > 0: # print('[Info] Stop training at epoch {}. The lowest loss achieved is {}'.format(epoch, loss)) # break except KeyboardInterrupt: # save_checkpoint(model, f'efficientdet-d{opt.compound_coef}_{epoch}_{step}.pth') writer.close() writer.close()
def train_det(opt, cfg): # # Write history # if 'backlog' not in opt.config: # with open(os.path.join(opt.saved_path, f'{opt.project}_backlog.yml'), 'w') as f: # doc = open(f'projects/{opt.project}.yml', 'r') # f.write('#History log file') # f.write(f'\n__backlog__: {now.strftime("%Y/%m/%d %H:%M:%S")}\n') # f.write(doc.read()) # f.write('\n# Manual seed used') # f.write(f'\nmanual_seed: {cfg.manual_seed}') # else: # with open(os.path.join(opt.saved_path, f'{opt.project}_history.yml'), 'w') as f: # doc = open(f'projects/{opt.project}.yml', 'r') # f.write(doc.read()) training_params = { 'batch_size': cfg.batch_size, 'shuffle': True, 'drop_last': True, 'collate_fn': collater, 'num_workers': opt.num_workers } val_params = { 'batch_size': cfg.batch_size, 'shuffle': False, 'drop_last': True, 'collate_fn': collater, 'num_workers': opt.num_workers } input_sizes = [512, 640, 768, 896, 1024, 1280, 1280, 1536] training_set = DataGenerator( data_path=os.path.join(opt.data_path, 'Train'), class_ids=cfg.dictionary_class_name.keys(), transform=transforms.Compose([ Augmenter(), Normalizer(mean=cfg.mean, std=cfg.std), Resizer(input_sizes[cfg.compound_coef]) ]), pre_augments=['', *[f'{aug}_' for aug in cfg.augment_list]] if cfg.augment_list else None) training_generator = DataLoader(training_set, **training_params) val_set = DataGenerator( # root_dir=os.path.join(opt.data_path, cfg.project_name), data_path=os.path.join(opt.data_path, 'Validation'), class_ids=cfg.dictionary_class_name.keys(), transform=transforms.Compose([ Normalizer(mean=cfg.mean, std=cfg.std), Resizer(input_sizes[cfg.compound_coef]) ])) val_generator = DataLoader(val_set, **val_params) model = EfficientDetBackbone(num_classes=len(cfg.dictionary_class_name), compound_coef=cfg.compound_coef, ratios=eval(cfg.anchor_ratios), scales=eval(cfg.anchor_scales)) # load last weights if opt.load_weights is not None: if opt.load_weights.endswith('.pth'): weights_path = opt.load_weights else: weights_path = get_last_weights(opt.saved_path) try: last_step = int( os.path.basename(weights_path).split('_')[-1].split('.')[0]) except: last_step = 0 try: ret = model.load_state_dict(torch.load(weights_path), strict=False) except RuntimeError as e: print(f'[Warning] Ignoring {e}') print( '[Warning] Don\'t panic if you see this, ' 'this might be because you load a pretrained weights with different number of classes. ' 'The rest of the weights should be loaded already.') print( f'[Info] loaded weights: {os.path.basename(weights_path)}, resuming checkpoint from step: {last_step}' ) else: last_step = 0 print('[Info] initializing weights...') init_weights(model) # freeze backbone if train head_only if cfg.training_layer.lower() == 'heads': def freeze_backbone(m): classname = m.__class__.__name__ for ntl in ['EfficientNet', 'BiFPN']: if ntl in classname: for param in m.parameters(): param.requires_grad = False model.apply(freeze_backbone) print('[Info] freezed backbone') # https://github.com/vacancy/Synchronized-BatchNorm-PyTorch # apply sync_bn when using multiple gpu and batch_size per gpu is lower than 4 # useful when gpu memory is limited. # because when bn is disable, the training will be very unstable or slow to converge, # apply sync_bn can solve it, # by packing all mini-batch across all gpus as one batch and normalize, then send it back to all gpus. # but it would also slow down the training by a little bit. if cfg.num_gpus > 1 and cfg.batch_size // cfg.num_gpus < 4: model.apply(replace_w_sync_bn) use_sync_bn = True else: use_sync_bn = False # warp the model with loss function, to reduce the memory usage on gpu0 and speedup model = ModelWithLoss(model, debug=opt.debug) if cfg.num_gpus > 0: model = model.cuda() if cfg.num_gpus > 1: model = CustomDataParallel(model, cfg.num_gpus) if use_sync_bn: patch_replication_callback(model) if cfg.optimizer.lower() == 'adamw': optimizer = torch.optim.AdamW(model.parameters(), cfg.learning_rate) if cfg.optimizer.lower() == 'srsgd': optimizer = SRSGD(model.parameters(), lr=cfg.learning_rate, weight_decay=5e-4, iter_count=100) else: optimizer = torch.optim.SGD(model.parameters(), cfg.learning_rate, momentum=0.9, nesterov=True) scheduler = torch.optim.lr_scheduler.ReduceLROnPlateau(optimizer, patience=3, verbose=True) # Setup complete, then start training now = datetime.datetime.now() opt.saved_path = opt.saved_path + f'/trainlogs_{now.strftime("%Y%m%d_%H%M%S")}' if opt.log_path is None: opt.log_path = opt.saved_path os.makedirs(opt.log_path, exist_ok=True) os.makedirs(opt.saved_path, exist_ok=True) # Write history if 'backlog' not in opt.config: with open( os.path.join(opt.saved_path, f'{now.strftime("%Y%m%d%H%M%S")}.backlog.json'), 'w') as f: backlog = dict(cfg.to_pascal_case()) backlog['__metadata__'] = 'Backlog at ' + now.strftime( "%Y/%m/%d %H:%M:%S") json.dump(backlog, f) else: with open( os.path.join(opt.saved_path, f'{now.strftime("%Y%m%d%H%M%S")}.history.json'), 'w') as f: history = dict(cfg.to_pascal_case()) history['__metadata__'] = now.strftime("%Y/%m/%d %H:%M:%S") json.dump(history, f) writer = SummaryWriter(opt.log_path + f'/tensorboard') epoch = 0 best_loss = 1e5 best_epoch = 0 step = max(0, last_step) model.train() num_iter_per_epoch = len(training_generator) try: for epoch in range(cfg.no_epochs): last_epoch = step // num_iter_per_epoch if epoch < last_epoch: continue epoch_loss = [] progress_bar = tqdm(training_generator) for iter, data in enumerate(progress_bar): if iter < step - last_epoch * num_iter_per_epoch: progress_bar.set_description( f'Skip {iter} < {step} - {last_epoch} * {num_iter_per_epoch}' ) progress_bar.update() continue try: imgs = data['img'] annot = data['annot'] if cfg.num_gpus == 1: # if only one gpu, just send it to cuda:0 # elif multiple gpus, send it to multiple gpus in CustomDataParallel, not here imgs = imgs.cuda() annot = annot.cuda() optimizer.zero_grad() cls_loss, reg_loss = model( imgs, annot, obj_list=cfg.dictionary_class_name.keys()) cls_loss = cls_loss.mean() reg_loss = reg_loss.mean() loss = cls_loss + reg_loss if loss == 0 or not torch.isfinite(loss): continue loss.backward() # torch.nn.utils.clip_grad_norm_(model.parameters(), 0.1) optimizer.step() epoch_loss.append(float(loss)) progress_bar.set_description( 'Step: {}. Epoch: {}/{}. Iteration: {}/{}. Cls loss: {:.5f}. Reg loss: {:.5f}. ' 'Total loss: {:.5f}'.format(step, epoch, cfg.no_epochs, iter + 1, num_iter_per_epoch, cls_loss.item(), reg_loss.item(), loss.item())) writer.add_scalars('Loss', {'train': loss}, step) writer.add_scalars('Regression_loss', {'train': reg_loss}, step) writer.add_scalars('Classification_loss', {'train': cls_loss}, step) # log learning_rate current_lr = optimizer.param_groups[0]['lr'] writer.add_scalar('learning_rate', current_lr, step) step += 1 except Exception as e: print('[Error]', traceback.format_exc()) print(e) continue scheduler.step(np.mean(epoch_loss)) model.eval() loss_regression_ls = [] loss_classification_ls = [] for iter, data in enumerate(val_generator): with torch.no_grad(): imgs = data['img'] annot = data['annot'] if cfg.num_gpus == 1: imgs = imgs.cuda() annot = annot.cuda() cls_loss, reg_loss = model( imgs, annot, obj_list=cfg.dictionary_class_name.keys()) cls_loss = cls_loss.mean() reg_loss = reg_loss.mean() loss = cls_loss + reg_loss if loss == 0 or not torch.isfinite(loss): continue loss_classification_ls.append(cls_loss.item()) loss_regression_ls.append(reg_loss.item()) cls_loss = np.mean(loss_classification_ls) reg_loss = np.mean(loss_regression_ls) loss = cls_loss + reg_loss progress_bar.set_description( 'Val. Epoch: {}/{}. Classification loss: {:1.5f}. Regression loss: {:1.5f}.' ' Total loss: {:1.5f}'.format(epoch, cfg.no_epochs, cls_loss, reg_loss, loss)) writer.add_scalars('Loss', {'val': loss}, step) writer.add_scalars('Regression_loss', {'val': reg_loss}, step) writer.add_scalars('Classification_loss', {'val': cls_loss}, step) if cfg.only_best_weights: if loss + opt.es_min_delta < best_loss: best_loss = loss best_epoch = epoch save_checkpoint( model, f"{opt.saved_path}/det_d{cfg.compound_coef}_{epoch}_{step}.pth" ) else: if loss + opt.es_min_delta < best_loss: best_loss = loss best_epoch = epoch save_checkpoint( model, f"{opt.saved_path}/det_d{cfg.compound_coef}_{epoch}_{step}.pth" ) model.train() # Early stopping if epoch - best_epoch > opt.es_patience > 0: print( '[Info] Stop training at epoch {}. The lowest loss achieved is {}' .format(epoch, best_loss)) break print( f'[Info] Finished training. Best loss achieved {best_loss} at epoch {best_epoch}.' ) except KeyboardInterrupt: save_checkpoint( model, f"{opt.saved_path}/d{cfg.compound_coef}_{epoch}_{step}.pth") writer.close() writer.close()
def train(args): assert args.weight_path, 'must indicate the path of initial weight' if (os.path.exists(f'{args.weight_path}/train_log.txt')): os.remove(f'{args.weight_path}/train_log.txt') if (os.path.exists(f'{args.weight_path}/pre_trained_weight.pth')): os.remove(f'{args.weight_path}/pre_trained_weight.pth') print("Hi") present_time = time.strftime("%Y_%m_%d_%H_%M_%S", time.localtime()) params = Params(f'projects/eye.yml') os.environ['CUDA_VISIBLE_DEVICES'] = args.gpu train_params = {'batch_size': args.batch_size, 'shuffle': True, 'drop_last': True, 'collate_fn': collater, 'num_workers': args.num_workers} val_params = {'batch_size': args.batch_size, 'shuffle': False, 'drop_last': True, 'collate_fn': collater, 'num_workers': args.num_workers} input_sizes = [512, 640, 768, 896, 1024, 1280, 1280, 1536] model = EfficientDetBackbone(num_classes=len(params.obj_list), compound_coef=args.compound_coef, ratios=eval(params.anchors_ratios), scales=eval(params.anchors_scales)) init_weights(model) # warp the model with loss function, to reduce the memory usage on gpu0 and speedup model = ModelWithLoss(model) model = model.cuda() if args.optim == 'adamw': optimizer = torch.optim.AdamW(model.parameters(), args.lr) else: optimizer = torch.optim.SGD(model.parameters(), args.lr, momentum=0.9, nesterov=True) scheduler = torch.optim.lr_scheduler.ReduceLROnPlateau(optimizer, patience=args.patience, verbose=True) # unit is epoch img_list = glob.glob(f"{args.dataset_path}/train/*") normal_img_list = [] yellow_img_list = [] for img in img_list: if (img.find("n_") != -1): normal_img_list.append(img) else: yellow_img_list.append(img) random.shuffle(normal_img_list) random.shuffle(yellow_img_list) normal_val_num = int(len(normal_img_list) / 5) yellow_val_num = int(len(yellow_img_list) / 5) train_img_list = normal_img_list[normal_val_num:] + yellow_img_list[yellow_val_num:] val_img_list = normal_img_list[:normal_val_num] + yellow_img_list[:yellow_val_num] train_anno_txt_path = f"{args.dataset_path}/train.txt" val_anno_txt_path = f"{args.dataset_path}/train.txt" train_transform = transforms.Compose([# Normalizer(mean=params.mean, std=params.std), Augmenter(), randomScaleWidth(), randomBlur(), # randomBrightness(), # randomHue(), # randomSaturation(), Normalizer(mean=params.mean, std=params.std), Resizer(input_sizes[args.compound_coef])]) val_transform = transforms.Compose([# Normalizer(mean=params.mean, std=params.std), Augmenter(), Normalizer(mean=params.mean, std=params.std), Resizer(input_sizes[args.compound_coef])]) train_set = EyeDataset(train_img_list, train_anno_txt_path, train_transform) val_set = EyeDataset(val_img_list, val_anno_txt_path, val_transform) train_generator = DataLoader(train_set, **train_params) val_generator = DataLoader(val_set, **val_params) model.model.load_state_dict(torch.load(f'{args.weight_path}/init_weight.pth')["model_state_dict"]) optimizer.load_state_dict(torch.load(f'{args.weight_path}/init_weight.pth')["optimizer_state_dict"]) scheduler.load_state_dict(torch.load(f'{args.weight_path}/init_weight.pth')["scheduler_state_dict"]) model.train() best_val_loss = 1e5 for epoch in range(args.epoch): model.train() total_loss_ls = [] total_correct = 0 total = 0 for data in train_generator: imgs = data['img'].cuda() annot = data['annot'].cuda() optimizer.zero_grad() reg_loss, cls_head_loss, cls_correct_num, total_num = model(imgs, annot, obj_list=params.obj_list) total_correct += cls_correct_num total += total_num reg_loss = reg_loss.mean() loss = cls_head_loss + reg_loss total_loss_ls.append(loss.item()) if (loss == 0 or not torch.isfinite(loss)): continue loss.backward() optimizer.step() total_loss = np.mean(total_loss_ls) scheduler.step(total_loss) with open(f'{args.weight_path}/train_log.txt', 'a') as fp: fp.write(f'Epoch: {epoch} loss: {total_loss:.6f} | acc: {total_correct / total * 100:.2f}\n') model.eval() with torch.no_grad(): total = 0 total_correct = 0 total_loss_ls = [] for data in val_generator: imgs = data['img'].cuda() annot = data['annot'].cuda() reg_loss, cls_head_loss, cls_correct_num, total_num = model(imgs, annot, obj_list=params.obj_list) total += total_num total_correct += cls_correct_num reg_loss = reg_loss.mean() loss = cls_head_loss + reg_loss total_loss_ls.append(loss.item()) total_loss = np.mean(total_loss_ls) with open(f'{args.weight_path}/train_log.txt', 'a') as fp: fp.write(f'Epoch: {epoch} loss: {total_loss:.6f} | acc: {total_correct / total * 100:.2f}\n\n') if (total_loss < best_val_loss): best_val_loss = total_loss torch.save({ "model_state_dict": model.model.state_dict(), "optimizer_state_dict": optimizer.state_dict(), "scheduler_state_dict": scheduler.state_dict(), }, f"{args.weight_path}/pre_trained_weight.pth")
def train(args): print("Hi") present_time = time.strftime("%Y_%m_%d_%H_%M_%S", time.localtime()) params = Params(f'projects/eye.yml') os.environ['CUDA_VISIBLE_DEVICES'] = args.gpu torch.cuda.manual_seed(20) torch.cuda.manual_seed_all(20) np.random.seed(20) random.seed(20) torch.backends.cudnn.deterministic = True torch.backends.cudnn.benchmark = False prepare_dir(args, present_time) training_params = { 'batch_size': args.batch_size, 'shuffle': True, 'drop_last': True, 'collate_fn': collater, 'num_workers': args.num_workers } val_params = { 'batch_size': args.batch_size, 'shuffle': False, 'drop_last': True, 'collate_fn': collater, 'num_workers': args.num_workers } input_sizes = [512, 640, 768, 896, 1024, 1280, 1280, 1536] model = EfficientDetBackbone(num_classes=len(params.obj_list), compound_coef=args.compound_coef, ratios=eval(params.anchors_ratios), scales=eval(params.anchors_scales)) # load last weights ''' if opt.load_weights is not None: if opt.load_weights.endswith('.pth'): weights_path = opt.load_weights else: weights_path = get_last_weights(opt.saved_path) try: last_step = int(os.path.basename(weights_path).split('_')[-1].split('.')[0]) except: last_step = 0 try: ret = model.load_state_dict(torch.load(weights_path), strict=False) except RuntimeError as e: print(f'[Warning] Ignoring {e}') print(f'[Info] loaded weights: {os.path.basename(weights_path)}, resuming checkpoint from step: {last_step}') else: last_step = 0 print('[Info] initializing weights...') init_weights(model) ''' init_weights(model) # warp the model with loss function, to reduce the memory usage on gpu0 and speedup model = ModelWithLoss(model) model = model.cuda() if args.optim == 'adamw': optimizer = torch.optim.AdamW(model.parameters(), args.lr) else: optimizer = torch.optim.SGD(model.parameters(), args.lr, momentum=0.9, nesterov=True) scheduler = torch.optim.lr_scheduler.ReduceLROnPlateau( optimizer, patience=args.patience, verbose=True) # unit is epoch torch.save( { "model_state_dict": model.model.state_dict(), "optimizer_state_dict": optimizer.state_dict(), "scheduler_state_dict": scheduler.state_dict(), }, f"{args.saved_path}/init_weight.pth") k = 10 train_img_list = glob.glob(f"{args.dataset_path}/train/*") normal_img_list = [] yellow_img_list = [] for img in train_img_list: if (img.find('n_') != -1): normal_img_list.append(img) else: yellow_img_list.append(img) random.shuffle(normal_img_list) random.shuffle(yellow_img_list) normal_part_num = math.ceil(len(normal_img_list) / k) yellow_part_num = math.ceil(len(yellow_img_list) / k) last_acc = [] last_loss = [] for i in range(k): best_loss = 1e5 model.model.load_state_dict( torch.load(f"{args.saved_path}/init_weight.pth") ["model_state_dict"]) optimizer.load_state_dict( torch.load(f"{args.saved_path}/init_weight.pth") ["optimizer_state_dict"]) scheduler.load_state_dict( torch.load(f"{args.saved_path}/init_weight.pth") ["scheduler_state_dict"]) model.train() sub_train_img_list = normal_img_list[:i * normal_part_num] + normal_img_list[ (i + 1) * normal_part_num:] + yellow_img_list[:i * yellow_part_num] + yellow_img_list[ (i + 1) * yellow_part_num:] sub_test_img_list = normal_img_list[i * normal_part_num:( i + 1) * normal_part_num] + yellow_img_list[i * yellow_part_num: (i + 1) * yellow_part_num] random.shuffle(sub_train_img_list) random.shuffle(sub_test_img_list) print("---") for img in sub_test_img_list: print(img) print("---") train_anno_txt_path = f"{args.dataset_path}/train.txt" test_anno_txt_path = f"{args.dataset_path}/train.txt" train_transform = transforms.Compose( [ # Normalizer(mean=params.mean, std=params.std), Augmenter(), randomScaleWidth(), randomBlur(), randomBrightness(), randomHue(), randomSaturation(), Normalizer(mean=params.mean, std=params.std), Resizer(input_sizes[args.compound_coef]) ]) test_transform = transforms.Compose( [ # Normalizer(mean=params.mean, std=params.std), Augmenter(), Normalizer(mean=params.mean, std=params.std), Resizer(input_sizes[args.compound_coef]) ]) train_set = EyeDataset(sub_train_img_list, train_anno_txt_path, train_transform) test_set = EyeDataset(sub_test_img_list, test_anno_txt_path, test_transform) training_generator = DataLoader(train_set, **training_params) val_generator = DataLoader(test_set, **val_params) for epoch in range(args.epoch): model.train() total_correct = 0 total = 0 total_loss_ls = [] for data in training_generator: imgs = data['img'] annot = data['annot'] imgs = imgs.cuda() annot = annot.cuda() optimizer.zero_grad() reg_loss, cls_head_loss, cls_correct_num, total_num = model( imgs, annot, obj_list=params.obj_list) total_correct += cls_correct_num total += total_num reg_loss = reg_loss.mean() loss = cls_head_loss + reg_loss total_loss_ls.append(loss.item()) if loss == 0 or not torch.isfinite(loss): continue loss.backward() optimizer.step() total_loss = np.mean(total_loss_ls) scheduler.step(total_loss) with open(f'./logs/{present_time}/cv_log.txt', 'a') as fp: fp.write(f"Epoch: {i}/{epoch}/{args.epoch}\n") fp.write( f"Training loss: {total_loss:.6f} | acc: {total_correct / total * 100:.2f}\n" ) model.eval() with torch.no_grad(): total = 0 total_correct = 0 total_loss_ls = [] for data in val_generator: imgs = data['img'].cuda() annot = data['annot'].cuda() reg_loss, cls_head_loss, cls_correct_num, total_num = model( imgs, annot, obj_list=params.obj_list) total_correct += cls_correct_num total += total_num reg_loss = reg_loss.mean() loss = reg_loss + cls_head_loss total_loss_ls.append(loss.item()) total_loss = np.mean(total_loss_ls) with open(f'./logs/{present_time}/cv_log.txt', 'a') as fp: fp.write( f"Testing loss: {total_loss:.6f} | acc: {total_correct / total * 100:.2f}\n\n" ) if (epoch == args.epoch - 1): last_loss.append(total_loss) last_acc.append(total_correct / total * 100) with open(f'./logs/{present_time}/cv_log.txt', 'a') as fp: fp.write("\n===========\n\n") fp.write(f"Avg. loss: {np.mean(np.array(last_loss)):.2f}\n") fp.write(f"Avg. accuracy: {np.mean(np.array(last_acc)):.2f}\n")