def train(opt): params = Params(f'projects/{opt.project}.yml') # выше f-string https://python-scripts.com/f-strings if params.num_gpus == 0: os.environ['CUDA_VISIBLE_DEVICES'] = '-1' if torch.cuda.is_available(): torch.cuda.manual_seed(42) else: torch.manual_seed(42) opt.saved_path = opt.saved_path + f'/{params.project_name}/' opt.log_path = opt.log_path + f'/{params.project_name}/tensorboard/' os.makedirs(opt.log_path, exist_ok=True) os.makedirs(opt.saved_path, exist_ok=True) #exist_ok (optional) : A default value False is used for this parameter. # If the target directory already exists an OSError is raised if its value is False otherwise not. training_params = { 'batch_size': opt.batch_size, 'shuffle': True, 'drop_last': True, 'collate_fn': collater, 'num_workers': opt.num_workers } val_params = { 'batch_size': opt.batch_size, 'shuffle': False, 'drop_last': True, 'collate_fn': collater, 'num_workers': opt.num_workers } input_sizes = [512, 640, 768, 896, 1024, 1280, 1280, 1536] training_set = CocoDataset(root_dir=os.path.join(opt.data_path, params.project_name), set=params.train_set, transform=transforms.Compose([ Normalizer(mean=params.mean, std=params.std), Augmenter(), Resizer(input_sizes[opt.compound_coef]) ])) training_generator = DataLoader(training_set, **training_params) val_set = CocoDataset(root_dir=os.path.join(opt.data_path, params.project_name), set=params.val_set, transform=transforms.Compose([ Normalizer(mean=params.mean, std=params.std), Resizer(input_sizes[opt.compound_coef]) ])) val_generator = DataLoader(val_set, **val_params) model = EfficientDetBackbone(num_classes=len(params.obj_list), compound_coef=opt.compound_coef, ratios=eval(params.anchors_ratios), scales=eval(params.anchors_scales)) # load last weights if opt.load_weights is not None: if opt.load_weights.endswith('.pth'): weights_path = opt.load_weights else: weights_path = get_last_weights(opt.saved_path) try: last_step = int( os.path.basename(weights_path).split('_')[-1].split('.')[0]) except: last_step = 0 try: ret = model.load_state_dict(torch.load(weights_path), strict=False) except RuntimeError as e: print(f'[Warning] Ignoring {e}') print( '[Warning] Don\'t panic if you see this, this might be because you load a pretrained weights with different number of classes. The rest of the weights should be loaded already.' ) print( f'[Info] loaded weights: {os.path.basename(weights_path)}, resuming checkpoint from step: {last_step}' ) else: last_step = 0 print('[Info] initializing weights...') init_weights(model) # freeze backbone if train head_only if opt.head_only: def freeze_backbone(m): classname = m.__class__.__name__ for ntl in ['EfficientNet', 'BiFPN']: if ntl in classname: for param in m.parameters(): param.requires_grad = False model.apply(freeze_backbone) print('[Info] freezed backbone') # https://github.com/vacancy/Synchronized-BatchNorm-PyTorch # apply sync_bn when using multiple gpu and batch_size per gpu is lower than 4 # useful when gpu memory is limited. # because when bn is disable, the training will be very unstable or slow to converge, # apply sync_bn can solve it, # by packing all mini-batch across all gpus as one batch and normalize, then send it back to all gpus. # but it would also slow down the training by a little bit. if params.num_gpus > 1 and opt.batch_size // params.num_gpus < 4: model.apply(replace_w_sync_bn) use_sync_bn = Trueslack else: use_sync_bn = False writer = SummaryWriter( opt.log_path + f'/{datetime.datetime.now().strftime("%Y%m%d-%H%M%S")}/') # warp the model with loss function, to reduce the memory usage on gpu0 and speedup model = ModelWithLoss(model, debug=opt.debug) if params.num_gpus > 0: model = model.cuda() if params.num_gpus > 1: model = CustomDataParallel(model, params.num_gpus) if use_sync_bn: patch_replication_callback(model) if opt.optim == 'adamw': optimizer = torch.optim.AdamW(model.parameters(), opt.lr) else: optimizer = torch.optim.SGD(model.parameters(), opt.lr, momentum=0.9, nesterov=True) scheduler = torch.optim.lr_scheduler.ReduceLROnPlateau(optimizer, patience=3, verbose=True) epoch = 0 best_loss = 1e5 best_epoch = 0 step = max(0, last_step) model.train() num_iter_per_epoch = len(training_generator) try: for epoch in range(opt.num_epochs): last_epoch = step // num_iter_per_epoch if epoch < last_epoch: continue epoch_loss = [] progress_bar = tqdm(training_generator) for iter, data in enumerate(progress_bar): if iter < step - last_epoch * num_iter_per_epoch: progress_bar.update() continue try: imgs = data['img'] annot = data['annot'] if params.num_gpus == 1: # if only one gpu, just send it to cuda:0 # elif multiple gpus, send it to multiple gpus in CustomDataParallel, not here imgs = imgs.cuda() annot = annot.cuda() optimizer.zero_grad() cls_loss, reg_loss = model(imgs, annot, obj_list=params.obj_list) cls_loss = cls_loss.mean() reg_loss = reg_loss.mean() loss = cls_loss + reg_loss if loss == 0 or not torch.isfinite(loss): continue loss.backward() # torch.nn.utils.clip_grad_norm_(model.parameters(), 0.1) optimizer.step() epoch_loss.append(float(loss)) progress_bar.set_description( 'Step: {}. Epoch: {}/{}. Iteration: {}/{}. Cls loss: {:.5f}. Reg loss: {:.5f}. Total loss: {:.5f}' .format(step, epoch, opt.num_epochs, iter + 1, num_iter_per_epoch, cls_loss.item(), reg_loss.item(), loss.item())) writer.add_scalars('Loss', {'train': loss}, step) writer.add_scalars('Regression_loss', {'train': reg_loss}, step) writer.add_scalars('Classfication_loss', {'train': cls_loss}, step) # log learning_rate current_lr = optimizer.param_groups[0]['lr'] writer.add_scalar('learning_rate', current_lr, step) step += 1 if step % opt.save_interval == 0 and step > 0: save_checkpoint( model, f'efficientdet-d{opt.compound_coef}_{epoch}_{step}.pth' ) print('checkpoint...') except Exception as e: print('[Error]', traceback.format_exc()) print(e) continue scheduler.step(np.mean(epoch_loss)) if epoch % opt.val_interval == 0: model.eval() loss_regression_ls = [] loss_classification_ls = [] for iter, data in enumerate(val_generator): with torch.no_grad(): imgs = data['img'] annot = data['annot'] if params.num_gpus == 1: imgs = imgs.cuda() annot = annot.cuda() cls_loss, reg_loss = model(imgs, annot, obj_list=params.obj_list) cls_loss = cls_loss.mean() reg_loss = reg_loss.mean() loss = cls_loss + reg_loss if loss == 0 or not torch.isfinite(loss): continue loss_classification_ls.append(cls_loss.item()) loss_regression_ls.append(reg_loss.item()) cls_loss = np.mean(loss_classification_ls) reg_loss = np.mean(loss_regression_ls) loss = cls_loss + reg_loss print( 'Val. Epoch: {}/{}. Classification loss: {:1.5f}. Regression loss: {:1.5f}. Total loss: {:1.5f}' .format(epoch, opt.num_epochs, cls_loss, reg_loss, loss)) writer.add_scalars('Loss', {'val': loss}, step) writer.add_scalars('Regression_loss', {'val': reg_loss}, step) writer.add_scalars('Classfication_loss', {'val': cls_loss}, step) if loss + opt.es_min_delta < best_loss: best_loss = loss best_epoch = epoch save_checkpoint( model, f'efficientdet-d{opt.compound_coef}_{epoch}_{step}.pth' ) model.train() # Early stopping if epoch - best_epoch > opt.es_patience > 0: print( '[Info] Stop training at epoch {}. The lowest loss achieved is {}' .format(epoch, best_loss)) break except KeyboardInterrupt: save_checkpoint( model, f'efficientdet-d{opt.compound_coef}_{epoch}_{step}.pth') writer.close() writer.close()
'batch_size': 8, 'shuffle': False, 'drop_last': True, 'collate_fn': collater, 'num_workers': 0 } input_sizes = [512, 640, 768, 896, 1024, 1280, 1280, 1536, 1536] root_val = 'D:/Etri_tracking_data/Etri_full/val_1024/' side_val = 'D:/Etri_tracking_data/Etri_full/val_Sejin_1024/' ground_truth_val = 'D:/Etri_tracking_data/Etri_full/val_1024.txt' val_set = TobyCustom(root_dir=root_val, side_dir = side_val, \ annot_path = ground_truth_val, \ transform=ComposeAlb([Resizer(input_sizes[4], num_channels=3), Normalizer(mean = [0.485, 0.456, 0.406], std = [0.229, 0.224, 0.225])])) val_generator = DataLoader(val_set, **val_params) # root_val = 'D:/COCO/val/val2017/' # side_val = None # ground_truth_val = 'C:/Users/giang/Desktop/coco_val_2017.json' # root = '/home/../../data3/giangData/image_crop_1175x7680/' # side = '/home/../../data3/giangData/image_vol1_Sejin/' # ground_truth = '/home/../../data3/giangData/specific_train.txt' # val_set = TobyCustom4COCO(root_dir=root_val, side_dir = side_val, \ # annot_path = ground_truth_val, \ # transform=ComposeAlb([Resizer(input_sizes[4], 3), # Normalizer(mean=[0.485, 0.456, 0.406], std=[0.229, 0.224, 0.225])])) # val_generator = DataLoader(val_set, **val_params)
def train(args): assert args.weight_path, 'must indicate the path of initial weight' if (os.path.exists(f'{args.weight_path}/train_log.txt')): os.remove(f'{args.weight_path}/train_log.txt') if (os.path.exists(f'{args.weight_path}/pre_trained_weight.pth')): os.remove(f'{args.weight_path}/pre_trained_weight.pth') print("Hi") present_time = time.strftime("%Y_%m_%d_%H_%M_%S", time.localtime()) params = Params(f'projects/eye.yml') os.environ['CUDA_VISIBLE_DEVICES'] = args.gpu train_params = {'batch_size': args.batch_size, 'shuffle': True, 'drop_last': True, 'collate_fn': collater, 'num_workers': args.num_workers} val_params = {'batch_size': args.batch_size, 'shuffle': False, 'drop_last': True, 'collate_fn': collater, 'num_workers': args.num_workers} input_sizes = [512, 640, 768, 896, 1024, 1280, 1280, 1536] model = EfficientDetBackbone(num_classes=len(params.obj_list), compound_coef=args.compound_coef, ratios=eval(params.anchors_ratios), scales=eval(params.anchors_scales)) init_weights(model) # warp the model with loss function, to reduce the memory usage on gpu0 and speedup model = ModelWithLoss(model) model = model.cuda() if args.optim == 'adamw': optimizer = torch.optim.AdamW(model.parameters(), args.lr) else: optimizer = torch.optim.SGD(model.parameters(), args.lr, momentum=0.9, nesterov=True) scheduler = torch.optim.lr_scheduler.ReduceLROnPlateau(optimizer, patience=args.patience, verbose=True) # unit is epoch img_list = glob.glob(f"{args.dataset_path}/train/*") normal_img_list = [] yellow_img_list = [] for img in img_list: if (img.find("n_") != -1): normal_img_list.append(img) else: yellow_img_list.append(img) random.shuffle(normal_img_list) random.shuffle(yellow_img_list) normal_val_num = int(len(normal_img_list) / 5) yellow_val_num = int(len(yellow_img_list) / 5) train_img_list = normal_img_list[normal_val_num:] + yellow_img_list[yellow_val_num:] val_img_list = normal_img_list[:normal_val_num] + yellow_img_list[:yellow_val_num] train_anno_txt_path = f"{args.dataset_path}/train.txt" val_anno_txt_path = f"{args.dataset_path}/train.txt" train_transform = transforms.Compose([# Normalizer(mean=params.mean, std=params.std), Augmenter(), randomScaleWidth(), randomBlur(), # randomBrightness(), # randomHue(), # randomSaturation(), Normalizer(mean=params.mean, std=params.std), Resizer(input_sizes[args.compound_coef])]) val_transform = transforms.Compose([# Normalizer(mean=params.mean, std=params.std), Augmenter(), Normalizer(mean=params.mean, std=params.std), Resizer(input_sizes[args.compound_coef])]) train_set = EyeDataset(train_img_list, train_anno_txt_path, train_transform) val_set = EyeDataset(val_img_list, val_anno_txt_path, val_transform) train_generator = DataLoader(train_set, **train_params) val_generator = DataLoader(val_set, **val_params) model.model.load_state_dict(torch.load(f'{args.weight_path}/init_weight.pth')["model_state_dict"]) optimizer.load_state_dict(torch.load(f'{args.weight_path}/init_weight.pth')["optimizer_state_dict"]) scheduler.load_state_dict(torch.load(f'{args.weight_path}/init_weight.pth')["scheduler_state_dict"]) model.train() best_val_loss = 1e5 for epoch in range(args.epoch): model.train() total_loss_ls = [] total_correct = 0 total = 0 for data in train_generator: imgs = data['img'].cuda() annot = data['annot'].cuda() optimizer.zero_grad() reg_loss, cls_head_loss, cls_correct_num, total_num = model(imgs, annot, obj_list=params.obj_list) total_correct += cls_correct_num total += total_num reg_loss = reg_loss.mean() loss = cls_head_loss + reg_loss total_loss_ls.append(loss.item()) if (loss == 0 or not torch.isfinite(loss)): continue loss.backward() optimizer.step() total_loss = np.mean(total_loss_ls) scheduler.step(total_loss) with open(f'{args.weight_path}/train_log.txt', 'a') as fp: fp.write(f'Epoch: {epoch} loss: {total_loss:.6f} | acc: {total_correct / total * 100:.2f}\n') model.eval() with torch.no_grad(): total = 0 total_correct = 0 total_loss_ls = [] for data in val_generator: imgs = data['img'].cuda() annot = data['annot'].cuda() reg_loss, cls_head_loss, cls_correct_num, total_num = model(imgs, annot, obj_list=params.obj_list) total += total_num total_correct += cls_correct_num reg_loss = reg_loss.mean() loss = cls_head_loss + reg_loss total_loss_ls.append(loss.item()) total_loss = np.mean(total_loss_ls) with open(f'{args.weight_path}/train_log.txt', 'a') as fp: fp.write(f'Epoch: {epoch} loss: {total_loss:.6f} | acc: {total_correct / total * 100:.2f}\n\n') if (total_loss < best_val_loss): best_val_loss = total_loss torch.save({ "model_state_dict": model.model.state_dict(), "optimizer_state_dict": optimizer.state_dict(), "scheduler_state_dict": scheduler.state_dict(), }, f"{args.weight_path}/pre_trained_weight.pth")
def train(args): print("Hi") present_time = time.strftime("%Y_%m_%d_%H_%M_%S", time.localtime()) params = Params(f'projects/eye.yml') os.environ['CUDA_VISIBLE_DEVICES'] = args.gpu torch.cuda.manual_seed(20) torch.cuda.manual_seed_all(20) np.random.seed(20) random.seed(20) torch.backends.cudnn.deterministic = True torch.backends.cudnn.benchmark = False prepare_dir(args, present_time) training_params = { 'batch_size': args.batch_size, 'shuffle': True, 'drop_last': True, 'collate_fn': collater, 'num_workers': args.num_workers } val_params = { 'batch_size': args.batch_size, 'shuffle': False, 'drop_last': True, 'collate_fn': collater, 'num_workers': args.num_workers } input_sizes = [512, 640, 768, 896, 1024, 1280, 1280, 1536] model = EfficientDetBackbone(num_classes=len(params.obj_list), compound_coef=args.compound_coef, ratios=eval(params.anchors_ratios), scales=eval(params.anchors_scales)) # load last weights ''' if opt.load_weights is not None: if opt.load_weights.endswith('.pth'): weights_path = opt.load_weights else: weights_path = get_last_weights(opt.saved_path) try: last_step = int(os.path.basename(weights_path).split('_')[-1].split('.')[0]) except: last_step = 0 try: ret = model.load_state_dict(torch.load(weights_path), strict=False) except RuntimeError as e: print(f'[Warning] Ignoring {e}') print(f'[Info] loaded weights: {os.path.basename(weights_path)}, resuming checkpoint from step: {last_step}') else: last_step = 0 print('[Info] initializing weights...') init_weights(model) ''' init_weights(model) # warp the model with loss function, to reduce the memory usage on gpu0 and speedup model = ModelWithLoss(model) model = model.cuda() if args.optim == 'adamw': optimizer = torch.optim.AdamW(model.parameters(), args.lr) else: optimizer = torch.optim.SGD(model.parameters(), args.lr, momentum=0.9, nesterov=True) scheduler = torch.optim.lr_scheduler.ReduceLROnPlateau( optimizer, patience=args.patience, verbose=True) # unit is epoch torch.save( { "model_state_dict": model.model.state_dict(), "optimizer_state_dict": optimizer.state_dict(), "scheduler_state_dict": scheduler.state_dict(), }, f"{args.saved_path}/init_weight.pth") k = 10 train_img_list = glob.glob(f"{args.dataset_path}/train/*") normal_img_list = [] yellow_img_list = [] for img in train_img_list: if (img.find('n_') != -1): normal_img_list.append(img) else: yellow_img_list.append(img) random.shuffle(normal_img_list) random.shuffle(yellow_img_list) normal_part_num = math.ceil(len(normal_img_list) / k) yellow_part_num = math.ceil(len(yellow_img_list) / k) last_acc = [] last_loss = [] for i in range(k): best_loss = 1e5 model.model.load_state_dict( torch.load(f"{args.saved_path}/init_weight.pth") ["model_state_dict"]) optimizer.load_state_dict( torch.load(f"{args.saved_path}/init_weight.pth") ["optimizer_state_dict"]) scheduler.load_state_dict( torch.load(f"{args.saved_path}/init_weight.pth") ["scheduler_state_dict"]) model.train() sub_train_img_list = normal_img_list[:i * normal_part_num] + normal_img_list[ (i + 1) * normal_part_num:] + yellow_img_list[:i * yellow_part_num] + yellow_img_list[ (i + 1) * yellow_part_num:] sub_test_img_list = normal_img_list[i * normal_part_num:( i + 1) * normal_part_num] + yellow_img_list[i * yellow_part_num: (i + 1) * yellow_part_num] random.shuffle(sub_train_img_list) random.shuffle(sub_test_img_list) print("---") for img in sub_test_img_list: print(img) print("---") train_anno_txt_path = f"{args.dataset_path}/train.txt" test_anno_txt_path = f"{args.dataset_path}/train.txt" train_transform = transforms.Compose( [ # Normalizer(mean=params.mean, std=params.std), Augmenter(), randomScaleWidth(), randomBlur(), randomBrightness(), randomHue(), randomSaturation(), Normalizer(mean=params.mean, std=params.std), Resizer(input_sizes[args.compound_coef]) ]) test_transform = transforms.Compose( [ # Normalizer(mean=params.mean, std=params.std), Augmenter(), Normalizer(mean=params.mean, std=params.std), Resizer(input_sizes[args.compound_coef]) ]) train_set = EyeDataset(sub_train_img_list, train_anno_txt_path, train_transform) test_set = EyeDataset(sub_test_img_list, test_anno_txt_path, test_transform) training_generator = DataLoader(train_set, **training_params) val_generator = DataLoader(test_set, **val_params) for epoch in range(args.epoch): model.train() total_correct = 0 total = 0 total_loss_ls = [] for data in training_generator: imgs = data['img'] annot = data['annot'] imgs = imgs.cuda() annot = annot.cuda() optimizer.zero_grad() reg_loss, cls_head_loss, cls_correct_num, total_num = model( imgs, annot, obj_list=params.obj_list) total_correct += cls_correct_num total += total_num reg_loss = reg_loss.mean() loss = cls_head_loss + reg_loss total_loss_ls.append(loss.item()) if loss == 0 or not torch.isfinite(loss): continue loss.backward() optimizer.step() total_loss = np.mean(total_loss_ls) scheduler.step(total_loss) with open(f'./logs/{present_time}/cv_log.txt', 'a') as fp: fp.write(f"Epoch: {i}/{epoch}/{args.epoch}\n") fp.write( f"Training loss: {total_loss:.6f} | acc: {total_correct / total * 100:.2f}\n" ) model.eval() with torch.no_grad(): total = 0 total_correct = 0 total_loss_ls = [] for data in val_generator: imgs = data['img'].cuda() annot = data['annot'].cuda() reg_loss, cls_head_loss, cls_correct_num, total_num = model( imgs, annot, obj_list=params.obj_list) total_correct += cls_correct_num total += total_num reg_loss = reg_loss.mean() loss = reg_loss + cls_head_loss total_loss_ls.append(loss.item()) total_loss = np.mean(total_loss_ls) with open(f'./logs/{present_time}/cv_log.txt', 'a') as fp: fp.write( f"Testing loss: {total_loss:.6f} | acc: {total_correct / total * 100:.2f}\n\n" ) if (epoch == args.epoch - 1): last_loss.append(total_loss) last_acc.append(total_correct / total * 100) with open(f'./logs/{present_time}/cv_log.txt', 'a') as fp: fp.write("\n===========\n\n") fp.write(f"Avg. loss: {np.mean(np.array(last_loss)):.2f}\n") fp.write(f"Avg. accuracy: {np.mean(np.array(last_acc)):.2f}\n")