def train(model, device, config, epochs=5, batch_size=1, save_cp=True, log_step=20, img_scale=0.5): train_dataset = Yolo_dataset(config.train_label, config) val_dataset = Yolo_dataset(config.val_label, config) n_train = len(train_dataset) n_val = len(val_dataset) train_loader = DataLoader(train_dataset, batch_size=config.batch // config.subdivisions, shuffle=True, num_workers=8, pin_memory=True, drop_last=True, collate_fn=collate) val_loader = DataLoader(val_dataset, batch_size=config.batch // config.subdivisions, shuffle=True, num_workers=8, pin_memory=True, drop_last=True) writer = SummaryWriter(log_dir=config.TRAIN_TENSORBOARD_DIR, filename_suffix=f'OPT_{config.TRAIN_OPTIMIZER}_LR_{config.learning_rate}_BS_{config.batch}_Sub_{config.subdivisions}_Size_{config.width}', comment=f'OPT_{config.TRAIN_OPTIMIZER}_LR_{config.learning_rate}_BS_{config.batch}_Sub_{config.subdivisions}_Size_{config.width}') # writer.add_images('legend', # torch.from_numpy(train_dataset.label2colorlegend2(cfg.DATA_CLASSES).transpose([2, 0, 1])).to( # device).unsqueeze(0)) max_itr = config.TRAIN_EPOCHS * n_train # global_step = cfg.TRAIN_MINEPOCH * n_train global_step = 0 logging.info(f'''Starting training: Epochs: {epochs} Batch size: {config.batch} Subdivisions: {config.subdivisions} Learning rate: {config.learning_rate} Training size: {n_train} Validation size: {n_val} Checkpoints: {save_cp} Device: {device.type} Images size: {config.width} Optimizer: {config.TRAIN_OPTIMIZER} Dataset classes: {config.classes} Train label path:{config.train_label} Pretrained: ''') # learning rate setup def burnin_schedule(i): if i < config.burn_in: factor = pow(i / config.burn_in, 4) elif i < config.steps[0]: factor = 1.0 elif i < config.steps[1]: factor = 0.1 else: factor = 0.01 return factor optimizer = optim.Adam(model.parameters(), lr=config.learning_rate / config.batch, betas=(0.9, 0.999), eps=1e-08) scheduler = optim.lr_scheduler.LambdaLR(optimizer, burnin_schedule) criterion = Yolo_loss(device=device, batch=config.batch // config.subdivisions, n_classes=config.classes) # scheduler = ReduceLROnPlateau(optimizer, mode='max', verbose=True, patience=6, min_lr=1e-7) # scheduler = CosineAnnealingWarmRestarts(optimizer, 0.001, 1e-6, 20) model.train() for epoch in range(epochs): # model.train() epoch_loss = 0 epoch_step = 0 with tqdm(total=n_train, desc=f'Epoch {epoch + 1}/{epochs}', unit='img', ncols=50) as pbar: for i, batch in enumerate(train_loader): global_step += 1 epoch_step += 1 images = batch[0] bboxes = batch[1] images = images.to(device=device, dtype=torch.float32) bboxes = bboxes.to(device=device) bboxes_pred = model(images) loss, loss_xy, loss_wh, loss_obj, loss_cls, loss_l2 = criterion(bboxes_pred, bboxes) # loss = loss / config.subdivisions loss.backward() epoch_loss += loss.item() if global_step % config.subdivisions == 0: optimizer.step() scheduler.step() model.zero_grad() if global_step % (log_step * config.subdivisions) == 0: writer.add_scalar('train/Loss', loss.item(), global_step) writer.add_scalar('train/loss_xy', loss_xy.item(), global_step) writer.add_scalar('train/loss_wh', loss_wh.item(), global_step) writer.add_scalar('train/loss_obj', loss_obj.item(), global_step) writer.add_scalar('train/loss_cls', loss_cls.item(), global_step) writer.add_scalar('train/loss_l2', loss_l2.item(), global_step) writer.add_scalar('lr', scheduler.get_lr()[0] * config.batch, global_step) pbar.set_postfix(**{'loss (batch)': loss.item(), 'loss_xy': loss_xy.item(), 'loss_wh': loss_wh.item(), 'loss_obj': loss_obj.item(), 'loss_cls': loss_cls.item(), 'loss_l2': loss_l2.item(), 'lr': scheduler.get_lr()[0] * config.batch }) logging.debug('Train step_{}: loss : {},loss xy : {},loss wh : {},' 'loss obj : {},loss cls : {},loss l2 : {},lr : {}' .format(global_step, loss.item(), loss_xy.item(), loss_wh.item(), loss_obj.item(), loss_cls.item(), loss_l2.item(), scheduler.get_lr()[0] * config.batch)) pbar.update(images.shape[0]) if save_cp: try: os.mkdir(config.checkpoints) logging.info('Created checkpoint directory') except OSError: pass torch.save(model.state_dict(), os.path.join(config.checkpoints, f'Yolov4_epoch{epoch + 1}.pth')) logging.info(f'Checkpoint {epoch + 1} saved !') writer.close()
def train(model, device, config, epochs=5, batch_size=1, save_cp=True, log_step=20, img_scale=0.5): train_dataset = Yolo_dataset(config.train_label, config, train=True) val_dataset = Yolo_dataset(config.val_label, config, train=False) n_train = len(train_dataset) n_val = len(val_dataset) train_loader = DataLoader(train_dataset, batch_size=config.batch // config.subdivisions, shuffle=True, num_workers=8, pin_memory=True, drop_last=True, collate_fn=collate) val_loader = DataLoader(val_dataset, batch_size=config.batch // config.subdivisions, shuffle=True, num_workers=8, pin_memory=True, drop_last=True, collate_fn=val_collate) writer = SummaryWriter( log_dir=config.TRAIN_TENSORBOARD_DIR, filename_suffix= f'OPT_{config.TRAIN_OPTIMIZER}_LR_{config.learning_rate}_BS_{config.batch}_Sub_{config.subdivisions}_Size_{config.width}', comment= f'OPT_{config.TRAIN_OPTIMIZER}_LR_{config.learning_rate}_BS_{config.batch}_Sub_{config.subdivisions}_Size_{config.width}' ) # writer.add_images('legend', # torch.from_numpy(train_dataset.label2colorlegend2(cfg.DATA_CLASSES).transpose([2, 0, 1])).to( # device).unsqueeze(0)) max_itr = config.TRAIN_EPOCHS * n_train # global_step = cfg.TRAIN_MINEPOCH * n_train global_step = 0 logging.info(f'''Starting training: Epochs: {epochs} Batch size: {config.batch} Subdivisions: {config.subdivisions} Learning rate: {config.learning_rate} Training size: {n_train} Validation size: {n_val} Checkpoints: {save_cp} Device: {device.type} Images size: {config.width} Optimizer: {config.TRAIN_OPTIMIZER} Dataset classes: {config.classes} Train label path:{config.train_label} Pretrained: ''') # learning rate setup def burnin_schedule(i): if i < config.burn_in: factor = pow(i / config.burn_in, 4) elif i < config.steps[0]: factor = 1.0 elif i < config.steps[1]: factor = 0.1 else: factor = 0.01 return factor if config.TRAIN_OPTIMIZER.lower() == 'adam': optimizer = optim.Adam( model.parameters(), lr=config.learning_rate / config.batch, betas=(0.9, 0.999), eps=1e-08, ) elif config.TRAIN_OPTIMIZER.lower() == 'sgd': optimizer = optim.SGD( params=model.parameters(), lr=config.learning_rate / config.batch, momentum=config.momentum, weight_decay=config.decay, ) scheduler = optim.lr_scheduler.LambdaLR(optimizer, burnin_schedule) criterion = Yolo_loss(device=device, batch=config.batch // config.subdivisions, n_classes=config.classes) # scheduler = ReduceLROnPlateau(optimizer, mode='max', verbose=True, patience=6, min_lr=1e-7) # scheduler = CosineAnnealingWarmRestarts(optimizer, 0.001, 1e-6, 20) save_prefix = 'Yolov4_epoch' saved_models = deque() model.train() for epoch in range(epochs): # model.train() epoch_loss = 0 epoch_step = 0 with tqdm(total=n_train, desc=f'Epoch {epoch + 1}/{epochs}', unit='img', ncols=50) as pbar: for i, batch in enumerate(train_loader): global_step += 1 epoch_step += 1 images = batch[0] bboxes = batch[1] images = images.to(device=device, dtype=torch.float32) bboxes = bboxes.to(device=device) bboxes_pred = model(images) loss, loss_xy, loss_wh, loss_obj, loss_cls, loss_l2 = criterion( bboxes_pred, bboxes) # loss = loss / config.subdivisions loss.backward() epoch_loss += loss.item() if global_step % config.subdivisions == 0: optimizer.step() scheduler.step() model.zero_grad() if global_step % (log_step * config.subdivisions) == 0: writer.add_scalar('train/Loss', loss.item(), global_step) writer.add_scalar('train/loss_xy', loss_xy.item(), global_step) writer.add_scalar('train/loss_wh', loss_wh.item(), global_step) writer.add_scalar('train/loss_obj', loss_obj.item(), global_step) writer.add_scalar('train/loss_cls', loss_cls.item(), global_step) writer.add_scalar('train/loss_l2', loss_l2.item(), global_step) writer.add_scalar('lr', scheduler.get_lr()[0] * config.batch, global_step) pbar.set_postfix( **{ 'loss (batch)': loss.item(), 'loss_xy': loss_xy.item(), 'loss_wh': loss_wh.item(), 'loss_obj': loss_obj.item(), 'loss_cls': loss_cls.item(), 'loss_l2': loss_l2.item(), 'lr': scheduler.get_lr()[0] * config.batch }) logging.debug( 'Train step_{}: loss : {},loss xy : {},loss wh : {},' 'loss obj : {},loss cls : {},loss l2 : {},lr : {}'. format(global_step, loss.item(), loss_xy.item(), loss_wh.item(), loss_obj.item(), loss_cls.item(), loss_l2.item(), scheduler.get_lr()[0] * config.batch)) pbar.update(images.shape[0]) if cfg.use_darknet_cfg: eval_model = Darknet(cfg.cfgfile, inference=True) else: eval_model = Yolov4(cfg.pretrained, n_classes=cfg.classes, inference=True) # eval_model = Yolov4(yolov4conv137weight=None, n_classes=config.classes, inference=True) if torch.cuda.device_count() > 1: eval_model.load_state_dict(model.module.state_dict()) else: eval_model.load_state_dict(model.state_dict()) eval_model.to(device) evaluator = evaluate(eval_model, val_loader, config, device) del eval_model stats = evaluator.coco_eval['bbox'].stats writer.add_scalar('train/AP', stats[0], global_step) writer.add_scalar('train/AP50', stats[1], global_step) writer.add_scalar('train/AP75', stats[2], global_step) writer.add_scalar('train/AP_small', stats[3], global_step) writer.add_scalar('train/AP_medium', stats[4], global_step) writer.add_scalar('train/AP_large', stats[5], global_step) writer.add_scalar('train/AR1', stats[6], global_step) writer.add_scalar('train/AR10', stats[7], global_step) writer.add_scalar('train/AR100', stats[8], global_step) writer.add_scalar('train/AR_small', stats[9], global_step) writer.add_scalar('train/AR_medium', stats[10], global_step) writer.add_scalar('train/AR_large', stats[11], global_step) if save_cp: try: # os.mkdir(config.checkpoints) os.makedirs(config.checkpoints, exist_ok=True) logging.info('Created checkpoint directory') except OSError: pass save_path = os.path.join(config.checkpoints, f'{save_prefix}{epoch + 1}.pth') torch.save(model.state_dict(), save_path) logging.info(f'Checkpoint {epoch + 1} saved !') saved_models.append(save_path) if len(saved_models) > config.keep_checkpoint_max > 0: model_to_remove = saved_models.popleft() try: os.remove(model_to_remove) except: logging.info(f'failed to remove {model_to_remove}') writer.close()
def train(model, device, config, epochs=5, log_step=20, img_scale=0.5): train_dataset = Yolo_dataset(config.train_label, config) n_train = len(train_dataset) train_loader = DataLoader(train_dataset, batch_size=config.batch, shuffle=True, num_workers=0, pin_memory=True, drop_last=True, collate_fn=collate) logging.info(f'''Starting training: Epochs: {epochs} Batch size: {config.batch} Subdivisions: {config.subdivisions} Learning rate: {config.learning_rate} Training size: {n_train} Device: {device.type} Images size: {config.width} Optimizer: {config.TRAIN_OPTIMIZER} Dataset classes: {config.classes} Train label path:{config.train_label} Pretrained: ''') # learning rate setup def adjust_learning_rate(optimizer, iter, lr): """Sets the learning rate to the initial LR decayed by 10 every 2 epochs""" if iter < burn_in: factor = pow(iter / burn_in, 4) elif iter < steps[0]: factor = 1.0 elif iter < steps[1]: factor = 0.1 else: factor = 0.01 lr = lr * factor for param_group in optimizer.param_groups: param_group['lr'] = lr lr_start = 0.001 optimizer = optim.Adam(model.parameters(), lr=lr_start, betas=(0.9, 0.999), eps=1e-08) # optimizer = optim.SGD(model.parameters(), lr=lr_start, momentum=0.9, dampening=0) criterion = Yolo_loss(device=device, batch=config.batch // config.subdivisions, n_classes=config.classes) # scheduler = ReduceLROnPlateau(optimizer, mode='max', verbose=True, patience=6, min_lr=1e-7) # scheduler = CosineAnnealingWarmRestarts(optimizer, 0.001, 1e-6, 20) model.train() start_epoch = 0 burn_in = 10 epochs = 6 batch_num = len(train_loader) * epochs steps = [int(0.5 * batch_num), int(0.8 * batch_num)] global_iter = start_epoch * len(train_loader) for epoch in range(start_epoch, epochs): torch.save( model.state_dict(), os.path.join(config.checkpoints, f'Yolov4_epoch{epoch + 1}.pth')) # with tqdm(total=n_train, desc=f'Epoch {epoch + 1}/{epochs}', unit='img', ncols=50) as pbar: for i, batch in enumerate(train_loader): global_iter += 1 images = batch[0] bboxes = batch[1] images = images.to(device=device, dtype=torch.float32) bboxes = bboxes.to(device=device) bboxes_pred = model(images) loss, loss_xy, loss_wh, loss_obj, loss_cls, loss_l2 = criterion( bboxes_pred, bboxes, images) # loss = loss / config.subdivisions loss.backward() optimizer.step() model.zero_grad() adjust_learning_rate(optimizer, global_iter, lr_start) lr = optimizer.param_groups[0]['lr'] if global_iter % (log_step) == 0: part = i / len(train_loader) logging.info( 'Epoch:%.2f/%d Loss:%.4f. loss_xy:%.4f loss_wh:%.4f loss_obj:%.4f loss_cls:%.4f loss_l2:%.4f lr:%.6f' % (epoch + part, epochs, loss.item(), loss_xy.item(), loss_wh.item(), loss_obj.item(), loss_cls.item(), loss_l2.item(), lr)) try: os.mkdir(config.checkpoints) logging.info('Created checkpoint directory') except OSError: pass torch.save( model.state_dict(), os.path.join(config.checkpoints, f'Yolov4_epoch{epoch + 1}.pth')) logging.info(f'Checkpoint {epoch + 1} saved !')
def train(model, device, config, epochs=5, batch_size=1, save_cp=True, log_step=20, img_scale=0.5): # TODO:加上resume功能,resume需要什么信息? # config的所有信息、yolov4-custom.cfg的所有信息,权重,epoch序号,学习率到哪了 # 创建dataset # config.train_label为data/coins.txt标签文本的路径 train_dataset = Yolo_dataset(config.train_label, config, train=True) val_dataset = Yolo_dataset(config.val_label, config, train=False) # 获得dataset的长度 n_train = len(train_dataset) n_val = len(val_dataset) # 创建dataloader # 当pin_memory=False,num_workers=0(子进程数量为0,即只有主进程)时,正常 # 当pin_memory=True,num_workers=8时,卡住 # 当pin_memory=False,num_workers=8时,卡住 # 当pin_memory=True,num_workers=0时,正常 # 综上,原因在于num_workers大于0开启多线程导致 # 经查,dataset加载图片中使用OpenCV,OpenCV某些函数默认也会开多线程, # 多线程套多线程,容易导致线程卡住(是否会卡住可能与不同操作系统有关) # 解决方法:法一,在dataset的前面import cv2时加上cv2.setNumThreads(0)禁用OpenCV多进程(推荐) # 法二,使用PIL加载和预处理图片(不推荐,PIL速度不如OpenCV) train_loader = DataLoader(train_dataset, batch_size=config.batch // config.subdivisions, shuffle=True, num_workers=8, pin_memory=True, drop_last=True) val_loader = DataLoader(val_dataset, batch_size=config.batch // config.subdivisions, shuffle=False, num_workers=8, pin_memory=True, drop_last=False, collate_fn=val_collate) if config.only_evaluate or config.evaluate_when_train: tgtFile = makeTgtJson(val_loader, config.categories) writer = SummaryWriter(log_dir=config.TRAIN_TENSORBOARD_DIR, filename_suffix=f'OPT_{config.TRAIN_OPTIMIZER}_LR_{config.learning_rate}_BS_{config.batch}_Sub_{config.subdivisions}_Size_{config.width}', comment=f'OPT_{config.TRAIN_OPTIMIZER}_LR_{config.learning_rate}_BS_{config.batch}_Sub_{config.subdivisions}_Size_{config.width}') # 计算迭代次数的最大值 max_itr = config.TRAIN_EPOCHS * n_train # 迭代次数的全局计数器 global_step = 0 logging.info(f'''Starting training: Epochs: {epochs} Batch size: {config.batch} Subdivisions: {config.subdivisions} Learning rate: {config.learning_rate} Training size: {n_train} Validation size: {n_val} Checkpoints: {save_cp} Device: {device.type} Images size: {config.width} Optimizer: {config.TRAIN_OPTIMIZER} Dataset classes: {config.classes} Train label path:{config.train_label} Pretrained: {config.pretrainedWeight is not None or config.Pretrained is not None} ''') if config.only_evaluate: if config.use_darknet_cfg: eval_model = Darknet(config.cfgfile) else: raise NotImplementedError if torch.cuda.device_count() > 1: eval_model.load_state_dict(model.module.state_dict()) else: eval_model.load_state_dict(model.state_dict()) eval_model.to(device) eval_model.eval() resFile = evaluate(eval_model, config.val_label, config.dataset_dir, device==torch.device("cuda")) if resFile is None: debugPrint("detect 0 boxes in the val set") return cocoEvaluate(tgtFile, resFile) return # learning rate setup # 自定义的学习率调整函数,先递增,然后阶梯性降低 def burnin_schedule(i): # i表示iter,而不是epoch if i < config.burn_in: # 按4次方递增阶段 # factor表示乘在学习率上的倍数 factor = pow(i / config.burn_in, 4) elif i < config.steps[0]: # 第一阶段 factor = 1.0 elif i < config.steps[1]: # 第二阶段 factor = 0.1 else: # 第三阶段 factor = 0.01 return factor if config.TRAIN_OPTIMIZER.lower() == 'adam': # 默认是adam optimizer = optim.Adam( model.parameters(), lr=config.learning_rate / config.batch, # 学习率的实际值是设置值/batch_size betas=(0.9, 0.999), # adam的特殊参数,一般用默认即可 eps=1e-08, # adam的特殊参数,一般用默认即可 ) elif config.TRAIN_OPTIMIZER.lower() == 'sgd': optimizer = optim.SGD( params=model.parameters(), lr=config.learning_rate / config.batch, momentum=config.momentum, weight_decay=config.decay, ) # pytorch调整学习率的专用接口 scheduler = optim.lr_scheduler.LambdaLR(optimizer, burnin_schedule) # 计算loss的对象,这个模块是在yolo网络后专门求解loss的(yolo主网络只负责接收图片,然后输出三路张量),这个模块不需要权重等参数 criterion = Yolo_loss(device=device, batch=config.batch // config.subdivisions, n_classes=config.classes) save_prefix = 'Yolov4_epoch' saved_models = deque() for epoch in range(epochs): epoch_loss = 0 epoch_step = 0 model.train() logging.info("===Train===") for i, batch in enumerate(train_loader): global_step += 1 epoch_step += 1 images = batch[0] bboxes = batch[1] images = images.to(device=device, dtype=torch.float32) bboxes = bboxes.to(device=device) bboxes_pred = model(images) loss, loss_xy, loss_wh, loss_obj, loss_cls, loss_l2 = criterion(bboxes_pred, bboxes) loss.backward() epoch_loss += loss.item() if global_step % config.subdivisions == 0: optimizer.step() scheduler.step() model.zero_grad() logging.info("Epoch:[{:3}/{}],step:[{:3}/{}],total loss:{:.2f}|lr:{:.5f}".format(epoch + 1, epochs, i + 1, len(train_loader), loss.item(), scheduler.get_last_lr()[0])) if global_step % (log_step * config.subdivisions) == 0: # log_step默认为20,这里指的是迭代次数 writer.add_scalar('train/Loss', loss.item(), global_step) writer.add_scalar('train/loss_xy', loss_xy.item(), global_step) writer.add_scalar('train/loss_wh', loss_wh.item(), global_step) writer.add_scalar('train/loss_obj', loss_obj.item(), global_step) writer.add_scalar('train/loss_cls', loss_cls.item(), global_step) writer.add_scalar('train/loss_l2', loss_l2.item(), global_step) writer.add_scalar('lr', scheduler.get_last_lr()[0] * config.batch, global_step) logging.debug('Train step_{}: loss : {},loss xy : {},loss wh : {},' 'loss obj : {},loss cls : {},loss l2 : {},lr : {}' .format(global_step, loss.item(), loss_xy.item(), loss_wh.item(), loss_obj.item(), loss_cls.item(), loss_l2.item(), scheduler.get_last_lr()[0] * config.batch)) if save_cp: # True # 创建checkpoints文件夹 if not os.path.exists(config.checkpoints): os.makedirs(config.checkpoints, exist_ok=True) # exist_ok=True表示可以接受已经存在该文件夹,当exist_ok=False时文件夹存在会抛出错误 logging.info('Created checkpoint directory') save_path = os.path.join(config.checkpoints, f'{save_prefix}{epoch + 1}.weights') # 考虑torch.nn.DataParallel特殊情况 if torch.cuda.device_count() > 1: model.module.save_weights(save_path) else: model.save_weights(save_path) logging.info(f'Checkpoint {epoch + 1} saved !') # 只保留最新keep_checkpoint_max个checkpoint,自动删除较早的checkpoint saved_models.append(save_path) if len(saved_models) > config.keep_checkpoint_max > 0: model_to_remove = saved_models.popleft() try: os.remove(model_to_remove) except: logging.info(f'failed to remove {model_to_remove}') if config.evaluate_when_train: try: model.eval() resFile = evaluate(model, config.val_label, config.dataset_dir, device==torch.device("cuda"), config.width, config.height) if resFile is None: continue stats = cocoEvaluate(tgtFile, resFile) logging.info("===Val===") logging.info("Epoch:[{:3}/{}],AP:{:.3f}|AP50:{:.3f}|AP75:{:.3f}|APs:{:.3f}|APm:{:.3f}|APl:{:.3f}".format( epoch + 1, epochs, stats[0], stats[1], stats[2], stats[3], stats[4], stats[5])) logging.info("Epoch:[{:3}/{}],AR1:{:.3f}|AR10:{:.3f}|AR100:{:.3f}|ARs:{:.3f}|ARm:{:.3f}|ARl:{:.3f}".format( epoch + 1, epochs, stats[6], stats[7], stats[8], stats[9], stats[10], stats[11])) writer.add_scalar('train/AP', stats[0], global_step) writer.add_scalar('train/AP50', stats[1], global_step) writer.add_scalar('train/AP75', stats[2], global_step) writer.add_scalar('train/AP_small', stats[3], global_step) writer.add_scalar('train/AP_medium', stats[4], global_step) writer.add_scalar('train/AP_large', stats[5], global_step) writer.add_scalar('train/AR1', stats[6], global_step) writer.add_scalar('train/AR10', stats[7], global_step) writer.add_scalar('train/AR100', stats[8], global_step) writer.add_scalar('train/AR_small', stats[9], global_step) writer.add_scalar('train/AR_medium', stats[10], global_step) writer.add_scalar('train/AR_large', stats[11], global_step) except Exception as e: debugPrint("evaluate meets an exception, here is the exception info:") traceback.print_exc() debugPrint("ignore error in evaluate and continue training") writer.close()
def train(model, device, config, epochs=5, batch_size=1, save_cp=True, log_step=20, img_scale=0.5): train_dataset = Yolo_dataset(config.train_label, config, train=True) val_dataset = Yolo_dataset(config.val_label, config, train=False) n_train = len(train_dataset) n_val = len(val_dataset) train_loader = DataLoader(train_dataset, batch_size=config.batch // config.subdivisions, shuffle=True, num_workers=8, pin_memory=True, drop_last=True, collate_fn=collate) # val_loader = DataLoader(val_dataset, batch_size=config.batch // config.subdivisions, shuffle=True, num_workers=0, # pin_memory=True, drop_last=True, collate_fn=val_collate) # writer = SummaryWriter(log_dir=config.TRAIN_TENSORBOARD_DIR, # filename_suffix=f'OPT_{config.TRAIN_OPTIMIZER}_LR_{config.learning_rate}_BS_{config.batch}_Sub_{config.subdivisions}_Size_{config.width}', # comment=f'OPT_{config.TRAIN_OPTIMIZER}_LR_{config.learning_rate}_BS_{config.batch}_Sub_{config.subdivisions}_Size_{config.width}') writer = SummaryWriter(log_dir=config.TRAIN_TENSORBOARD_DIR) # writer.add_images('legend', # torch.from_numpy(train_dataset.label2colorlegend2(cfg.DATA_CLASSES).transpose([2, 0, 1])).to( # device).unsqueeze(0)) max_itr = config.TRAIN_EPOCHS * n_train # global_step = cfg.TRAIN_MINEPOCH * n_train global_step = 0 logging.info(f'''Starting training: Epochs: {epochs} Batch size: {config.batch} Subdivisions: {config.subdivisions} Learning rate: {config.learning_rate} Training size: {n_train} Validation size: {n_val} Checkpoints: {save_cp} Device: {device.type} Images size: {config.width} Optimizer: {config.TRAIN_OPTIMIZER} Dataset classes: {config.classes} Train label path:{config.train_label} Pretrained: {config.pretrained} ''') # learning rate setup def burnin_schedule(i): if i < config.burn_in: factor = pow(i / config.burn_in, 4) elif i < config.steps[0]: factor = 1.0 elif i < config.steps[1]: factor = 0.1 else: factor = 0.01 return factor if config.TRAIN_OPTIMIZER.lower() == 'adam': optimizer = optim.Adam( model.parameters(), lr=config.learning_rate, betas=(0.9, 0.999), eps=1e-08, ) elif config.TRAIN_OPTIMIZER.lower() == 'sgd': optimizer = optim.SGD( params=model.parameters(), lr=config.learning_rate / config.batch, momentum=config.momentum, weight_decay=config.decay, ) # scheduler = optim.lr_scheduler.LambdaLR(optimizer, burnin_schedule) criterion = Yolo_loss(device=device, batch=config.batch // config.subdivisions, n_classes=config.classes, image_size=config.width) # scheduler = ReduceLROnPlateau(optimizer, mode='max', verbose=True, patience=6, min_lr=1e-7) # scheduler = CosineAnnealingWarmRestarts(optimizer, 0.001, 1e-6, 20) save_prefix = 'Yolov4_tiny_epoch' saved_models = deque() model.train() # MODEL FREEZE CHECK --------------------------------------------------- # for param in model.parameters(): # param.requires_grad = False # for param in model.head.parameters(): # param.requires_grad = True # optimizer = optim.Adam( # model.head.parameters(), # lr=config.learning_rate, # betas=(0.9, 0.999), # eps=1e-08, # ) # MODEL FREEZE CHECK --------------------------------------------------- for epoch in range(epochs): # model.train() epoch_loss = 0 epoch_step = 0 with tqdm(total=n_train, desc=f'Epoch {epoch + 1}/{epochs}', unit='img', ncols=50) as pbar: # EVALUATION CHECK --------------------------------------------------- print("\n", "-" * 50) print("EVALUATION CHECK") # print("Strat Eval, cfg type: ",type(cfg), " , cfg.cfgfile type", type(cfg.cfgfile)) # print("cfg : ", cfg) # print("cfg.cfgfile : ", cfg.cfgfile) print("-" * 50, "\n") print() print("START ITER") for i, batch in enumerate(train_loader): # print() # print("START ITER") global_step += 1 epoch_step += 1 images = batch[0] bboxes = batch[1] # print("DATA TO DEVICE") # print(images.shape) images = images.to(device=device, dtype=torch.float32) # print("DATA TO DEVICE") # print(bboxes.shape) bboxes = bboxes.to(device=device) # print("MODEL PRED") bboxes_pred = model(images) # print("LOSS CALC") # print(bboxes_pred[0].shape) # print(bboxes_pred[1].shape) # print(bboxes.shape) loss, loss_xy, loss_wh, loss_obj, loss_cls, loss_l2 = criterion( bboxes_pred, bboxes) # loss = loss / config.subdivisions # print("LOSS CALC END") loss.backward() # print("LOSS backward END") # epoch_loss += loss.item() # print("LOSS adding END") # print("LR : ", optimizer.param_groups[0]['lr']) if global_step % 10 * config.subdivisions == 0: # print("OPT STEP") # print("\n-------BackWard ------\n") optimizer.step() # scheduler.step() model.zero_grad() # EVALUATION CHECK --------------------------------------------------- # print("LOSS : ", loss.item()) writer.add_scalar(' /Loss', loss.item(), global_step) print("ADD TO TENSORBOARD") writer.add_scalar('train/Loss', loss.item(), global_step) writer.add_scalar('train/loss_xy', loss_xy.item(), global_step) writer.add_scalar('train/loss_wh', loss_wh.item(), global_step) writer.add_scalar('train/loss_obj', loss_obj.item(), global_step) writer.add_scalar('train/loss_cls', loss_cls.item(), global_step) writer.add_scalar('train/loss_l2', loss_l2.item(), global_step) # writer.add_scalar('lr', scheduler.get_lr()[0] * config.batch, global_step) pbar.set_postfix( **{ 'loss (batch)': loss.item(), 'loss_xy': loss_xy.item(), 'loss_wh': loss_wh.item(), 'loss_obj': loss_obj.item(), 'loss_cls': loss_cls.item(), 'loss_l2': loss_l2.item(), # 'lr': scheduler.get_lr()[0] * config.batch }) logging.debug( 'Train step_{}: loss : {},loss xy : {},loss wh : {},' 'loss obj : {},loss cls : {},loss l2 : {},lr : {}'. format(global_step, loss.item(), loss_xy.item(), loss_wh.item(), loss_obj.item(), loss_cls.item(), loss_l2.item(), config.learning_rate)) model.eval() # model.head.inference = True # eval_stats = coco_evaluator.evaluate(model, device) img_path = '/workspace/GitHub/YOLO/sample_data/sample_0.jpg' detect.detect_img(model, img_path, savename='prediction_sample.jpg', img_size=640) model.train() # model.head.inference = False # EVALUATION CHECK --------------------------------------------------- pbar.update(images.shape[0]) if save_cp: try: # os.mkdir(config.checkpoints) os.makedirs(config.checkpoints, exist_ok=True) logging.info('Created checkpoint directory') except OSError: pass save_path = os.path.join(config.checkpoints, f'{save_prefix}{epoch + 1}.pth') torch.save(model.state_dict(), save_path) logging.info(f'Checkpoint {epoch + 1} saved !') saved_models.append(save_path) # MODEL REMOVE ------------------------------------------------ # if len(saved_models) > config.keep_checkpoint_max > 0: # model_to_remove = saved_models.popleft() # try: # os.remove(model_to_remove) # except: # logging.info(f'failed to remove {model_to_remove}') # MODEL REMOVE ------------------------------------------------ writer.close()
def train(model, device, config, epochs=5, batch_size=1, save_cp=True, log_step=20, img_scale=0.5, freeze_backbone=False): """ Train the YOLOv4 network with given configurations """ train_dataset = Yolo_dataset(config.train_label, config, train=True) val_dataset = Yolo_dataset(config.val_label, config, train=False) n_train = len(train_dataset) n_val = len(val_dataset) train_loader = DataLoader(train_dataset, batch_size=config.batch // config.subdivisions, shuffle=True, num_workers=8, pin_memory=True, drop_last=True, collate_fn=collate) val_loader = DataLoader(val_dataset, batch_size=config.batch // config.subdivisions, shuffle=True, num_workers=8, pin_memory=True, drop_last=True, collate_fn=val_collate) global_step = 0 logging.info(f'''Starting training: Epochs: {epochs} Batch size: {config.batch} Subdivisions: {config.subdivisions} Learning rate: {config.learning_rate} Training size: {n_train} Validation size: {n_val} Checkpoints: {save_cp} Device: {device.type} Images size: {config.width} Optimizer: {config.TRAIN_OPTIMIZER} Dataset classes: {config.classes} Train label path:{config.train_label} Pretrained: {config.pretrained} ''') # Learning rate setup def burnin_schedule(i): if i < config.burn_in: factor = pow(i / config.burn_in, 4) elif i < config.steps[0]: factor = 1.0 elif i < config.steps[1]: factor = 0.1 else: factor = 0.01 return factor # Optimiser alternatives if config.TRAIN_OPTIMIZER.lower() == 'adam': optimizer = optim.Adam( model.parameters(), lr=config.learning_rate / config.batch, betas=(0.9, 0.999), eps=1e-08, ) elif config.TRAIN_OPTIMIZER.lower() == 'sgd': optimizer = optim.SGD( params=model.parameters(), lr=config.learning_rate / config.batch, momentum=config.momentum, weight_decay=config.decay, ) scheduler = optim.lr_scheduler.LambdaLR(optimizer, burnin_schedule) criterion = Yolo_loss(device=device, batch=config.batch // config.subdivisions, n_classes=config.classes) save_prefix = 'Yolov4_epoch' saved_models = deque() # Set model to training mode model.train() for epoch in range(epochs): epoch_loss = 0 epoch_step = 0 # Freeze backbone first epoch if freeze_backbone and epoch < 2: for name, p in model.named_parameters(): # Freeze everything but the head if not 'head' in name.split('.')[0]: p.requires_grad = False if (epoch == 0) else True with tqdm(total=n_train, desc=f'Epoch {epoch + 1}/{epochs}', unit='img', ncols=200) as progress_bar: for batch in train_loader: global_step += 1 epoch_step += 1 images = batch[0] bboxes = batch[1] images = images.to(device=device, dtype=torch.float32) bboxes = bboxes.to(device=device) bboxes_pred = model(images) loss, loss_xy, loss_wh, loss_obj, loss_cls, loss_l2 = criterion( bboxes_pred, bboxes) loss.backward() epoch_loss += loss.item() if global_step % config.subdivisions == 0: optimizer.step() scheduler.step() model.zero_grad() if global_step % (log_step * config.subdivisions) == 0: progress_bar.set_postfix( **{ 'loss (batch)': loss.item(), 'loss_xy': loss_xy.item(), 'loss_wh': loss_wh.item(), 'loss_obj': loss_obj.item(), 'loss_cls': loss_cls.item(), 'loss_l2': loss_l2.item(), 'lr': scheduler.get_lr()[0] * config.batch }) logging.debug( 'Train step_{}: loss : {}, loss xy : {}, loss wh : {},' ' loss obj : {}, loss cls : {}, loss l2 : {}, lr : {}'. format(global_step, loss.item(), loss_xy.item(), loss_wh.item(), loss_obj.item(), loss_cls.item(), loss_l2.item(), scheduler.get_lr()[0] * config.batch)) progress_bar.update(images.shape[0]) eval_model = Yolov4(cfg.pretrained, n_classes=cfg.classes, inference=True) if torch.cuda.device_count() > 1: eval_model.load_state_dict(model.module.state_dict()) else: eval_model.load_state_dict(model.state_dict()) eval_model.to(device) """ evaluator = evaluate(eval_model, val_loader, config, device) del eval_model stats = evaluator.coco_eval['bbox'].stats writer.add_scalar('train/AP', stats[0], global_step) writer.add_scalar('train/AP50', stats[1], global_step) writer.add_scalar('train/AP75', stats[2], global_step) writer.add_scalar('train/AP_small', stats[3], global_step) writer.add_scalar('train/AP_medium', stats[4], global_step) writer.add_scalar('train/AP_large', stats[5], global_step) writer.add_scalar('train/AR1', stats[6], global_step) writer.add_scalar('train/AR10', stats[7], global_step) writer.add_scalar('train/AR100', stats[8], global_step) writer.add_scalar('train/AR_small', stats[9], global_step) writer.add_scalar('train/AR_medium', stats[10], global_step) writer.add_scalar('train/AR_large', stats[11], global_step) """ # Save model to file if save_cp: try: os.makedirs(config.checkpoints, exist_ok=True) logging.info('Created checkpoint directory') except OSError: pass save_path = os.path.join(config.checkpoints, f'{save_prefix}{epoch + 1}.pth') torch.save(model.state_dict(), save_path) logging.info(f'Checkpoint {epoch + 1} saved !') saved_models.append(save_path) if len(saved_models) > config.keep_checkpoint_max > 0: model_to_remove = saved_models.popleft() try: os.remove(model_to_remove) except: logging.info(f'failed to remove {model_to_remove}')
def train(model, device, config, epochs=5, batch_size=1, save_cp=True, log_step=20, img_scale=0.5): train_dataset = Yolo_dataset(config.train_label, config) #val_dataset = Yolo_dataset(config.val_label, config) n_train = len(train_dataset) #n_val = len(val_dataset) n_val = 0 #print(config.batch, config.subdivisions, config.batch // config.subdivisions) train_loader = DataLoader(train_dataset, batch_size=config.batch // config.subdivisions, shuffle=True, num_workers=8, pin_memory=True, drop_last=True, collate_fn=collate) #val_loader = DataLoader(val_dataset, batch_size=config.batch // config.subdivisions, shuffle=True, num_workers=8, # pin_memory=True, drop_last=True) outfile = open('loss.txt', 'w') max_itr = config.TRAIN_EPOCHS * n_train # global_step = cfg.TRAIN_MINEPOCH * n_train global_step = 0 logging.info(f'''Starting training: Epochs: {epochs} Batch size: {config.batch} Subdivisions: {config.subdivisions} Learning rate: {config.learning_rate} Training size: {n_train} Validation size: {n_val} Checkpoints: {save_cp} Device: {device.type} Images size: {config.width} Optimizer: {config.TRAIN_OPTIMIZER} Dataset classes: {config.classes} Train label path:{config.train_label} Pretrained: ''') # learning rate setup def burnin_schedule(i): if i < config.burn_in: factor = pow(i / config.burn_in, 4) elif i < config.steps[0]: factor = 1.0 elif i < config.steps[1]: factor = 0.1 else: factor = 0.01 return factor optimizer = optim.Adam(model.parameters(), lr=config.learning_rate / config.batch, betas=(0.9, 0.999), eps=1e-08) scheduler = optim.lr_scheduler.LambdaLR(optimizer, burnin_schedule) criterion = Yolo_loss(device=device, batch=config.batch // config.subdivisions, n_classes=config.classes) # scheduler = ReduceLROnPlateau(optimizer, mode='max', verbose=True, patience=6, min_lr=1e-7) # scheduler = CosineAnnealingWarmRestarts(optimizer, 0.001, 1e-6, 20) model.train() for epoch in range(epochs): #model.train() #epoch_loss = 0.0 #epoch_step = 0 with tqdm(total=n_train, desc=f'Epoch {epoch + 1}/{epochs}', unit='img', ncols=50) as pbar: for epoch_step, batch in enumerate(train_loader): global_step += 1 #epoch_step += 1 images = batch[0] bboxes = batch[1] images = images.to(device=device, dtype=torch.float32) bboxes = bboxes.to(device=device) bboxes_pred = model(images) loss, loss_xy, loss_wh, loss_obj, loss_cls, loss_l2 = criterion( bboxes_pred, bboxes) #loss = loss / config.subdivisions loss.backward() #epoch_loss += loss.item() if ((epoch_step + 1) % config.subdivisions) == 0: optimizer.step() scheduler.step() model.zero_grad() if global_step % (log_step * config.subdivisions) == 0: outfile.write( str(round(loss.item(), 3)) + " " + str(round(loss_xy.item(), 3)) + " " + str(round(loss_wh.item(), 3)) + " " + str(round(loss_obj.item(), 3)) + " " + str(round(loss_cls.item(), 3)) + " " + str(round(loss_l2.item(), 3)) + " " + str(round(scheduler.get_lr()[0] * config.batch, 3)) + "\n") logging.info( 'Train step_{}: loss : {},loss xy : {},loss wh : {},' 'loss obj : {},loss cls : {},loss l2 : {},lr : {}'. format(global_step, loss.item(), loss_xy.item(), loss_wh.item(), loss_obj.item(), loss_cls.item(), loss_l2.item(), scheduler.get_lr()[0] * config.batch)) pbar.update(images.shape[0]) if (save_cp) & ((epoch + 1) % 10 == 0): try: os.mkdir(config.checkpoints) logging.info('Created checkpoint directory') except OSError: pass torch.save( model.state_dict(), os.path.join(config.checkpoints, f'Yolov4_epoch{epoch + 1}.pth')) logging.info(f'Checkpoint {epoch + 1} saved !') #writer.close() outfile.close()