val_dataloader = PSPnetDataset(val_lines, input_shape, batch_size, num_classes, aux_branch, False, VOCdevkit_path) #-------------------------------------------------------------------------------# # 训练参数的设置 # logging 用于设置tensorboard的保存地址 # checkpoint 用于设置权值保存的细节,period用于修改多少epoch保存一次 # lr_scheduler 用于设置学习率下降的方式 # early_stopping 用于设定早停,val_loss多次不下降自动结束训练,表示模型基本收敛 #-------------------------------------------------------------------------------# time_str = datetime.datetime.strftime(datetime.datetime.now(), '%Y_%m_%d_%H_%M_%S') log_dir = os.path.join(save_dir, "loss_" + str(time_str)) logging = TensorBoard(log_dir) loss_history = LossHistory(log_dir) checkpoint = ModelCheckpoint(os.path.join( save_dir, "ep{epoch:03d}-loss{loss:.3f}-val_loss{val_loss:.3f}.h5"), monitor='val_loss', save_weights_only=True, save_best_only=False, period=save_period) early_stopping = EarlyStopping(monitor='val_loss', min_delta=0, patience=10, verbose=1) lr_scheduler = LearningRateScheduler(lr_scheduler_func, verbose=1) callbacks = [logging, loss_history, checkpoint, lr_scheduler] if start_epoch < end_epoch:
#-------------------------------------------------------------------------------# # 训练参数的设置 # logging表示tensorboard的保存地址 # checkpoint用于设置权值保存的细节,period用于修改多少epoch保存一次 # reduce_lr用于设置学习率下降的方式 # early_stopping用于设定早停,val_loss多次不下降自动结束训练,表示模型基本收敛 #-------------------------------------------------------------------------------# logging = TensorBoard(log_dir = 'logs/') checkpoint = ModelCheckpoint('logs/ep{epoch:03d}-loss{loss:.3f}-val_loss{val_loss:.3f}.h5', monitor = 'val_loss', save_weights_only = True, save_best_only = False, period = 1) if Cosine_scheduler: reduce_lr = WarmUpCosineDecayScheduler(T_max = 5, eta_min = 1e-5, verbose = 1) else: reduce_lr = ExponentDecayScheduler(decay_rate = 0.94, verbose = 1) early_stopping = EarlyStopping(monitor='val_loss', min_delta = 0, patience = 10, verbose = 1) loss_history = LossHistory('logs/') #---------------------------# # 读取数据集对应的txt #---------------------------# with open(train_annotation_path) as f: train_lines = f.readlines() with open(val_annotation_path) as f: val_lines = f.readlines() num_train = len(train_lines) num_val = len(val_lines) if Freeze_Train: freeze_layers = 249 for i in range(freeze_layers): model_body.layers[i].trainable = False print('Freeze the first {} layers of total {} layers.'.format(freeze_layers, len(model_body.layers)))
k: v for k, v in pretrained_dict.items() if np.shape(model_dict[k]) == np.shape(v) } model_dict.update(pretrained_dict) model.load_state_dict(model_dict) model_train = model.train() if Cuda: model_train = torch.nn.DataParallel(model) cudnn.benchmark = True model_train = model_train.cuda() yolo_loss = YOLOLoss(anchors, num_classes, input_shape, Cuda, anchors_mask, label_smoothing) loss_history = LossHistory("logs/") #---------------------------# # 读取数据集对应的txt #---------------------------# with open(train_annotation_path) as f: train_lines = f.readlines() with open(val_annotation_path) as f: val_lines = f.readlines() num_train = len(train_lines) num_val = len(val_lines) #------------------------------------------------------# # 主干特征提取网络特征通用,冻结训练可以加快训练速度 # 也可以在训练初期防止权值被破坏。 # Init_Epoch为起始世代
# checkpoint用于设置权值保存的细节,period用于修改多少epoch保存一次 # reduce_lr用于设置学习率下降的方式 # early_stopping用于设定早停,val_loss多次不下降自动结束训练,表示模型基本收敛 #-------------------------------------------------------------------------------# logging = TensorBoard(log_dir='logs/') checkpoint = ModelCheckpoint('logs/ep{epoch:03d}-loss{loss:.3f}.h5', monitor='loss', save_weights_only=True, save_best_only=False, period=1) reduce_lr = ExponentDecayScheduler(decay_rate=0.96, verbose=1) early_stopping = EarlyStopping(monitor='loss', min_delta=0, patience=10, verbose=1) loss_history = LossHistory('logs/', val_loss_flag=False) if focal_loss: if dice_loss: loss = dice_loss_with_Focal_Loss(cls_weights) else: loss = Focal_Loss(cls_weights) else: if dice_loss: loss = dice_loss_with_CE(cls_weights) else: loss = CE(cls_weights) #------------------------------------# # 冻结一定部分训练 #------------------------------------#
if not pretrained: weights_init(model) if model_path != '': #------------------------------------------------------# # 权值文件请看README,百度网盘下载 #------------------------------------------------------# print('Load weights {}.'.format(model_path)) device = torch.device('cuda' if torch.cuda.is_available() else 'cpu') model_dict = model.state_dict() pretrained_dict = torch.load(model_path, map_location = device) pretrained_dict = {k: v for k, v in pretrained_dict.items() if np.shape(model_dict[k]) == np.shape(v)} model_dict.update(pretrained_dict) model.load_state_dict(model_dict) yolo_loss = YOLOLoss(anchors, num_classes, input_shape, Cuda, anchors_mask, label_smoothing) loss_history = LossHistory("logs/", model, input_shape=input_shape) model_train = model.train() if Cuda: model_train = torch.nn.DataParallel(model) cudnn.benchmark = True model_train = model_train.cuda() #---------------------------# # 读取数据集对应的txt #---------------------------# with open(train_annotation_path, encoding='utf-8') as f: train_lines = f.readlines() with open(val_annotation_path, encoding='utf-8') as f: val_lines = f.readlines() num_train = len(train_lines)
if local_rank == 0: print("\nSuccessful Load Key:", str(load_key)[:500], "……\nSuccessful Load Key Num:", len(load_key)) print("\nFail To Load Key:", str(no_load_key)[:500], "……\nFail To Load Key num:", len(no_load_key)) print("\n\033[1;33;44m温馨提示,head部分没有载入是正常现象,Backbone部分没有载入是错误的。\033[0m") #----------------------# # 获得损失函数 #----------------------# criterion = MultiboxLoss(num_classes, neg_pos_ratio=3.0) #----------------------# # 记录Loss #----------------------# if local_rank == 0: time_str = datetime.datetime.strftime(datetime.datetime.now(),'%Y_%m_%d_%H_%M_%S') log_dir = os.path.join(save_dir, "loss_" + str(time_str)) loss_history = LossHistory(log_dir, model, input_shape=input_shape) else: loss_history = None #------------------------------------------------------------------# # torch 1.2不支持amp,建议使用torch 1.7.1及以上正确使用fp16 # 因此torch1.2这里显示"could not be resolve" #------------------------------------------------------------------# if fp16: from torch.cuda.amp import GradScaler as GradScaler scaler = GradScaler() else: scaler = None model_train = model.train() #----------------------------#
pretrained_dict = torch.load(model_path, map_location=device) pretrained_dict = { k: v for k, v in pretrained_dict.items() if np.shape(model_dict[k]) == np.shape(v) } model_dict.update(pretrained_dict) model.load_state_dict(model_dict) model_train = model.train() if Cuda: model_train = torch.nn.DataParallel(model) cudnn.benchmark = True model_train = model_train.cuda() loss_history = LossHistory(os.path.join("logs", backbone), model_train, input_shape) #----------------------------------------------------# # 驗證集的劃分在train.py代碼里面進行 #----------------------------------------------------# with open(annotation_path, "r") as f: lines = f.readlines() np.random.seed(10101) np.random.shuffle(lines) np.random.seed(None) num_val = int(len(lines) * val_split) num_train = len(lines) - num_val #------------------------------------------------------# # 訓練分為兩個階段,分別是凍結階段和解凍階段。 # 顯存不足與數據集大小無關,提示顯存不足請調小batch_size。 # 受到BatchNorm層影響,batch_size最小為1。
def fit_model(model, Lr, Batch_size, Init_Epoch, run_Epoch, warmup_proportion=0.1, min_scale=1e-2, max_objects=100): # -------------------------------------------------------------------------------# # 训练参数的设置 # logging表示tensorboard的保存地址 # checkpoint用于设置权值保存的细节,period用于修改多少epoch保存一次 # reduce_lr用于设置学习率下降的方式 # early_stopping用于设定早停,val_loss多次不下降自动结束训练,表示模型基本收敛 # -------------------------------------------------------------------------------# logs = path + '/' + datetime.now().strftime("%Y%m%d-%H%M%S") logging = TensorBoard(log_dir=logs, profile_batch=(2, 5)) loss_history = LossHistory(logs) checkpoint = ModelCheckpoint( path + '/ep{epoch:03d}-loss{loss:.3f}-val_loss{val_loss:.3f}.h5', monitor='val_loss', save_weights_only=True, save_best_only=False, period=1) Epoch = Init_Epoch + run_Epoch train_dataloader = OneNetDatasets(lines[:num_train], input_shape, Batch_size, num_classes, train=True, max_objects=max_objects) val_dataloader = OneNetDatasets(lines[num_train:], input_shape, Batch_size, num_classes, train=False, max_objects=max_objects) print('Train on {} samples, val on {} samples, with batch size {}.'. format(num_train, num_val, Batch_size)) # gen = Generator(Batch_size, lines[:num_train], lines[num_train:], input_shape, num_classes, max_objects=max_objects) optimizer = tfa.optimizers.RectifiedAdam( learning_rate=Lr, total_steps=num_train // Batch_size * (Epoch - Init_Epoch), warmup_proportion=warmup_proportion, weight_decay=1e-4, min_lr=Lr * min_scale) loss_list = { 'cls': lambda y_true, y_pred: y_pred, 'loc': lambda y_true, y_pred: y_pred, 'giou': lambda y_true, y_pred: y_pred } loss_weights = [2, 5, 2] model.compile(loss=loss_list, loss_weights=loss_weights, optimizer=optimizer) histogram = model.fit(train_dataloader, steps_per_epoch=num_train // Batch_size, validation_data=val_dataloader, validation_steps=num_val // Batch_size, epochs=Epoch, verbose=1, initial_epoch=Init_Epoch, callbacks=[logging, checkpoint, loss_history]) return histogram