def train(config): config["global_step"] = config.get("start_step", 0) is_training = False if config.get("export_onnx") else True anchors = [int(x) for x in config["yolo"]["anchors"].split(",")] anchors = [[[anchors[i], anchors[i + 1]], [anchors[i + 2], anchors[i + 3]], [anchors[i + 4], anchors[i + 5]]] for i in range(0, len(anchors), 6)] anchors.reverse() config["yolo"]["anchors"] = [] for i in range(3): config["yolo"]["anchors"].append(anchors[i]) # Load and initialize network net = ModelMain(config, is_training=is_training) net.train(is_training) # Optimizer and learning rate optimizer = _get_optimizer(config, net) lr_scheduler = optim.lr_scheduler.CosineAnnealingLR(optimizer, T_max=10) # lr_scheduler = optim.lr_scheduler.StepLR( # optimizer, # step_size=config["lr"]["decay_step"], # gamma=config["lr"]["decay_gamma"]) # Set data parallel net = nn.DataParallel(net) net = net.cuda() # Restore pretrain model if config["pretrain_snapshot"]: logging.info("Load pretrained weights from {}".format(config["pretrain_snapshot"])) state_dict = torch.load(config["pretrain_snapshot"]) net.load_state_dict(state_dict) # YOLO loss with 3 scales yolo_losses = [] for i in range(3): yolo_losses.append(YOLOLayer(config["batch_size"],i,config["yolo"]["anchors"][i], config["yolo"]["classes"], (config["img_w"], config["img_h"]))) # DataLoader dataloader = torch.utils.data.DataLoader(COCODataset(config["train_path"], (config["img_w"], config["img_h"]), is_training=True,is_scene=True), batch_size=config["batch_size"], shuffle=True,drop_last=True, num_workers=0, pin_memory=True) # Start the training loop logging.info("Start training.") dataload_len=len(dataloader) best_acc=0.5 for epoch in range(config["epochs"]): recall = 0 mini_step = 0 for step, samples in enumerate(dataloader): images, labels = samples["image"], samples["label"] start_time = time.time() config["global_step"] += 1 # Forward and backward optimizer.zero_grad() outputs = net(images) losses_name = ["total_loss", "x", "y", "w", "h", "conf", "cls", "recall"] losses = [0] * len(losses_name) for i in range(3): _loss_item = yolo_losses[i](outputs[i], labels) for j, l in enumerate(_loss_item): losses[j] += l # losses = [sum(l) for l in losses] loss = losses[0] loss.backward() optimizer.step() _loss = loss.item() # example_per_second = config["batch_size"] / duration lr = optimizer.param_groups[0]['lr'] strftime = datetime.datetime.now().strftime("%H:%M:%S") # if (losses[7] / 3 >= recall / (step + 1)):#mini_batchΪ0×ßÕâÀï recall += losses[7] / 3 print('%s [Epoch %d/%d,batch %03d/%d loss:x %.5f,y %.5f,w %.5f,h %.5f,conf %.5f,cls %.5f,total %.5f,rec %.3f,avrec %.3f %.3f]' % (strftime, epoch, config["epochs"], step, dataload_len, losses[1], losses[2], losses[3], losses[4], losses[5], losses[6], _loss, losses[7] / 3, recall / (step + 1), lr)) if recall / len(dataloader) > best_acc: best_acc=recall / len(dataloader) if epoch>0: torch.save(net.state_dict(), '%s/%.4f_%04d.weights' % (checkpoint_dir, recall / len(dataloader), epoch)) lr_scheduler.step() net.train(is_training) torch.cuda.empty_cache() # net.train(True) logging.info("Bye bye")
def train(config): config["global_step"] = config.get("start_step", 0) is_training = False if config.get("export_onnx") else True # Load and initialize network net = ModelMain(config, is_training=is_training) net.train(is_training) # Optimizer and learning rate optimizer = _get_optimizer(config, net) lr_scheduler = optim.lr_scheduler.StepLR( optimizer, step_size=config["lr"]["decay_step"], gamma=config["lr"]["decay_gamma"]) # Set data parallel net = nn.DataParallel(net) net = net.cuda() # Restore pretrain model if config["pretrain_snapshot"]: logging.info("Load pretrained weights from {}".format( config["pretrain_snapshot"])) state_dict = torch.load(config["pretrain_snapshot"]) net.load_state_dict(state_dict) # Only export onnx # if config.get("export_onnx"): # real_model = net.module # real_model.eval() # dummy_input = torch.randn(8, 3, config["img_h"], config["img_w"]).cuda() # save_path = os.path.join(config["sub_working_dir"], "pytorch.onnx") # logging.info("Exporting onnx to {}".format(save_path)) # torch.onnx.export(real_model, dummy_input, save_path, verbose=False) # logging.info("Done. Exiting now.") # sys.exit() # Evaluate interface # if config["evaluate_type"]: # logging.info("Using {} to evaluate model.".format(config["evaluate_type"])) # evaluate_func = importlib.import_module(config["evaluate_type"]).run_eval # config["online_net"] = net # YOLO loss with 3 scales yolo_losses = [] for i in range(3): yolo_losses.append( YOLOLoss(config["yolo"]["anchors"][i], config["yolo"]["classes"], (config["img_w"], config["img_h"]))) # DataLoader dataloader = torch.utils.data.DataLoader(COCODataset( config["train_path"], (config["img_w"], config["img_h"]), is_training=True), batch_size=config["batch_size"], shuffle=True, num_workers=32, pin_memory=True) # Start the training loop logging.info("Start training.") for epoch in range(config["epochs"]): for step, samples in enumerate(dataloader): images, labels = samples["image"], samples["label"] start_time = time.time() config["global_step"] += 1 # Forward and backward optimizer.zero_grad() outputs = net(images) losses_name = ["total_loss", "x", "y", "w", "h", "conf", "cls"] losses = [] for _ in range(len(losses_name)): losses.append([]) for i in range(3): _loss_item = yolo_losses[i](outputs[i], labels) for j, l in enumerate(_loss_item): losses[j].append(l) losses = [sum(l) for l in losses] loss = losses[0] loss.backward() optimizer.step() if step > 0 and step % 10 == 0: _loss = loss.item() duration = float(time.time() - start_time) example_per_second = config["batch_size"] / duration lr = optimizer.param_groups[0]['lr'] logging.info( "epoch [%.3d] iter = %d loss = %.2f example/sec = %.3f lr = %.5f " % (epoch, step, _loss, example_per_second, lr)) config["tensorboard_writer"].add_scalar( "lr", lr, config["global_step"]) config["tensorboard_writer"].add_scalar( "example/sec", example_per_second, config["global_step"]) for i, name in enumerate(losses_name): value = _loss if i == 0 else losses[i] config["tensorboard_writer"].add_scalar( name, value, config["global_step"]) # if step > 0 and step % 1000 == 0: # net.train(False) # _save_checkpoint(net.state_dict(), config) # net.train(True) _save_checkpoint(net.state_dict(), config) lr_scheduler.step() # net.train(False) _save_checkpoint(net.state_dict(), config) # net.train(True) logging.info("Bye~")
def train(config): config["global_step"] = config.get("start_step", 0) is_training = False if config.get("export_onnx") else True anchors = [int(x) for x in config["yolo"]["anchors"].split(",")] anchors = [[[anchors[i], anchors[i + 1]], [anchors[i + 2], anchors[i + 3]], [anchors[i + 4], anchors[i + 5]]] for i in range(0, len(anchors), 6)] anchors.reverse() config["yolo"]["anchors"] = [] for i in range(3): config["yolo"]["anchors"].append(anchors[i]) # Load and initialize network net = ModelMain(config, is_training=is_training) net.train(is_training) # Optimizer and learning rate optimizer = _get_optimizer(config, net) lr_scheduler = optim.lr_scheduler.StepLR( optimizer, step_size=config["lr"]["decay_step"], gamma=config["lr"]["decay_gamma"]) # Set data parallel net = nn.DataParallel(net) net = net.cuda() # Restore pretrain model if config["pretrain_snapshot"]: logging.info("Load pretrained weights from {}".format( config["pretrain_snapshot"])) state_dict = torch.load(config["pretrain_snapshot"]) net.load_state_dict(state_dict) # Only export onnx # if config.get("export_onnx"): # real_model = net.module # real_model.eval() # dummy_input = torch.randn(8, 3, config["img_h"], config["img_w"]).cuda() # save_path = os.path.join(config["sub_working_dir"], "pytorch.onnx") # logging.info("Exporting onnx to {}".format(save_path)) # torch.onnx.export(real_model, dummy_input, save_path, verbose=False) # logging.info("Done. Exiting now.") # sys.exit() # Evaluate interface # if config["evaluate_type"]: # logging.info("Using {} to evaluate model.".format(config["evaluate_type"])) # evaluate_func = importlib.import_module(config["evaluate_type"]).run_eval # config["online_net"] = net # YOLO loss with 3 scales yolo_losses = [] for i in range(3): yolo_losses.append( YOLOLayer(config["batch_size"], i, config["yolo"]["anchors"][i], config["yolo"]["classes"], (config["img_w"], config["img_h"]))) # DataLoader dataloader = torch.utils.data.DataLoader(COCODataset( config["train_path"], (config["img_w"], config["img_h"]), is_training=True, is_scene=True), batch_size=config["batch_size"], shuffle=True, drop_last=True, num_workers=0, pin_memory=True) # Start the training loop logging.info("Start training.") dataload_len = len(dataloader) for epoch in range(config["epochs"]): recall = 0 mini_step = 0 for step, samples in enumerate(dataloader): images, labels = samples["image"], samples["label"] start_time = time.time() config["global_step"] += 1 for mini_batch in range(3): mini_step += 1 # Forward and backward optimizer.zero_grad() outputs = net(images) losses_name = [ "total_loss", "x", "y", "w", "h", "conf", "cls", "recall" ] losses = [0] * len(losses_name) for i in range(3): _loss_item = yolo_losses[i](outputs[i], labels) for j, l in enumerate(_loss_item): losses[j] += l # losses = [sum(l) for l in losses] loss = losses[0] loss.backward() optimizer.step() _loss = loss.item() # example_per_second = config["batch_size"] / duration # lr = optimizer.param_groups[0]['lr'] strftime = datetime.datetime.now().strftime("%H:%M:%S") if (losses[7] / 3 >= recall / (step + 1)) or mini_batch == (3 - 1): #mini_batch为0走这里 recall += losses[7] / 3 print( '%s [Epoch %d/%d,batch %03d/%d loss:x %.5f,y %.5f,w %.5f,h %.5f,conf %.5f,cls %.5f,total %.5f,rec %.3f,avrec %.3f %d]' % (strftime, epoch, config["epochs"], step, dataload_len, losses[1], losses[2], losses[3], losses[4], losses[5], losses[6], _loss, losses[7] / 3, recall / (step + 1), mini_batch)) break else: print( '%s [Epoch %d/%d,batch %03d/%d loss:x %.5f,y %.5f,w %.5f,h %.5f,conf %.5f,cls %.5f,total %.5f,rec %.3f,prerc %.3f %d]' % (strftime, epoch, config["epochs"], step, dataload_len, losses[1], losses[2], losses[3], losses[4], losses[5], losses[6], _loss, losses[7] / 3, recall / step, mini_batch)) # logging.info(epoch [%.3d] iter = %d loss = %.2f example/sec = %.3f lr = %.5f "% # (epoch, step, _loss, example_per_second, lr)) # config["tensorboard_writer"].add_scalar("lr", # lr, # config["global_step"]) # config["tensorboard_writer"].add_scalar("example/sec", # example_per_second, # config["global_step"]) # for i, name in enumerate(losses_name): # value = _loss if i == 0 else losses[i] # config["tensorboard_writer"].add_scalar(name, # value, # config["global_step"]) if (epoch % 2 == 0 and recall / len(dataloader) > 0.7 ) or recall / len(dataloader) > 0.96: torch.save( net.state_dict(), '%s/%.4f_%04d.weights' % (checkpoint_dir, recall / len(dataloader), epoch)) lr_scheduler.step() # net.train(True) logging.info("Bye bye")
def train(config): config["global_step"] = config.get("start_step", 0) is_training = False if config.get("export_onnx") else True # Load and initialize network net = ModelMain(config, is_training=is_training) net.train(is_training) # Optimizer and learning rate optimizer = _get_optimizer(config, net) lr_scheduler = optim.lr_scheduler.CosineAnnealingLR(optimizer, T_max=15) # lr_scheduler = optim.lr_scheduler.StepLR( # optimizer, # step_size=config["lr"]["decay_step"], # gamma=config["lr"]["decay_gamma"]) # Set data parallel net = nn.DataParallel(net) net = net.cuda() # Restore pretrain model if config["pretrain_snapshot"]: logging.info("Load pretrained weights from {}".format( config["pretrain_snapshot"])) state_dict = torch.load(config["pretrain_snapshot"]) net.load_state_dict(state_dict) # Start the training loop logging.info("Start training.") dataload_len = len(dataloader) epoch_size = 4 start = time.time() pruned_pct = 0 global index, pruned_book, num_pruned global num_weights global weight_masks, bias_masks for epoch in range(config["epochs"]): if epoch % 4 == 0: index = 0 num_pruned = 0 num_weights = 0 net.apply(prune) torch.save(net.state_dict(), '%s/%.4f_%04d.weights' % (checkpoint_dir, 0.01, 1)) print('previously pruned: %.3f %%' % (100 * (pruned_pct))) print('number pruned: %.3f %%' % (100 * (num_pruned / num_weights))) new_pruned = num_pruned / num_weights - pruned_pct pruned_pct = num_pruned / num_weights # if new_pruned <= 0.01: # time_elapse = time.time() - start # print('training time:', str(timedelta(seconds=time_elapse))) # break recall = 0 mini_step = 0 for step, samples in enumerate(dataloader): index = 0 images, labels = samples["image"], samples["label"] start_time = time.time() optimizer.zero_grad() outputs = net(images) losses_name = [ "total_loss", "x", "y", "w", "h", "conf", "cls", "recall" ] losses = [0] * len(losses_name) for i in range(3): _loss_item = yolo_losses[i](outputs[i], labels) for j, l in enumerate(_loss_item): losses[j] += l # losses = [sum(l) for l in losses] loss = losses[0] loss.backward() net.apply(set_grad) optimizer.step() _loss = loss.item() # example_per_second = config["batch_size"] / duration lr = optimizer.param_groups[0]['lr'] strftime = datetime.datetime.now().strftime("%H:%M:%S") recall += losses[7] / 3 print( '%s [Epoch %d/%d,batch %03d/%d loss:x %.5f,y %.5f,w %.5f,h %.5f,conf %.5f,cls %.5f,total %.5f,rec %.3f,avrec %.3f %.3f]' % (strftime, epoch, config["epochs"], step, dataload_len, losses[1], losses[2], losses[3], losses[4], losses[5], losses[6], _loss, losses[7] / 3, recall / (step + 1), lr)) if (epoch % 2 == 0 and recall / len(dataloader) > 0.5 ) or recall / len(dataloader) > 0: # torch.save(net.state_dict(), '%s/%.4f_%04d.weights' % (checkpoint_dir, recall / len(dataloader), epoch)) torch.save( net.state_dict(), '%s/%.4f_%04d.weights' % (checkpoint_dir, recall / len(dataloader), epoch)) lr_scheduler.step() # net.train(True) logging.info("Bye bye")
def train(): global_step = 0 is_training = True # Load and Initialize Network net = ModelMain(is_training) net.train(is_training) # Optimizer and Lr optimizer = _get_optimizer(net) lr_scheduler = optim.lr_scheduler.StepLR( optimizer, step_size=lr_decay_step, #20 gamma=lr_decay_gamma) # 0.1 # Set Data Paraller: net = nn.DataParallel(net) net = net.cuda() logging.info("Net of Cuda is Done!") # Restore pretrain model 从预训练模型中恢复 if pretrain_snapshot: logging.info( "Load pretrained weights from {}".format(pretrain_snapshot)) state_dic = torch.load(pretrain_snapshot) net.load_state_dict(state_dic) yolo_losses = [] for i in range(3): yolo_losses.append( YOLOLoss(anchors[i], classes, (img_w, img_h)).cuda()) print('YOLO_Losses: \n', yolo_losses) # DataLoader train_data_loader = DATA.DataLoader(dataset=COCODataset(train_path, (img_w, img_h), is_training=True), batch_size=batch_size, shuffle=True, pin_memory=False) # Start the training loop logging.info("Start training......") for epoch in range(epochs): for step, samples in enumerate(train_data_loader): images, labels = samples['image'].cuda(), samples["label"].cuda() start_time = time.time() global_step += 1 # Forward & Backward optimizer.zero_grad() outputs = net(images) losses_name = ["total_loss", "x", "y", "w", "h", "conf", "cls"] losses = [[]] * len( losses_name) # [[]] ---> [[], [], [], [], [], [], []] for i in range(3): # YOLO 3 scales _loss_item = yolo_losses[i](outputs[i], labels) for j, l in enumerate(_loss_item): # print('j: ', j, 'l: ', l) j: index(0-6); l内容: 总loss, x, y, w, h, conf, cls losses[j].append(l) losses = [sum(l) for l in losses] loss = losses[0] # losses[0]为总Loss conf = losses[5] loss.backward() optimizer.step() if step > 0 and step % 10 == 0: _loss = loss.item() _conf = conf.item() duration = float(time.time() - start_time) # 总用时 example_per_second = batch_size / duration # 每个样本用时 lr = optimizer.param_groups[0]['lr'] logging.info( "epoch [%.3d] iter = %d loss = %.2f conf = %.2f example/sec = %.3f lr = %.5f " % (epoch, step, _loss, _conf, example_per_second, lr)) if step >= 0 and step % 1000 == 0: # net.train(False) _save_checkpoint(net.state_dict(), epoch, step) # net.train(True) lr_scheduler.step() _save_checkpoint(net.state_dict(), 100, 9999) logging.info("Bye~")
def train(imgs, labels, checkpoint_path, config): config["global_step"] = config.get("start_step", 0) is_training = False if config.get("export_onnx") else True # Load and initialize network net = ModelMain(config, is_training=is_training) net.train(is_training) # Optimizer and learning rate optimizer = _get_optimizer(config, net) lr_scheduler = optim.lr_scheduler.StepLR( optimizer, step_size=config["lr"]["decay_step"], gamma=config["lr"]["decay_gamma"]) # Set data parallel net = nn.DataParallel(net) net = net.cuda() # Restore pretrain model if checkpoint_path: logging.info("Load pretrained weights from {}".format(checkpoint_path)) state_dict = torch.load(checkpoint_path) net.load_state_dict(state_dict) # YOLO loss with 3 scales yolo_losses = [] for i in range(3): yolo_losses.append( YOLOLoss(config["yolo"]["anchors"][i], config["yolo"]["classes"], (config["img_w"], config["img_h"]))) # DataLoader dataloader = torch.utils.data.DataLoader(SatDataset( imgs, labels, (config["img_w"], config["img_h"]), is_training=True), batch_size=config["batch_size"], shuffle=True, num_workers=1, pin_memory=True) # Start the training loop logging.info("Start training.") for epoch in range(config["epochs"]): for step, samples in enumerate(dataloader): images, labels = samples["image"], samples["label"] start_time = time.time() config["global_step"] += 1 # Forward and backward optimizer.zero_grad() outputs = net(images) losses_name = ["total_loss", "x", "y", "w", "h", "conf", "cls"] losses = [[]] * len(losses_name) for i in range(3): _loss_item = yolo_losses[i](outputs[i], labels) for j, l in enumerate(_loss_item): losses[j].append(l) losses = [sum(l) for l in losses] loss = losses[0] loss.backward() optimizer.step() if step > 0 and step % 10 == 0: _loss = loss.item() duration = float(time.time() - start_time) example_per_second = config["batch_size"] / duration lr = optimizer.param_groups[0]['lr'] logging.info( "epoch [%.3d] iter = %d loss = %.2f example/sec = %.3f lr = %.5f " % (epoch, step, _loss, example_per_second, lr)) config["tensorboard_writer"].add_scalar( "lr", lr, config["global_step"]) config["tensorboard_writer"].add_scalar( "example/sec", example_per_second, config["global_step"]) for i, name in enumerate(losses_name): value = _loss if i == 0 else losses[i] config["tensorboard_writer"].add_scalar( name, value, config["global_step"]) lr_scheduler.step() # net.train(False) checkpoint_path = _save_checkpoint(net.state_dict(), config) # net.train(True) logging.info("Bye~") return checkpoint_path
def train(config): config["global_step"] = config.get("start_step", 0) is_training = False if config.get("export_onnx") else True anchors = [int(x) for x in config["yolo"]["anchors"].split(",")] anchors = [[[anchors[i], anchors[i + 1]], [anchors[i + 2], anchors[i + 3]], [anchors[i + 4], anchors[i + 5]]] for i in range(0, len(anchors), 6)] anchors.reverse() config["yolo"]["anchors"] = [] for i in range(3): config["yolo"]["anchors"].append(anchors[i]) # Load and initialize network net = ModelMain(config, is_training=is_training) net.train(is_training) # Optimizer and learning rate optimizer = _get_optimizer(config, net) t_max = 50 # lr_scheduler = optim.lr_scheduler.CosineAnnealingLR(optimizer, T_max=t_max,eta_min=1e-05) lr_scheduler = optim.lr_scheduler.StepLR( optimizer, step_size=config["lr"]["decay_step"], gamma=config["lr"]["decay_gamma"]) # Set data parallel net = nn.DataParallel(net) net = net.cuda() # Restore pretrain model if config["pretrain_snapshot"]: logging.info("Load pretrained weights from {}".format( config["pretrain_snapshot"])) state_dict = torch.load(config["pretrain_snapshot"]) net.load_state_dict(state_dict) # Only export onnx # if config.get("export_onnx"): # real_model = net.module # real_model.eval() # dummy_input = torch.randn(8, 3, config["img_h"], config["img_w"]).cuda() # save_path = os.path.join(config["sub_working_dir"], "pytorch.onnx") # logging.info("Exporting onnx to {}".format(save_path)) # torch.onnx.export(real_model, dummy_input, save_path, verbose=False) # logging.info("Done. Exiting now.") # sys.exit() # Evaluate interface # if config["evaluate_type"]: # logging.info("Using {} to evaluate model.".format(config["evaluate_type"])) # evaluate_func = importlib.import_module(config["evaluate_type"]).run_eval # config["online_net"] = net # YOLO loss with 3 scales # DataLoader dataloader = torch.utils.data.DataLoader( COCODataset(config["train_path"], (config["img_w"], config["img_h"]), is_training=True, is_scene=True), batch_size=config["batch_size"] * config["parallels"], shuffle=True, drop_last=True, num_workers=0, pin_memory=True) # Start the training loop logging.info("Start training.") dataload_len = len(dataloader) best_acc = 0.2 last_recall = 0.6 for epoch in range(config["epochs"]): recall = 0 mini_step = 0 for step, samples in enumerate(dataloader): start = time.time() images, labels = samples["image"], samples["label"] config["global_step"] += 1 # Forward and backward optimizer.zero_grad() losses = net(images.cuda(), labels.cuda()) # current_recall = mAP(detections, labels, config["img_w"]) # current_recall = np.mean(current_recall) if config["parallels"] > 1: losses = losses.view(config["parallels"], 8)[0] + losses.view( config["parallels"], 8)[1] loss = losses[0] if epoch > 0: loss = loss * 20 current_recall = float(losses[7] / 3 / config["parallels"]) if last_recall < 0.65: loss = loss + 20 * (1 - current_recall) # * 0.8 else: loss = loss + 20 * (1 - current_recall) loss.backward() optimizer.step() _loss = loss.item() # example_per_second = config["batch_size"] / duration lr = optimizer.param_groups[0]['lr'] # strftime = datetime.datetime.now().strftime("%H:%M:%S") # # if (losses[7] / 3 >= recall / (step + 1)):#mini_batch为0走这里 recall += current_recall print( '%s [Epoch %d/%d,batch %03d/%d loss:x %.5f,y %.5f,w %.5f,h %.5f,conf %.5f,cls %.5f,total %.5f,rec %.3f,avrec %.3f %.3f]' % (strftime, epoch, config["epochs"], step, dataload_len, losses[1], losses[2], losses[3], losses[4], losses[5], losses[6], _loss, current_recall, recall / (step + 1), lr)) last_recall = recall / len(dataloader) if recall / len(dataloader) > best_acc: best_acc = recall / len(dataloader) torch.save( net.state_dict(), '%s/%.4f_%04d.weights' % (checkpoint_dir, recall / len(dataloader), epoch)) lr_scheduler.step() # if epoch % (lr_scheduler.T_max + next_need) == (lr_scheduler.T_max + next_need - 1): # next_need += float(lr_scheduler.T_max) # lr_scheduler.T_max += 2 # lr_scheduler.last_epoch = 0 # lr_scheduler.base_lrs*=0.98 # lr_scheduler.base_lrs[0] *= 0.95 # lr_scheduler.base_lrs[1] *= 0.95 # net.train(is_training) # torch.cuda.empty_cache() # net.train(True) logging.info("Bye bye")
def train(config): config["global_step"] = config.get("start_step", 0) is_training = False if config.get("export_onnx") else True anchors = [int(x) for x in config["yolo"]["anchors"].split(",")] anchors = [[[anchors[i], anchors[i + 1]], [anchors[i + 2], anchors[i + 3]], [anchors[i + 4], anchors[i + 5]]] for i in range(0, len(anchors), 6)] anchors.reverse() config["yolo"]["anchors"] = [] for i in range(3): config["yolo"]["anchors"].append(anchors[i]) # Load and initialize network net = ModelMain(config, is_training=is_training) net.train(is_training) # Optimizer and learning rate optimizer = _get_optimizer(config, net) lr_scheduler = optim.lr_scheduler.StepLR( optimizer, step_size=config["lr"]["decay_step"], gamma=config["lr"]["decay_gamma"]) # Set data parallel net = nn.DataParallel(net) net = net.cuda() # Restore pretrain model if config["pretrain_snapshot"]: logging.info("Load pretrained weights from {}".format( config["pretrain_snapshot"])) state_dict = torch.load(config["pretrain_snapshot"]) net.load_state_dict(state_dict) # YOLO loss with 3 scales yolo_losses = [] for i in range(3): yolo_losses.append( YOLOLayer(config["batch_size"], i, config["yolo"]["anchors"][i], config["yolo"]["classes"], (config["img_w"], config["img_h"]))) total_loss = 0 last_total_loss = 0 manager = Manager() # 父进程创建Queue,并传给各个子进程: q = manager.Queue(1) lock = manager.Lock() # 初始化一把锁 p = Pool() pw = p.apply_async(get_data, args=(q, lock)) batch_len = q.get() if batch_len[0] == "len": batch_len = batch_len[1] logging.info("Start training.") for epoch in range(config["epochs"]): recall = 0 for step in range(batch_len): samples = q.get() images, labels = samples["image"], samples["label"] start_time = time.time() config["global_step"] += 1 # Forward and backward optimizer.zero_grad() outputs = net(images) losses_name = [ "total_loss", "x", "y", "w", "h", "conf", "cls", "recall" ] losses = [0] * len(losses_name) for i in range(3): _loss_item = yolo_losses[i](outputs[i], labels) for j, l in enumerate(_loss_item): losses[j] += l # losses = [sum(l) for l in losses] loss = losses[0] loss.backward() optimizer.step() if step > 0 and step % 2 == 0: _loss = loss.item() duration = float(time.time() - start_time) example_per_second = config["batch_size"] / duration lr = optimizer.param_groups[0]['lr'] strftime = datetime.datetime.now().strftime("%H:%M:%S") recall += losses[7] / 3 print( '%s [Epoch %d/%d, Batch %03d/%d losses: x %.5f, y %.5f, w %.5f, h %.5f, conf %.5f, cls %.5f, total %.5f, recall: %.3f]' % (strftime, epoch, config["epochs"], step, batch_len, losses[1], losses[2], losses[3], losses[4], losses[5], losses[6], _loss, losses[7] / 3)) # logging.info(epoch [%.3d] iter = %d loss = %.2f example/sec = %.3f lr = %.5f "% # (epoch, step, _loss, example_per_second, lr)) # config["tensorboard_writer"].add_scalar("lr", # lr, # config["global_step"]) # config["tensorboard_writer"].add_scalar("example/sec", # example_per_second, # config["global_step"]) # for i, name in enumerate(losses_name): # value = _loss if i == 0 else losses[i] # config["tensorboard_writer"].add_scalar(name, # value, # config["global_step"]) if (epoch % 2 == 0 and recall / batch_len > 0.7) or recall / batch_len > 0.96: torch.save(net.state_dict(), '%s/%04d.weights' % (checkpoint_dir, epoch)) lr_scheduler.step()
def train(config): # Hyper-parameters config["global_step"] = config.get("start_step", 0) is_training = True # Net & Loss & Optimizer ## Net Main net = ModelMain(config, is_training=is_training) net.train(is_training) ## YOLO Loss with 3 scales yolo_losses = [] for i in range(3): yolo_loss = YOLOLoss(config["yolo"]["anchors"][i], config["yolo"]["classes"], (config["img_w"], config["img_h"])) yolo_losses.append(yolo_loss) ## Optimizer and LR scheduler optimizer = _get_optimizer(config, net) lr_scheduler = optim.lr_scheduler.StepLR(optimizer, step_size=config["lr"]["decay_step"], gamma=config["lr"]["decay_gamma"]) net = nn.DataParallel(net) net = net.cuda() # Load checkpoint if config["pretrain_snapshot"]: logging.info("Load pretrained weights from {}".format(config["pretrain_snapshot"])) state_dict = torch.load(config["pretrain_snapshot"]) net.load_state_dict(state_dict) # DataLoader dataloader = torch.utils.data.DataLoader(AIPrimeDataset(config["train_path"]), batch_size=config["batch_size"], shuffle=True, num_workers=16, pin_memory=False) # Start the training logging.info("Start training.") for epoch in range(config["start_epoch"], config["epochs"]): for step, (images, labels) in enumerate(dataloader): start_time = time.time() config["global_step"] += 1 # Forward outputs = net(images) # Loss losses_name = ["total_loss", "x", "y", "w", "h", "conf", "cls"] losses = [[]] * len(losses_name) for i in range(3): _loss_item = yolo_losses[i](outputs[i], labels) for j, l in enumerate(_loss_item): losses[j].append(l) losses = [sum(l) for l in losses] loss = losses[0] # Zero & Backward & Step optimizer.zero_grad() loss.backward() optimizer.step() # Logging if step > 0 and step % 10 == 0: _loss = loss.item() duration = float(time.time() - start_time) example_per_second = config["batch_size"] / duration lr = optimizer.param_groups[0]['lr'] logging.info( "epoch [%.3d] iter = %d loss = %.2f example/sec = %.3f lr = %.5f " % (epoch, step, _loss, example_per_second, lr) ) # Things to be done for every epoch ## LR schedule lr_scheduler.step() ## Save checkpoint _save_checkpoint(net.state_dict(), config, epoch) # Finish training logging.info("QiaJiaBa~ BeiBei")