def main(): GPU_count = torch.cuda.device_count() # 默认使用最后一块 GPU,最大可能保证其空闲 device = torch.device( "cuda:{}".format(GPU_count - 1) if torch.cuda.is_available() else "cpu") print("Using cuda:{0}/{1} device training.".format( GPU_count - 1, [item for item in range(GPU_count)])) # 检查保存权重文件夹是否存在,不存在则创建 if not os.path.exists("save_weights"): os.makedirs("save_weights") data_transform = { "train": transforms.Compose( [transforms.ToTensor(), transforms.RandomHorizontalFlip(0.5)]), "val": transforms.Compose([transforms.ToTensor()]) } VOC_root = "/home/shaolun/PYTHON/object-detection/faster-rcnn.pytorch/data/" # check voc root if os.path.exists(os.path.join(VOC_root, "VOCdevkit")) is False: raise FileNotFoundError( "VOCdevkit dose not in path:'{}'.".format(VOC_root)) # load train data set # VOCdevkit -> VOC2012 -> ImageSets -> Main -> train.txt train_data_set = VOC2007DataSet(VOC_root, data_transform["train"], "train.txt") # 注意这里的collate_fn是自定义的,因为读取的数据包括image和targets,不能直接使用默认的方法合成batch batch_size = 8 nw = min([os.cpu_count(), batch_size if batch_size > 1 else 0, 8]) # number of workers print('Using %g dataloader workers' % nw) train_data_loader = torch.utils.data.DataLoader( train_data_set, batch_size=batch_size, shuffle=True, num_workers=nw, collate_fn=train_data_set.collate_fn) # load validation data set # VOCdevkit -> VOC2012 -> ImageSets -> Main -> val.txt val_data_set = VOC2007DataSet(VOC_root, data_transform["val"], "val.txt") val_data_set_loader = torch.utils.data.DataLoader( val_data_set, batch_size=batch_size, shuffle=False, num_workers=nw, collate_fn=train_data_set.collate_fn) # create model num_classes equal background + 20 classes model = create_model(num_classes=21) # print(model) model.to(device) train_loss = [] learning_rate = [] val_mAP = [] # # # # # # # # # # # # # # # # # # # # # # # # # # # # # # # # # first frozen backbone and train 5 epochs # # 首先冻结前置特征提取网络权重(backbone),训练rpn以及最终预测网络部分 # # # # # # # # # # # # # # # # # # # # # # # # # # # # # # # # # for param in model.backbone.parameters(): param.requires_grad = False # define optimizer params = [p for p in model.parameters() if p.requires_grad] optimizer = torch.optim.SGD(params, lr=0.005, momentum=0.9, weight_decay=0.0005) num_epochs = 5 # for epoch in range(num_epochs): # # train for one epoch, printing every 10 iterations # utils.train_one_epoch(model, optimizer, train_data_loader, # device, epoch, print_freq=50, # train_loss=train_loss, train_lr=learning_rate) # # # evaluate on the test dataset # utils.evaluate(model, val_data_set_loader, device=device, mAP_list=val_mAP) # # torch.save(model.state_dict(), "./save_weights/pretrain.pth") # # # # # # # # # # # # # # # # # # # # # # # # # # # # # second unfrozen backbone and train all network # # 解冻前置特征提取网络权重(backbone),接着训练整个网络权重 # # # # # # # # # # # # # # # # # # # # # # # # # # # # # # 冻结backbone部分底层权重 for name, parameter in model.backbone.named_parameters(): split_name = name.split(".")[0] if split_name in ["0", "1", "2", "3"]: parameter.requires_grad = False else: parameter.requires_grad = True # define optimizer params = [p for p in model.parameters() if p.requires_grad] optimizer = torch.optim.SGD(params, lr=0.005, momentum=0.9, weight_decay=0.0005) # learning rate scheduler lr_scheduler = torch.optim.lr_scheduler.StepLR(optimizer, step_size=5, gamma=0.33) num_epochs = 20 for epoch in range(num_epochs): # train for one epoch, printing every 50 iterations utils.train_one_epoch(model, optimizer, train_data_loader, device, epoch, print_freq=50, train_loss=train_loss, train_lr=learning_rate) # update the learning rate lr_scheduler.step() # evaluate on the test dataset utils.evaluate(model, val_data_set_loader, device=device, mAP_list=val_mAP) # save weights if epoch > 10: save_files = { 'model': model.state_dict(), 'optimizer': optimizer.state_dict(), 'lr_scheduler': lr_scheduler.state_dict(), 'epoch': epoch } torch.save(save_files, "./save_weights/mobile-model-{}.pth".format(epoch)) # plot loss and lr curve if len(train_loss) != 0 and len(learning_rate) != 0: from plot_curve import plot_loss_and_lr plot_loss_and_lr(train_loss, learning_rate) # plot mAP curve if len(val_mAP) != 0: from plot_curve import plot_map plot_map(val_mAP)
def main(parser_data): device = torch.device( parser_data.device if torch.cuda.is_available() else "cpu") print(device) if not os.path.exists("save_weights"): os.mkdir("save_weights") data_transform = { "train": transform.Compose([ transform.SSDCropping(), transform.Resize(), transform.ColorJitter(), transform.ToTensor(), transform.RandomHorizontalFlip(), transform.Normalization(), transform.AssignGTtoDefaultBox() ]), "val": transform.Compose([ transform.Resize(), transform.ToTensor(), transform.Normalization() ]) } night_root = parser_data.data_path train_dataset = NightDataSet(night_root, data_transform['train'], train_set='train.txt') # aa = train_dataset[1] # 注意训练时,batch_size必须大于1 train_data_loader = torch.utils.data.DataLoader( train_dataset, batch_size=8, shuffle=True, num_workers=4, collate_fn=utils.collate_fn) val_dataset = NightDataSet(night_root, data_transform['val'], train_set='val.txt') # bb = val_dataset[2] val_data_loader = torch.utils.data.DataLoader(val_dataset, batch_size=4, shuffle=False, num_workers=0, collate_fn=utils.collate_fn) model = create_model(num_classes=3, device=device) print(model) model.to(device) # define optimizer params = [p for p in model.parameters() if p.requires_grad] optimizer = torch.optim.SGD(params, lr=0.005, momentum=0.9, weight_decay=0.0005) # learning rate scheduler lr_scheduler = torch.optim.lr_scheduler.StepLR(optimizer, step_size=5, gamma=0.5) # 如果指定了上次训练保存的权重文件地址,则接着上次结果接着训练 if parser_data.resume != "": checkpoint = torch.load(parser_data.resume) model.load_state_dict(checkpoint['model']) optimizer.load_state_dict(checkpoint['optimizer']) lr_scheduler.load_state_dict(checkpoint['lr_scheduler']) parser_data.start_epoch = checkpoint['epoch'] + 1 print("the training process from epoch{}...".format( parser_data.start_epoch)) train_loss = [] learning_rate = [] val_map = [] train_val_map = [] val_data = None # 如果电脑内存充裕,可提前加载验证集数据,以免每次验证时都要重新加载一次数据,节省时间 # val_data = get_coco_api_from_dataset(val_data_loader.dataset) for epoch in range(parser_data.start_epoch, parser_data.epochs): utils.train_one_epoch(model=model, optimizer=optimizer, data_loader=train_data_loader, device=device, epoch=epoch, print_freq=50, train_loss=train_loss, train_lr=learning_rate) lr_scheduler.step() if epoch >= 20 or epoch == 10: utils.evaluate(model=model, data_loader=val_data_loader, device=device, data_set=val_data, mAP_list=val_map) # save weights save_files = { 'model': model.state_dict(), 'optimizer': optimizer.state_dict(), 'lr_scheduler': lr_scheduler.state_dict(), 'epoch': epoch } torch.save(save_files, "./save_weights/ssd512-{}.pth".format(epoch)) # plot loss and lr curve if len(train_loss) != 0 and len(learning_rate) != 0: from plot_curve import plot_loss_and_lr plot_loss_and_lr(train_loss, learning_rate) # plot mAP curve if len(val_map) != 0: from plot_curve import plot_map plot_map(val_map)
def main(parser_data): device = torch.device( parser_data.device if torch.cuda.is_available() else "cpu") print("Using {} device training.".format(device.type)) if not os.path.exists("save_weights"): os.mkdir("save_weights") data_transform = { "train": transform.Compose([ transform.SSDCropping(), transform.Resize(), transform.ColorJitter(), transform.ToTensor(), transform.RandomHorizontalFlip(), transform.Normalization(), transform.AssignGTtoDefaultBox() ]), "val": transform.Compose([ transform.Resize(), transform.ToTensor(), transform.Normalization() ]) } VOC_root = parser_data.data_path # check voc root if os.path.exists(os.path.join(VOC_root, "VOCdevkit")) is False: raise FileNotFoundError( "VOCdevkit dose not in path:'{}'.".format(VOC_root)) train_dataset = VOC2012DataSet(VOC_root, data_transform['train'], train_set='train.txt') # 注意训练时,batch_size必须大于1 batch_size = parser_data.batch_size assert batch_size > 1, "batch size must be greater than 1" nw = min([os.cpu_count(), batch_size if batch_size > 1 else 0, 8]) # number of workers print('Using %g dataloader workers' % nw) train_data_loader = torch.utils.data.DataLoader( train_dataset, batch_size=batch_size, shuffle=True, num_workers=nw, collate_fn=train_dataset.collate_fn) val_dataset = VOC2012DataSet(VOC_root, data_transform['val'], train_set='val.txt') val_data_loader = torch.utils.data.DataLoader( val_dataset, batch_size=batch_size, shuffle=False, num_workers=nw, collate_fn=train_dataset.collate_fn) model = create_model(num_classes=21, device=device) model.to(device) # define optimizer params = [p for p in model.parameters() if p.requires_grad] optimizer = torch.optim.SGD(params, lr=0.0005, momentum=0.9, weight_decay=0.0005) # learning rate scheduler lr_scheduler = torch.optim.lr_scheduler.StepLR(optimizer, step_size=5, gamma=0.3) # 如果指定了上次训练保存的权重文件地址,则接着上次结果接着训练 if parser_data.resume != "": checkpoint = torch.load(parser_data.resume) model.load_state_dict(checkpoint['model']) optimizer.load_state_dict(checkpoint['optimizer']) lr_scheduler.load_state_dict(checkpoint['lr_scheduler']) parser_data.start_epoch = checkpoint['epoch'] + 1 print("the training process from epoch{}...".format( parser_data.start_epoch)) train_loss = [] learning_rate = [] val_map = [] val_data = None # 如果电脑内存充裕,可提前加载验证集数据,以免每次验证时都要重新加载一次数据,节省时间 # val_data = get_coco_api_from_dataset(val_data_loader.dataset) for epoch in range(parser_data.start_epoch, parser_data.epochs): utils.train_one_epoch(model=model, optimizer=optimizer, data_loader=train_data_loader, device=device, epoch=epoch, print_freq=50, train_loss=train_loss, train_lr=learning_rate) lr_scheduler.step() utils.evaluate(model=model, data_loader=val_data_loader, device=device, data_set=val_data, mAP_list=val_map) # save weights save_files = { 'model': model.state_dict(), 'optimizer': optimizer.state_dict(), 'lr_scheduler': lr_scheduler.state_dict(), 'epoch': epoch } torch.save(save_files, "./save_weights/ssd300-{}.pth".format(epoch)) # plot loss and lr curve if len(train_loss) != 0 and len(learning_rate) != 0: from plot_curve import plot_loss_and_lr plot_loss_and_lr(train_loss, learning_rate) # plot mAP curve if len(val_map) != 0: from plot_curve import plot_map plot_map(val_map)
def main(parser_data): device = torch.device( parser_data.device if torch.cuda.is_available() else "cpu") print("Using {} device training.".format(device.type)) # 用来保存coco_info的文件 results_file = "results{}.txt".format( datetime.datetime.now().strftime("%Y%m%d-%H%M%S")) data_transform = { "train": transforms.Compose( [transforms.ToTensor(), transforms.RandomHorizontalFlip(0.5)]), "val": transforms.Compose([transforms.ToTensor()]) } VOC_root = parser_data.data_path # check voc root if os.path.exists(os.path.join(VOC_root, "VOCdevkit")) is False: raise FileNotFoundError( "VOCdevkit dose not in path:'{}'.".format(VOC_root)) # load train data set # VOCdevkit -> VOC2012 -> ImageSets -> Main -> train.txt train_data_set = VOC2012DataSet(VOC_root, data_transform["train"], "train.txt") # 注意这里的collate_fn是自定义的,因为读取的数据包括image和targets,不能直接使用默认的方法合成batch batch_size = parser_data.batch_size nw = min([os.cpu_count(), batch_size if batch_size > 1 else 0, 8]) # number of workers print('Using %g dataloader workers' % nw) train_data_loader = torch.utils.data.DataLoader( train_data_set, batch_size=batch_size, shuffle=True, num_workers=nw, collate_fn=train_data_set.collate_fn) # load validation data set # VOCdevkit -> VOC2012 -> ImageSets -> Main -> val.txt val_data_set = VOC2012DataSet(VOC_root, data_transform["val"], "val.txt") val_data_set_loader = torch.utils.data.DataLoader( val_data_set, batch_size=batch_size, shuffle=False, num_workers=nw, collate_fn=train_data_set.collate_fn) # create model num_classes equal background + 20 classes model = create_model(num_classes=21, device=device) # print(model) model.to(device) # define optimizer params = [p for p in model.parameters() if p.requires_grad] optimizer = torch.optim.SGD(params, lr=0.005, momentum=0.9, weight_decay=0.0005) # learning rate scheduler lr_scheduler = torch.optim.lr_scheduler.StepLR(optimizer, step_size=3, gamma=0.33) # 如果指定了上次训练保存的权重文件地址,则接着上次结果接着训练 if parser_data.resume != "": checkpoint = torch.load(parser_data.resume, map_location=device) model.load_state_dict(checkpoint['model']) optimizer.load_state_dict(checkpoint['optimizer']) lr_scheduler.load_state_dict(checkpoint['lr_scheduler']) parser_data.start_epoch = checkpoint['epoch'] + 1 print("the training process from epoch{}...".format( parser_data.start_epoch)) train_loss = [] learning_rate = [] val_map = [] for epoch in range(parser_data.start_epoch, parser_data.epochs): # train for one epoch, printing every 10 iterations mean_loss, lr = utils.train_one_epoch(model, optimizer, train_data_loader, device=device, epoch=epoch, print_freq=50, warmup=True) train_loss.append(mean_loss.item()) learning_rate.append(lr) # update the learning rate lr_scheduler.step() # evaluate on the test dataset coco_info = utils.evaluate(model, val_data_set_loader, device=device) # write into txt with open(results_file, "a") as f: result_info = [ str(round(i, 4)) for i in coco_info + [mean_loss.item(), lr] ] txt = "epoch:{} {}".format(epoch, ' '.join(result_info)) f.write(txt + "\n") val_map.append(coco_info[1]) # pascal mAP # save weights save_files = { 'model': model.state_dict(), 'optimizer': optimizer.state_dict(), 'lr_scheduler': lr_scheduler.state_dict(), 'epoch': epoch } torch.save(save_files, "./save_weights/resNetFpn-model-{}.pth".format(epoch)) # plot loss and lr curve if len(train_loss) != 0 and len(learning_rate) != 0: from plot_curve import plot_loss_and_lr plot_loss_and_lr(train_loss, learning_rate) # plot mAP curve if len(val_map) != 0: from plot_curve import plot_map plot_map(val_map)
def main(opt, hyp): # 初始化各进程 init_distributed_mode(opt) if opt.rank in [-1, 0]: print(opt) print( 'Start Tensorboard with "tensorboard --logdir=runs", view at http://localhost:6006/' ) tb_writer = SummaryWriter(comment=opt.name) device = torch.device(opt.device) if "cuda" not in device.type: raise EnvironmentError("not find GPU device for training.") # 使用DDP后会对每个device上的gradients取均值,所以需要放大学习率 hyp["lr0"] *= max(1., opt.world_size * opt.batch_size / 64) wdir = "weights" + os.sep # weights dir best = wdir + "best.pt" results_file = "results.txt" cfg = opt.cfg data = opt.data epochs = opt.epochs batch_size = opt.batch_size # accumulate n times before optimizer update (bs 64) accumulate = max(round(64 / (opt.world_size * opt.batch_size)), 1) weights = opt.weights # initial training weights imgsz_train = opt.img_size imgsz_test = opt.img_size # test image sizes multi_scale = opt.multi_scale # Image sizes # 图像要设置成32的倍数 gs = 32 # (pixels) grid size assert math.fmod( imgsz_test, gs) == 0, "--img-size %g must be a %g-multiple" % (imgsz_test, gs) grid_min, grid_max = imgsz_test // gs, imgsz_test // gs if multi_scale: imgsz_min = opt.img_size // 1.5 imgsz_max = opt.img_size // 0.667 # 将给定的最大,最小输入尺寸向下调整到32的整数倍 grid_min, grid_max = imgsz_min // gs, imgsz_max // gs imgsz_min, imgsz_max = int(grid_min * gs), int(grid_max * gs) imgsz_train = imgsz_max # initialize with max size if opt.rank in [-1, 0]: # 只在第一个进程中显示打印信息 print("Using multi_scale training, image range[{}, {}]".format( imgsz_min, imgsz_max)) # configure run random.seed(0) # 设置随机种子 data_dict = parse_data_cfg(data) train_path = data_dict["train"] test_path = data_dict["valid"] nc = 1 if opt.single_cls else int( data_dict["classes"]) # number of classes hyp["cls"] *= nc / 80 # update coco-tuned hyp['cls'] to current dataset hyp["obj"] *= imgsz_test / 320 if opt.rank in [-1, 0]: # Remove previous results for f in glob.glob(results_file) + glob.glob("tmp.pk"): os.remove(f) # Initialize model model = Darknet(cfg).to(device) start_epoch = 0 best_map = 0.0 # 如果指定了预训练权重,则载入预训练权重 if weights.endswith(".pt"): ckpt = torch.load(weights, map_location=device) # load model try: ckpt["model"] = { k: v for k, v in ckpt["model"].items() if model.state_dict()[k].numel() == v.numel() } model.load_state_dict(ckpt["model"], strict=False) except KeyError as e: s = "%s is not compatible with %s. Specify --weights '' or specify a --cfg compatible with %s. " \ "See https://github.com/ultralytics/yolov3/issues/657" % (opt.weights, opt.cfg, opt.weights) raise KeyError(s) from e if opt.rank in [-1, 0]: # load results if ckpt.get("training_results") is not None: with open(results_file, "w") as file: file.write(ckpt["training_results"]) # write results.txt # epochs start_epoch = ckpt["epoch"] + 1 if epochs < start_epoch: print( '%s has been trained for %g epochs. Fine-tuning for %g additional epochs.' % (opt.weights, ckpt['epoch'], epochs)) epochs += ckpt['epoch'] # finetune additional epochs del ckpt # 是否冻结权重,只训练predictor的权重 if opt.freeze_layers: # 索引减一对应的是predictor的索引,YOLOLayer并不是predictor output_layer_indices = [ idx - 1 for idx, module in enumerate(model.module_list) if isinstance(module, YOLOLayer) ] # 冻结除predictor和YOLOLayer外的所有层 freeze_layer_indeces = [ x for x in range(len(model.module_list)) if (x not in output_layer_indices) and ( x - 1 not in output_layer_indices) ] # Freeze non-output layers # 总共训练3x2=6个parameters for idx in freeze_layer_indeces: for parameter in model.module_list[idx].parameters(): parameter.requires_grad_(False) else: # 如果freeze_layer为False,默认仅训练除darknet53之后的部分 # 若要训练全部权重,删除以下代码 darknet_end_layer = 74 # only yolov3spp cfg # Freeze darknet53 layers # 总共训练21x3+3x2=69个parameters for idx in range(darknet_end_layer + 1): # [0, 74] for parameter in model.module_list[idx].parameters(): parameter.requires_grad_(False) # SyncBatchNorm # 如果只训练最后的predictor(其中不含bn层),SyncBatchNorm没有作用 if opt.freeze_layers is False: model = torch.nn.SyncBatchNorm.convert_sync_batchnorm(model).to(device) model = torch.nn.parallel.DistributedDataParallel(model, device_ids=[opt.gpu]) model.yolo_layers = model.module.yolo_layers # move yolo layer indices to top level # optimizer pg = [p for p in model.parameters() if p.requires_grad] optimizer = optim.SGD(pg, lr=hyp["lr0"], momentum=hyp["momentum"], weight_decay=hyp["weight_decay"], nesterov=True) # Scheduler https://arxiv.org/pdf/1812.01187.pdf lf = lambda x: ((1 + math.cos(x * math.pi / epochs)) / 2) * (1 - hyp[ "lrf"]) + hyp["lrf"] # cosine scheduler = lr_scheduler.LambdaLR(optimizer, lr_lambda=lf) scheduler.last_epoch = start_epoch # 指定从哪个epoch开始 # dataset # 训练集的图像尺寸指定为multi_scale_range中最大的尺寸 # Make sure only the first process in DDP process the dataset first, and the following others can use the cache. with torch_distributed_zero_first(opt.rank): train_dataset = LoadImagesAndLabels( train_path, imgsz_train, batch_size, augment=True, hyp=hyp, # augmentation hyperparameters rect=opt.rect, # rectangular training cache_images=opt.cache_images, single_cls=opt.single_cls, rank=opt.rank) # 验证集的图像尺寸指定为img_size(512) val_dataset = LoadImagesAndLabels(test_path, imgsz_test, batch_size, hyp=hyp, cache_images=opt.cache_images, single_cls=opt.single_cls, rank=opt.rank) # 给每个rank对应的进程分配训练的样本索引 train_sampler = torch.utils.data.distributed.DistributedSampler( train_dataset) val_sampler = torch.utils.data.distributed.DistributedSampler(val_dataset) # 将样本索引每batch_size个元素组成一个list train_batch_sampler = torch.utils.data.BatchSampler(train_sampler, batch_size, drop_last=True) # dataloader nw = min([os.cpu_count(), batch_size if batch_size > 1 else 0, 8]) # number of workers if opt.rank in [-1, 0]: print('Using %g dataloader workers' % nw) train_data_loader = torch.utils.data.DataLoader( train_dataset, batch_sampler=train_batch_sampler, num_workers=nw, pin_memory=True, collate_fn=train_dataset.collate_fn) val_data_loader = torch.utils.data.DataLoader( val_dataset, batch_size=batch_size, sampler=val_sampler, num_workers=nw, pin_memory=True, collate_fn=val_dataset.collate_fn) # Model parameters model.nc = nc # attach number of classes to model model.hyp = hyp # attach hyperparameters to model model.gr = 1.0 # giou loss ratio (obj_loss = 1.0 or giou) # start training # caching val_data when you have plenty of memory(RAM) with torch_distributed_zero_first(opt.rank): if os.path.exists("tmp.pk") is False: coco = get_coco_api_from_dataset(val_dataset) with open("tmp.pk", "wb") as f: pickle.dump(coco, f) else: with open("tmp.pk", "rb") as f: coco = pickle.load(f) if opt.rank in [-1, 0]: print("starting traning for %g epochs..." % epochs) print('Using %g dataloader workers' % nw) start_time = time.time() for epoch in range(start_epoch, epochs): train_sampler.set_epoch(epoch) mloss, lr = train_util.train_one_epoch( model, optimizer, train_data_loader, device, epoch, accumulate=accumulate, # 迭代多少batch才训练完64张图片 img_size=imgsz_train, # 输入图像的大小 multi_scale=multi_scale, grid_min=grid_min, # grid的最小尺寸 grid_max=grid_max, # grid的最大尺寸 gs=gs, # grid step: 32 print_freq=50, # 每训练多少个step打印一次信息 warmup=True) # update scheduler scheduler.step() if opt.notest is False or epoch == epochs - 1: # evaluate on the test dataset result_info = train_util.evaluate(model, val_data_loader, coco=coco, device=device) # only first process in DDP process to record info and save weights if opt.rank in [-1, 0]: coco_mAP = result_info[0] voc_mAP = result_info[1] coco_mAR = result_info[8] # write into tensorboard if tb_writer: tags = [ 'train/giou_loss', 'train/obj_loss', 'train/cls_loss', 'train/loss', "learning_rate", "mAP@[IoU=0.50:0.95]", "mAP@[IoU=0.5]", "mAR@[IoU=0.50:0.95]" ] for x, tag in zip( mloss.tolist() + [lr, coco_mAP, voc_mAP, coco_mAR], tags): tb_writer.add_scalar(tag, x, epoch) # write into txt with open(results_file, "a") as f: result_info = [str(round(i, 4)) for i in result_info] txt = "epoch:{} {}".format(epoch, ' '.join(result_info)) f.write(txt + "\n") # update best mAP(IoU=0.50:0.95) if coco_mAP > best_map: best_map = coco_mAP if opt.savebest is False: # save weights every epoch with open(results_file, 'r') as f: save_files = { 'model': model.module.state_dict(), 'optimizer': optimizer.state_dict(), 'training_results': f.read(), 'epoch': epoch, 'best_map': best_map } torch.save(save_files, "./weights/yolov3spp-{}.pt".format(epoch)) else: # only save best weights if best_map == coco_mAP: with open(results_file, 'r') as f: save_files = { 'model': model.module.state_dict(), 'optimizer': optimizer.state_dict(), 'training_results': f.read(), 'epoch': epoch, 'best_map': best_map } torch.save(save_files, best.format(epoch)) total_time = time.time() - start_time total_time_str = str(datetime.timedelta(seconds=int(total_time))) if opt.rank in [-1, 0]: print('Training time {}'.format(total_time_str))
def train(hyp): device = torch.device(opt.device if torch.cuda.is_available() else "cpu") cfg = opt.cfg data = opt.data epochs = opt.epochs batch_size = opt.batch_size accumulate = max(round(64 / batch_size), 1) # accumulate n times before optimizer update (bs 64) weights = opt.weights # initial training weights imgsz_train = opt.img_size imgsz_test = opt.img_size # test image sizes multi_scale = opt.multi_scale # Image sizes # 图像要设置成32的倍数 gs = 32 # (pixels) grid size assert math.fmod( imgsz_test, gs) == 0, "--img-size %g must be a %g-multiple" % (imgsz_test, gs) grid_min, grid_max = imgsz_test // gs, imgsz_test // gs if multi_scale: imgsz_min = opt.img_size // 1.5 imgsz_max = opt.img_size // 0.667 # 将给定的最大,最小输入尺寸向下调整到32的整数倍 grid_min, grid_max = imgsz_min // gs, imgsz_max // gs imgsz_min, imgsz_max = int(grid_min * gs), int(grid_max * gs) imgsz_train = imgsz_max # initialize with max size print("Using multi_scale training, image range[{}, {}]".format( imgsz_min, imgsz_max)) # configure run # init_seeds() # 初始化随机种子,保证结果可复现 data_dict = parse_data_cfg(data) train_path = data_dict["train"] test_path = data_dict["valid"] nc = 1 if opt.single_cls else int( data_dict["classes"]) # number of classes hyp["cls"] *= nc / 80 # update coco-tuned hyp['cls'] to current dataset # remove previous results for f in glob.glob("*_batch*.jpg") + glob.glob(results_file): os.remove(f) # Initialize model model = Darknet(cfg).to(device) # optimizer pg0, pg1, pg2 = [], [], [] # optimizer parameter groups for k, v in dict(model.named_parameters()).items(): if ".bias" in k: pg2 += [v] # biases (bn biases and conv2d biases) elif "Conv2d.weight" in k: pg1 += [v] # apply weight_decay else: pg0 += [v] # all else, (bn weight) if opt.adam: optimizer = optim.Adam(pg0, lr=hyp["lr0"]) else: optimizer = optim.SGD(pg0, lr=hyp["lr0"], momentum=hyp["momentum"], nesterov=True) optimizer.add_param_group({ "params": pg1, "weight_decay": hyp["weight_decay"] }) # add pg1 with weight_decay optimizer.add_param_group({"params": pg2}) # add pg2 (biases) print('Optimizer groups: %g .bias, %g Conv2d.weight, %g other' % (len(pg2), len(pg1), len(pg0))) del pg0, pg1, pg2 start_epoch = 0 best_fitness = 0.0 if weights.endswith(".pt"): ckpt = torch.load(weights, map_location=device) # load model try: ckpt["model"] = { k: v for k, v in ckpt["model"].items() if model.state_dict()[k].numel() == v.numel() } model.load_state_dict(ckpt["model"], strict=False) except KeyError as e: s = "%s is not compatible with %s. Specify --weights '' or specify a --cfg compatible with %s. " \ "See https://github.com/ultralytics/yolov3/issues/657" % (opt.weights, opt.cfg, opt.weights) raise KeyError(s) from e # load optimizer if ckpt["optimizer"] is not None: optimizer.load_state_dict(ckpt["optimizer"]) best_fitness = ckpt["best_fitness"] # load results if ckpt.get("training_results") is not None: with open(results_file, "w") as file: file.write(ckpt["training_results"]) # write results.txt # epochs start_epoch = ckpt["epoch"] + 1 if epochs < start_epoch: print( '%s has been trained for %g epochs. Fine-tuning for %g additional epochs.' % (opt.weights, ckpt['epoch'], epochs)) epochs += ckpt['epoch'] # finetune additional epochs del ckpt if opt.freeze_layers: # 索引减一对应的是predictor的索引,YOLOLayer并不是predictor output_layer_indices = [ idx - 1 for idx, module in enumerate(model.module_list) if isinstance(module, YOLOLayer) ] # 冻结除predictor和YOLOLayer外的所有层 freeze_layer_indeces = [ x for x in range(len(model.module_list)) if (x not in output_layer_indices) and ( x - 1 not in output_layer_indices) ] # Freeze non-output layers for idx in freeze_layer_indeces: for parameter in model.module_list[idx].parameters(): parameter.requires_grad_(False) # Scheduler https://arxiv.org/pdf/1812.01187.pdf lf = lambda x: ( (1 + math.cos(x * math.pi / epochs)) / 2) * 0.99 + 0.01 # cosine scheduler = lr_scheduler.LambdaLR(optimizer, lr_lambda=lf) scheduler.last_epoch = start_epoch # 指定从哪个epoch开始 # Plot lr schedule # y = [] # for _ in range(epochs+20): # scheduler.step() # y.append(optimizer.param_groups[0]['lr']) # plt.plot(y, '.-', label='LambdaLR') # plt.xlabel('epoch') # plt.ylabel('LR') # plt.tight_layout() # plt.savefig('LR.png', dpi=300) # model.yolo_layers = model.module.yolo_layers # dataset # 训练集的图像尺寸指定为multi_scale_range中最大的尺寸 train_dataset = LoadImageAndLabels( train_path, imgsz_train, batch_size, augment=True, hyp=hyp, # augmentation hyperparameters rect=opt.rect, # rectangular training cache_images=opt.cache_images, single_cls=opt.single_cls) # 验证集的图像尺寸指定为img_size(512) val_dataset = LoadImageAndLabels( test_path, imgsz_test, batch_size, hyp=hyp, rect=True, # 将每个batch的图像调整到合适大小,可减少运算量(并不是512x512标准尺寸) cache_images=opt.cache_images, single_cls=opt.single_cls) # dataloader nw = min([os.cpu_count(), batch_size if batch_size > 1 else 0, 8]) # number of workers train_dataloader = torch.utils.data.DataLoader( train_dataset, batch_size=batch_size, num_workers=nw, # Shuffle=True unless rectangular training is used shuffle=not opt.rect, pin_memory=True, collate_fn=train_dataset.collate_fn) val_datasetloader = torch.utils.data.DataLoader( val_dataset, batch_size=batch_size, num_workers=nw, pin_memory=True, collate_fn=val_dataset.collate_fn) # Model parameters model.nc = nc # attach number of classes to model model.hyp = hyp # attach hyperparameters to model model.gr = 1.0 # giou loss ratio (obj_loss = 1.0 or giou) # 计算每个类别的目标个数,并计算每个类别的比重 model.class_weights = labels_to_class_weights(train_dataset.labels, nc).to( device) # attach class weights # start training nb = len(train_dataloader) # number of batches n_burn = max(3 * nb, 500) # burn-in iterations, max(3 epochs, 500 iterations) # caching val_data when you have plenty of memory(RAM) print("caching val_data for evaluation.") coco = get_coco_api_from_dataset(val_dataset) print("starting traning for %g epochs..." % epochs) print('Using %g dataloader workers' % nw) for epoch in range(start_epoch, epochs): mloss, lr = train_util.train_one_epoch( model, optimizer, train_dataloader, device, epoch, accumulate=accumulate, # 迭代多少batch才训练完64张图片 img_size=imgsz_train, # 输入图像的大小 batch_size=batch_size, multi_scale=multi_scale, grid_min=grid_min, # grid的最小尺寸 grid_max=grid_max, # grid的最大尺寸 gs=gs, # grid step: 32 print_freq=50, # 每训练多少个step打印一次信息 warmup=True) # update scheduler scheduler.step() # evaluate on the test dataset result_info = train_util.evaluate(model, val_datasetloader, coco=coco, device=device) # write into tensorboard if tb_writer: tags = [ 'train/giou_loss', 'train/obj_loss', 'train/cls_loss', 'train/loss', "learning_rate", "mAP@[IoU=0.50:0.95]", "mAP@[IoU=0.5]", "mAR@[IoU=0.50:0.95]" ] coco_mAP = result_info[0] voc_mAP = result_info[1] coco_mAR = result_info[8] for x, tag in zip( mloss.tolist() + [lr, coco_mAP, voc_mAP, coco_mAR], tags): tb_writer.add_scalar(tag, x, epoch) # save weights save_files = { 'model': model.state_dict(), 'optimizer': optimizer.state_dict(), 'epoch': epoch } torch.save(save_files, "./weights/yolov3spp-{}.pth".format(epoch))
def train(hyp): device = torch.device(opt.device if torch.cuda.is_available() else "cpu") print("Using {} device training.".format(device.type)) wdir = "weights" + os.sep # weights dir best = wdir + "best.pt" results_file = "results.txt" cfg = opt.cfg data = opt.data epochs = opt.epochs batch_size = opt.batch_size accumulate = max(round(64 / batch_size), 1) # accumulate n times before optimizer update (bs 64) weights = opt.weights # initial training weights imgsz_train = opt.img_size imgsz_test = opt.img_size # test image sizes multi_scale = opt.multi_scale # Image sizes # 图像要设置成32的倍数 gs = 32 # (pixels) grid size assert math.fmod(imgsz_test, gs) == 0, "--img-size %g must be a %g-multiple" % (imgsz_test, gs) grid_min, grid_max = imgsz_test // gs, imgsz_test // gs if multi_scale: imgsz_min = opt.img_size // 1.5 imgsz_max = opt.img_size // 0.667 # 将给定的最大,最小输入尺寸向下调整到32的整数倍 grid_min, grid_max = imgsz_min // gs, imgsz_max // gs imgsz_min, imgsz_max = int(grid_min * gs), int(grid_max * gs) imgsz_train = imgsz_max # initialize with max size print("Using multi_scale training, image range[{}, {}]".format(imgsz_min, imgsz_max)) # configure run # init_seeds() # 初始化随机种子,保证结果可复现 data_dict = parse_data_cfg(data) train_path = data_dict["train"] test_path = data_dict["valid"] nc = 1 if opt.single_cls else int(data_dict["classes"]) # number of classes hyp["cls"] *= nc / 80 # update coco-tuned hyp['cls'] to current dataset hyp["obj"] *= imgsz_test / 320 # Remove previous results for f in glob.glob(results_file): os.remove(f) # Initialize model model = Darknet(cfg).to(device) # 是否冻结权重,只训练predictor的权重 if opt.freeze_layers: # 索引减一对应的是predictor的索引,YOLOLayer并不是predictor output_layer_indices = [idx - 1 for idx, module in enumerate(model.module_list) if isinstance(module, YOLOLayer)] # 冻结除predictor和YOLOLayer外的所有层 freeze_layer_indeces = [x for x in range(len(model.module_list)) if (x not in output_layer_indices) and (x - 1 not in output_layer_indices)] # Freeze non-output layers # 总共训练3x2=6个parameters for idx in freeze_layer_indeces: for parameter in model.module_list[idx].parameters(): parameter.requires_grad_(False) else: # 如果freeze_layer为False,默认仅训练除darknet53之后的部分 # 若要训练全部权重,删除以下代码 darknet_end_layer = 74 # only yolov3spp cfg # Freeze darknet53 layers # 总共训练21x3+3x2=69个parameters for idx in range(darknet_end_layer + 1): # [0, 74] for parameter in model.module_list[idx].parameters(): parameter.requires_grad_(False) # optimizer pg = [p for p in model.parameters() if p.requires_grad] optimizer = optim.SGD(pg, lr=hyp["lr0"], momentum=hyp["momentum"], weight_decay=hyp["weight_decay"], nesterov=True) start_epoch = 0 best_map = 0.0 if weights.endswith(".pt") or weights.endswith(".pth"): ckpt = torch.load(weights, map_location=device) # load model try: ckpt["model"] = {k: v for k, v in ckpt["model"].items() if model.state_dict()[k].numel() == v.numel()} model.load_state_dict(ckpt["model"], strict=False) except KeyError as e: s = "%s is not compatible with %s. Specify --weights '' or specify a --cfg compatible with %s. " \ "See https://github.com/ultralytics/yolov3/issues/657" % (opt.weights, opt.cfg, opt.weights) raise KeyError(s) from e # load optimizer if ckpt["optimizer"] is not None: optimizer.load_state_dict(ckpt["optimizer"]) if "best_map" in ckpt.keys(): best_map = ckpt["best_map"] # load results if ckpt.get("training_results") is not None: with open(results_file, "w") as file: file.write(ckpt["training_results"]) # write results.txt # epochs start_epoch = ckpt["epoch"] + 1 if epochs < start_epoch: print('%s has been trained for %g epochs. Fine-tuning for %g additional epochs.' % (opt.weights, ckpt['epoch'], epochs)) epochs += ckpt['epoch'] # finetune additional epochs del ckpt # Scheduler https://arxiv.org/pdf/1812.01187.pdf lf = lambda x: ((1 + math.cos(x * math.pi / epochs)) / 2) * (1 - hyp["lrf"]) + hyp["lrf"] # cosine scheduler = lr_scheduler.LambdaLR(optimizer, lr_lambda=lf) scheduler.last_epoch = start_epoch # 指定从哪个epoch开始 # Plot lr schedule # y = [] # for _ in range(epochs): # scheduler.step() # y.append(optimizer.param_groups[0]['lr']) # plt.plot(y, '.-', label='LambdaLR') # plt.xlabel('epoch') # plt.ylabel('LR') # plt.tight_layout() # plt.savefig('LR.png', dpi=300) # model.yolo_layers = model.module.yolo_layers # dataset # 训练集的图像尺寸指定为multi_scale_range中最大的尺寸 train_dataset = LoadImageAndLabels(train_path, imgsz_train, batch_size, augment=True, hyp=hyp, # augmentation hyperparameters rect=opt.rect, # rectangular training cache_images=opt.cache_images, single_cls=opt.single_cls) # 验证集的图像尺寸指定为img_size(512) val_dataset = LoadImageAndLabels(test_path, imgsz_test, batch_size, hyp=hyp, rect=True, # 将每个batch的图像调整到合适大小,可减少运算量(并不是512x512标准尺寸) cache_images=opt.cache_images, single_cls=opt.single_cls) # dataloader nw = min([os.cpu_count(), batch_size if batch_size > 1 else 0, 8]) # number of workers train_dataloader = torch.utils.data.DataLoader(train_dataset, batch_size=batch_size, num_workers=nw, # Shuffle=True unless rectangular training is used shuffle=not opt.rect, pin_memory=True, collate_fn=train_dataset.collate_fn) val_datasetloader = torch.utils.data.DataLoader(val_dataset, batch_size=batch_size, num_workers=nw, pin_memory=True, collate_fn=val_dataset.collate_fn) # Model parameters model.nc = nc # attach number of classes to model model.hyp = hyp # attach hyperparameters to model model.gr = 1.0 # giou loss ratio (obj_loss = 1.0 or giou) # 计算每个类别的目标个数,并计算每个类别的比重 # model.class_weights = labels_to_class_weights(train_dataset.labels, nc).to(device) # attach class weights # start training # caching val_data when you have plenty of memory(RAM) # coco = None coco = get_coco_api_from_dataset(val_dataset) print("starting traning for %g epochs..." % epochs) print('Using %g dataloader workers' % nw) for epoch in range(start_epoch, epochs): mloss, lr = train_util.train_one_epoch(model, optimizer, train_dataloader, device, epoch, accumulate=accumulate, # 迭代多少batch才训练完64张图片 img_size=imgsz_train, # 输入图像的大小 multi_scale=multi_scale, grid_min=grid_min, # grid的最小尺寸 grid_max=grid_max, # grid的最大尺寸 gs=gs, # grid step: 32 print_freq=50, # 每训练多少个step打印一次信息 warmup=True) # update scheduler scheduler.step() if opt.notest is False or epoch == epochs - 1: # evaluate on the test dataset result_info = train_util.evaluate(model, val_datasetloader, coco=coco, device=device) coco_mAP = result_info[0] voc_mAP = result_info[1] coco_mAR = result_info[8] # write into tensorboard if tb_writer: tags = ['train/giou_loss', 'train/obj_loss', 'train/cls_loss', 'train/loss', "learning_rate", "mAP@[IoU=0.50:0.95]", "mAP@[IoU=0.5]", "mAR@[IoU=0.50:0.95]"] for x, tag in zip(mloss.tolist() + [lr, coco_mAP, voc_mAP, coco_mAR], tags): tb_writer.add_scalar(tag, x, epoch) # write into txt with open(results_file, "a") as f: result_info = [str(round(i, 4)) for i in result_info] txt = "epoch:{} {}".format(epoch, ' '.join(result_info)) f.write(txt + "\n") # update best mAP(IoU=0.50:0.95) if coco_mAP > best_map: best_map = coco_mAP if opt.savebest is False: # save weights every epoch with open(results_file, 'r') as f: save_files = { 'model': model.state_dict(), 'optimizer': optimizer.state_dict(), 'training_results': f.read(), 'epoch': epoch, 'best_map': best_map} torch.save(save_files, "./weights/yolov3spp-{}.pt".format(epoch)) else: # only save best weights if best_map == coco_mAP: with open(results_file, 'r') as f: save_files = { 'model': model.state_dict(), 'optimizer': optimizer.state_dict(), 'training_results': f.read(), 'epoch': epoch, 'best_map': best_map} torch.save(save_files, best.format(epoch))
def main(parser_data): device = torch.device( parser_data.device if torch.cuda.is_available() else "cpu") print("Using {} device training.".format(device.type)) if not os.path.exists("save_weights"): os.mkdir("save_weights") results_file = "results{}.txt".format( datetime.datetime.now().strftime("%Y%m%d-%H%M%S")) data_transform = { "train": transforms.Compose([ transforms.SSDCropping(), transforms.Resize(), transforms.ColorJitter(), transforms.ToTensor(), transforms.RandomHorizontalFlip(), transforms.Normalization(), transforms.AssignGTtoDefaultBox() ]), "val": transforms.Compose([ transforms.Resize(), transforms.ToTensor(), transforms.Normalization() ]) } VOC_root = parser_data.data_path # check voc root if os.path.exists(os.path.join(VOC_root, "VOCdevkit")) is False: raise FileNotFoundError( "VOCdevkit dose not in path:'{}'.".format(VOC_root)) # VOCdevkit -> VOC2012 -> ImageSets -> Main -> train.txt train_dataset = VOCDataSet(VOC_root, "2012", data_transform['train'], train_set='train.txt') # 注意训练时,batch_size必须大于1 batch_size = parser_data.batch_size assert batch_size > 1, "batch size must be greater than 1" # 防止最后一个batch_size=1,如果最后一个batch_size=1就舍去 drop_last = True if len(train_dataset) % batch_size == 1 else False nw = min([os.cpu_count(), batch_size if batch_size > 1 else 0, 8]) # number of workers print('Using %g dataloader workers' % nw) train_data_loader = torch.utils.data.DataLoader( train_dataset, batch_size=batch_size, shuffle=True, num_workers=nw, collate_fn=train_dataset.collate_fn, drop_last=drop_last) # VOCdevkit -> VOC2012 -> ImageSets -> Main -> val.txt val_dataset = VOCDataSet(VOC_root, "2012", data_transform['val'], train_set='val.txt') val_data_loader = torch.utils.data.DataLoader( val_dataset, batch_size=batch_size, shuffle=False, num_workers=nw, collate_fn=train_dataset.collate_fn) model = create_model(num_classes=args.num_classes + 1) model.to(device) # define optimizer params = [p for p in model.parameters() if p.requires_grad] optimizer = torch.optim.SGD(params, lr=0.0005, momentum=0.9, weight_decay=0.0005) # learning rate scheduler lr_scheduler = torch.optim.lr_scheduler.StepLR(optimizer, step_size=5, gamma=0.3) # 如果指定了上次训练保存的权重文件地址,则接着上次结果接着训练 if parser_data.resume != "": checkpoint = torch.load(parser_data.resume, map_location='cpu') model.load_state_dict(checkpoint['model']) optimizer.load_state_dict(checkpoint['optimizer']) lr_scheduler.load_state_dict(checkpoint['lr_scheduler']) parser_data.start_epoch = checkpoint['epoch'] + 1 print("the training process from epoch{}...".format( parser_data.start_epoch)) train_loss = [] learning_rate = [] val_map = [] # 提前加载验证集数据,以免每次验证时都要重新加载一次数据,节省时间 val_data = get_coco_api_from_dataset(val_data_loader.dataset) for epoch in range(parser_data.start_epoch, parser_data.epochs): mean_loss, lr = utils.train_one_epoch(model=model, optimizer=optimizer, data_loader=train_data_loader, device=device, epoch=epoch, print_freq=50) train_loss.append(mean_loss.item()) learning_rate.append(lr) # update learning rate lr_scheduler.step() coco_info = utils.evaluate(model=model, data_loader=val_data_loader, device=device, data_set=val_data) # write into txt with open(results_file, "a") as f: # 写入的数据包括coco指标还有loss和learning rate result_info = [ str(round(i, 4)) for i in coco_info + [mean_loss.item()] ] + [str(round(lr, 6))] txt = "epoch:{} {}".format(epoch, ' '.join(result_info)) f.write(txt + "\n") val_map.append(coco_info[1]) # pascal mAP # save weights save_files = { 'model': model.state_dict(), 'optimizer': optimizer.state_dict(), 'lr_scheduler': lr_scheduler.state_dict(), 'epoch': epoch } torch.save(save_files, "./save_weights/ssd300-{}.pth".format(epoch)) # plot loss and lr curve if len(train_loss) != 0 and len(learning_rate) != 0: from plot_curve import plot_loss_and_lr plot_loss_and_lr(train_loss, learning_rate) # plot mAP curve if len(val_map) != 0: from plot_curve import plot_map plot_map(val_map)
def main(parser_data): device = torch.device( parser_data.device if torch.cuda.is_available() else "cpu") print(device) if not os.path.exists("save_weights"): os.mkdir("save_weights") data_transform = { "train": transform.Compose([ transform.SSDCropping(), transform.Resize(), transform.ColorJitter(), transform.ToTensor(), transform.RandomHorizontalFlip(), transform.Normalization(), transform.AssignGTtoDefaultBox() ]), "val": transform.Compose([ transform.Resize(), transform.ToTensor(), transform.Normalization() ]) } XRay_root = parser_data.data_path train_dataset = XRayDataset(XRay_root, data_transform['train'], train_set='train.txt') # Note that the batch_size must be greater than 1 train_data_loader = torch.utils.data.DataLoader( train_dataset, batch_size=8, shuffle=True, num_workers=4, collate_fn=utils.collate_fn) val_dataset = XRayDataset(XRay_root, data_transform['val'], train_set='val.txt') val_data_loader = torch.utils.data.DataLoader(val_dataset, batch_size=1, shuffle=False, num_workers=0, collate_fn=utils.collate_fn) model = create_model(num_classes=6, device=device) model.to(device) # define optimizer params = [p for p in model.parameters() if p.requires_grad] optimizer = torch.optim.SGD(params, lr=0.0005, momentum=0.9, weight_decay=0.0005) # learning rate scheduler lr_scheduler = torch.optim.lr_scheduler.StepLR(optimizer, step_size=5, gamma=0.3) # If the address of the weight file saved by the last training is specified, the training continues with the last result if parser_data.resume != "": checkpoint = torch.load(parser_data.resume) model.load_state_dict(checkpoint['model']) optimizer.load_state_dict(checkpoint['optimizer']) lr_scheduler.load_state_dict(checkpoint['lr_scheduler']) parser_data.start_epoch = checkpoint['epoch'] + 1 print("the training process from epoch{}...".format( parser_data.start_epoch)) train_loss = [] learning_rate = [] val_map = [] val_data = None # If your computer has sufficient memory, you can save time by loading the validation set data in advance to avoid having to reload the data each time you validate # val_data = get_coco_api_from_dataset(val_data_loader.dataset) for epoch in range(parser_data.start_epoch, parser_data.epochs): utils.train_one_epoch(model=model, optimizer=optimizer, data_loader=train_data_loader, device=device, epoch=epoch, print_freq=50, train_loss=train_loss, train_lr=learning_rate) lr_scheduler.step() utils.evaluate(model=model, data_loader=val_data_loader, device=device, data_set=val_data, mAP_list=val_map) # save weights save_files = { 'model': model.state_dict(), 'optimizer': optimizer.state_dict(), 'lr_scheduler': lr_scheduler.state_dict(), 'epoch': epoch } torch.save(save_files, "./save_weights/ssd300-{}.pth".format(epoch)) # plot loss and lr curve if len(train_loss) != 0 and len(learning_rate) != 0: from plot_curve import plot_loss_and_lr plot_loss_and_lr(train_loss, learning_rate) # plot mAP curve if len(val_map) != 0: from plot_curve import plot_map plot_map(val_map)
def main(parser_data): device = torch.device( parser_data.device if torch.cuda.is_available() else "cpu") print(device) data_transform = { "train": transforms.Compose( [transforms.ToTensor(), transforms.RandomHorizontalFlip(0.5)]), "val": transforms.Compose([transforms.ToTensor()]) } root = parser_data.data_path # load train data set train_data_set = XRayDataset(root=root, transform=data_transform["train"], train_set=True) # 注意这里的collate_fn是自定义的,因为读取的数据包括image和targets,不能直接使用默认的方法合成batch train_data_loader = torch.utils.data.DataLoader( train_data_set, batch_size=4, shuffle=True, num_workers=0, collate_fn=utils.collate_fn) # load validation data set val_data_set = XRayDataset(root=root, transform=data_transform["val"], train_set=False) val_data_set_loader = torch.utils.data.DataLoader( val_data_set, batch_size=2, shuffle=False, num_workers=0, collate_fn=utils.collate_fn) # create model num_classes equal background + 5 classes model = create_model(num_classes=6) print(model) model.to(device) # define optimizer params = [p for p in model.parameters() if p.requires_grad] optimizer = torch.optim.SGD(params, lr=0.005, momentum=0.9, weight_decay=0.0005) # learning rate scheduler lr_scheduler = torch.optim.lr_scheduler.StepLR(optimizer, step_size=5, gamma=0.33) # 如果指定了上次训练保存的权重文件地址,则接着上次结果接着训练 if parser_data.resume != "": checkpoint = torch.load(parser_data.resume) model.load_state_dict(checkpoint['model']) optimizer.load_state_dict(checkpoint['optimizer']) lr_scheduler.load_state_dict(checkpoint['lr_scheduler']) parser_data.start_epoch = checkpoint['epoch'] + 1 print("the training process from epoch{}...".format( parser_data.start_epoch)) for epoch in range(parser_data.start_epoch, parser_data.epochs): # train for one epoch, printing every 10 iterations utils.train_one_epoch(model, optimizer, train_data_loader, device, epoch, print_freq=50, warmup=True) # update the learning rate lr_scheduler.step() # evaluate on the test dataset utils.evaluate(model, val_data_set_loader, device=device) # save weights save_files = { 'model': model.state_dict(), 'optimizer': optimizer.state_dict(), 'lr_scheduler': lr_scheduler.state_dict(), 'epoch': epoch } torch.save(save_files, "./save_weights/resNetFpn-model-{}.pth".format(epoch))
def main(args): # utils.init_distributed_mode(args) # print(args) device = torch.device(args.device if torch.cuda.is_available() else "cpu") print(device) # # 用来保存coco_info的文件 results_file = "results{}.txt".format( datetime.datetime.now().strftime("%Y%m%d-%H%M%S")) # # Data loading code print("Loading data") data_transform = { "train": transforms.Compose( [transforms.ToTensor(), transforms.RandomHorizontalFlip(0.5)]), "val": transforms.Compose([transforms.ToTensor()]) } coco_root = args.data_path # check root if os.path.exists(coco_root) is False: raise FileNotFoundError( "coco dose not in path:'{}'.".format(coco_root)) batch_size = args.batch_size nw = min([os.cpu_count(), batch_size if batch_size > 1 else 0, 0]) # number of workers print('Using %g dataloader workers' % nw) val_dataset = get_coco(coco_root, "val", data_transform["val"]) print("Creating data loaders") # if args.distributed: # train_sampler = torch.utils.data.distributed.DistributedSampler(dataset) # test_sampler = torch.utils.data.distributed.DistributedSampler(val_dataset) # else: # train_sampler = torch.utils.data.RandomSampler(dataset) # test_sampler = torch.utils.data.SequentialSampler(val_dataset) # if args.aspect_ratio_group_factor >= 0: # group_ids = create_aspect_ratio_groups(dataset, k=args.aspect_ratio_group_factor) # train_batch_sampler = GroupedBatchSampler(train_sampler, group_ids, args.batch_size) # else: # train_batch_sampler = torch.utils.data.BatchSampler(train_sampler, args.batch_size, drop_last=True) val_data_set_loader = torch.utils.data.DataLoader( val_dataset, batch_size=batch_size, shuffle=False, pin_memory=True, num_workers=nw, collate_fn=val_dataset.collate_fn) print("Creating model") model = get_model(num_classes=args.num_classes + 1) model.to(device) print(model) model_without_ddp = model # if args.distributed: # model = torch.nn.parallel.DistributedDataParallel(model, device_ids=[args.gpu]) # model_without_ddp = model.module train_loss = [] learning_rate = [] val_map = [] params = [p for p in model.parameters() if p.requires_grad] optimizer = torch.optim.SGD(params, lr=args.lr, momentum=args.momentum, weight_decay=args.weight_decay) # lr_scheduler = torch.optim.lr_scheduler.StepLR(optimizer, step_size=args.lr_step_size, gamma=args.lr_gamma) lr_scheduler = torch.optim.lr_scheduler.MultiStepLR( optimizer, milestones=args.lr_steps, gamma=args.lr_gamma) if args.resume: checkpoint = torch.load(args.resume, map_location=device) model_without_ddp.load_state_dict(checkpoint['model']) optimizer.load_state_dict(checkpoint['optimizer']) lr_scheduler.load_state_dict(checkpoint['lr_scheduler']) args.start_epoch = checkpoint['epoch'] + 1 if args.test_only: train_eval_utils.evaluate(model, val_data_set_loader, device=device) return for epoch in range(args.start_epoch, args.epochs): metric_logger = train_eval_utils.train_one_epoch( model, optimizer, val_data_set_loader, device, epoch, args.print_freq) mean_loss, lr = metric_logger["mloss"], metric_logger["lr"] train_loss.append(mean_loss) learning_rate.append(lr) lr_scheduler.step() # evaluate after every epoch coco_info = train_eval_utils.evaluate(model, val_data_set_loader, device=device) # write into txt with open(results_file, "a") as f: # 写入的数据包括coco指标还有loss和learning rate result_info = [ str(round(i, 4)) for i in coco_info + [mean_loss, lr] ] txt = "epoch:{} {}".format(epoch, ' '.join(result_info)) f.write(txt + "\n") val_map.append(coco_info[1]) # pascal mAP if args.output_dir: utils.save_on_master( { 'model': model_without_ddp.state_dict(), 'optimizer': optimizer.state_dict(), 'lr_scheduler': lr_scheduler.state_dict(), 'args': args, 'epoch': epoch }, os.path.join(args.output_dir, 'resnet-fpn-model-{}.pth'.format(epoch))) # plot loss and lr curve if len(train_loss) != 0 and len(learning_rate) != 0: from plot_curve import plot_loss_and_lr plot_loss_and_lr(train_loss, learning_rate) # plot mAP curve if len(val_map) != 0: from plot_curve import plot_map plot_map(val_map)
def main(args): print(args) # mp.spawn(main_worker, args=(args,), nprocs=args.world_size, join=True) utils.init_distributed_mode(args) device = torch.device(args.device) # Data loading code print("Loading data") data_transform = { "train": transform.Compose([ transform.SSDCropping(), transform.Resize(), transform.ColorJitter(), transform.ToTensor(), transform.RandomHorizontalFlip(), transform.Normalization(), transform.AssignGTtoDefaultBox() ]), "val": transform.Compose([ transform.Resize(), transform.ToTensor(), transform.Normalization() ]) } VOC_root = args.data_path # load train data set train_data_set = VOC2012DataSet(VOC_root, data_transform["train"], train_set='train.txt') # load validation data set val_data_set = VOC2012DataSet(VOC_root, data_transform["val"], train_set='val.txt') print("Creating data loaders") if args.distributed: train_sampler = torch.utils.data.distributed.DistributedSampler( train_data_set) test_sampler = torch.utils.data.distributed.DistributedSampler( val_data_set) else: train_sampler = torch.utils.data.RandomSampler(train_data_set) test_sampler = torch.utils.data.SequentialSampler(val_data_set) if args.aspect_ratio_group_factor >= 0: # count all scales of images in position index in bins. group_ids = create_aspect_ratio_groups( train_data_set, k=args.aspect_ratio_group_factor) train_batch_sampler = GroupedBatchSampler(train_sampler, group_ids, args.batch_size) else: train_batch_sampler = torch.utils.data.BatchSampler(train_sampler, args.batch_size, drop_last=True) data_loader = torch.utils.data.DataLoader( train_data_set, batch_sampler=train_batch_sampler, num_workers=args.workers, collate_fn=utils.collate_fn) data_loader_test = torch.utils.data.DataLoader(val_data_set, batch_size=4, sampler=test_sampler, num_workers=args.workers, collate_fn=utils.collate_fn) print("Creating model") model = create_model(num_classes=21) model.to(device) model_without_ddp = model if args.distributed: model = torch.nn.parallel.DistributedDataParallel( model, device_ids=[args.gpu]) model_without_ddp = model.module params = [p for p in model.parameters() if p.requires_grad] optimizer = torch.optim.SGD(params, lr=args.lr, momentum=args.momentum, weight_decay=args.weight_decay) lr_scheduler = torch.optim.lr_scheduler.StepLR(optimizer, step_size=args.lr_step_size, gamma=args.lr_gamma) # lr_scheduler = torch.optim.lr_scheduler.MultiStepLR(optimizer, milestones=args.lr_steps, gamma=args.lr_gamma) # If resume arg is not none, the training will continue after the resume arg. if args.resume: # If map_location is missing, torch.load will first load the module to CPU # and then copy each parameter to where it was saved, # which would result in all processes on the same machine using the same set of devices. checkpoint = torch.load( args.resume, map_location='cpu' ) # Read previously saved weight files (including optimizer and learning rate policy) model_without_ddp.load_state_dict(checkpoint['model']) optimizer.load_state_dict(checkpoint['optimizer']) lr_scheduler.load_state_dict(checkpoint['lr_scheduler']) args.start_epoch = checkpoint['epoch'] + 1 if args.test_only: utils.evaluate(model, data_loader_test, device=device) return print("Start training") start_time = time.time() for epoch in range(args.start_epoch, args.epochs): if args.distributed: train_sampler.set_epoch(epoch) utils.train_one_epoch(model, optimizer, data_loader, device, epoch, args.print_freq) lr_scheduler.step() if args.output_dir: # Save weight operations are performed only on the primary node. utils.save_on_master( { 'model': model_without_ddp.state_dict(), 'optimizer': optimizer.state_dict(), 'lr_scheduler': lr_scheduler.state_dict(), 'args': args, 'epoch': epoch }, os.path.join(args.output_dir, 'model_{}.pth'.format(epoch))) # evaluate after every epoch utils.evaluate(model, data_loader_test, device=device) total_time = time.time() - start_time total_time_str = str(datetime.timedelta(seconds=int(total_time))) print('Training time {}'.format(total_time_str))
def main(parser_data): device = torch.device( parser_data.device if torch.cuda.is_available() else "cpu") print("Using {} device training.".format(device.type)) data_transform = { "train": transforms.Compose( [transforms.ToTensor(), transforms.RandomHorizontalFlip(0.5)]), "val": transforms.Compose([transforms.ToTensor()]) } Data_root = parser_data.data_path # check voc root if os.path.exists(os.path.join(Data_root, "TGK_DATASET")) is False: # 修改数据集的名字 raise FileNotFoundError( "TGK_DATASET dose not in path:'{}'.".format(Data_root)) # load train data set train_data_set = TGKDataSet(Data_root, data_transform["train"], True) # 注意这里的collate_fn是自定义的,因为读取的数据包括image和targets,不能直接使用默认的方法合成batch batch_size = parser_data.batch_size nw = min([os.cpu_count(), batch_size if batch_size > 1 else 0, 8]) # number of workers print('Using %g dataloader workers' % nw) train_data_loader = torch.utils.data.DataLoader( train_data_set, batch_size=batch_size, shuffle=True, num_workers=nw, collate_fn=train_data_set.collate_fn) # load validation data set val_data_set = TGKDataSet(Data_root, data_transform["val"], False) val_data_set_loader = torch.utils.data.DataLoader( val_data_set, batch_size=batch_size, shuffle=False, num_workers=nw, collate_fn=train_data_set.collate_fn) # create model num_classes equal background + classes number model = create_model(num_classes=2) # 修改标注类别数,需要+1 # print(model) model.to(device) # define optimizer params = [p for p in model.parameters() if p.requires_grad] optimizer = torch.optim.SGD(params, lr=0.0025, momentum=0.9, weight_decay=0.0005) # learning rate scheduler lr_scheduler = torch.optim.lr_scheduler.StepLR(optimizer, step_size=5, gamma=0.33) # 如果指定了上次训练保存的权重文件地址,则接着上次结果接着训练 if parser_data.resume != "": checkpoint = torch.load(parser_data.resume) model.load_state_dict(checkpoint['model']) optimizer.load_state_dict(checkpoint['optimizer']) lr_scheduler.load_state_dict(checkpoint['lr_scheduler']) parser_data.start_epoch = checkpoint['epoch'] + 1 print("the training process from epoch{}...".format( parser_data.start_epoch)) train_loss = [] learning_rate = [] val_mAP = [] for epoch in range(parser_data.start_epoch, parser_data.epochs): # train for one epoch, printing every 10 iterations utils.train_one_epoch(model, optimizer, train_data_loader, device, epoch, train_loss=train_loss, train_lr=learning_rate, print_freq=10, warmup=True) # update the learning rate lr_scheduler.step() # evaluate on the test dataset utils.evaluate(model, val_data_set_loader, device=device, mAP_list=val_mAP) # save weights save_files = { 'model': model.state_dict(), 'optimizer': optimizer.state_dict(), 'lr_scheduler': lr_scheduler.state_dict(), 'epoch': epoch } torch.save(save_files, "./save_weights/resNetFpn-model-{}.pth".format(epoch)) # plot loss and lr curve if len(train_loss) != 0 and len(learning_rate) != 0: from plot_curve import plot_loss_and_lr plot_loss_and_lr(train_loss, learning_rate) # plot mAP curve if len(val_mAP) != 0: from plot_curve import plot_map plot_map(val_mAP)
def main(args): device = torch.device(args.device if torch.cuda.is_available() else "cpu") print(device) # 用来保存coco_info的文件 results_file = "results{}.txt".format( datetime.datetime.now().strftime("%Y%m%d-%H%M%S")) data_transform = { "train": transforms.Compose( [transforms.ToTensor(), transforms.RandomHorizontalFlip(0.5)]), "val": transforms.Compose([transforms.ToTensor()]) } VOC_root = args.data_path # check voc root if os.path.exists(os.path.join(VOC_root, "VOCdevkit")) is False: raise FileNotFoundError( "VOCdevkit dose not in path:'{}'.".format(VOC_root)) batch_size = args.batch_size nw = min([os.cpu_count(), batch_size if batch_size > 1 else 0, 0]) # number of workers print('Using %g dataloader workers' % nw) # VOCdevkit -> VOC2012 -> ImageSets -> Main -> train.txt train_data_set = VOC2012DataSet(VOC_root, data_transform["train"], "train.txt") train_data_loader = torch.utils.data.DataLoader( train_data_set, batch_size=batch_size, shuffle=True, pin_memory=True, num_workers=nw, collate_fn=train_data_set.collate_fn) val_data_set = VOC2012DataSet(VOC_root, data_transform["val"], "val.txt") val_data_set_loader = torch.utils.data.DataLoader( val_data_set, batch_size=batch_size, shuffle=False, pin_memory=True, num_workers=nw, collate_fn=train_data_set.collate_fn) model = get_model(num_classes=args.num_classes + 1) print(model) model.to(device) params = [p for p in model.parameters() if p.requires_grad] optimizer = torch.optim.SGD(params, lr=0.005, momentum=0.9, weight_decay=0.0005) lr_scheduler = torch.optim.lr_scheduler.StepLR(optimizer, step_size=3, gamma=0.33) # 加载上次保存的权重 # 如果指定了上次训练保存的权重文件地址,则接着上次结果接着训练 if args.resume != "": checkpoint = torch.load(args.resume, map_location=device) model.load_state_dict(checkpoint['model']) optimizer.load_state_dict(checkpoint['optimizer']) lr_scheduler.load_state_dict(checkpoint['lr_scheduler']) args.start_epoch = checkpoint['epoch'] + 1 print("the training process from epoch{}...".format(args.start_epoch)) train_loss = [] learning_rate = [] val_map = [] for epoch in range(args.start_epoch, args.epochs): # train for one epoch, printing every 10 iterations metric_logger = train_eval_utils.train_one_epoch(model, optimizer, train_data_loader, device=device, epoch=epoch, print_freq=50) mean_loss, lr = metric_logger["mloss"], metric_logger["lr"] train_loss.append(mean_loss) learning_rate.append(lr) # update the learning rate lr_scheduler.step() # evaluate on the test dataset coco_info = train_eval_utils.evaluate(model, val_data_set_loader, device=device) # write into txt with open(results_file, "a") as f: # 写入的数据包括coco指标还有loss和learning rate result_info = [ str(round(i, 4)) for i in coco_info + [mean_loss, lr] ] txt = "epoch:{} {}".format(epoch, ' '.join(result_info)) f.write(txt + "\n") val_map.append(coco_info[1]) # pascal mAP # save weights # save_files = { # 'model': model.state_dict(), # 'optimizer': optimizer.state_dict(), # 'lr_scheduler': lr_scheduler.state_dict(), # 'epoch': epoch} # torch.save(save_files, "./weights/resNetFpn-model-{}.pth".format(epoch)) if args.output_dir: utils.save_on_master( { 'model': model.state_dict(), 'optimizer': optimizer.state_dict(), 'lr_scheduler': lr_scheduler.state_dict(), 'args': args, 'epoch': epoch }, os.path.join(args.output_dir, 'resnet-fpn-model-{}.pth'.format(epoch))) # plot loss and lr curve if len(train_loss) != 0 and len(learning_rate) != 0: from plot_curve import plot_loss_and_lr plot_loss_and_lr(train_loss, learning_rate) # plot mAP curve if len(val_map) != 0: from plot_curve import plot_map plot_map(val_map)
def main(): device = torch.device("cuda:0" if torch.cuda.is_available() else "cpu") print(device) if not os.path.exists("save_weights"): os.mkdir("save_weights") data_transform = { "train": transform.Compose([transform.SSDCropping(), transform.Resize(), transform.ColorJitter(), transform.ToTensor(), transform.RandomHorizontalFlip(), transform.Normalization(), transform.AssignGTtoDefaultBox()]), "val": transform.Compose([transform.Resize(), transform.ToTensor(), transform.Normalization()]) } voc_path = "../" train_dataset = VOC2012DataSet(voc_path, data_transform['train'], True) # 注意训练时,batch_size必须大于1 train_data_loader = torch.utils.data.DataLoader(train_dataset, batch_size=8, shuffle=True, num_workers=0, collate_fn=utils.collate_fn) val_dataset = VOC2012DataSet(voc_path, data_transform['val'], False) val_data_loader = torch.utils.data.DataLoader(val_dataset, batch_size=1, shuffle=False, num_workers=0, collate_fn=utils.collate_fn) model = create_model(num_classes=21, device=device) model.to(device) # define optimizer params = [p for p in model.parameters() if p.requires_grad] optimizer = torch.optim.SGD(params, lr=0.002, momentum=0.9, weight_decay=0.0005) # learning rate scheduler lr_scheduler = torch.optim.lr_scheduler.StepLR(optimizer, step_size=5, gamma=0.3) train_loss = [] learning_rate = [] val_map = [] val_data = None # 如果电脑内存充裕,可提前加载验证集数据,以免每次验证时都要重新加载一次数据,节省时间 # val_data = get_coco_api_from_dataset(val_data_loader.dataset) for epoch in range(20): utils.train_one_epoch(model=model, optimizer=optimizer, data_loader=train_data_loader, device=device, epoch=epoch, print_freq=50, train_loss=train_loss, train_lr=learning_rate, warmup=True) lr_scheduler.step() utils.evaluate(model=model, data_loader=val_data_loader, device=device, data_set=val_data, mAP_list=val_map) # save weights save_files = { 'model': model.state_dict(), 'optimizer': optimizer.state_dict(), 'lr_scheduler': lr_scheduler.state_dict(), 'epoch': epoch} torch.save(save_files, "./save_weights/ssd300-{}.pth".format(epoch)) # plot loss and lr curve if len(train_loss) != 0 and len(learning_rate) != 0: from plot_curve import plot_loss_and_lr plot_loss_and_lr(train_loss, learning_rate) # plot mAP curve if len(val_map) != 0: from plot_curve import plot_map plot_map(val_map)
def main(parser_data): device = torch.device( parser_data.device if torch.cuda.is_available() else "cpu") print("Using {} device training.".format(device.type)) results_file = "results{}.txt".format( datetime.datetime.now().strftime("%Y%m%d-%H%M%S")) data_transform = { "train": transforms.Compose( [transforms.ToTensor(), transforms.RandomHorizontalFlip(0.5)]), "val": transforms.Compose([transforms.ToTensor()]) } VOC_root = parser_data.data_path # check voc root if os.path.exists(os.path.join(VOC_root, "VOCdevkit")) is False: raise FileNotFoundError( "VOCdevkit dose not in path:'{}'.".format(VOC_root)) # load train data set # VOCdevkit -> VOC2012 -> ImageSets -> Main -> train.txt train_dataset = VOCDataSet(VOC_root, "2012", data_transform["train"], "train.txt") train_sampler = None # 是否按图片相似高宽比采样图片组成batch # 使用的话能够减小训练时所需GPU显存,默认使用 if args.aspect_ratio_group_factor >= 0: train_sampler = torch.utils.data.RandomSampler(train_dataset) # 统计所有图像高宽比例在bins区间中的位置索引 group_ids = create_aspect_ratio_groups( train_dataset, k=args.aspect_ratio_group_factor) # 每个batch图片从同一高宽比例区间中取 train_batch_sampler = GroupedBatchSampler(train_sampler, group_ids, args.batch_size) # 注意这里的collate_fn是自定义的,因为读取的数据包括image和targets,不能直接使用默认的方法合成batch batch_size = parser_data.batch_size nw = min([os.cpu_count(), batch_size if batch_size > 1 else 0, 8]) # number of workers print('Using %g dataloader workers' % nw) if train_sampler: # 如果按照图片高宽比采样图片,dataloader中需要使用batch_sampler train_data_loader = torch.utils.data.DataLoader( train_dataset, batch_sampler=train_batch_sampler, pin_memory=True, num_workers=nw, collate_fn=train_dataset.collate_fn) else: train_data_loader = torch.utils.data.DataLoader( train_dataset, batch_size=batch_size, shuffle=True, pin_memory=True, num_workers=nw, collate_fn=train_dataset.collate_fn) # load validation data set # VOCdevkit -> VOC2012 -> ImageSets -> Main -> val.txt val_dataset = VOCDataSet(VOC_root, "2012", data_transform["val"], "val.txt") val_data_loader = torch.utils.data.DataLoader( val_dataset, batch_size=1, shuffle=False, pin_memory=True, num_workers=nw, collate_fn=val_dataset.collate_fn) # create model # 注意:不包含背景 model = create_model(num_classes=parser_data.num_classes) # print(model) model.to(device) # define optimizer params = [p for p in model.parameters() if p.requires_grad] optimizer = torch.optim.SGD(params, lr=0.005, momentum=0.9, weight_decay=0.0005) scaler = torch.cuda.amp.GradScaler() if args.amp else None # learning rate scheduler lr_scheduler = torch.optim.lr_scheduler.StepLR(optimizer, step_size=3, gamma=0.33) # 如果指定了上次训练保存的权重文件地址,则接着上次结果接着训练 if parser_data.resume != "": checkpoint = torch.load(parser_data.resume, map_location='cpu') model.load_state_dict(checkpoint['model']) optimizer.load_state_dict(checkpoint['optimizer']) lr_scheduler.load_state_dict(checkpoint['lr_scheduler']) parser_data.start_epoch = checkpoint['epoch'] + 1 if args.amp and "scaler" in checkpoint: scaler.load_state_dict(checkpoint["scaler"]) print("the training process from epoch{}...".format( parser_data.start_epoch)) train_loss = [] learning_rate = [] val_map = [] for epoch in range(parser_data.start_epoch, parser_data.epochs): # train for one epoch, printing every 10 iterations mean_loss, lr = utils.train_one_epoch(model, optimizer, train_data_loader, device, epoch, print_freq=50, warmup=True, scaler=scaler) train_loss.append(mean_loss.item()) learning_rate.append(lr) # update the learning rate lr_scheduler.step() # evaluate on the test dataset coco_info = utils.evaluate(model, val_data_loader, device=device) # write into txt with open(results_file, "a") as f: # 写入的数据包括coco指标还有loss和learning rate result_info = [ str(round(i, 4)) for i in coco_info + [mean_loss.item()] ] + [str(round(lr, 6))] txt = "epoch:{} {}".format(epoch, ' '.join(result_info)) f.write(txt + "\n") val_map.append(coco_info[1]) # pascal map # save weights save_files = { 'model': model.state_dict(), 'optimizer': optimizer.state_dict(), 'lr_scheduler': lr_scheduler.state_dict(), 'epoch': epoch } if args.amp: save_files["scaler"] = scaler.state_dict() torch.save(save_files, "./save_weights/resNetFpn-model-{}.pth".format(epoch)) # plot loss and lr curve if len(train_loss) != 0 and len(learning_rate) != 0: from plot_curve import plot_loss_and_lr plot_loss_and_lr(train_loss, learning_rate) # plot mAP curve if len(val_map) != 0: from plot_curve import plot_map plot_map(val_map)
def main(): device = torch.device("cuda:0" if torch.cuda.is_available() else "cpu") print("Using {} device training.".format(device.type)) # 用来保存coco_info的文件 results_file = "results{}.txt".format( datetime.datetime.now().strftime("%Y%m%d-%H%M%S")) # 检查保存权重文件夹是否存在,不存在则创建 if not os.path.exists("save_weights"): os.makedirs("save_weights") data_transform = { "train": transforms.Compose( [transforms.ToTensor(), transforms.RandomHorizontalFlip(0.5)]), "val": transforms.Compose([transforms.ToTensor()]) } VOC_root = "./" # check voc root if os.path.exists(os.path.join(VOC_root, "VOCdevkit")) is False: raise FileNotFoundError( "VOCdevkit dose not in path:'{}'.".format(VOC_root)) # load train data set # VOCdevkit -> VOC2012 -> ImageSets -> Main -> train.txt train_data_set = VOC2012DataSet(VOC_root, data_transform["train"], "train.txt") # 注意这里的collate_fn是自定义的,因为读取的数据包括image和targets,不能直接使用默认的方法合成batch batch_size = 8 nw = min([os.cpu_count(), batch_size if batch_size > 1 else 0, 8]) # number of workers print('Using %g dataloader workers' % nw) train_data_loader = torch.utils.data.DataLoader( train_data_set, batch_size=batch_size, shuffle=True, num_workers=nw, collate_fn=train_data_set.collate_fn) # load validation data set # VOCdevkit -> VOC2012 -> ImageSets -> Main -> val.txt val_data_set = VOC2012DataSet(VOC_root, data_transform["val"], "val.txt") val_data_set_loader = torch.utils.data.DataLoader( val_data_set, batch_size=batch_size, shuffle=False, num_workers=nw, collate_fn=train_data_set.collate_fn) # create model num_classes equal background + 20 classes model = create_model(num_classes=21) # print(model) model.to(device) train_loss = [] learning_rate = [] val_map = [] # # # # # # # # # # # # # # # # # # # # # # # # # # # # # # # # # first frozen backbone and train 5 epochs # # 首先冻结前置特征提取网络权重(backbone),训练rpn以及最终预测网络部分 # # # # # # # # # # # # # # # # # # # # # # # # # # # # # # # # # for param in model.backbone.parameters(): param.requires_grad = False # define optimizer params = [p for p in model.parameters() if p.requires_grad] optimizer = torch.optim.SGD(params, lr=0.005, momentum=0.9, weight_decay=0.0005) init_epochs = 5 for epoch in range(init_epochs): # train for one epoch, printing every 10 iterations mean_loss, lr = utils.train_one_epoch(model, optimizer, train_data_loader, device, epoch, print_freq=50, warmup=True) train_loss.append(mean_loss.item()) learning_rate.append(lr) # evaluate on the test dataset coco_info = utils.evaluate(model, val_data_set_loader, device=device) # write into txt with open(results_file, "a") as f: # 写入的数据包括coco指标还有loss和learning rate result_info = [ str(round(i, 4)) for i in coco_info + [mean_loss.item(), lr] ] txt = "epoch:{} {}".format(epoch, ' '.join(result_info)) f.write(txt + "\n") val_map.append(coco_info[1]) # pascal mAP torch.save(model.state_dict(), "./save_weights/pretrain.pth") # # # # # # # # # # # # # # # # # # # # # # # # # # # # # second unfrozen backbone and train all network # # 解冻前置特征提取网络权重(backbone),接着训练整个网络权重 # # # # # # # # # # # # # # # # # # # # # # # # # # # # # # 冻结backbone部分底层权重 for name, parameter in model.backbone.named_parameters(): split_name = name.split(".")[0] if split_name in ["0", "1", "2", "3"]: parameter.requires_grad = False else: parameter.requires_grad = True # define optimizer params = [p for p in model.parameters() if p.requires_grad] optimizer = torch.optim.SGD(params, lr=0.005, momentum=0.9, weight_decay=0.0005) # learning rate scheduler lr_scheduler = torch.optim.lr_scheduler.StepLR(optimizer, step_size=3, gamma=0.33) num_epochs = 20 for epoch in range(init_epochs, num_epochs + init_epochs, 1): # train for one epoch, printing every 50 iterations mean_loss, lr = utils.train_one_epoch(model, optimizer, train_data_loader, device, epoch, print_freq=50) train_loss.append(mean_loss.item()) learning_rate.append(lr) # update the learning rate lr_scheduler.step() # evaluate on the test dataset coco_info = utils.evaluate(model, val_data_set_loader, device=device) # write into txt with open(results_file, "a") as f: # 写入的数据包括coco指标还有loss和learning rate result_info = [ str(round(i, 4)) for i in coco_info + [mean_loss.item(), lr] ] txt = "epoch:{} {}".format(epoch, ' '.join(result_info)) f.write(txt + "\n") val_map.append(coco_info[1]) # pascal mAP # save weights # 仅保存最后5个epoch的权重 if epoch in range(num_epochs + init_epochs)[-5:]: save_files = { 'model': model.state_dict(), 'optimizer': optimizer.state_dict(), 'lr_scheduler': lr_scheduler.state_dict(), 'epoch': epoch } torch.save(save_files, "./save_weights/mobile-model-{}.pth".format(epoch)) # plot loss and lr curve if len(train_loss) != 0 and len(learning_rate) != 0: from plot_curve import plot_loss_and_lr plot_loss_and_lr(train_loss, learning_rate) # plot mAP curve if len(val_map) != 0: from plot_curve import plot_map plot_map(val_map)
def main(): device = torch.device("cuda:0" if torch.cuda.is_available() else "cpu") # 如果有可用的GPU就默认采用第0块设备,如果没有就用CPU print("Using {} device training.".format(device.type)) # 用来保存coco_info的文件 results_file = "results{}.txt".format(datetime.datetime.now().strftime("%Y%m%d-%H%M%S")) # 检查保存权重文件夹是否存在,不存在则创建 if not os.path.exists("save_weights"): os.makedirs("save_weights") data_transform = { "train": transforms.Compose([transforms.ToTensor(), transforms.RandomHorizontalFlip(0.5)]), # 转化为tensor,然后随机水平翻转,GT也应该变化 "val": transforms.Compose([transforms.ToTensor()]) } VOC_root = "./" # VOC数据集放在了根目录下 # VOC_root = os.getcwd() # VOC数据集放在了当前目录下 # check voc root if os.path.exists(os.path.join(VOC_root, "VOCdevkit")) is False: raise FileNotFoundError("VOCdevkit dose not in path:'{}'.".format(VOC_root)) # load train data set # VOCdevkit -> VOC2012 -> ImageSets -> Main -> train.txt train_data_set = VOC2012DataSet(VOC_root, data_transform["train"], "train.txt") # 使用VOC2012DataSet类来定义我们的数据集,然后再torch.DataLoader载入 # 注意这里的collate_fn是自定义的,因为读取的数据包括image和targets,不能直接使用默认的方法合成batch batch_size = 8 # 根据GPU来设定 nw = min([os.cpu_count(), batch_size if batch_size > 1 else 0, 8]) # number of workers,多线程图像读取和预处理,可以为4或8 print('Using %g dataloader workers' % nw) train_data_loader = torch.utils.data.DataLoader(train_data_set, batch_size=batch_size, shuffle=True, num_workers=nw, collate_fn=train_data_set.collate_fn) # load validation data set # VOCdevkit -> VOC2012 -> ImageSets -> Main -> val.txt val_data_set = VOC2012DataSet(VOC_root, data_transform["val"], "val.txt") val_data_set_loader = torch.utils.data.DataLoader(val_data_set, batch_size=batch_size, shuffle=False, pin_memory=True, num_workers=nw, collate_fn=train_data_set.collate_fn) # create model num_classes equal background + 20 classes model = create_model(num_classes=21) # 20+1,加上了背景这个类别 # print(model) model.to(device) # 将模型指派到设备中 train_loss = [] learning_rate = [] val_map = [] # # # # # # # # # # # # # # # # # # # # # # # # # # # # # # # # # first frozen backbone and train 5 epochs # # 首先冻结前置特征提取网络权重(backbone),训练rpn以及最终预测网络部分 # # 因为现在只有backbone的预训练权值,用来微调rpn以及最终预测网络部分 # # # # # # # # # # # # # # # # # # # # # # # # # # # # # # # # # for param in model.backbone.parameters(): param.requires_grad = False # define optimizer params = [p for p in model.parameters() if p.requires_grad] optimizer = torch.optim.SGD(params, lr=0.005, momentum=0.9, weight_decay=0.0005) init_epochs = 5 # 通过5个epochs进行微调 for epoch in range(init_epochs): # train for one epoch, printing every 10 iterations mean_loss, lr = utils.train_one_epoch(model, optimizer, train_data_loader, device, epoch, print_freq=50, warmup=True) train_loss.append(mean_loss.item()) learning_rate.append(lr) # evaluate on the test dataset coco_info = utils.evaluate(model, val_data_set_loader, device=device) # write into txt with open(results_file, "a") as f: # 写入的数据包括coco指标还有loss和learning rate result_info = [str(round(i, 4)) for i in coco_info + [mean_loss.item(), lr]] txt = "epoch:{} {}".format(epoch, ' '.join(result_info)) f.write(txt + "\n") val_map.append(coco_info[1]) # pascal mAP torch.save(model.state_dict(), "./save_weights/pretrain.pth") # 保存权值 # # # # # # # # # # # # # # # # # # # # # # # # # # # # # second unfrozen backbone and train all network # # 解冻前置特征提取网络权重(backbone),接着训练整个网络权重 # # # # # # # # # # # # # # # # # # # # # # # # # # # # # # 冻结backbone部分底层权重 for name, parameter in model.backbone.named_parameters(): split_name = name.split(".")[0] if split_name in ["0", "1", "2", "3"]: parameter.requires_grad = False # 前面的几层都是通用的特征,所以可以冻结部分底层权重;这样不仅可以加快训练,并且效更好 # 这也是pytorch官方训练ResNet50和RPN的方法 else: parameter.requires_grad = True # define optimizer params = [p for p in model.parameters() if p.requires_grad] # 遍历模型的所有权重,找出需要训练的即p.requires_grad=True optimizer = torch.optim.SGD(params, lr=0.005, momentum=0.9, weight_decay=0.0005) # 将参数传入SGD优化器中;初始lr,动量,decay # learning rate scheduler lr_scheduler = torch.optim.lr_scheduler.StepLR(optimizer, # 调整学习率的方法有很多,这里使用的是.StepLR step_size=3, gamma=0.33) # 设置学习率的路线,每隔step_size步,lr乘上系数gamma num_epochs = 20 # 迭代了20个epoch for epoch in range(init_epochs, num_epochs+init_epochs, 1): # train for one epoch, printing every 50 iterations mean_loss, lr = utils.train_one_epoch(model, optimizer, train_data_loader, device, epoch, print_freq=50) train_loss.append(mean_loss.item()) learning_rate.append(lr) # update the learning rate lr_scheduler.step() # 记录lr_scheduler方法已经执行一步了 # evaluate on the test dataset coco_info = utils.evaluate(model, val_data_set_loader, device=device) # write into txt,这些都可以没有 with open(results_file, "a") as f: # 写入的数据包括coco指标还有loss和learning rate result_info = [str(round(i, 4)) for i in coco_info + [mean_loss.item(), lr]] txt = "epoch:{} {}".format(epoch, ' '.join(result_info)) f.write(txt + "\n") val_map.append(coco_info[1]) # pascal mAP # save weights # 仅保存最后5个epoch的权重 if epoch in range(num_epochs+init_epochs)[-5:]: # if epoch >10: # 可以简单点,直接从第10个权重开始保存 save_files = { 'model': model.state_dict(), 'optimizer': optimizer.state_dict(), 'lr_scheduler': lr_scheduler.state_dict(), 'epoch': epoch} torch.save(save_files, "./save_weights/mobile-model-{}.pth".format(epoch)) # plot loss and lr curve if len(train_loss) != 0 and len(learning_rate) != 0: from plot_curve import plot_loss_and_lr plot_loss_and_lr(train_loss, learning_rate) # plot mAP curve if len(val_map) != 0: from plot_curve import plot_map plot_map(val_map)
def main(): device = torch.device("cuda:0" if torch.cuda.is_available() else "cpu") print(device) # 检查保存权重文件夹是否存在,不存在则创建 if not os.path.exists("save_weights"): os.makedirs("save_weights") data_transform = { "train": transforms.Compose([transforms.ToTensor(), transforms.RandomHorizontalFlip(0.5)]), "val": transforms.Compose([transforms.ToTensor()]) } VOC_root = "./" # load train data set train_data_set = VOC2012DataSet(VOC_root, data_transform["train"], True) # 注意这里的collate_fn是自定义的,因为读取的数据包括image和targets,不能直接使用默认的方法合成batch train_data_loader = torch.utils.data.DataLoader(train_data_set, batch_size=8, shuffle=True, num_workers=0, collate_fn=utils.collate_fn) # load validation data set val_data_set = VOC2012DataSet(VOC_root, data_transform["val"], False) val_data_set_loader = torch.utils.data.DataLoader(val_data_set, batch_size=1, shuffle=False, num_workers=0, collate_fn=utils.collate_fn) # create model num_classes equal background + 20 classes model = create_model(num_classes=21) print(model) model.to(device) train_loss = [] learning_rate = [] val_mAP = [] # # # # # # # # # # # # # # # # # # # # # # # # # # # # # # # # # first frozen backbone and train 5 epochs # # 首先冻结前置特征提取网络权重(backbone),训练rpn以及最终预测网络部分 # # # # # # # # # # # # # # # # # # # # # # # # # # # # # # # # # for param in model.backbone.parameters(): param.requires_grad = False # define optimizer params = [p for p in model.parameters() if p.requires_grad] optimizer = torch.optim.SGD(params, lr=0.005, momentum=0.9, weight_decay=0.0005) num_epochs = 5 for epoch in range(num_epochs): # train for one epoch, printing every 10 iterations utils.train_one_epoch(model, optimizer, train_data_loader, device, epoch, print_freq=50, train_loss=train_loss, train_lr=learning_rate) # evaluate on the test dataset utils.evaluate(model, val_data_set_loader, device=device, mAP_list=val_mAP) torch.save(model.state_dict(), "./save_weights/pretrain.pth") # # # # # # # # # # # # # # # # # # # # # # # # # # # # # second unfrozen backbone and train all network # # 解冻前置特征提取网络权重(backbone),接着训练整个网络权重 # # # # # # # # # # # # # # # # # # # # # # # # # # # # # # 冻结backbone部分底层权重 for name, parameter in model.backbone.named_parameters(): split_name = name.split(".")[0] if split_name in ["0", "1", "2", "3"]: parameter.requires_grad = False else: parameter.requires_grad = True # define optimizer params = [p for p in model.parameters() if p.requires_grad] optimizer = torch.optim.SGD(params, lr=0.005, momentum=0.9, weight_decay=0.0005) # learning rate scheduler lr_scheduler = torch.optim.lr_scheduler.StepLR(optimizer, step_size=5, gamma=0.33) num_epochs = 20 for epoch in range(num_epochs): # train for one epoch, printing every 50 iterations utils.train_one_epoch(model, optimizer, train_data_loader, device, epoch, print_freq=50, train_loss=train_loss, train_lr=learning_rate) # update the learning rate lr_scheduler.step() # evaluate on the test dataset utils.evaluate(model, val_data_set_loader, device=device, mAP_list=val_mAP) # save weights if epoch > 10: save_files = { 'model': model.state_dict(), 'optimizer': optimizer.state_dict(), 'lr_scheduler': lr_scheduler.state_dict(), 'epoch': epoch} torch.save(save_files, "./save_weights/mobile-model-{}.pth".format(epoch)) # plot loss and lr curve if len(train_loss) != 0 and len(learning_rate) != 0: from plot_curve import plot_loss_and_lr plot_loss_and_lr(train_loss, learning_rate) # plot mAP curve if len(val_mAP) != 0: from plot_curve import plot_map plot_map(val_mAP)
def main(args): device = torch.device(args.device if torch.cuda.is_available() else "cpu") print("Using {} device training.".format(device.type)) # 用来保存coco_info的文件 results_file = "results{}.txt".format( datetime.datetime.now().strftime("%Y%m%d-%H%M%S")) data_transform = { "train": transforms.Compose( [transforms.ToTensor(), transforms.RandomHorizontalFlip(0.5)]), "val": transforms.Compose([transforms.ToTensor()]) } COCO_root = args.data_path # load train data set # coco2017 -> annotations -> instances_train2017.json train_data_set = CocoDetection(COCO_root, "train", data_transform["train"]) # 注意这里的collate_fn是自定义的,因为读取的数据包括image和targets,不能直接使用默认的方法合成batch batch_size = args.batch_size nw = min([os.cpu_count(), batch_size if batch_size > 1 else 0, 8]) # number of workers print('Using %g dataloader workers' % nw) train_data_loader = torch.utils.data.DataLoader( train_data_set, batch_size=batch_size, shuffle=True, pin_memory=True, num_workers=nw, collate_fn=train_data_set.collate_fn) # load validation data set # coco2017 -> annotations -> instances_val2017.json val_data_set = CocoDetection(COCO_root, "val", data_transform["val"]) val_data_set_loader = torch.utils.data.DataLoader( val_data_set, batch_size=batch_size, shuffle=False, pin_memory=True, num_workers=nw, collate_fn=train_data_set.collate_fn) # create model num_classes equal background + 80 classes model = create_model(num_classes=args.num_classes + 1) # print(model) model.to(device) train_loss = [] learning_rate = [] val_map = [] # define optimizer params = [p for p in model.parameters() if p.requires_grad] optimizer = torch.optim.SGD(params, lr=args.lr, momentum=args.momentum, weight_decay=args.weight_decay) # learning rate scheduler lr_scheduler = torch.optim.lr_scheduler.MultiStepLR( optimizer, milestones=args.lr_steps, gamma=args.lr_gamma) # 如果传入resume参数,即上次训练的权重地址,则接着上次的参数训练 if args.resume: # If map_location is missing, torch.load will first load the module to CPU # and then copy each parameter to where it was saved, # which would result in all processes on the same machine using the same set of devices. checkpoint = torch.load( args.resume, map_location=device) # 读取之前保存的权重文件(包括优化器以及学习率策略) model.load_state_dict(checkpoint['model']) optimizer.load_state_dict(checkpoint['optimizer']) lr_scheduler.load_state_dict(checkpoint['lr_scheduler']) args.start_epoch = checkpoint['epoch'] + 1 for epoch in range(args.start_epoch, args.epochs): # train for one epoch, printing every 50 iterations mean_loss, lr = utils.train_one_epoch(model, optimizer, train_data_loader, device, epoch, print_freq=50, warmup=True) train_loss.append(mean_loss.item()) learning_rate.append(lr) # update the learning rate lr_scheduler.step() # evaluate on the test dataset coco_info = utils.evaluate(model, val_data_set_loader, device=device) # write into txt with open(results_file, "a") as f: # 写入的数据包括coco指标还有loss和learning rate result_info = [ str(round(i, 4)) for i in coco_info + [mean_loss.item(), lr] ] txt = "epoch:{} {}".format(epoch, ' '.join(result_info)) f.write(txt + "\n") val_map.append(coco_info[1]) # pascal mAP # save weights save_files = { 'model': model.state_dict(), 'optimizer': optimizer.state_dict(), 'lr_scheduler': lr_scheduler.state_dict(), 'epoch': epoch } torch.save(save_files, "./save_weights/mobile-model-{}.pth".format(epoch)) # plot loss and lr curve if len(train_loss) != 0 and len(learning_rate) != 0: from plot_curve import plot_loss_and_lr plot_loss_and_lr(train_loss, learning_rate) # plot mAP curve if len(val_map) != 0: from plot_curve import plot_map plot_map(val_map)
def main(args): print(args) # mp.spawn(main_worker, args=(args,), nprocs=args.world_size, join=True) utils.init_distributed_mode(args) device = torch.device(args.device) # Data loading code print("Loading data") data_transform = { "train": transforms.Compose( [transforms.ToTensor(), transforms.RandomHorizontalFlip(0.5)]), "val": transforms.Compose([transforms.ToTensor()]) } VOC_root = args.data_path assert os.path.exists(os.path.join( VOC_root, "VOCdevkit")), "not found VOCdevkit in path:'{}'".format(VOC_root) # load train data set train_data_set = VOC2012DataSet(VOC_root, data_transform["train"], True) # load validation data set val_data_set = VOC2012DataSet(VOC_root, data_transform["val"], False) print("Creating data loaders") if args.distributed: train_sampler = torch.utils.data.distributed.DistributedSampler( train_data_set) test_sampler = torch.utils.data.distributed.DistributedSampler( val_data_set) else: train_sampler = torch.utils.data.RandomSampler(train_data_set) test_sampler = torch.utils.data.SequentialSampler(val_data_set) if args.aspect_ratio_group_factor >= 0: # 统计所有图像比例在bins区间中的位置索引 group_ids = create_aspect_ratio_groups( train_data_set, k=args.aspect_ratio_group_factor) train_batch_sampler = GroupedBatchSampler(train_sampler, group_ids, args.batch_size) else: train_batch_sampler = torch.utils.data.BatchSampler(train_sampler, args.batch_size, drop_last=True) data_loader = torch.utils.data.DataLoader( train_data_set, batch_sampler=train_batch_sampler, num_workers=args.workers, collate_fn=utils.collate_fn) data_loader_test = torch.utils.data.DataLoader(val_data_set, batch_size=1, sampler=test_sampler, num_workers=args.workers, collate_fn=utils.collate_fn) print("Creating model") model = create_model(num_classes=21) model.to(device) model_without_ddp = model if args.distributed: model = torch.nn.parallel.DistributedDataParallel( model, device_ids=[args.gpu]) model_without_ddp = model.module params = [p for p in model.parameters() if p.requires_grad] optimizer = torch.optim.SGD(params, lr=args.lr, momentum=args.momentum, weight_decay=args.weight_decay) # lr_scheduler = torch.optim.lr_scheduler.StepLR(optimizer, step_size=args.lr_step_size, gamma=args.lr_gamma) lr_scheduler = torch.optim.lr_scheduler.MultiStepLR( optimizer, milestones=args.lr_steps, gamma=args.lr_gamma) # 如果传入resume参数,即上次训练的权重地址,则接着上次的参数训练 if args.resume: # If map_location is missing, torch.load will first load the module to CPU # and then copy each parameter to where it was saved, # which would result in all processes on the same machine using the same set of devices. checkpoint = torch.load( args.resume, map_location='cpu') # 读取之前保存的权重文件(包括优化器以及学习率策略) model_without_ddp.load_state_dict(checkpoint['model']) optimizer.load_state_dict(checkpoint['optimizer']) lr_scheduler.load_state_dict(checkpoint['lr_scheduler']) args.start_epoch = checkpoint['epoch'] + 1 if args.test_only: utils.evaluate(model, data_loader_test, device=device) return print("Start training") start_time = time.time() for epoch in range(args.start_epoch, args.epochs): if args.distributed: train_sampler.set_epoch(epoch) utils.train_one_epoch(model, optimizer, data_loader, device, epoch, args.print_freq) lr_scheduler.step() if args.output_dir: # 只在主节点上执行保存权重操作 utils.save_on_master( { 'model': model_without_ddp.state_dict(), 'optimizer': optimizer.state_dict(), 'lr_scheduler': lr_scheduler.state_dict(), 'args': args, 'epoch': epoch }, os.path.join(args.output_dir, 'model_{}.pth'.format(epoch))) # evaluate after every epoch utils.evaluate(model, data_loader_test, device=device) total_time = time.time() - start_time total_time_str = str(datetime.timedelta(seconds=int(total_time))) print('Training time {}'.format(total_time_str))
def main(): device = torch.device("cuda:0" if torch.cuda.is_available() else "cpu") print("Using {} device training.".format(device.type)) # 用来保存coco_info的文件 results_file = "results{}.txt".format( datetime.datetime.now().strftime("%Y%m%d-%H%M%S")) # 检查保存权重文件夹是否存在,不存在则创建 if not os.path.exists("save_weights"): os.makedirs("save_weights") data_transform = { "train": transforms.Compose( [transforms.ToTensor(), transforms.RandomHorizontalFlip(0.5)]), "val": transforms.Compose([transforms.ToTensor()]) } COCO_root = "/data/coco2017" # load train data set # coco2017 -> annotations -> instances_train2017.json train_data_set = CocoDetection(COCO_root, "train", data_transform["train"]) # 注意这里的collate_fn是自定义的,因为读取的数据包括image和targets,不能直接使用默认的方法合成batch batch_size = 8 nw = min([os.cpu_count(), batch_size if batch_size > 1 else 0, 8]) # number of workers print('Using %g dataloader workers' % nw) train_data_loader = torch.utils.data.DataLoader( train_data_set, batch_size=batch_size, shuffle=True, pin_memory=True, num_workers=nw, collate_fn=train_data_set.collate_fn) # load validation data set # coco2017 -> annotations -> instances_val2017.json val_data_set = CocoDetection(COCO_root, "val", data_transform["val"]) val_data_set_loader = torch.utils.data.DataLoader( val_data_set, batch_size=batch_size, shuffle=False, pin_memory=True, num_workers=nw, collate_fn=train_data_set.collate_fn) # create model num_classes equal background + 80 classes model = create_model(num_classes=81) # print(model) model.to(device) train_loss = [] learning_rate = [] val_map = [] # define optimizer params = [p for p in model.parameters() if p.requires_grad] optimizer = torch.optim.SGD(params, lr=0.001, momentum=0.9, weight_decay=1e-4) # learning rate scheduler lr_scheduler = torch.optim.lr_scheduler.MultiStepLR(optimizer, milestones=[16, 22], gamma=0.1) num_epochs = 26 for epoch in range(num_epochs): # train for one epoch, printing every 50 iterations mean_loss, lr = utils.train_one_epoch(model, optimizer, train_data_loader, device, epoch, print_freq=50, warmup=True) train_loss.append(mean_loss.item()) learning_rate.append(lr) # update the learning rate lr_scheduler.step() # evaluate on the test dataset coco_info = utils.evaluate(model, val_data_set_loader, device=device) # write into txt with open(results_file, "a") as f: # 写入的数据包括coco指标还有loss和learning rate result_info = [ str(round(i, 4)) for i in coco_info + [mean_loss.item(), lr] ] txt = "epoch:{} {}".format(epoch, ' '.join(result_info)) f.write(txt + "\n") val_map.append(coco_info[1]) # pascal mAP # save weights # 仅保存最后5个epoch的权重 if epoch in range(num_epochs)[-5:]: save_files = { 'model': model.state_dict(), 'optimizer': optimizer.state_dict(), 'lr_scheduler': lr_scheduler.state_dict(), 'epoch': epoch } torch.save(save_files, "./save_weights/mobile-model-{}.pth".format(epoch)) # plot loss and lr curve if len(train_loss) != 0 and len(learning_rate) != 0: from plot_curve import plot_loss_and_lr plot_loss_and_lr(train_loss, learning_rate) # plot mAP curve if len(val_map) != 0: from plot_curve import plot_map plot_map(val_map)
def main(args): init_distributed_mode(args) print(args) device = torch.device(args.device) # 用来保存coco_info的文件 results_file = "results{}.txt".format( datetime.datetime.now().strftime("%Y%m%d-%H%M%S")) # Data loading code print("Loading data") data_transform = { "train": transforms.Compose( [transforms.ToTensor(), transforms.RandomHorizontalFlip(0.5)]), "val": transforms.Compose([transforms.ToTensor()]) } COCO_root = args.data_path # load train data set # coco2017 -> annotations -> instances_train2017.json train_data_set = CocoDetection(COCO_root, "train", data_transform["train"]) # load validation data set # coco2017 -> annotations -> instances_val2017.json val_data_set = CocoDetection(COCO_root, "val", data_transform["val"]) print("Creating data loaders") if args.distributed: train_sampler = torch.utils.data.distributed.DistributedSampler( train_data_set) test_sampler = torch.utils.data.distributed.DistributedSampler( val_data_set) else: train_sampler = torch.utils.data.RandomSampler(train_data_set) test_sampler = torch.utils.data.SequentialSampler(val_data_set) if args.aspect_ratio_group_factor >= 0: # 统计所有图像比例在bins区间中的位置索引 group_ids = create_aspect_ratio_groups( train_data_set, k=args.aspect_ratio_group_factor) train_batch_sampler = GroupedBatchSampler(train_sampler, group_ids, args.batch_size) else: train_batch_sampler = torch.utils.data.BatchSampler(train_sampler, args.batch_size, drop_last=True) data_loader = torch.utils.data.DataLoader( train_data_set, batch_sampler=train_batch_sampler, num_workers=args.workers, collate_fn=train_data_set.collate_fn) data_loader_test = torch.utils.data.DataLoader( val_data_set, batch_size=1, sampler=test_sampler, num_workers=args.workers, collate_fn=train_data_set.collate_fn) print("Creating model") # create model num_classes equal background + 80 classes model = create_model(num_classes=args.num_classes + 1, device=device) model.to(device) model_without_ddp = model if args.distributed: model = torch.nn.parallel.DistributedDataParallel( model, device_ids=[args.gpu]) model_without_ddp = model.module params = [p for p in model.parameters() if p.requires_grad] optimizer = torch.optim.SGD(params, lr=args.lr, momentum=args.momentum, weight_decay=args.weight_decay) # lr_scheduler = torch.optim.lr_scheduler.StepLR(optimizer, step_size=args.lr_step_size, gamma=args.lr_gamma) lr_scheduler = torch.optim.lr_scheduler.MultiStepLR( optimizer, milestones=args.lr_steps, gamma=args.lr_gamma) # 如果传入resume参数,即上次训练的权重地址,则接着上次的参数训练 if args.resume: # If map_location is missing, torch.load will first load the module to CPU # and then copy each parameter to where it was saved, # which would result in all processes on the same machine using the same set of devices. checkpoint = torch.load( args.resume, map_location='cpu') # 读取之前保存的权重文件(包括优化器以及学习率策略) model_without_ddp.load_state_dict(checkpoint['model']) optimizer.load_state_dict(checkpoint['optimizer']) lr_scheduler.load_state_dict(checkpoint['lr_scheduler']) args.start_epoch = checkpoint['epoch'] + 1 train_loss = [] learning_rate = [] val_map = [] print("Start training") start_time = time.time() for epoch in range(args.start_epoch, args.epochs): if args.distributed: train_sampler.set_epoch(epoch) mean_loss, lr = utils.train_one_epoch(model, optimizer, data_loader, device, epoch, args.print_freq, warmup=True) # update learning rate lr_scheduler.step() # evaluate after every epoch coco_info = utils.evaluate(model, data_loader_test, device=device) # 只在主进程上进行写操作 if args.rank in [-1, 0]: train_loss.append(mean_loss.item()) learning_rate.append(lr) val_map.append(coco_info[1]) # pascal mAP # write into txt with open(results_file, "a") as f: # 写入的数据包括coco指标还有loss和learning rate result_info = [ str(round(i, 4)) for i in coco_info + [mean_loss.item(), lr] ] txt = "epoch:{} {}".format(epoch, ' '.join(result_info)) f.write(txt + "\n") if args.output_dir: # 只在主节点上执行保存权重操作 save_on_master( { 'model': model_without_ddp.state_dict(), 'optimizer': optimizer.state_dict(), 'lr_scheduler': lr_scheduler.state_dict(), 'args': args, 'epoch': epoch }, os.path.join(args.output_dir, 'model_{}.pth'.format(epoch))) total_time = time.time() - start_time total_time_str = str(datetime.timedelta(seconds=int(total_time))) print('Training time {}'.format(total_time_str)) if args.rank in [-1, 0]: # plot loss and lr curve if len(train_loss) != 0 and len(learning_rate) != 0: from plot_curve import plot_loss_and_lr plot_loss_and_lr(train_loss, learning_rate) # plot mAP curve if len(val_map) != 0: from plot_curve import plot_map plot_map(val_map)
def main(args): print(args) # mp.spawn(main_worker, args=(args,), nprocs=args.world_size, join=True) init_distributed_mode(args) device = torch.device(args.device) results_file = "results{}.txt".format( datetime.datetime.now().strftime("%Y%m%d-%H%M%S")) # Data loading code print("Loading data") data_transform = { "train": transform.Compose([ transform.SSDCropping(), transform.Resize(), transform.ColorJitter(), transform.ToTensor(), transform.RandomHorizontalFlip(), transform.Normalization(), transform.AssignGTtoDefaultBox() ]), "val": transform.Compose([ transform.Resize(), transform.ToTensor(), transform.Normalization() ]) } VOC_root = args.data_path # check voc root if os.path.exists(os.path.join(VOC_root, "VOCdevkit")) is False: raise FileNotFoundError( "VOCdevkit dose not in path:'{}'.".format(VOC_root)) # load train data set train_data_set = VOC2012DataSet(VOC_root, data_transform["train"], train_set='train.txt') # load validation data set val_data_set = VOC2012DataSet(VOC_root, data_transform["val"], train_set='val.txt') print("Creating data loaders") if args.distributed: train_sampler = torch.utils.data.distributed.DistributedSampler( train_data_set) test_sampler = torch.utils.data.distributed.DistributedSampler( val_data_set) else: train_sampler = torch.utils.data.RandomSampler(train_data_set) test_sampler = torch.utils.data.SequentialSampler(val_data_set) if args.aspect_ratio_group_factor >= 0: # 统计所有图像比例在bins区间中的位置索引 group_ids = create_aspect_ratio_groups( train_data_set, k=args.aspect_ratio_group_factor) train_batch_sampler = GroupedBatchSampler(train_sampler, group_ids, args.batch_size) else: train_batch_sampler = torch.utils.data.BatchSampler(train_sampler, args.batch_size, drop_last=True) data_loader = torch.utils.data.DataLoader( train_data_set, batch_sampler=train_batch_sampler, num_workers=args.workers, collate_fn=train_data_set.collate_fn) data_loader_test = torch.utils.data.DataLoader( val_data_set, batch_size=1, sampler=test_sampler, num_workers=args.workers, collate_fn=train_data_set.collate_fn) print("Creating model") model = create_model(num_classes=args.num_classes + 1, device=device) model_without_ddp = model if args.distributed: model = torch.nn.parallel.DistributedDataParallel( model, device_ids=[args.gpu]) model_without_ddp = model.module params = [p for p in model.parameters() if p.requires_grad] optimizer = torch.optim.SGD(params, lr=args.lr, momentum=args.momentum, weight_decay=args.weight_decay) lr_scheduler = torch.optim.lr_scheduler.StepLR(optimizer, step_size=args.lr_step_size, gamma=args.lr_gamma) # lr_scheduler = torch.optim.lr_scheduler.MultiStepLR(optimizer, milestones=args.lr_steps, gamma=args.lr_gamma) # 如果传入resume参数,即上次训练的权重地址,则接着上次的参数训练 if args.resume: # If map_location is missing, torch.load will first load the module to CPU # and then copy each parameter to where it was saved, # which would result in all processes on the same machine using the same set of devices. checkpoint = torch.load( args.resume, map_location='cpu') # 读取之前保存的权重文件(包括优化器以及学习率策略) model_without_ddp.load_state_dict(checkpoint['model']) optimizer.load_state_dict(checkpoint['optimizer']) lr_scheduler.load_state_dict(checkpoint['lr_scheduler']) args.start_epoch = checkpoint['epoch'] + 1 if args.test_only: utils.evaluate(model, data_loader_test, device=device) return train_loss = [] learning_rate = [] val_map = [] print("Start training") start_time = time.time() for epoch in range(args.start_epoch, args.epochs): if args.distributed: train_sampler.set_epoch(epoch) mean_loss, lr = utils.train_one_epoch(model, optimizer, data_loader, device, epoch, args.print_freq) # only first process to save training info if args.rank in [-1, 0]: train_loss.append(mean_loss.item()) learning_rate.append(lr) # update learning rate lr_scheduler.step() # evaluate after every epoch coco_info = utils.evaluate(model, data_loader_test, device=device) if args.rank in [-1, 0]: # write into txt with open(results_file, "a") as f: result_info = [ str(round(i, 4)) for i in coco_info + [mean_loss.item(), lr] ] txt = "epoch:{} {}".format(epoch, ' '.join(result_info)) f.write(txt + "\n") val_map.append(coco_info[1]) # pascal mAP if args.output_dir: # 只在主节点上执行保存权重操作 save_on_master( { 'model': model_without_ddp.state_dict(), 'optimizer': optimizer.state_dict(), 'lr_scheduler': lr_scheduler.state_dict(), 'args': args, 'epoch': epoch }, os.path.join(args.output_dir, 'model_{}.pth'.format(epoch))) total_time = time.time() - start_time total_time_str = str(datetime.timedelta(seconds=int(total_time))) print('Training time {}'.format(total_time_str)) if args.rank in [-1, 0]: # plot loss and lr curve if len(train_loss) != 0 and len(learning_rate) != 0: from plot_curve import plot_loss_and_lr plot_loss_and_lr(train_loss, learning_rate) # plot mAP curve if len(val_map) != 0: from plot_curve import plot_map plot_map(val_map)
def main(parser_data): device = torch.device(parser_data.device if torch.cuda.is_available() else "cpu") print(device) data_transform = { "train": transforms.Compose([transforms.ToTensor(), transforms.RandomHorizontalFlip(0.5)]), "val": transforms.Compose([transforms.ToTensor()]) } VOC_root = parser_data.data_path assert os.path.exists(os.path.join(VOC_root, "VOCdevkit")), "not found VOCdevkit in path:'{}'".format(VOC_root) # load train data set train_data_set = VOC2012DataSet(VOC_root, data_transform["train"], True) # 注意这里的collate_fn是自定义的,因为读取的数据包括image和targets,不能直接使用默认的方法合成batch train_data_loader = torch.utils.data.DataLoader(train_data_set, batch_size=1, shuffle=False, num_workers=0, collate_fn=utils.collate_fn) # load validation data set val_data_set = VOC2012DataSet(VOC_root, data_transform["val"], False) val_data_set_loader = torch.utils.data.DataLoader(val_data_set, batch_size=2, shuffle=False, num_workers=0, collate_fn=utils.collate_fn) # create model num_classes equal background + 20 classes model = create_model(num_classes=21) # print(model) model.to(device) # define optimizer params = [p for p in model.parameters() if p.requires_grad] optimizer = torch.optim.SGD(params, lr=0.005, momentum=0.9, weight_decay=0.0005) # learning rate scheduler lr_scheduler = torch.optim.lr_scheduler.StepLR(optimizer, step_size=5, gamma=0.33) # 如果指定了上次训练保存的权重文件地址,则接着上次结果接着训练 if parser_data.resume != "": checkpoint = torch.load(parser_data.resume) model.load_state_dict(checkpoint['model']) optimizer.load_state_dict(checkpoint['optimizer']) lr_scheduler.load_state_dict(checkpoint['lr_scheduler']) parser_data.start_epoch = checkpoint['epoch'] + 1 print("the training process from epoch{}...".format(parser_data.start_epoch)) train_loss = [] learning_rate = [] val_mAP = [] for epoch in range(parser_data.start_epoch, parser_data.epochs): # train for one epoch, printing every 10 iterations utils.train_one_epoch(model, optimizer, train_data_loader, device, epoch, train_loss=train_loss, train_lr=learning_rate, print_freq=50, warmup=True) # update the learning rate lr_scheduler.step() # evaluate on the test dataset utils.evaluate(model, val_data_set_loader, device=device, mAP_list=val_mAP) # save weights save_files = { 'model': model.state_dict(), 'optimizer': optimizer.state_dict(), 'lr_scheduler': lr_scheduler.state_dict(), 'epoch': epoch} torch.save(save_files, "./save_weights/resNetFpn-model-{}.pth".format(epoch)) # plot loss and lr curve if len(train_loss) != 0 and len(learning_rate) != 0: from plot_curve import plot_loss_and_lr plot_loss_and_lr(train_loss, learning_rate) # plot mAP curve if len(val_mAP) != 0: from plot_curve import plot_map plot_map(val_mAP)
def main(parser_data): device = torch.device( parser_data.device if torch.cuda.is_available() else "cpu") print(device) if not os.path.exists("save_weights"): os.mkdir("save_weights") data_transform = { "test": transform.Compose([ transform.Resize(), transform.ToTensor(), transform.Normalization() ]) } night_root = parser_data.data_path test_dataset = NightDataSet(night_root, data_transform['test'], train_set='test.txt') test_data_loader = torch.utils.data.DataLoader(test_dataset, batch_size=4, shuffle=False, num_workers=0, collate_fn=utils.collate_fn) model = create_model(num_classes=3, device=device) print(model) model.to(device) # define optimizer params = [p for p in model.parameters() if p.requires_grad] optimizer = torch.optim.SGD(params, lr=0.005, momentum=0.9, weight_decay=0.0005) # learning rate scheduler lr_scheduler = torch.optim.lr_scheduler.StepLR(optimizer, step_size=5, gamma=0.5) # 如果指定了上次训练保存的权重文件地址,则接着上次结果接着训练 if parser_data.resume != "": checkpoint = torch.load(parser_data.resume) model.load_state_dict(checkpoint['model']) optimizer.load_state_dict(checkpoint['optimizer']) lr_scheduler.load_state_dict(checkpoint['lr_scheduler']) parser_data.start_epoch = checkpoint['epoch'] + 1 print("the training process from epoch{}...".format( parser_data.start_epoch)) test_val_map = [] val_data = None for epoch in range(parser_data.start_epoch, parser_data.epochs): utils.evaluate(model=model, data_loader=test_data_loader, device=device, data_set=val_data, mAP_list=test_val_map)