def main(args): init_distributed_mode(args) print(args) device = torch.device(args.device) # 用来保存coco_info的文件 results_file = "results{}.txt".format( datetime.datetime.now().strftime("%Y%m%d-%H%M%S")) # Data loading code print("Loading data") data_transform = { "train": transforms.Compose( [transforms.ToTensor(), transforms.RandomHorizontalFlip(0.5)]), "val": transforms.Compose([transforms.ToTensor()]) } COCO_root = args.data_path # load train data set # coco2017 -> annotations -> instances_train2017.json train_data_set = CocoDetection(COCO_root, "train", data_transform["train"]) # load validation data set # coco2017 -> annotations -> instances_val2017.json val_data_set = CocoDetection(COCO_root, "val", data_transform["val"]) print("Creating data loaders") if args.distributed: train_sampler = torch.utils.data.distributed.DistributedSampler( train_data_set) test_sampler = torch.utils.data.distributed.DistributedSampler( val_data_set) else: train_sampler = torch.utils.data.RandomSampler(train_data_set) test_sampler = torch.utils.data.SequentialSampler(val_data_set) if args.aspect_ratio_group_factor >= 0: # 统计所有图像比例在bins区间中的位置索引 group_ids = create_aspect_ratio_groups( train_data_set, k=args.aspect_ratio_group_factor) train_batch_sampler = GroupedBatchSampler(train_sampler, group_ids, args.batch_size) else: train_batch_sampler = torch.utils.data.BatchSampler(train_sampler, args.batch_size, drop_last=True) data_loader = torch.utils.data.DataLoader( train_data_set, batch_sampler=train_batch_sampler, num_workers=args.workers, collate_fn=train_data_set.collate_fn) data_loader_test = torch.utils.data.DataLoader( val_data_set, batch_size=1, sampler=test_sampler, num_workers=args.workers, collate_fn=train_data_set.collate_fn) print("Creating model") # create model num_classes equal background + 80 classes model = create_model(num_classes=args.num_classes + 1, device=device) model.to(device) model_without_ddp = model if args.distributed: model = torch.nn.parallel.DistributedDataParallel( model, device_ids=[args.gpu]) model_without_ddp = model.module params = [p for p in model.parameters() if p.requires_grad] optimizer = torch.optim.SGD(params, lr=args.lr, momentum=args.momentum, weight_decay=args.weight_decay) # lr_scheduler = torch.optim.lr_scheduler.StepLR(optimizer, step_size=args.lr_step_size, gamma=args.lr_gamma) lr_scheduler = torch.optim.lr_scheduler.MultiStepLR( optimizer, milestones=args.lr_steps, gamma=args.lr_gamma) # 如果传入resume参数,即上次训练的权重地址,则接着上次的参数训练 if args.resume: # If map_location is missing, torch.load will first load the module to CPU # and then copy each parameter to where it was saved, # which would result in all processes on the same machine using the same set of devices. checkpoint = torch.load( args.resume, map_location='cpu') # 读取之前保存的权重文件(包括优化器以及学习率策略) model_without_ddp.load_state_dict(checkpoint['model']) optimizer.load_state_dict(checkpoint['optimizer']) lr_scheduler.load_state_dict(checkpoint['lr_scheduler']) args.start_epoch = checkpoint['epoch'] + 1 train_loss = [] learning_rate = [] val_map = [] print("Start training") start_time = time.time() for epoch in range(args.start_epoch, args.epochs): if args.distributed: train_sampler.set_epoch(epoch) mean_loss, lr = utils.train_one_epoch(model, optimizer, data_loader, device, epoch, args.print_freq, warmup=True) # update learning rate lr_scheduler.step() # evaluate after every epoch coco_info = utils.evaluate(model, data_loader_test, device=device) # 只在主进程上进行写操作 if args.rank in [-1, 0]: train_loss.append(mean_loss.item()) learning_rate.append(lr) val_map.append(coco_info[1]) # pascal mAP # write into txt with open(results_file, "a") as f: # 写入的数据包括coco指标还有loss和learning rate result_info = [ str(round(i, 4)) for i in coco_info + [mean_loss.item(), lr] ] txt = "epoch:{} {}".format(epoch, ' '.join(result_info)) f.write(txt + "\n") if args.output_dir: # 只在主节点上执行保存权重操作 save_on_master( { 'model': model_without_ddp.state_dict(), 'optimizer': optimizer.state_dict(), 'lr_scheduler': lr_scheduler.state_dict(), 'args': args, 'epoch': epoch }, os.path.join(args.output_dir, 'model_{}.pth'.format(epoch))) total_time = time.time() - start_time total_time_str = str(datetime.timedelta(seconds=int(total_time))) print('Training time {}'.format(total_time_str)) if args.rank in [-1, 0]: # plot loss and lr curve if len(train_loss) != 0 and len(learning_rate) != 0: from plot_curve import plot_loss_and_lr plot_loss_and_lr(train_loss, learning_rate) # plot mAP curve if len(val_map) != 0: from plot_curve import plot_map plot_map(val_map)
def main(parser_data): device = torch.device( parser_data.device if torch.cuda.is_available() else "cpu") print("Using {} device training.".format(device.type)) # 用来保存coco_info的文件 results_file = "results{}.txt".format( datetime.datetime.now().strftime("%Y%m%d-%H%M%S")) data_transform = { "train": transforms.Compose( [transforms.ToTensor(), transforms.RandomHorizontalFlip(0.5)]), "val": transforms.Compose([transforms.ToTensor()]) } VOC_root = parser_data.data_path # check voc root if os.path.exists(os.path.join(VOC_root, "VOCdevkit")) is False: raise FileNotFoundError( "VOCdevkit dose not in path:'{}'.".format(VOC_root)) # load train data set # VOCdevkit -> VOC2012 -> ImageSets -> Main -> train.txt train_dataset = VOCDataSet(VOC_root, "2012", data_transform["train"], "train.txt") train_sampler = None # 是否按图片相似高宽比采样图片组成batch # 使用的话能够减小训练时所需GPU显存,默认使用 if args.aspect_ratio_group_factor >= 0: train_sampler = torch.utils.data.RandomSampler(train_dataset) # 统计所有图像高宽比例在bins区间中的位置索引 group_ids = create_aspect_ratio_groups( train_dataset, k=args.aspect_ratio_group_factor) # 每个batch图片从同一高宽比例区间中取 train_batch_sampler = GroupedBatchSampler(train_sampler, group_ids, args.batch_size) # 注意这里的collate_fn是自定义的,因为读取的数据包括image和targets,不能直接使用默认的方法合成batch batch_size = parser_data.batch_size nw = min([os.cpu_count(), batch_size if batch_size > 1 else 0, 8]) # number of workers print('Using %g dataloader workers' % nw) if train_sampler: # 如果按照图片高宽比采样图片,dataloader中需要使用batch_sampler train_data_loader = torch.utils.data.DataLoader( train_dataset, batch_sampler=train_batch_sampler, pin_memory=True, num_workers=nw, collate_fn=train_dataset.collate_fn) else: train_data_loader = torch.utils.data.DataLoader( train_dataset, batch_size=batch_size, shuffle=True, pin_memory=True, num_workers=nw, collate_fn=train_dataset.collate_fn) # load validation data set # VOCdevkit -> VOC2012 -> ImageSets -> Main -> val.txt val_dataset = VOCDataSet(VOC_root, "2012", data_transform["val"], "val.txt") val_data_set_loader = torch.utils.data.DataLoader( val_dataset, batch_size=1, shuffle=False, pin_memory=True, num_workers=nw, collate_fn=val_dataset.collate_fn) # create model num_classes equal background + 20 classes model = create_model(num_classes=parser_data.num_classes + 1) # print(model) model.to(device) # define optimizer params = [p for p in model.parameters() if p.requires_grad] optimizer = torch.optim.SGD(params, lr=0.005, momentum=0.9, weight_decay=0.0005) # learning rate scheduler lr_scheduler = torch.optim.lr_scheduler.StepLR(optimizer, step_size=3, gamma=0.33) # 如果指定了上次训练保存的权重文件地址,则接着上次结果接着训练 if parser_data.resume != "": checkpoint = torch.load(parser_data.resume, map_location='cpu') model.load_state_dict(checkpoint['model']) optimizer.load_state_dict(checkpoint['optimizer']) lr_scheduler.load_state_dict(checkpoint['lr_scheduler']) parser_data.start_epoch = checkpoint['epoch'] + 1 print("the training process from epoch{}...".format( parser_data.start_epoch)) train_loss = [] learning_rate = [] val_map = [] for epoch in range(parser_data.start_epoch, parser_data.epochs): # train for one epoch, printing every 10 iterations mean_loss, lr = utils.train_one_epoch(model, optimizer, train_data_loader, device=device, epoch=epoch, print_freq=50, warmup=True) train_loss.append(mean_loss.item()) learning_rate.append(lr) # update the learning rate lr_scheduler.step() # evaluate on the test dataset coco_info = utils.evaluate(model, val_data_set_loader, device=device) # write into txt with open(results_file, "a") as f: # 写入的数据包括coco指标还有loss和learning rate result_info = [ str(round(i, 4)) for i in coco_info + [mean_loss.item()] ] + [str(round(lr, 6))] txt = "epoch:{} {}".format(epoch, ' '.join(result_info)) f.write(txt + "\n") val_map.append(coco_info[1]) # pascal mAP # save weights save_files = { 'model': model.state_dict(), 'optimizer': optimizer.state_dict(), 'lr_scheduler': lr_scheduler.state_dict(), 'epoch': epoch } torch.save(save_files, "./save_weights/resNetFpn-model-{}.pth".format(epoch)) # plot loss and lr curve if len(train_loss) != 0 and len(learning_rate) != 0: from plot_curve import plot_loss_and_lr plot_loss_and_lr(train_loss, learning_rate) # plot mAP curve if len(val_map) != 0: from plot_curve import plot_map plot_map(val_map)
def main(): device = torch.device("cuda:0" if torch.cuda.is_available() else "cpu") print("Using {} device training.".format(device.type)) # 用来保存coco_info的文件 results_file = "results{}.txt".format( datetime.datetime.now().strftime("%Y%m%d-%H%M%S")) # 检查保存权重文件夹是否存在,不存在则创建 if not os.path.exists("save_weights"): os.makedirs("save_weights") data_transform = { "train": transforms.Compose( [transforms.ToTensor(), transforms.RandomHorizontalFlip(0.5)]), "val": transforms.Compose([transforms.ToTensor()]) } VOC_root = "./" # VOCdevkit aspect_ratio_group_factor = 3 batch_size = 8 # check voc root if os.path.exists(os.path.join(VOC_root, "VOCdevkit")) is False: raise FileNotFoundError( "VOCdevkit dose not in path:'{}'.".format(VOC_root)) # load train data set # VOCdevkit -> VOC2012 -> ImageSets -> Main -> train.txt train_dataset = VOCDataSet(VOC_root, "2012", data_transform["train"], "train.txt") train_sampler = None # 是否按图片相似高宽比采样图片组成batch # 使用的话能够减小训练时所需GPU显存,默认使用 if aspect_ratio_group_factor >= 0: train_sampler = torch.utils.data.RandomSampler(train_dataset) # 统计所有图像高宽比例在bins区间中的位置索引 group_ids = create_aspect_ratio_groups(train_dataset, k=aspect_ratio_group_factor) # 每个batch图片从同一高宽比例区间中取 train_batch_sampler = GroupedBatchSampler(train_sampler, group_ids, batch_size) nw = min([os.cpu_count(), batch_size if batch_size > 1 else 0, 8]) # number of workers print('Using %g dataloader workers' % nw) # 注意这里的collate_fn是自定义的,因为读取的数据包括image和targets,不能直接使用默认的方法合成batch if train_sampler: # 如果按照图片高宽比采样图片,dataloader中需要使用batch_sampler train_data_loader = torch.utils.data.DataLoader( train_dataset, batch_sampler=train_batch_sampler, pin_memory=True, num_workers=nw, collate_fn=train_dataset.collate_fn) else: train_data_loader = torch.utils.data.DataLoader( train_dataset, batch_size=batch_size, shuffle=True, pin_memory=True, num_workers=nw, collate_fn=train_dataset.collate_fn) # load validation data set # VOCdevkit -> VOC2012 -> ImageSets -> Main -> val.txt val_dataset = VOCDataSet(VOC_root, "2012", data_transform["val"], "val.txt") val_data_loader = torch.utils.data.DataLoader( val_dataset, batch_size=1, shuffle=False, pin_memory=True, num_workers=nw, collate_fn=val_dataset.collate_fn) # create model num_classes equal background + 20 classes model = create_model(num_classes=21) # print(model) model.to(device) train_loss = [] learning_rate = [] val_map = [] # # # # # # # # # # # # # # # # # # # # # # # # # # # # # # # # # first frozen backbone and train 5 epochs # # 首先冻结前置特征提取网络权重(backbone),训练rpn以及最终预测网络部分 # # # # # # # # # # # # # # # # # # # # # # # # # # # # # # # # # for param in model.backbone.parameters(): param.requires_grad = False # define optimizer params = [p for p in model.parameters() if p.requires_grad] optimizer = torch.optim.SGD(params, lr=0.005, momentum=0.9, weight_decay=0.0005) init_epochs = 5 for epoch in range(init_epochs): # train for one epoch, printing every 10 iterations mean_loss, lr = utils.train_one_epoch(model, optimizer, train_data_loader, device, epoch, print_freq=50, warmup=True) train_loss.append(mean_loss.item()) learning_rate.append(lr) # evaluate on the test dataset coco_info = utils.evaluate(model, val_data_loader, device=device) # write into txt with open(results_file, "a") as f: # 写入的数据包括coco指标还有loss和learning rate result_info = [ str(round(i, 4)) for i in coco_info + [mean_loss.item()] ] + [str(round(lr, 6))] txt = "epoch:{} {}".format(epoch, ' '.join(result_info)) f.write(txt + "\n") val_map.append(coco_info[1]) # pascal mAP torch.save(model.state_dict(), "./save_weights/pretrain.pth") # # # # # # # # # # # # # # # # # # # # # # # # # # # # # second unfrozen backbone and train all network # # 解冻前置特征提取网络权重(backbone),接着训练整个网络权重 # # # # # # # # # # # # # # # # # # # # # # # # # # # # # # 冻结backbone部分底层权重 for name, parameter in model.backbone.named_parameters(): split_name = name.split(".")[0] if split_name in ["0", "1", "2", "3"]: parameter.requires_grad = False else: parameter.requires_grad = True # define optimizer params = [p for p in model.parameters() if p.requires_grad] optimizer = torch.optim.SGD(params, lr=0.005, momentum=0.9, weight_decay=0.0005) # learning rate scheduler lr_scheduler = torch.optim.lr_scheduler.StepLR(optimizer, step_size=3, gamma=0.33) num_epochs = 20 for epoch in range(init_epochs, num_epochs + init_epochs, 1): # train for one epoch, printing every 50 iterations mean_loss, lr = utils.train_one_epoch(model, optimizer, train_data_loader, device, epoch, print_freq=50, warmup=True) train_loss.append(mean_loss.item()) learning_rate.append(lr) # update the learning rate lr_scheduler.step() # evaluate on the test dataset coco_info = utils.evaluate(model, val_data_loader, device=device) # write into txt with open(results_file, "a") as f: # 写入的数据包括coco指标还有loss和learning rate result_info = [ str(round(i, 4)) for i in coco_info + [mean_loss.item()] ] + [str(round(lr, 6))] txt = "epoch:{} {}".format(epoch, ' '.join(result_info)) f.write(txt + "\n") val_map.append(coco_info[1]) # pascal mAP # save weights # 仅保存最后5个epoch的权重 if epoch in range(num_epochs + init_epochs)[-5:]: save_files = { 'model': model.state_dict(), 'optimizer': optimizer.state_dict(), 'lr_scheduler': lr_scheduler.state_dict(), 'epoch': epoch } torch.save(save_files, "./save_weights/mobile-model-{}.pth".format(epoch)) # plot loss and lr curve if len(train_loss) != 0 and len(learning_rate) != 0: from plot_curve import plot_loss_and_lr plot_loss_and_lr(train_loss, learning_rate) # plot mAP curve if len(val_map) != 0: from plot_curve import plot_map plot_map(val_map)
def main(args): print(args) # mp.spawn(main_worker, args=(args,), nprocs=args.world_size, join=True) init_distributed_mode(args) device = torch.device(args.device) results_file = "results{}.txt".format( datetime.datetime.now().strftime("%Y%m%d-%H%M%S")) # Data loading code print("Loading data") data_transform = { "train": transform.Compose([ transform.SSDCropping(), transform.Resize(), transform.ColorJitter(), transform.ToTensor(), transform.RandomHorizontalFlip(), transform.Normalization(), transform.AssignGTtoDefaultBox() ]), "val": transform.Compose([ transform.Resize(), transform.ToTensor(), transform.Normalization() ]) } VOC_root = args.data_path # check voc root if os.path.exists(os.path.join(VOC_root, "VOCdevkit")) is False: raise FileNotFoundError( "VOCdevkit dose not in path:'{}'.".format(VOC_root)) # load train data set train_data_set = VOC2012DataSet(VOC_root, data_transform["train"], train_set='train.txt') # load validation data set val_data_set = VOC2012DataSet(VOC_root, data_transform["val"], train_set='val.txt') print("Creating data loaders") if args.distributed: train_sampler = torch.utils.data.distributed.DistributedSampler( train_data_set) test_sampler = torch.utils.data.distributed.DistributedSampler( val_data_set) else: train_sampler = torch.utils.data.RandomSampler(train_data_set) test_sampler = torch.utils.data.SequentialSampler(val_data_set) if args.aspect_ratio_group_factor >= 0: # 统计所有图像比例在bins区间中的位置索引 group_ids = create_aspect_ratio_groups( train_data_set, k=args.aspect_ratio_group_factor) train_batch_sampler = GroupedBatchSampler(train_sampler, group_ids, args.batch_size) else: train_batch_sampler = torch.utils.data.BatchSampler(train_sampler, args.batch_size, drop_last=True) data_loader = torch.utils.data.DataLoader( train_data_set, batch_sampler=train_batch_sampler, num_workers=args.workers, collate_fn=train_data_set.collate_fn) data_loader_test = torch.utils.data.DataLoader( val_data_set, batch_size=1, sampler=test_sampler, num_workers=args.workers, collate_fn=train_data_set.collate_fn) print("Creating model") model = create_model(num_classes=args.num_classes + 1, device=device) model_without_ddp = model if args.distributed: model = torch.nn.parallel.DistributedDataParallel( model, device_ids=[args.gpu]) model_without_ddp = model.module params = [p for p in model.parameters() if p.requires_grad] optimizer = torch.optim.SGD(params, lr=args.lr, momentum=args.momentum, weight_decay=args.weight_decay) lr_scheduler = torch.optim.lr_scheduler.StepLR(optimizer, step_size=args.lr_step_size, gamma=args.lr_gamma) # lr_scheduler = torch.optim.lr_scheduler.MultiStepLR(optimizer, milestones=args.lr_steps, gamma=args.lr_gamma) # 如果传入resume参数,即上次训练的权重地址,则接着上次的参数训练 if args.resume: # If map_location is missing, torch.load will first load the module to CPU # and then copy each parameter to where it was saved, # which would result in all processes on the same machine using the same set of devices. checkpoint = torch.load( args.resume, map_location='cpu') # 读取之前保存的权重文件(包括优化器以及学习率策略) model_without_ddp.load_state_dict(checkpoint['model']) optimizer.load_state_dict(checkpoint['optimizer']) lr_scheduler.load_state_dict(checkpoint['lr_scheduler']) args.start_epoch = checkpoint['epoch'] + 1 if args.test_only: utils.evaluate(model, data_loader_test, device=device) return train_loss = [] learning_rate = [] val_map = [] print("Start training") start_time = time.time() for epoch in range(args.start_epoch, args.epochs): if args.distributed: train_sampler.set_epoch(epoch) mean_loss, lr = utils.train_one_epoch(model, optimizer, data_loader, device, epoch, args.print_freq) # only first process to save training info if args.rank in [-1, 0]: train_loss.append(mean_loss.item()) learning_rate.append(lr) # update learning rate lr_scheduler.step() # evaluate after every epoch coco_info = utils.evaluate(model, data_loader_test, device=device) if args.rank in [-1, 0]: # write into txt with open(results_file, "a") as f: result_info = [ str(round(i, 4)) for i in coco_info + [mean_loss.item(), lr] ] txt = "epoch:{} {}".format(epoch, ' '.join(result_info)) f.write(txt + "\n") val_map.append(coco_info[1]) # pascal mAP if args.output_dir: # 只在主节点上执行保存权重操作 save_on_master( { 'model': model_without_ddp.state_dict(), 'optimizer': optimizer.state_dict(), 'lr_scheduler': lr_scheduler.state_dict(), 'args': args, 'epoch': epoch }, os.path.join(args.output_dir, 'model_{}.pth'.format(epoch))) total_time = time.time() - start_time total_time_str = str(datetime.timedelta(seconds=int(total_time))) print('Training time {}'.format(total_time_str)) if args.rank in [-1, 0]: # plot loss and lr curve if len(train_loss) != 0 and len(learning_rate) != 0: from plot_curve import plot_loss_and_lr plot_loss_and_lr(train_loss, learning_rate) # plot mAP curve if len(val_map) != 0: from plot_curve import plot_map plot_map(val_map)
def main(args): print(args) # mp.spawn(main_worker, args=(args,), nprocs=args.world_size, join=True) init_distributed_mode(args) device = torch.device(args.device) # Data loading code print("Loading data") data_transform = { "train": transforms.Compose( [transforms.ToTensor(), transforms.RandomHorizontalFlip(0.5)]), "val": transforms.Compose([transforms.ToTensor()]) } VOC_root = args.data_path # check voc root if os.path.exists(os.path.join(VOC_root, "VOCdevkit")) is False: raise FileNotFoundError( "VOCdevkit dose not in path:'{}'.".format(VOC_root)) # load train data set # VOCdevkit -> VOC2012 -> ImageSets -> Main -> train.txt train_data_set = VOC2012DataSet(VOC_root, data_transform["train"], "train.txt") # load validation data set # VOCdevkit -> VOC2012 -> ImageSets -> Main -> val.txt val_data_set = VOC2012DataSet(VOC_root, data_transform["val"], "val.txt") print("Creating data loaders") if args.distributed: train_sampler = torch.utils.data.distributed.DistributedSampler( train_data_set) test_sampler = torch.utils.data.distributed.DistributedSampler( val_data_set) else: train_sampler = torch.utils.data.RandomSampler(train_data_set) test_sampler = torch.utils.data.SequentialSampler(val_data_set) if args.aspect_ratio_group_factor >= 0: # 统计所有图像比例在bins区间中的位置索引 group_ids = create_aspect_ratio_groups( train_data_set, k=args.aspect_ratio_group_factor) train_batch_sampler = GroupedBatchSampler(train_sampler, group_ids, args.batch_size) else: train_batch_sampler = torch.utils.data.BatchSampler(train_sampler, args.batch_size, drop_last=True) data_loader = torch.utils.data.DataLoader( train_data_set, batch_sampler=train_batch_sampler, num_workers=args.workers, collate_fn=train_data_set.collate_fn) data_loader_test = torch.utils.data.DataLoader( val_data_set, batch_size=1, sampler=test_sampler, num_workers=args.workers, collate_fn=train_data_set.collate_fn) print("Creating model") model = create_model(num_classes=21, device=device) model.to(device) model_without_ddp = model if args.distributed: model = torch.nn.parallel.DistributedDataParallel( model, device_ids=[args.gpu]) model_without_ddp = model.module params = [p for p in model.parameters() if p.requires_grad] optimizer = torch.optim.SGD(params, lr=args.lr, momentum=args.momentum, weight_decay=args.weight_decay) # lr_scheduler = torch.optim.lr_scheduler.StepLR(optimizer, step_size=args.lr_step_size, gamma=args.lr_gamma) lr_scheduler = torch.optim.lr_scheduler.MultiStepLR( optimizer, milestones=args.lr_steps, gamma=args.lr_gamma) # 如果传入resume参数,即上次训练的权重地址,则接着上次的参数训练 if args.resume: # If map_location is missing, torch.load will first load the module to CPU # and then copy each parameter to where it was saved, # which would result in all processes on the same machine using the same set of devices. checkpoint = torch.load( args.resume, map_location='cpu') # 读取之前保存的权重文件(包括优化器以及学习率策略) model_without_ddp.load_state_dict(checkpoint['model']) optimizer.load_state_dict(checkpoint['optimizer']) lr_scheduler.load_state_dict(checkpoint['lr_scheduler']) args.start_epoch = checkpoint['epoch'] + 1 if args.test_only: utils.evaluate(model, data_loader_test, device=device) return print("Start training") start_time = time.time() for epoch in range(args.start_epoch, args.epochs): if args.distributed: train_sampler.set_epoch(epoch) utils.train_one_epoch(model, optimizer, data_loader, device, epoch, args.print_freq) lr_scheduler.step() if args.output_dir: # 只在主节点上执行保存权重操作 save_on_master( { 'model': model_without_ddp.state_dict(), 'optimizer': optimizer.state_dict(), 'lr_scheduler': lr_scheduler.state_dict(), 'args': args, 'epoch': epoch }, os.path.join(args.output_dir, 'model_{}.pth'.format(epoch))) # evaluate after every epoch utils.evaluate(model, data_loader_test, device=device) total_time = time.time() - start_time total_time_str = str(datetime.timedelta(seconds=int(total_time))) print('Training time {}'.format(total_time_str))