def main(parser_data): device = torch.device( parser_data.device if torch.cuda.is_available() else "cpu") print("Using {} device training.".format(device.type)) if not os.path.exists("save_weights"): os.mkdir("save_weights") data_transform = { "train": transform.Compose([ transform.SSDCropping(), transform.Resize(), transform.ColorJitter(), transform.ToTensor(), transform.RandomHorizontalFlip(), transform.Normalization(), transform.AssignGTtoDefaultBox() ]), "val": transform.Compose([ transform.Resize(), transform.ToTensor(), transform.Normalization() ]) } VOC_root = parser_data.data_path # check voc root if os.path.exists(os.path.join(VOC_root, "VOCdevkit")) is False: raise FileNotFoundError( "VOCdevkit dose not in path:'{}'.".format(VOC_root)) train_dataset = VOC2012DataSet(VOC_root, data_transform['train'], train_set='train.txt') # 注意训练时,batch_size必须大于1 batch_size = parser_data.batch_size assert batch_size > 1, "batch size must be greater than 1" nw = min([os.cpu_count(), batch_size if batch_size > 1 else 0, 8]) # number of workers print('Using %g dataloader workers' % nw) train_data_loader = torch.utils.data.DataLoader( train_dataset, batch_size=batch_size, shuffle=True, num_workers=nw, collate_fn=train_dataset.collate_fn) val_dataset = VOC2012DataSet(VOC_root, data_transform['val'], train_set='val.txt') val_data_loader = torch.utils.data.DataLoader( val_dataset, batch_size=batch_size, shuffle=False, num_workers=nw, collate_fn=train_dataset.collate_fn) model = create_model(num_classes=21, device=device) model.to(device) # define optimizer params = [p for p in model.parameters() if p.requires_grad] optimizer = torch.optim.SGD(params, lr=0.0005, momentum=0.9, weight_decay=0.0005) # learning rate scheduler lr_scheduler = torch.optim.lr_scheduler.StepLR(optimizer, step_size=5, gamma=0.3) # 如果指定了上次训练保存的权重文件地址,则接着上次结果接着训练 if parser_data.resume != "": checkpoint = torch.load(parser_data.resume) model.load_state_dict(checkpoint['model']) optimizer.load_state_dict(checkpoint['optimizer']) lr_scheduler.load_state_dict(checkpoint['lr_scheduler']) parser_data.start_epoch = checkpoint['epoch'] + 1 print("the training process from epoch{}...".format( parser_data.start_epoch)) train_loss = [] learning_rate = [] val_map = [] val_data = None # 如果电脑内存充裕,可提前加载验证集数据,以免每次验证时都要重新加载一次数据,节省时间 # val_data = get_coco_api_from_dataset(val_data_loader.dataset) for epoch in range(parser_data.start_epoch, parser_data.epochs): utils.train_one_epoch(model=model, optimizer=optimizer, data_loader=train_data_loader, device=device, epoch=epoch, print_freq=50, train_loss=train_loss, train_lr=learning_rate) lr_scheduler.step() utils.evaluate(model=model, data_loader=val_data_loader, device=device, data_set=val_data, mAP_list=val_map) # save weights save_files = { 'model': model.state_dict(), 'optimizer': optimizer.state_dict(), 'lr_scheduler': lr_scheduler.state_dict(), 'epoch': epoch } torch.save(save_files, "./save_weights/ssd300-{}.pth".format(epoch)) # plot loss and lr curve if len(train_loss) != 0 and len(learning_rate) != 0: from plot_curve import plot_loss_and_lr plot_loss_and_lr(train_loss, learning_rate) # plot mAP curve if len(val_map) != 0: from plot_curve import plot_map plot_map(val_map)
def main(): device = torch.device("cuda:0" if torch.cuda.is_available() else "cpu") print("Using {} device training.".format(device.type)) # 用来保存coco_info的文件 results_file = "results{}.txt".format( datetime.datetime.now().strftime("%Y%m%d-%H%M%S")) # 检查保存权重文件夹是否存在,不存在则创建 if not os.path.exists("save_weights"): os.makedirs("save_weights") data_transform = { "train": transforms.Compose( [transforms.ToTensor(), transforms.RandomHorizontalFlip(0.5)]), "val": transforms.Compose([transforms.ToTensor()]) } COCO_root = "/data/coco2017" # load train data set # coco2017 -> annotations -> instances_train2017.json train_data_set = CocoDetection(COCO_root, "train", data_transform["train"]) # 注意这里的collate_fn是自定义的,因为读取的数据包括image和targets,不能直接使用默认的方法合成batch batch_size = 8 nw = min([os.cpu_count(), batch_size if batch_size > 1 else 0, 8]) # number of workers print('Using %g dataloader workers' % nw) train_data_loader = torch.utils.data.DataLoader( train_data_set, batch_size=batch_size, shuffle=True, pin_memory=True, num_workers=nw, collate_fn=train_data_set.collate_fn) # load validation data set # coco2017 -> annotations -> instances_val2017.json val_data_set = CocoDetection(COCO_root, "val", data_transform["val"]) val_data_set_loader = torch.utils.data.DataLoader( val_data_set, batch_size=batch_size, shuffle=False, pin_memory=True, num_workers=nw, collate_fn=train_data_set.collate_fn) # create model num_classes equal background + 80 classes model = create_model(num_classes=81) # print(model) model.to(device) train_loss = [] learning_rate = [] val_map = [] # define optimizer params = [p for p in model.parameters() if p.requires_grad] optimizer = torch.optim.SGD(params, lr=0.001, momentum=0.9, weight_decay=1e-4) # learning rate scheduler lr_scheduler = torch.optim.lr_scheduler.MultiStepLR(optimizer, milestones=[16, 22], gamma=0.1) num_epochs = 26 for epoch in range(num_epochs): # train for one epoch, printing every 50 iterations mean_loss, lr = utils.train_one_epoch(model, optimizer, train_data_loader, device, epoch, print_freq=50, warmup=True) train_loss.append(mean_loss.item()) learning_rate.append(lr) # update the learning rate lr_scheduler.step() # evaluate on the test dataset coco_info = utils.evaluate(model, val_data_set_loader, device=device) # write into txt with open(results_file, "a") as f: # 写入的数据包括coco指标还有loss和learning rate result_info = [ str(round(i, 4)) for i in coco_info + [mean_loss.item(), lr] ] txt = "epoch:{} {}".format(epoch, ' '.join(result_info)) f.write(txt + "\n") val_map.append(coco_info[1]) # pascal mAP # save weights # 仅保存最后5个epoch的权重 if epoch in range(num_epochs)[-5:]: save_files = { 'model': model.state_dict(), 'optimizer': optimizer.state_dict(), 'lr_scheduler': lr_scheduler.state_dict(), 'epoch': epoch } torch.save(save_files, "./save_weights/mobile-model-{}.pth".format(epoch)) # plot loss and lr curve if len(train_loss) != 0 and len(learning_rate) != 0: from plot_curve import plot_loss_and_lr plot_loss_and_lr(train_loss, learning_rate) # plot mAP curve if len(val_map) != 0: from plot_curve import plot_map plot_map(val_map)
def main(args): print(args) # mp.spawn(main_worker, args=(args,), nprocs=args.world_size, join=True) init_distributed_mode(args) device = torch.device(args.device) results_file = "results{}.txt".format( datetime.datetime.now().strftime("%Y%m%d-%H%M%S")) # Data loading code print("Loading data") data_transform = { "train": transform.Compose([ transform.SSDCropping(), transform.Resize(), transform.ColorJitter(), transform.ToTensor(), transform.RandomHorizontalFlip(), transform.Normalization(), transform.AssignGTtoDefaultBox() ]), "val": transform.Compose([ transform.Resize(), transform.ToTensor(), transform.Normalization() ]) } VOC_root = args.data_path # check voc root if os.path.exists(os.path.join(VOC_root, "VOCdevkit")) is False: raise FileNotFoundError( "VOCdevkit dose not in path:'{}'.".format(VOC_root)) # load train data set train_data_set = VOC2012DataSet(VOC_root, data_transform["train"], train_set='train.txt') # load validation data set val_data_set = VOC2012DataSet(VOC_root, data_transform["val"], train_set='val.txt') print("Creating data loaders") if args.distributed: train_sampler = torch.utils.data.distributed.DistributedSampler( train_data_set) test_sampler = torch.utils.data.distributed.DistributedSampler( val_data_set) else: train_sampler = torch.utils.data.RandomSampler(train_data_set) test_sampler = torch.utils.data.SequentialSampler(val_data_set) if args.aspect_ratio_group_factor >= 0: # 统计所有图像比例在bins区间中的位置索引 group_ids = create_aspect_ratio_groups( train_data_set, k=args.aspect_ratio_group_factor) train_batch_sampler = GroupedBatchSampler(train_sampler, group_ids, args.batch_size) else: train_batch_sampler = torch.utils.data.BatchSampler(train_sampler, args.batch_size, drop_last=True) data_loader = torch.utils.data.DataLoader( train_data_set, batch_sampler=train_batch_sampler, num_workers=args.workers, collate_fn=train_data_set.collate_fn) data_loader_test = torch.utils.data.DataLoader( val_data_set, batch_size=1, sampler=test_sampler, num_workers=args.workers, collate_fn=train_data_set.collate_fn) print("Creating model") model = create_model(num_classes=args.num_classes + 1, device=device) model_without_ddp = model if args.distributed: model = torch.nn.parallel.DistributedDataParallel( model, device_ids=[args.gpu]) model_without_ddp = model.module params = [p for p in model.parameters() if p.requires_grad] optimizer = torch.optim.SGD(params, lr=args.lr, momentum=args.momentum, weight_decay=args.weight_decay) lr_scheduler = torch.optim.lr_scheduler.StepLR(optimizer, step_size=args.lr_step_size, gamma=args.lr_gamma) # lr_scheduler = torch.optim.lr_scheduler.MultiStepLR(optimizer, milestones=args.lr_steps, gamma=args.lr_gamma) # 如果传入resume参数,即上次训练的权重地址,则接着上次的参数训练 if args.resume: # If map_location is missing, torch.load will first load the module to CPU # and then copy each parameter to where it was saved, # which would result in all processes on the same machine using the same set of devices. checkpoint = torch.load( args.resume, map_location='cpu') # 读取之前保存的权重文件(包括优化器以及学习率策略) model_without_ddp.load_state_dict(checkpoint['model']) optimizer.load_state_dict(checkpoint['optimizer']) lr_scheduler.load_state_dict(checkpoint['lr_scheduler']) args.start_epoch = checkpoint['epoch'] + 1 if args.test_only: utils.evaluate(model, data_loader_test, device=device) return train_loss = [] learning_rate = [] val_map = [] print("Start training") start_time = time.time() for epoch in range(args.start_epoch, args.epochs): if args.distributed: train_sampler.set_epoch(epoch) mean_loss, lr = utils.train_one_epoch(model, optimizer, data_loader, device, epoch, args.print_freq) # only first process to save training info if args.rank in [-1, 0]: train_loss.append(mean_loss.item()) learning_rate.append(lr) # update learning rate lr_scheduler.step() # evaluate after every epoch coco_info = utils.evaluate(model, data_loader_test, device=device) if args.rank in [-1, 0]: # write into txt with open(results_file, "a") as f: result_info = [ str(round(i, 4)) for i in coco_info + [mean_loss.item(), lr] ] txt = "epoch:{} {}".format(epoch, ' '.join(result_info)) f.write(txt + "\n") val_map.append(coco_info[1]) # pascal mAP if args.output_dir: # 只在主节点上执行保存权重操作 save_on_master( { 'model': model_without_ddp.state_dict(), 'optimizer': optimizer.state_dict(), 'lr_scheduler': lr_scheduler.state_dict(), 'args': args, 'epoch': epoch }, os.path.join(args.output_dir, 'model_{}.pth'.format(epoch))) total_time = time.time() - start_time total_time_str = str(datetime.timedelta(seconds=int(total_time))) print('Training time {}'.format(total_time_str)) if args.rank in [-1, 0]: # plot loss and lr curve if len(train_loss) != 0 and len(learning_rate) != 0: from plot_curve import plot_loss_and_lr plot_loss_and_lr(train_loss, learning_rate) # plot mAP curve if len(val_map) != 0: from plot_curve import plot_map plot_map(val_map)
def main(args): init_distributed_mode(args) print(args) device = torch.device(args.device) # 用来保存coco_info的文件 results_file = "results{}.txt".format( datetime.datetime.now().strftime("%Y%m%d-%H%M%S")) # Data loading code print("Loading data") data_transform = { "train": transforms.Compose( [transforms.ToTensor(), transforms.RandomHorizontalFlip(0.5)]), "val": transforms.Compose([transforms.ToTensor()]) } COCO_root = args.data_path # load train data set # coco2017 -> annotations -> instances_train2017.json train_data_set = CocoDetection(COCO_root, "train", data_transform["train"]) # load validation data set # coco2017 -> annotations -> instances_val2017.json val_data_set = CocoDetection(COCO_root, "val", data_transform["val"]) print("Creating data loaders") if args.distributed: train_sampler = torch.utils.data.distributed.DistributedSampler( train_data_set) test_sampler = torch.utils.data.distributed.DistributedSampler( val_data_set) else: train_sampler = torch.utils.data.RandomSampler(train_data_set) test_sampler = torch.utils.data.SequentialSampler(val_data_set) if args.aspect_ratio_group_factor >= 0: # 统计所有图像比例在bins区间中的位置索引 group_ids = create_aspect_ratio_groups( train_data_set, k=args.aspect_ratio_group_factor) train_batch_sampler = GroupedBatchSampler(train_sampler, group_ids, args.batch_size) else: train_batch_sampler = torch.utils.data.BatchSampler(train_sampler, args.batch_size, drop_last=True) data_loader = torch.utils.data.DataLoader( train_data_set, batch_sampler=train_batch_sampler, num_workers=args.workers, collate_fn=train_data_set.collate_fn) data_loader_test = torch.utils.data.DataLoader( val_data_set, batch_size=1, sampler=test_sampler, num_workers=args.workers, collate_fn=train_data_set.collate_fn) print("Creating model") # create model num_classes equal background + 80 classes model = create_model(num_classes=args.num_classes + 1, device=device) model.to(device) model_without_ddp = model if args.distributed: model = torch.nn.parallel.DistributedDataParallel( model, device_ids=[args.gpu]) model_without_ddp = model.module params = [p for p in model.parameters() if p.requires_grad] optimizer = torch.optim.SGD(params, lr=args.lr, momentum=args.momentum, weight_decay=args.weight_decay) # lr_scheduler = torch.optim.lr_scheduler.StepLR(optimizer, step_size=args.lr_step_size, gamma=args.lr_gamma) lr_scheduler = torch.optim.lr_scheduler.MultiStepLR( optimizer, milestones=args.lr_steps, gamma=args.lr_gamma) # 如果传入resume参数,即上次训练的权重地址,则接着上次的参数训练 if args.resume: # If map_location is missing, torch.load will first load the module to CPU # and then copy each parameter to where it was saved, # which would result in all processes on the same machine using the same set of devices. checkpoint = torch.load( args.resume, map_location='cpu') # 读取之前保存的权重文件(包括优化器以及学习率策略) model_without_ddp.load_state_dict(checkpoint['model']) optimizer.load_state_dict(checkpoint['optimizer']) lr_scheduler.load_state_dict(checkpoint['lr_scheduler']) args.start_epoch = checkpoint['epoch'] + 1 train_loss = [] learning_rate = [] val_map = [] print("Start training") start_time = time.time() for epoch in range(args.start_epoch, args.epochs): if args.distributed: train_sampler.set_epoch(epoch) mean_loss, lr = utils.train_one_epoch(model, optimizer, data_loader, device, epoch, args.print_freq, warmup=True) # update learning rate lr_scheduler.step() # evaluate after every epoch coco_info = utils.evaluate(model, data_loader_test, device=device) # 只在主进程上进行写操作 if args.rank in [-1, 0]: train_loss.append(mean_loss.item()) learning_rate.append(lr) val_map.append(coco_info[1]) # pascal mAP # write into txt with open(results_file, "a") as f: # 写入的数据包括coco指标还有loss和learning rate result_info = [ str(round(i, 4)) for i in coco_info + [mean_loss.item(), lr] ] txt = "epoch:{} {}".format(epoch, ' '.join(result_info)) f.write(txt + "\n") if args.output_dir: # 只在主节点上执行保存权重操作 save_on_master( { 'model': model_without_ddp.state_dict(), 'optimizer': optimizer.state_dict(), 'lr_scheduler': lr_scheduler.state_dict(), 'args': args, 'epoch': epoch }, os.path.join(args.output_dir, 'model_{}.pth'.format(epoch))) total_time = time.time() - start_time total_time_str = str(datetime.timedelta(seconds=int(total_time))) print('Training time {}'.format(total_time_str)) if args.rank in [-1, 0]: # plot loss and lr curve if len(train_loss) != 0 and len(learning_rate) != 0: from plot_curve import plot_loss_and_lr plot_loss_and_lr(train_loss, learning_rate) # plot mAP curve if len(val_map) != 0: from plot_curve import plot_map plot_map(val_map)
def main(parser_data): device = torch.device( parser_data.device if torch.cuda.is_available() else "cpu") print(device) if not os.path.exists("save_weights"): os.mkdir("save_weights") data_transform = { "train": transform.Compose([ transform.SSDCropping(), transform.Resize(), transform.ColorJitter(), transform.ToTensor(), transform.RandomHorizontalFlip(), transform.Normalization(), transform.AssignGTtoDefaultBox() ]), "val": transform.Compose([ transform.Resize(), transform.ToTensor(), transform.Normalization() ]) } night_root = parser_data.data_path train_dataset = NightDataSet(night_root, data_transform['train'], train_set='train.txt') # aa = train_dataset[1] # 注意训练时,batch_size必须大于1 train_data_loader = torch.utils.data.DataLoader( train_dataset, batch_size=8, shuffle=True, num_workers=4, collate_fn=utils.collate_fn) val_dataset = NightDataSet(night_root, data_transform['val'], train_set='val.txt') # bb = val_dataset[2] val_data_loader = torch.utils.data.DataLoader(val_dataset, batch_size=4, shuffle=False, num_workers=0, collate_fn=utils.collate_fn) model = create_model(num_classes=3, device=device) print(model) model.to(device) # define optimizer params = [p for p in model.parameters() if p.requires_grad] optimizer = torch.optim.SGD(params, lr=0.005, momentum=0.9, weight_decay=0.0005) # learning rate scheduler lr_scheduler = torch.optim.lr_scheduler.StepLR(optimizer, step_size=5, gamma=0.5) # 如果指定了上次训练保存的权重文件地址,则接着上次结果接着训练 if parser_data.resume != "": checkpoint = torch.load(parser_data.resume) model.load_state_dict(checkpoint['model']) optimizer.load_state_dict(checkpoint['optimizer']) lr_scheduler.load_state_dict(checkpoint['lr_scheduler']) parser_data.start_epoch = checkpoint['epoch'] + 1 print("the training process from epoch{}...".format( parser_data.start_epoch)) train_loss = [] learning_rate = [] val_map = [] train_val_map = [] val_data = None # 如果电脑内存充裕,可提前加载验证集数据,以免每次验证时都要重新加载一次数据,节省时间 # val_data = get_coco_api_from_dataset(val_data_loader.dataset) for epoch in range(parser_data.start_epoch, parser_data.epochs): utils.train_one_epoch(model=model, optimizer=optimizer, data_loader=train_data_loader, device=device, epoch=epoch, print_freq=50, train_loss=train_loss, train_lr=learning_rate) lr_scheduler.step() if epoch >= 20 or epoch == 10: utils.evaluate(model=model, data_loader=val_data_loader, device=device, data_set=val_data, mAP_list=val_map) # save weights save_files = { 'model': model.state_dict(), 'optimizer': optimizer.state_dict(), 'lr_scheduler': lr_scheduler.state_dict(), 'epoch': epoch } torch.save(save_files, "./save_weights/ssd512-{}.pth".format(epoch)) # plot loss and lr curve if len(train_loss) != 0 and len(learning_rate) != 0: from plot_curve import plot_loss_and_lr plot_loss_and_lr(train_loss, learning_rate) # plot mAP curve if len(val_map) != 0: from plot_curve import plot_map plot_map(val_map)
def main(): device = torch.device("cuda:0" if torch.cuda.is_available() else "cpu") print("Using {} device training.".format(device.type)) # 用来保存coco_info的文件 results_file = "results{}.txt".format( datetime.datetime.now().strftime("%Y%m%d-%H%M%S")) # 检查保存权重文件夹是否存在,不存在则创建 if not os.path.exists("save_weights"): os.makedirs("save_weights") data_transform = { "train": transforms.Compose( [transforms.ToTensor(), transforms.RandomHorizontalFlip(0.5)]), "val": transforms.Compose([transforms.ToTensor()]) } VOC_root = "./" # check voc root if os.path.exists(os.path.join(VOC_root, "VOCdevkit")) is False: raise FileNotFoundError( "VOCdevkit dose not in path:'{}'.".format(VOC_root)) # load train data set # VOCdevkit -> VOC2012 -> ImageSets -> Main -> train.txt train_data_set = VOC2012DataSet(VOC_root, data_transform["train"], "train.txt") # 注意这里的collate_fn是自定义的,因为读取的数据包括image和targets,不能直接使用默认的方法合成batch batch_size = 8 nw = min([os.cpu_count(), batch_size if batch_size > 1 else 0, 8]) # number of workers print('Using %g dataloader workers' % nw) train_data_loader = torch.utils.data.DataLoader( train_data_set, batch_size=batch_size, shuffle=True, num_workers=nw, collate_fn=train_data_set.collate_fn) # load validation data set # VOCdevkit -> VOC2012 -> ImageSets -> Main -> val.txt val_data_set = VOC2012DataSet(VOC_root, data_transform["val"], "val.txt") val_data_set_loader = torch.utils.data.DataLoader( val_data_set, batch_size=batch_size, shuffle=False, num_workers=nw, collate_fn=train_data_set.collate_fn) # create model num_classes equal background + 20 classes model = create_model(num_classes=21) # print(model) model.to(device) train_loss = [] learning_rate = [] val_map = [] # # # # # # # # # # # # # # # # # # # # # # # # # # # # # # # # # first frozen backbone and train 5 epochs # # 首先冻结前置特征提取网络权重(backbone),训练rpn以及最终预测网络部分 # # # # # # # # # # # # # # # # # # # # # # # # # # # # # # # # # for param in model.backbone.parameters(): param.requires_grad = False # define optimizer params = [p for p in model.parameters() if p.requires_grad] optimizer = torch.optim.SGD(params, lr=0.005, momentum=0.9, weight_decay=0.0005) init_epochs = 5 for epoch in range(init_epochs): # train for one epoch, printing every 10 iterations mean_loss, lr = utils.train_one_epoch(model, optimizer, train_data_loader, device, epoch, print_freq=50, warmup=True) train_loss.append(mean_loss.item()) learning_rate.append(lr) # evaluate on the test dataset coco_info = utils.evaluate(model, val_data_set_loader, device=device) # write into txt with open(results_file, "a") as f: # 写入的数据包括coco指标还有loss和learning rate result_info = [ str(round(i, 4)) for i in coco_info + [mean_loss.item(), lr] ] txt = "epoch:{} {}".format(epoch, ' '.join(result_info)) f.write(txt + "\n") val_map.append(coco_info[1]) # pascal mAP torch.save(model.state_dict(), "./save_weights/pretrain.pth") # # # # # # # # # # # # # # # # # # # # # # # # # # # # # second unfrozen backbone and train all network # # 解冻前置特征提取网络权重(backbone),接着训练整个网络权重 # # # # # # # # # # # # # # # # # # # # # # # # # # # # # # 冻结backbone部分底层权重 for name, parameter in model.backbone.named_parameters(): split_name = name.split(".")[0] if split_name in ["0", "1", "2", "3"]: parameter.requires_grad = False else: parameter.requires_grad = True # define optimizer params = [p for p in model.parameters() if p.requires_grad] optimizer = torch.optim.SGD(params, lr=0.005, momentum=0.9, weight_decay=0.0005) # learning rate scheduler lr_scheduler = torch.optim.lr_scheduler.StepLR(optimizer, step_size=3, gamma=0.33) num_epochs = 20 for epoch in range(init_epochs, num_epochs + init_epochs, 1): # train for one epoch, printing every 50 iterations mean_loss, lr = utils.train_one_epoch(model, optimizer, train_data_loader, device, epoch, print_freq=50) train_loss.append(mean_loss.item()) learning_rate.append(lr) # update the learning rate lr_scheduler.step() # evaluate on the test dataset coco_info = utils.evaluate(model, val_data_set_loader, device=device) # write into txt with open(results_file, "a") as f: # 写入的数据包括coco指标还有loss和learning rate result_info = [ str(round(i, 4)) for i in coco_info + [mean_loss.item(), lr] ] txt = "epoch:{} {}".format(epoch, ' '.join(result_info)) f.write(txt + "\n") val_map.append(coco_info[1]) # pascal mAP # save weights # 仅保存最后5个epoch的权重 if epoch in range(num_epochs + init_epochs)[-5:]: save_files = { 'model': model.state_dict(), 'optimizer': optimizer.state_dict(), 'lr_scheduler': lr_scheduler.state_dict(), 'epoch': epoch } torch.save(save_files, "./save_weights/mobile-model-{}.pth".format(epoch)) # plot loss and lr curve if len(train_loss) != 0 and len(learning_rate) != 0: from plot_curve import plot_loss_and_lr plot_loss_and_lr(train_loss, learning_rate) # plot mAP curve if len(val_map) != 0: from plot_curve import plot_map plot_map(val_map)
def main(): device = torch.device("cuda:0" if torch.cuda.is_available() else "cpu") print(device) # 检查保存权重文件夹是否存在,不存在则创建 if not os.path.exists("save_weights"): os.makedirs("save_weights") data_transform = { "train": transforms.Compose([transforms.ToTensor(), transforms.RandomHorizontalFlip(0.5)]), "val": transforms.Compose([transforms.ToTensor()]) } VOC_root = "./" # load train data set train_data_set = VOC2012DataSet(VOC_root, data_transform["train"], True) # 注意这里的collate_fn是自定义的,因为读取的数据包括image和targets,不能直接使用默认的方法合成batch train_data_loader = torch.utils.data.DataLoader(train_data_set, batch_size=8, shuffle=True, num_workers=0, collate_fn=utils.collate_fn) # load validation data set val_data_set = VOC2012DataSet(VOC_root, data_transform["val"], False) val_data_set_loader = torch.utils.data.DataLoader(val_data_set, batch_size=1, shuffle=False, num_workers=0, collate_fn=utils.collate_fn) # create model num_classes equal background + 20 classes model = create_model(num_classes=21) print(model) model.to(device) train_loss = [] learning_rate = [] val_mAP = [] # # # # # # # # # # # # # # # # # # # # # # # # # # # # # # # # # first frozen backbone and train 5 epochs # # 首先冻结前置特征提取网络权重(backbone),训练rpn以及最终预测网络部分 # # # # # # # # # # # # # # # # # # # # # # # # # # # # # # # # # for param in model.backbone.parameters(): param.requires_grad = False # define optimizer params = [p for p in model.parameters() if p.requires_grad] optimizer = torch.optim.SGD(params, lr=0.005, momentum=0.9, weight_decay=0.0005) num_epochs = 5 for epoch in range(num_epochs): # train for one epoch, printing every 10 iterations utils.train_one_epoch(model, optimizer, train_data_loader, device, epoch, print_freq=50, train_loss=train_loss, train_lr=learning_rate) # evaluate on the test dataset utils.evaluate(model, val_data_set_loader, device=device, mAP_list=val_mAP) torch.save(model.state_dict(), "./save_weights/pretrain.pth") # # # # # # # # # # # # # # # # # # # # # # # # # # # # # second unfrozen backbone and train all network # # 解冻前置特征提取网络权重(backbone),接着训练整个网络权重 # # # # # # # # # # # # # # # # # # # # # # # # # # # # # # 冻结backbone部分底层权重 for name, parameter in model.backbone.named_parameters(): split_name = name.split(".")[0] if split_name in ["0", "1", "2", "3"]: parameter.requires_grad = False else: parameter.requires_grad = True # define optimizer params = [p for p in model.parameters() if p.requires_grad] optimizer = torch.optim.SGD(params, lr=0.005, momentum=0.9, weight_decay=0.0005) # learning rate scheduler lr_scheduler = torch.optim.lr_scheduler.StepLR(optimizer, step_size=5, gamma=0.33) num_epochs = 20 for epoch in range(num_epochs): # train for one epoch, printing every 50 iterations utils.train_one_epoch(model, optimizer, train_data_loader, device, epoch, print_freq=50, train_loss=train_loss, train_lr=learning_rate) # update the learning rate lr_scheduler.step() # evaluate on the test dataset utils.evaluate(model, val_data_set_loader, device=device, mAP_list=val_mAP) # save weights if epoch > 10: save_files = { 'model': model.state_dict(), 'optimizer': optimizer.state_dict(), 'lr_scheduler': lr_scheduler.state_dict(), 'epoch': epoch} torch.save(save_files, "./save_weights/mobile-model-{}.pth".format(epoch)) # plot loss and lr curve if len(train_loss) != 0 and len(learning_rate) != 0: from plot_curve import plot_loss_and_lr plot_loss_and_lr(train_loss, learning_rate) # plot mAP curve if len(val_mAP) != 0: from plot_curve import plot_map plot_map(val_mAP)
def main(parser_data): device = torch.device( parser_data.device if torch.cuda.is_available() else "cpu") print("Using {} device training.".format(device.type)) if not os.path.exists("save_weights"): os.mkdir("save_weights") results_file = "results{}.txt".format( datetime.datetime.now().strftime("%Y%m%d-%H%M%S")) data_transform = { "train": transforms.Compose([ transforms.SSDCropping(), transforms.Resize(), transforms.ColorJitter(), transforms.ToTensor(), transforms.RandomHorizontalFlip(), transforms.Normalization(), transforms.AssignGTtoDefaultBox() ]), "val": transforms.Compose([ transforms.Resize(), transforms.ToTensor(), transforms.Normalization() ]) } VOC_root = parser_data.data_path # check voc root if os.path.exists(os.path.join(VOC_root, "VOCdevkit")) is False: raise FileNotFoundError( "VOCdevkit dose not in path:'{}'.".format(VOC_root)) # VOCdevkit -> VOC2012 -> ImageSets -> Main -> train.txt train_dataset = VOCDataSet(VOC_root, "2012", data_transform['train'], train_set='train.txt') # 注意训练时,batch_size必须大于1 batch_size = parser_data.batch_size assert batch_size > 1, "batch size must be greater than 1" # 防止最后一个batch_size=1,如果最后一个batch_size=1就舍去 drop_last = True if len(train_dataset) % batch_size == 1 else False nw = min([os.cpu_count(), batch_size if batch_size > 1 else 0, 8]) # number of workers print('Using %g dataloader workers' % nw) train_data_loader = torch.utils.data.DataLoader( train_dataset, batch_size=batch_size, shuffle=True, num_workers=nw, collate_fn=train_dataset.collate_fn, drop_last=drop_last) # VOCdevkit -> VOC2012 -> ImageSets -> Main -> val.txt val_dataset = VOCDataSet(VOC_root, "2012", data_transform['val'], train_set='val.txt') val_data_loader = torch.utils.data.DataLoader( val_dataset, batch_size=batch_size, shuffle=False, num_workers=nw, collate_fn=train_dataset.collate_fn) model = create_model(num_classes=args.num_classes + 1) model.to(device) # define optimizer params = [p for p in model.parameters() if p.requires_grad] optimizer = torch.optim.SGD(params, lr=0.0005, momentum=0.9, weight_decay=0.0005) # learning rate scheduler lr_scheduler = torch.optim.lr_scheduler.StepLR(optimizer, step_size=5, gamma=0.3) # 如果指定了上次训练保存的权重文件地址,则接着上次结果接着训练 if parser_data.resume != "": checkpoint = torch.load(parser_data.resume, map_location='cpu') model.load_state_dict(checkpoint['model']) optimizer.load_state_dict(checkpoint['optimizer']) lr_scheduler.load_state_dict(checkpoint['lr_scheduler']) parser_data.start_epoch = checkpoint['epoch'] + 1 print("the training process from epoch{}...".format( parser_data.start_epoch)) train_loss = [] learning_rate = [] val_map = [] # 提前加载验证集数据,以免每次验证时都要重新加载一次数据,节省时间 val_data = get_coco_api_from_dataset(val_data_loader.dataset) for epoch in range(parser_data.start_epoch, parser_data.epochs): mean_loss, lr = utils.train_one_epoch(model=model, optimizer=optimizer, data_loader=train_data_loader, device=device, epoch=epoch, print_freq=50) train_loss.append(mean_loss.item()) learning_rate.append(lr) # update learning rate lr_scheduler.step() coco_info = utils.evaluate(model=model, data_loader=val_data_loader, device=device, data_set=val_data) # write into txt with open(results_file, "a") as f: # 写入的数据包括coco指标还有loss和learning rate result_info = [ str(round(i, 4)) for i in coco_info + [mean_loss.item()] ] + [str(round(lr, 6))] txt = "epoch:{} {}".format(epoch, ' '.join(result_info)) f.write(txt + "\n") val_map.append(coco_info[1]) # pascal mAP # save weights save_files = { 'model': model.state_dict(), 'optimizer': optimizer.state_dict(), 'lr_scheduler': lr_scheduler.state_dict(), 'epoch': epoch } torch.save(save_files, "./save_weights/ssd300-{}.pth".format(epoch)) # plot loss and lr curve if len(train_loss) != 0 and len(learning_rate) != 0: from plot_curve import plot_loss_and_lr plot_loss_and_lr(train_loss, learning_rate) # plot mAP curve if len(val_map) != 0: from plot_curve import plot_map plot_map(val_map)
def main(parser_data): device = torch.device( parser_data.device if torch.cuda.is_available() else "cpu") print(device) if not os.path.exists("save_weights"): os.mkdir("save_weights") data_transform = { "train": transform.Compose([ transform.SSDCropping(), transform.Resize(), transform.ColorJitter(), transform.ToTensor(), transform.RandomHorizontalFlip(), transform.Normalization(), transform.AssignGTtoDefaultBox() ]), "val": transform.Compose([ transform.Resize(), transform.ToTensor(), transform.Normalization() ]) } XRay_root = parser_data.data_path train_dataset = XRayDataset(XRay_root, data_transform['train'], train_set='train.txt') # Note that the batch_size must be greater than 1 train_data_loader = torch.utils.data.DataLoader( train_dataset, batch_size=8, shuffle=True, num_workers=4, collate_fn=utils.collate_fn) val_dataset = XRayDataset(XRay_root, data_transform['val'], train_set='val.txt') val_data_loader = torch.utils.data.DataLoader(val_dataset, batch_size=1, shuffle=False, num_workers=0, collate_fn=utils.collate_fn) model = create_model(num_classes=6, device=device) model.to(device) # define optimizer params = [p for p in model.parameters() if p.requires_grad] optimizer = torch.optim.SGD(params, lr=0.0005, momentum=0.9, weight_decay=0.0005) # learning rate scheduler lr_scheduler = torch.optim.lr_scheduler.StepLR(optimizer, step_size=5, gamma=0.3) # If the address of the weight file saved by the last training is specified, the training continues with the last result if parser_data.resume != "": checkpoint = torch.load(parser_data.resume) model.load_state_dict(checkpoint['model']) optimizer.load_state_dict(checkpoint['optimizer']) lr_scheduler.load_state_dict(checkpoint['lr_scheduler']) parser_data.start_epoch = checkpoint['epoch'] + 1 print("the training process from epoch{}...".format( parser_data.start_epoch)) train_loss = [] learning_rate = [] val_map = [] val_data = None # If your computer has sufficient memory, you can save time by loading the validation set data in advance to avoid having to reload the data each time you validate # val_data = get_coco_api_from_dataset(val_data_loader.dataset) for epoch in range(parser_data.start_epoch, parser_data.epochs): utils.train_one_epoch(model=model, optimizer=optimizer, data_loader=train_data_loader, device=device, epoch=epoch, print_freq=50, train_loss=train_loss, train_lr=learning_rate) lr_scheduler.step() utils.evaluate(model=model, data_loader=val_data_loader, device=device, data_set=val_data, mAP_list=val_map) # save weights save_files = { 'model': model.state_dict(), 'optimizer': optimizer.state_dict(), 'lr_scheduler': lr_scheduler.state_dict(), 'epoch': epoch } torch.save(save_files, "./save_weights/ssd300-{}.pth".format(epoch)) # plot loss and lr curve if len(train_loss) != 0 and len(learning_rate) != 0: from plot_curve import plot_loss_and_lr plot_loss_and_lr(train_loss, learning_rate) # plot mAP curve if len(val_map) != 0: from plot_curve import plot_map plot_map(val_map)
cocoEval.evaluate() cocoEval.accumulate() cocoEval.summarize() ap_per_category(self.cocoGt, cocoEval, epoch_index) print_txt = cocoEval.stats coco_mAP = print_txt[0] voc_mAP = print_txt[1] if isinstance(mAP_list, list): mAP_list.append(voc_mAP) if __name__ == '__main__': train_loss = [] learning_rate = [] val_mAP = [] trainer = _Trainer() for epoch_index in range(cfg.max_epoch): trainer.train_one_epoch(epoch_index, train_loss=train_loss, train_lr=learning_rate) trainer.eval(epoch_index, mAP_list=val_mAP) # plot loss and lr curve if len(train_loss) != 0 and len(learning_rate) != 0: plot_loss_and_lr(train_loss, learning_rate) # plot mAP curve if len(val_mAP) != 0: plot_map(val_mAP)
def main(args): # utils.init_distributed_mode(args) # print(args) device = torch.device(args.device if torch.cuda.is_available() else "cpu") print(device) # # 用来保存coco_info的文件 results_file = "results{}.txt".format( datetime.datetime.now().strftime("%Y%m%d-%H%M%S")) # # Data loading code print("Loading data") data_transform = { "train": transforms.Compose( [transforms.ToTensor(), transforms.RandomHorizontalFlip(0.5)]), "val": transforms.Compose([transforms.ToTensor()]) } coco_root = args.data_path # check root if os.path.exists(coco_root) is False: raise FileNotFoundError( "coco dose not in path:'{}'.".format(coco_root)) batch_size = args.batch_size nw = min([os.cpu_count(), batch_size if batch_size > 1 else 0, 0]) # number of workers print('Using %g dataloader workers' % nw) val_dataset = get_coco(coco_root, "val", data_transform["val"]) print("Creating data loaders") # if args.distributed: # train_sampler = torch.utils.data.distributed.DistributedSampler(dataset) # test_sampler = torch.utils.data.distributed.DistributedSampler(val_dataset) # else: # train_sampler = torch.utils.data.RandomSampler(dataset) # test_sampler = torch.utils.data.SequentialSampler(val_dataset) # if args.aspect_ratio_group_factor >= 0: # group_ids = create_aspect_ratio_groups(dataset, k=args.aspect_ratio_group_factor) # train_batch_sampler = GroupedBatchSampler(train_sampler, group_ids, args.batch_size) # else: # train_batch_sampler = torch.utils.data.BatchSampler(train_sampler, args.batch_size, drop_last=True) val_data_set_loader = torch.utils.data.DataLoader( val_dataset, batch_size=batch_size, shuffle=False, pin_memory=True, num_workers=nw, collate_fn=val_dataset.collate_fn) print("Creating model") model = get_model(num_classes=args.num_classes + 1) model.to(device) print(model) model_without_ddp = model # if args.distributed: # model = torch.nn.parallel.DistributedDataParallel(model, device_ids=[args.gpu]) # model_without_ddp = model.module train_loss = [] learning_rate = [] val_map = [] params = [p for p in model.parameters() if p.requires_grad] optimizer = torch.optim.SGD(params, lr=args.lr, momentum=args.momentum, weight_decay=args.weight_decay) # lr_scheduler = torch.optim.lr_scheduler.StepLR(optimizer, step_size=args.lr_step_size, gamma=args.lr_gamma) lr_scheduler = torch.optim.lr_scheduler.MultiStepLR( optimizer, milestones=args.lr_steps, gamma=args.lr_gamma) if args.resume: checkpoint = torch.load(args.resume, map_location=device) model_without_ddp.load_state_dict(checkpoint['model']) optimizer.load_state_dict(checkpoint['optimizer']) lr_scheduler.load_state_dict(checkpoint['lr_scheduler']) args.start_epoch = checkpoint['epoch'] + 1 if args.test_only: train_eval_utils.evaluate(model, val_data_set_loader, device=device) return for epoch in range(args.start_epoch, args.epochs): metric_logger = train_eval_utils.train_one_epoch( model, optimizer, val_data_set_loader, device, epoch, args.print_freq) mean_loss, lr = metric_logger["mloss"], metric_logger["lr"] train_loss.append(mean_loss) learning_rate.append(lr) lr_scheduler.step() # evaluate after every epoch coco_info = train_eval_utils.evaluate(model, val_data_set_loader, device=device) # write into txt with open(results_file, "a") as f: # 写入的数据包括coco指标还有loss和learning rate result_info = [ str(round(i, 4)) for i in coco_info + [mean_loss, lr] ] txt = "epoch:{} {}".format(epoch, ' '.join(result_info)) f.write(txt + "\n") val_map.append(coco_info[1]) # pascal mAP if args.output_dir: utils.save_on_master( { 'model': model_without_ddp.state_dict(), 'optimizer': optimizer.state_dict(), 'lr_scheduler': lr_scheduler.state_dict(), 'args': args, 'epoch': epoch }, os.path.join(args.output_dir, 'resnet-fpn-model-{}.pth'.format(epoch))) # plot loss and lr curve if len(train_loss) != 0 and len(learning_rate) != 0: from plot_curve import plot_loss_and_lr plot_loss_and_lr(train_loss, learning_rate) # plot mAP curve if len(val_map) != 0: from plot_curve import plot_map plot_map(val_map)
def main(): device = torch.device(cfg['device_name']) print("Using {} device training.".format(device.type)) if not os.path.exists(cfg['save_dir']): os.makedirs(cfg['save_dir']) data_transform = { "train": Compose([ToTensor(), RandomHorizontalFlip(0.5)]), "val": Compose([ToTensor()]) } if not os.path.exists(cfg['data_root_dir']): raise FileNotFoundError("dataset root dir not exist!") # load train data set train_data_set = coco(cfg['data_root_dir'], 'train', '2017', data_transform["train"]) batch_size = cfg['batch_size'] nw = min([os.cpu_count(), batch_size if batch_size > 1 else 0, 8]) # number of workers print('Using {} dataloader workers'.format(nw)) train_data_loader = torch.utils.data.DataLoader( train_data_set, batch_size=batch_size, shuffle=True, num_workers=nw, collate_fn=train_data_set.collate_fn) # load validation data set val_data_set = coco(cfg['data_root_dir'], 'val', '2017', data_transform["train"]) val_data_set_loader = torch.utils.data.DataLoader( val_data_set, batch_size=batch_size, shuffle=False, num_workers=nw, collate_fn=train_data_set.collate_fn) # create model num_classes equal background + 20 classes model = create_model(num_classes=cfg['num_class']) model.to(device) # define optimizer params = [p for p in model.parameters() if p.requires_grad] optimizer = torch.optim.SGD(params, lr=0.005, momentum=0.9, weight_decay=0.0005) # learning rate scheduler lr_scheduler = torch.optim.lr_scheduler.StepLR(optimizer, step_size=5, gamma=0.33) # 如果指定了上次训练保存的权重文件地址,则接着上次结果接着训练 if cfg['resume'] != "": checkpoint = torch.load(cfg['resume']) model.load_state_dict(checkpoint['model']) optimizer.load_state_dict(checkpoint['optimizer']) lr_scheduler.load_state_dict(checkpoint['lr_scheduler']) cfg['start_epoch'] = checkpoint['epoch'] + 1 print("the training process from epoch{}...".format( parser_data.start_epoch)) train_loss = [] learning_rate = [] val_mAP = [] for epoch in range(cfg['start_epoch'], cfg['num_epochs']): # train for one epoch, printing every 10 iterations utils.train_one_epoch(model, optimizer, train_data_loader, device, epoch, train_loss=train_loss, train_lr=learning_rate, print_freq=50, warmup=True) # update the learning rate lr_scheduler.step() # evaluate on the test dataset utils.evaluate(model, val_data_set_loader, device=device, mAP_list=val_mAP) # save weights save_files = { 'model': model.state_dict(), 'optimizer': optimizer.state_dict(), 'lr_scheduler': lr_scheduler.state_dict(), 'epoch': epoch } model_save_dir = cfg['model_save_dir'] if not os.path.exists(model_save_dir): os.makedirs(model_save_dir) torch.save( save_files, os.path.join(model_save_dir, "{}-model-{}.pth".format(cfg['backbone'], epoch))) # plot loss and lr curve if len(train_loss) != 0 and len(learning_rate) != 0: from plot_curve import plot_loss_and_lr plot_loss_and_lr(train_loss, learning_rate) # plot mAP curve if len(val_mAP) != 0: from plot_curve import plot_map plot_map(val_mAP)
def main(parser_data): device = torch.device( parser_data.device if torch.cuda.is_available() else "cpu") print("Using {} device training.".format(device.type)) data_transform = { "train": transforms.Compose( [transforms.ToTensor(), transforms.RandomHorizontalFlip(0.5)]), "val": transforms.Compose([transforms.ToTensor()]) } Data_root = parser_data.data_path # check voc root if os.path.exists(os.path.join(Data_root, "TGK_DATASET")) is False: # 修改数据集的名字 raise FileNotFoundError( "TGK_DATASET dose not in path:'{}'.".format(Data_root)) # load train data set train_data_set = TGKDataSet(Data_root, data_transform["train"], True) # 注意这里的collate_fn是自定义的,因为读取的数据包括image和targets,不能直接使用默认的方法合成batch batch_size = parser_data.batch_size nw = min([os.cpu_count(), batch_size if batch_size > 1 else 0, 8]) # number of workers print('Using %g dataloader workers' % nw) train_data_loader = torch.utils.data.DataLoader( train_data_set, batch_size=batch_size, shuffle=True, num_workers=nw, collate_fn=train_data_set.collate_fn) # load validation data set val_data_set = TGKDataSet(Data_root, data_transform["val"], False) val_data_set_loader = torch.utils.data.DataLoader( val_data_set, batch_size=batch_size, shuffle=False, num_workers=nw, collate_fn=train_data_set.collate_fn) # create model num_classes equal background + classes number model = create_model(num_classes=2) # 修改标注类别数,需要+1 # print(model) model.to(device) # define optimizer params = [p for p in model.parameters() if p.requires_grad] optimizer = torch.optim.SGD(params, lr=0.0025, momentum=0.9, weight_decay=0.0005) # learning rate scheduler lr_scheduler = torch.optim.lr_scheduler.StepLR(optimizer, step_size=5, gamma=0.33) # 如果指定了上次训练保存的权重文件地址,则接着上次结果接着训练 if parser_data.resume != "": checkpoint = torch.load(parser_data.resume) model.load_state_dict(checkpoint['model']) optimizer.load_state_dict(checkpoint['optimizer']) lr_scheduler.load_state_dict(checkpoint['lr_scheduler']) parser_data.start_epoch = checkpoint['epoch'] + 1 print("the training process from epoch{}...".format( parser_data.start_epoch)) train_loss = [] learning_rate = [] val_mAP = [] for epoch in range(parser_data.start_epoch, parser_data.epochs): # train for one epoch, printing every 10 iterations utils.train_one_epoch(model, optimizer, train_data_loader, device, epoch, train_loss=train_loss, train_lr=learning_rate, print_freq=10, warmup=True) # update the learning rate lr_scheduler.step() # evaluate on the test dataset utils.evaluate(model, val_data_set_loader, device=device, mAP_list=val_mAP) # save weights save_files = { 'model': model.state_dict(), 'optimizer': optimizer.state_dict(), 'lr_scheduler': lr_scheduler.state_dict(), 'epoch': epoch } torch.save(save_files, "./save_weights/resNetFpn-model-{}.pth".format(epoch)) # plot loss and lr curve if len(train_loss) != 0 and len(learning_rate) != 0: from plot_curve import plot_loss_and_lr plot_loss_and_lr(train_loss, learning_rate) # plot mAP curve if len(val_mAP) != 0: from plot_curve import plot_map plot_map(val_mAP)
def main(args): device = torch.device(args.device if torch.cuda.is_available() else "cpu") print(device) # 用来保存coco_info的文件 results_file = "results{}.txt".format( datetime.datetime.now().strftime("%Y%m%d-%H%M%S")) data_transform = { "train": transforms.Compose( [transforms.ToTensor(), transforms.RandomHorizontalFlip(0.5)]), "val": transforms.Compose([transforms.ToTensor()]) } VOC_root = args.data_path # check voc root if os.path.exists(os.path.join(VOC_root, "VOCdevkit")) is False: raise FileNotFoundError( "VOCdevkit dose not in path:'{}'.".format(VOC_root)) batch_size = args.batch_size nw = min([os.cpu_count(), batch_size if batch_size > 1 else 0, 0]) # number of workers print('Using %g dataloader workers' % nw) # VOCdevkit -> VOC2012 -> ImageSets -> Main -> train.txt train_data_set = VOC2012DataSet(VOC_root, data_transform["train"], "train.txt") train_data_loader = torch.utils.data.DataLoader( train_data_set, batch_size=batch_size, shuffle=True, pin_memory=True, num_workers=nw, collate_fn=train_data_set.collate_fn) val_data_set = VOC2012DataSet(VOC_root, data_transform["val"], "val.txt") val_data_set_loader = torch.utils.data.DataLoader( val_data_set, batch_size=batch_size, shuffle=False, pin_memory=True, num_workers=nw, collate_fn=train_data_set.collate_fn) model = get_model(num_classes=args.num_classes + 1) print(model) model.to(device) params = [p for p in model.parameters() if p.requires_grad] optimizer = torch.optim.SGD(params, lr=0.005, momentum=0.9, weight_decay=0.0005) lr_scheduler = torch.optim.lr_scheduler.StepLR(optimizer, step_size=3, gamma=0.33) # 加载上次保存的权重 # 如果指定了上次训练保存的权重文件地址,则接着上次结果接着训练 if args.resume != "": checkpoint = torch.load(args.resume, map_location=device) model.load_state_dict(checkpoint['model']) optimizer.load_state_dict(checkpoint['optimizer']) lr_scheduler.load_state_dict(checkpoint['lr_scheduler']) args.start_epoch = checkpoint['epoch'] + 1 print("the training process from epoch{}...".format(args.start_epoch)) train_loss = [] learning_rate = [] val_map = [] for epoch in range(args.start_epoch, args.epochs): # train for one epoch, printing every 10 iterations metric_logger = train_eval_utils.train_one_epoch(model, optimizer, train_data_loader, device=device, epoch=epoch, print_freq=50) mean_loss, lr = metric_logger["mloss"], metric_logger["lr"] train_loss.append(mean_loss) learning_rate.append(lr) # update the learning rate lr_scheduler.step() # evaluate on the test dataset coco_info = train_eval_utils.evaluate(model, val_data_set_loader, device=device) # write into txt with open(results_file, "a") as f: # 写入的数据包括coco指标还有loss和learning rate result_info = [ str(round(i, 4)) for i in coco_info + [mean_loss, lr] ] txt = "epoch:{} {}".format(epoch, ' '.join(result_info)) f.write(txt + "\n") val_map.append(coco_info[1]) # pascal mAP # save weights # save_files = { # 'model': model.state_dict(), # 'optimizer': optimizer.state_dict(), # 'lr_scheduler': lr_scheduler.state_dict(), # 'epoch': epoch} # torch.save(save_files, "./weights/resNetFpn-model-{}.pth".format(epoch)) if args.output_dir: utils.save_on_master( { 'model': model.state_dict(), 'optimizer': optimizer.state_dict(), 'lr_scheduler': lr_scheduler.state_dict(), 'args': args, 'epoch': epoch }, os.path.join(args.output_dir, 'resnet-fpn-model-{}.pth'.format(epoch))) # plot loss and lr curve if len(train_loss) != 0 and len(learning_rate) != 0: from plot_curve import plot_loss_and_lr plot_loss_and_lr(train_loss, learning_rate) # plot mAP curve if len(val_map) != 0: from plot_curve import plot_map plot_map(val_map)
def main(): GPU_count = torch.cuda.device_count() # 默认使用最后一块 GPU,最大可能保证其空闲 device = torch.device( "cuda:{}".format(GPU_count - 1) if torch.cuda.is_available() else "cpu") print("Using cuda:{0}/{1} device training.".format( GPU_count - 1, [item for item in range(GPU_count)])) # 检查保存权重文件夹是否存在,不存在则创建 if not os.path.exists("save_weights"): os.makedirs("save_weights") data_transform = { "train": transforms.Compose( [transforms.ToTensor(), transforms.RandomHorizontalFlip(0.5)]), "val": transforms.Compose([transforms.ToTensor()]) } VOC_root = "/home/shaolun/PYTHON/object-detection/faster-rcnn.pytorch/data/" # check voc root if os.path.exists(os.path.join(VOC_root, "VOCdevkit")) is False: raise FileNotFoundError( "VOCdevkit dose not in path:'{}'.".format(VOC_root)) # load train data set # VOCdevkit -> VOC2012 -> ImageSets -> Main -> train.txt train_data_set = VOC2007DataSet(VOC_root, data_transform["train"], "train.txt") # 注意这里的collate_fn是自定义的,因为读取的数据包括image和targets,不能直接使用默认的方法合成batch batch_size = 8 nw = min([os.cpu_count(), batch_size if batch_size > 1 else 0, 8]) # number of workers print('Using %g dataloader workers' % nw) train_data_loader = torch.utils.data.DataLoader( train_data_set, batch_size=batch_size, shuffle=True, num_workers=nw, collate_fn=train_data_set.collate_fn) # load validation data set # VOCdevkit -> VOC2012 -> ImageSets -> Main -> val.txt val_data_set = VOC2007DataSet(VOC_root, data_transform["val"], "val.txt") val_data_set_loader = torch.utils.data.DataLoader( val_data_set, batch_size=batch_size, shuffle=False, num_workers=nw, collate_fn=train_data_set.collate_fn) # create model num_classes equal background + 20 classes model = create_model(num_classes=21) # print(model) model.to(device) train_loss = [] learning_rate = [] val_mAP = [] # # # # # # # # # # # # # # # # # # # # # # # # # # # # # # # # # first frozen backbone and train 5 epochs # # 首先冻结前置特征提取网络权重(backbone),训练rpn以及最终预测网络部分 # # # # # # # # # # # # # # # # # # # # # # # # # # # # # # # # # for param in model.backbone.parameters(): param.requires_grad = False # define optimizer params = [p for p in model.parameters() if p.requires_grad] optimizer = torch.optim.SGD(params, lr=0.005, momentum=0.9, weight_decay=0.0005) num_epochs = 5 # for epoch in range(num_epochs): # # train for one epoch, printing every 10 iterations # utils.train_one_epoch(model, optimizer, train_data_loader, # device, epoch, print_freq=50, # train_loss=train_loss, train_lr=learning_rate) # # # evaluate on the test dataset # utils.evaluate(model, val_data_set_loader, device=device, mAP_list=val_mAP) # # torch.save(model.state_dict(), "./save_weights/pretrain.pth") # # # # # # # # # # # # # # # # # # # # # # # # # # # # # second unfrozen backbone and train all network # # 解冻前置特征提取网络权重(backbone),接着训练整个网络权重 # # # # # # # # # # # # # # # # # # # # # # # # # # # # # # 冻结backbone部分底层权重 for name, parameter in model.backbone.named_parameters(): split_name = name.split(".")[0] if split_name in ["0", "1", "2", "3"]: parameter.requires_grad = False else: parameter.requires_grad = True # define optimizer params = [p for p in model.parameters() if p.requires_grad] optimizer = torch.optim.SGD(params, lr=0.005, momentum=0.9, weight_decay=0.0005) # learning rate scheduler lr_scheduler = torch.optim.lr_scheduler.StepLR(optimizer, step_size=5, gamma=0.33) num_epochs = 20 for epoch in range(num_epochs): # train for one epoch, printing every 50 iterations utils.train_one_epoch(model, optimizer, train_data_loader, device, epoch, print_freq=50, train_loss=train_loss, train_lr=learning_rate) # update the learning rate lr_scheduler.step() # evaluate on the test dataset utils.evaluate(model, val_data_set_loader, device=device, mAP_list=val_mAP) # save weights if epoch > 10: save_files = { 'model': model.state_dict(), 'optimizer': optimizer.state_dict(), 'lr_scheduler': lr_scheduler.state_dict(), 'epoch': epoch } torch.save(save_files, "./save_weights/mobile-model-{}.pth".format(epoch)) # plot loss and lr curve if len(train_loss) != 0 and len(learning_rate) != 0: from plot_curve import plot_loss_and_lr plot_loss_and_lr(train_loss, learning_rate) # plot mAP curve if len(val_mAP) != 0: from plot_curve import plot_map plot_map(val_mAP)
def main(parser_data): device = torch.device( parser_data.device if torch.cuda.is_available() else "cpu") print("Using {} device training.".format(device.type)) results_file = "results{}.txt".format( datetime.datetime.now().strftime("%Y%m%d-%H%M%S")) data_transform = { "train": transforms.Compose( [transforms.ToTensor(), transforms.RandomHorizontalFlip(0.5)]), "val": transforms.Compose([transforms.ToTensor()]) } VOC_root = parser_data.data_path # check voc root if os.path.exists(os.path.join(VOC_root, "VOCdevkit")) is False: raise FileNotFoundError( "VOCdevkit dose not in path:'{}'.".format(VOC_root)) # load train data set # VOCdevkit -> VOC2012 -> ImageSets -> Main -> train.txt train_dataset = VOCDataSet(VOC_root, "2012", data_transform["train"], "train.txt") train_sampler = None # 是否按图片相似高宽比采样图片组成batch # 使用的话能够减小训练时所需GPU显存,默认使用 if args.aspect_ratio_group_factor >= 0: train_sampler = torch.utils.data.RandomSampler(train_dataset) # 统计所有图像高宽比例在bins区间中的位置索引 group_ids = create_aspect_ratio_groups( train_dataset, k=args.aspect_ratio_group_factor) # 每个batch图片从同一高宽比例区间中取 train_batch_sampler = GroupedBatchSampler(train_sampler, group_ids, args.batch_size) # 注意这里的collate_fn是自定义的,因为读取的数据包括image和targets,不能直接使用默认的方法合成batch batch_size = parser_data.batch_size nw = min([os.cpu_count(), batch_size if batch_size > 1 else 0, 8]) # number of workers print('Using %g dataloader workers' % nw) if train_sampler: # 如果按照图片高宽比采样图片,dataloader中需要使用batch_sampler train_data_loader = torch.utils.data.DataLoader( train_dataset, batch_sampler=train_batch_sampler, pin_memory=True, num_workers=nw, collate_fn=train_dataset.collate_fn) else: train_data_loader = torch.utils.data.DataLoader( train_dataset, batch_size=batch_size, shuffle=True, pin_memory=True, num_workers=nw, collate_fn=train_dataset.collate_fn) # load validation data set # VOCdevkit -> VOC2012 -> ImageSets -> Main -> val.txt val_dataset = VOCDataSet(VOC_root, "2012", data_transform["val"], "val.txt") val_data_loader = torch.utils.data.DataLoader( val_dataset, batch_size=1, shuffle=False, pin_memory=True, num_workers=nw, collate_fn=val_dataset.collate_fn) # create model # 注意:不包含背景 model = create_model(num_classes=parser_data.num_classes) # print(model) model.to(device) # define optimizer params = [p for p in model.parameters() if p.requires_grad] optimizer = torch.optim.SGD(params, lr=0.005, momentum=0.9, weight_decay=0.0005) scaler = torch.cuda.amp.GradScaler() if args.amp else None # learning rate scheduler lr_scheduler = torch.optim.lr_scheduler.StepLR(optimizer, step_size=3, gamma=0.33) # 如果指定了上次训练保存的权重文件地址,则接着上次结果接着训练 if parser_data.resume != "": checkpoint = torch.load(parser_data.resume, map_location='cpu') model.load_state_dict(checkpoint['model']) optimizer.load_state_dict(checkpoint['optimizer']) lr_scheduler.load_state_dict(checkpoint['lr_scheduler']) parser_data.start_epoch = checkpoint['epoch'] + 1 if args.amp and "scaler" in checkpoint: scaler.load_state_dict(checkpoint["scaler"]) print("the training process from epoch{}...".format( parser_data.start_epoch)) train_loss = [] learning_rate = [] val_map = [] for epoch in range(parser_data.start_epoch, parser_data.epochs): # train for one epoch, printing every 10 iterations mean_loss, lr = utils.train_one_epoch(model, optimizer, train_data_loader, device, epoch, print_freq=50, warmup=True, scaler=scaler) train_loss.append(mean_loss.item()) learning_rate.append(lr) # update the learning rate lr_scheduler.step() # evaluate on the test dataset coco_info = utils.evaluate(model, val_data_loader, device=device) # write into txt with open(results_file, "a") as f: # 写入的数据包括coco指标还有loss和learning rate result_info = [ str(round(i, 4)) for i in coco_info + [mean_loss.item()] ] + [str(round(lr, 6))] txt = "epoch:{} {}".format(epoch, ' '.join(result_info)) f.write(txt + "\n") val_map.append(coco_info[1]) # pascal map # save weights save_files = { 'model': model.state_dict(), 'optimizer': optimizer.state_dict(), 'lr_scheduler': lr_scheduler.state_dict(), 'epoch': epoch } if args.amp: save_files["scaler"] = scaler.state_dict() torch.save(save_files, "./save_weights/resNetFpn-model-{}.pth".format(epoch)) # plot loss and lr curve if len(train_loss) != 0 and len(learning_rate) != 0: from plot_curve import plot_loss_and_lr plot_loss_and_lr(train_loss, learning_rate) # plot mAP curve if len(val_map) != 0: from plot_curve import plot_map plot_map(val_map)
def main(): device = torch.device("cuda:0" if torch.cuda.is_available() else "cpu") print(device) if not os.path.exists("save_weights"): os.mkdir("save_weights") data_transform = { "train": transform.Compose([transform.SSDCropping(), transform.Resize(), transform.ColorJitter(), transform.ToTensor(), transform.RandomHorizontalFlip(), transform.Normalization(), transform.AssignGTtoDefaultBox()]), "val": transform.Compose([transform.Resize(), transform.ToTensor(), transform.Normalization()]) } voc_path = "../" train_dataset = VOC2012DataSet(voc_path, data_transform['train'], True) # 注意训练时,batch_size必须大于1 train_data_loader = torch.utils.data.DataLoader(train_dataset, batch_size=8, shuffle=True, num_workers=0, collate_fn=utils.collate_fn) val_dataset = VOC2012DataSet(voc_path, data_transform['val'], False) val_data_loader = torch.utils.data.DataLoader(val_dataset, batch_size=1, shuffle=False, num_workers=0, collate_fn=utils.collate_fn) model = create_model(num_classes=21, device=device) model.to(device) # define optimizer params = [p for p in model.parameters() if p.requires_grad] optimizer = torch.optim.SGD(params, lr=0.002, momentum=0.9, weight_decay=0.0005) # learning rate scheduler lr_scheduler = torch.optim.lr_scheduler.StepLR(optimizer, step_size=5, gamma=0.3) train_loss = [] learning_rate = [] val_map = [] val_data = None # 如果电脑内存充裕,可提前加载验证集数据,以免每次验证时都要重新加载一次数据,节省时间 # val_data = get_coco_api_from_dataset(val_data_loader.dataset) for epoch in range(20): utils.train_one_epoch(model=model, optimizer=optimizer, data_loader=train_data_loader, device=device, epoch=epoch, print_freq=50, train_loss=train_loss, train_lr=learning_rate, warmup=True) lr_scheduler.step() utils.evaluate(model=model, data_loader=val_data_loader, device=device, data_set=val_data, mAP_list=val_map) # save weights save_files = { 'model': model.state_dict(), 'optimizer': optimizer.state_dict(), 'lr_scheduler': lr_scheduler.state_dict(), 'epoch': epoch} torch.save(save_files, "./save_weights/ssd300-{}.pth".format(epoch)) # plot loss and lr curve if len(train_loss) != 0 and len(learning_rate) != 0: from plot_curve import plot_loss_and_lr plot_loss_and_lr(train_loss, learning_rate) # plot mAP curve if len(val_map) != 0: from plot_curve import plot_map plot_map(val_map)
def main(): device = torch.device("cuda:0" if torch.cuda.is_available() else "cpu") # 如果有可用的GPU就默认采用第0块设备,如果没有就用CPU print("Using {} device training.".format(device.type)) # 用来保存coco_info的文件 results_file = "results{}.txt".format(datetime.datetime.now().strftime("%Y%m%d-%H%M%S")) # 检查保存权重文件夹是否存在,不存在则创建 if not os.path.exists("save_weights"): os.makedirs("save_weights") data_transform = { "train": transforms.Compose([transforms.ToTensor(), transforms.RandomHorizontalFlip(0.5)]), # 转化为tensor,然后随机水平翻转,GT也应该变化 "val": transforms.Compose([transforms.ToTensor()]) } VOC_root = "./" # VOC数据集放在了根目录下 # VOC_root = os.getcwd() # VOC数据集放在了当前目录下 # check voc root if os.path.exists(os.path.join(VOC_root, "VOCdevkit")) is False: raise FileNotFoundError("VOCdevkit dose not in path:'{}'.".format(VOC_root)) # load train data set # VOCdevkit -> VOC2012 -> ImageSets -> Main -> train.txt train_data_set = VOC2012DataSet(VOC_root, data_transform["train"], "train.txt") # 使用VOC2012DataSet类来定义我们的数据集,然后再torch.DataLoader载入 # 注意这里的collate_fn是自定义的,因为读取的数据包括image和targets,不能直接使用默认的方法合成batch batch_size = 8 # 根据GPU来设定 nw = min([os.cpu_count(), batch_size if batch_size > 1 else 0, 8]) # number of workers,多线程图像读取和预处理,可以为4或8 print('Using %g dataloader workers' % nw) train_data_loader = torch.utils.data.DataLoader(train_data_set, batch_size=batch_size, shuffle=True, num_workers=nw, collate_fn=train_data_set.collate_fn) # load validation data set # VOCdevkit -> VOC2012 -> ImageSets -> Main -> val.txt val_data_set = VOC2012DataSet(VOC_root, data_transform["val"], "val.txt") val_data_set_loader = torch.utils.data.DataLoader(val_data_set, batch_size=batch_size, shuffle=False, pin_memory=True, num_workers=nw, collate_fn=train_data_set.collate_fn) # create model num_classes equal background + 20 classes model = create_model(num_classes=21) # 20+1,加上了背景这个类别 # print(model) model.to(device) # 将模型指派到设备中 train_loss = [] learning_rate = [] val_map = [] # # # # # # # # # # # # # # # # # # # # # # # # # # # # # # # # # first frozen backbone and train 5 epochs # # 首先冻结前置特征提取网络权重(backbone),训练rpn以及最终预测网络部分 # # 因为现在只有backbone的预训练权值,用来微调rpn以及最终预测网络部分 # # # # # # # # # # # # # # # # # # # # # # # # # # # # # # # # # for param in model.backbone.parameters(): param.requires_grad = False # define optimizer params = [p for p in model.parameters() if p.requires_grad] optimizer = torch.optim.SGD(params, lr=0.005, momentum=0.9, weight_decay=0.0005) init_epochs = 5 # 通过5个epochs进行微调 for epoch in range(init_epochs): # train for one epoch, printing every 10 iterations mean_loss, lr = utils.train_one_epoch(model, optimizer, train_data_loader, device, epoch, print_freq=50, warmup=True) train_loss.append(mean_loss.item()) learning_rate.append(lr) # evaluate on the test dataset coco_info = utils.evaluate(model, val_data_set_loader, device=device) # write into txt with open(results_file, "a") as f: # 写入的数据包括coco指标还有loss和learning rate result_info = [str(round(i, 4)) for i in coco_info + [mean_loss.item(), lr]] txt = "epoch:{} {}".format(epoch, ' '.join(result_info)) f.write(txt + "\n") val_map.append(coco_info[1]) # pascal mAP torch.save(model.state_dict(), "./save_weights/pretrain.pth") # 保存权值 # # # # # # # # # # # # # # # # # # # # # # # # # # # # # second unfrozen backbone and train all network # # 解冻前置特征提取网络权重(backbone),接着训练整个网络权重 # # # # # # # # # # # # # # # # # # # # # # # # # # # # # # 冻结backbone部分底层权重 for name, parameter in model.backbone.named_parameters(): split_name = name.split(".")[0] if split_name in ["0", "1", "2", "3"]: parameter.requires_grad = False # 前面的几层都是通用的特征,所以可以冻结部分底层权重;这样不仅可以加快训练,并且效更好 # 这也是pytorch官方训练ResNet50和RPN的方法 else: parameter.requires_grad = True # define optimizer params = [p for p in model.parameters() if p.requires_grad] # 遍历模型的所有权重,找出需要训练的即p.requires_grad=True optimizer = torch.optim.SGD(params, lr=0.005, momentum=0.9, weight_decay=0.0005) # 将参数传入SGD优化器中;初始lr,动量,decay # learning rate scheduler lr_scheduler = torch.optim.lr_scheduler.StepLR(optimizer, # 调整学习率的方法有很多,这里使用的是.StepLR step_size=3, gamma=0.33) # 设置学习率的路线,每隔step_size步,lr乘上系数gamma num_epochs = 20 # 迭代了20个epoch for epoch in range(init_epochs, num_epochs+init_epochs, 1): # train for one epoch, printing every 50 iterations mean_loss, lr = utils.train_one_epoch(model, optimizer, train_data_loader, device, epoch, print_freq=50) train_loss.append(mean_loss.item()) learning_rate.append(lr) # update the learning rate lr_scheduler.step() # 记录lr_scheduler方法已经执行一步了 # evaluate on the test dataset coco_info = utils.evaluate(model, val_data_set_loader, device=device) # write into txt,这些都可以没有 with open(results_file, "a") as f: # 写入的数据包括coco指标还有loss和learning rate result_info = [str(round(i, 4)) for i in coco_info + [mean_loss.item(), lr]] txt = "epoch:{} {}".format(epoch, ' '.join(result_info)) f.write(txt + "\n") val_map.append(coco_info[1]) # pascal mAP # save weights # 仅保存最后5个epoch的权重 if epoch in range(num_epochs+init_epochs)[-5:]: # if epoch >10: # 可以简单点,直接从第10个权重开始保存 save_files = { 'model': model.state_dict(), 'optimizer': optimizer.state_dict(), 'lr_scheduler': lr_scheduler.state_dict(), 'epoch': epoch} torch.save(save_files, "./save_weights/mobile-model-{}.pth".format(epoch)) # plot loss and lr curve if len(train_loss) != 0 and len(learning_rate) != 0: from plot_curve import plot_loss_and_lr plot_loss_and_lr(train_loss, learning_rate) # plot mAP curve if len(val_map) != 0: from plot_curve import plot_map plot_map(val_map)
def main(parser_data): device = torch.device( parser_data.device if torch.cuda.is_available() else "cpu") print("Using {} device training.".format(device.type)) train_data_set = TrainDataset(lines[:num_train], (input_shape[0], input_shape[1]), mosaic=False) # 注意这里的collate_fn是自定义的,因为读取的数据包括image和targets,不能直接使用默认的方法合成batch batch_size = parser_data.batch_size nw = min([os.cpu_count(), batch_size if batch_size > 1 else 0, 8]) # number of workers print('Using %g dataloader workers' % nw) train_data_loader = torch.utils.data.DataLoader( train_data_set, batch_size=batch_size, shuffle=True, num_workers=nw, drop_last=True, collate_fn=train_dataset_collate) # load validation data set # val_data_set = VOC2012DataSet(VOC_root, data_transform["val"], False) val_data_set = TrainDataset(lines[num_train:num_val + num_train], (input_shape[0], input_shape[1]), mosaic=False) val_data_loader = torch.utils.data.DataLoader( val_data_set, batch_size=batch_size, shuffle=False, num_workers=nw, drop_last=True, collate_fn=train_dataset_collate) # create model num_classes equal background + 1 classes model = create_model(num_classes=2) model.to(device) params = [p for p in model.parameters() if p.requires_grad] optimizer = torch.optim.SGD(params, lr=0.005, momentum=0.9, weight_decay=0.0005) # learning rate scheduler lr_scheduler = torch.optim.lr_scheduler.StepLR(optimizer, step_size=5, gamma=0.33) # 如果指定了上次训练保存的权重文件地址,则接着上次结果接着训练 if parser_data.resume != "": checkpoint = torch.load(parser_data.resume) model.load_state_dict(checkpoint['model']) optimizer.load_state_dict(checkpoint['optimizer']) lr_scheduler.load_state_dict(checkpoint['lr_scheduler']) parser_data.start_epoch = checkpoint['epoch'] + 1 print("the training process from epoch{}...".format( parser_data.start_epoch)) train_loss = [] val_loss = [] learning_rate = [] val_mAP = [] for epoch in range(parser_data.start_epoch, parser_data.epochs): # train for one epoch, printing every 10 iterations pro_epoch_total_loss_train = [] pro_epoch_total_loss_val = [] utils.train_one_epoch( model, optimizer, train_data_loader, device, epoch, parser_data.epochs, pro_epoch_total_loss_train=pro_epoch_total_loss_train, train_loss=train_loss, train_lr=learning_rate, warmup=True) # update the learning rate lr_scheduler.step() # evaluate on the test dataset utils.val_one_epoch(model, val_data_loader, device, epoch, parser_data.epochs, pro_epoch_total_loss_val=pro_epoch_total_loss_val, val_loss=val_loss) # save weights # save_files = { # 'model': model.state_dict(), # 'optimizer': optimizer.state_dict(), # 'lr_scheduler': lr_scheduler.state_dict(), # 'epoch': epoch} # lr = optimizer.param_groups[0]["lr"] # torch.save(save_files, "./save_weights/Epoch_%03d_Loss_%.4f_lr_%.6f.pth"% ((epoch+1), # np.mean(pro_epoch_total_loss_train), # lr)) save_files = { 'model': model, 'optimizer': optimizer.state_dict(), 'lr_scheduler': lr_scheduler.state_dict(), 'epoch': epoch } lr = optimizer.param_groups[0]["lr"] torch.save( save_files, "./save_weights1/Epoch_%03d_Loss_%.4f_lr_%.6f.pth" % ((epoch + 1), np.mean(pro_epoch_total_loss_train), lr)) print('Loss_train: %.4f || Loss_val: %.4f ' % (np.mean(pro_epoch_total_loss_train), np.mean(pro_epoch_total_loss_val))) print('Saving state, iter:', str(epoch + 1)) # plot loss and lr curve if len(train_loss) != 0 and len(learning_rate) != 0: from plot_curve import plot_loss_and_lr plot_loss_and_lr(train_loss, learning_rate) # plot mAP curve if len(val_mAP) != 0: from plot_curve import plot_map plot_map(val_mAP)
save_folder, net, num_classes, dataset, detect, AP_stats=ap_stats) else: multi_scale_test_net(target_size, save_folder, net, num_classes, dataset, detect, AP_stats=ap_stats) # print the best model. max_idx = np.argmax(np.asarray(ap_stats['ap50'])) print('Best ap50: {:.4f} at epoch {}'.format(ap_stats['ap50'][max_idx], ap_stats['epoch'][max_idx])) res_file = os.path.join(save_folder, 'ap_stats.json') print('Writing ap stats json to {}'.format(res_file)) with open(res_file, 'w') as fid: json.dump(ap_stats, fid) # plot curves fig_name = 'ap.png' metrics = ['ap50'] legend = ['ap50'] plot_map(save_folder, ap_stats, metrics, legend, fig_name) txt_log = prefix + '/log.txt' plot_loss(save_folder, txt_log)
def main(parser_data): device = torch.device( parser_data.device if torch.cuda.is_available() else "cpu") print("Using {} device training.".format(device.type)) # 用来保存coco_info的文件 results_file = "results{}.txt".format( datetime.datetime.now().strftime("%Y%m%d-%H%M%S")) data_transform = { "train": transforms.Compose( [transforms.ToTensor(), transforms.RandomHorizontalFlip(0.5)]), "val": transforms.Compose([transforms.ToTensor()]) } VOC_root = parser_data.data_path # check voc root if os.path.exists(os.path.join(VOC_root, "VOCdevkit")) is False: raise FileNotFoundError( "VOCdevkit dose not in path:'{}'.".format(VOC_root)) # load train data set # VOCdevkit -> VOC2012 -> ImageSets -> Main -> train.txt train_data_set = VOC2012DataSet(VOC_root, data_transform["train"], "train.txt") # 注意这里的collate_fn是自定义的,因为读取的数据包括image和targets,不能直接使用默认的方法合成batch batch_size = parser_data.batch_size nw = min([os.cpu_count(), batch_size if batch_size > 1 else 0, 8]) # number of workers print('Using %g dataloader workers' % nw) train_data_loader = torch.utils.data.DataLoader( train_data_set, batch_size=batch_size, shuffle=True, num_workers=nw, collate_fn=train_data_set.collate_fn) # load validation data set # VOCdevkit -> VOC2012 -> ImageSets -> Main -> val.txt val_data_set = VOC2012DataSet(VOC_root, data_transform["val"], "val.txt") val_data_set_loader = torch.utils.data.DataLoader( val_data_set, batch_size=batch_size, shuffle=False, num_workers=nw, collate_fn=train_data_set.collate_fn) # create model num_classes equal background + 20 classes model = create_model(num_classes=21, device=device) # print(model) model.to(device) # define optimizer params = [p for p in model.parameters() if p.requires_grad] optimizer = torch.optim.SGD(params, lr=0.005, momentum=0.9, weight_decay=0.0005) # learning rate scheduler lr_scheduler = torch.optim.lr_scheduler.StepLR(optimizer, step_size=3, gamma=0.33) # 如果指定了上次训练保存的权重文件地址,则接着上次结果接着训练 if parser_data.resume != "": checkpoint = torch.load(parser_data.resume, map_location=device) model.load_state_dict(checkpoint['model']) optimizer.load_state_dict(checkpoint['optimizer']) lr_scheduler.load_state_dict(checkpoint['lr_scheduler']) parser_data.start_epoch = checkpoint['epoch'] + 1 print("the training process from epoch{}...".format( parser_data.start_epoch)) train_loss = [] learning_rate = [] val_map = [] for epoch in range(parser_data.start_epoch, parser_data.epochs): # train for one epoch, printing every 10 iterations mean_loss, lr = utils.train_one_epoch(model, optimizer, train_data_loader, device=device, epoch=epoch, print_freq=50, warmup=True) train_loss.append(mean_loss.item()) learning_rate.append(lr) # update the learning rate lr_scheduler.step() # evaluate on the test dataset coco_info = utils.evaluate(model, val_data_set_loader, device=device) # write into txt with open(results_file, "a") as f: result_info = [ str(round(i, 4)) for i in coco_info + [mean_loss.item(), lr] ] txt = "epoch:{} {}".format(epoch, ' '.join(result_info)) f.write(txt + "\n") val_map.append(coco_info[1]) # pascal mAP # save weights save_files = { 'model': model.state_dict(), 'optimizer': optimizer.state_dict(), 'lr_scheduler': lr_scheduler.state_dict(), 'epoch': epoch } torch.save(save_files, "./save_weights/resNetFpn-model-{}.pth".format(epoch)) # plot loss and lr curve if len(train_loss) != 0 and len(learning_rate) != 0: from plot_curve import plot_loss_and_lr plot_loss_and_lr(train_loss, learning_rate) # plot mAP curve if len(val_map) != 0: from plot_curve import plot_map plot_map(val_map)
def main(args): device = torch.device(args.device if torch.cuda.is_available() else "cpu") print("Using {} device training.".format(device.type)) # 用来保存coco_info的文件 results_file = "results{}.txt".format( datetime.datetime.now().strftime("%Y%m%d-%H%M%S")) data_transform = { "train": transforms.Compose( [transforms.ToTensor(), transforms.RandomHorizontalFlip(0.5)]), "val": transforms.Compose([transforms.ToTensor()]) } COCO_root = args.data_path # load train data set # coco2017 -> annotations -> instances_train2017.json train_data_set = CocoDetection(COCO_root, "train", data_transform["train"]) # 注意这里的collate_fn是自定义的,因为读取的数据包括image和targets,不能直接使用默认的方法合成batch batch_size = args.batch_size nw = min([os.cpu_count(), batch_size if batch_size > 1 else 0, 8]) # number of workers print('Using %g dataloader workers' % nw) train_data_loader = torch.utils.data.DataLoader( train_data_set, batch_size=batch_size, shuffle=True, pin_memory=True, num_workers=nw, collate_fn=train_data_set.collate_fn) # load validation data set # coco2017 -> annotations -> instances_val2017.json val_data_set = CocoDetection(COCO_root, "val", data_transform["val"]) val_data_set_loader = torch.utils.data.DataLoader( val_data_set, batch_size=batch_size, shuffle=False, pin_memory=True, num_workers=nw, collate_fn=train_data_set.collate_fn) # create model num_classes equal background + 80 classes model = create_model(num_classes=args.num_classes + 1) # print(model) model.to(device) train_loss = [] learning_rate = [] val_map = [] # define optimizer params = [p for p in model.parameters() if p.requires_grad] optimizer = torch.optim.SGD(params, lr=args.lr, momentum=args.momentum, weight_decay=args.weight_decay) # learning rate scheduler lr_scheduler = torch.optim.lr_scheduler.MultiStepLR( optimizer, milestones=args.lr_steps, gamma=args.lr_gamma) # 如果传入resume参数,即上次训练的权重地址,则接着上次的参数训练 if args.resume: # If map_location is missing, torch.load will first load the module to CPU # and then copy each parameter to where it was saved, # which would result in all processes on the same machine using the same set of devices. checkpoint = torch.load( args.resume, map_location=device) # 读取之前保存的权重文件(包括优化器以及学习率策略) model.load_state_dict(checkpoint['model']) optimizer.load_state_dict(checkpoint['optimizer']) lr_scheduler.load_state_dict(checkpoint['lr_scheduler']) args.start_epoch = checkpoint['epoch'] + 1 for epoch in range(args.start_epoch, args.epochs): # train for one epoch, printing every 50 iterations mean_loss, lr = utils.train_one_epoch(model, optimizer, train_data_loader, device, epoch, print_freq=50, warmup=True) train_loss.append(mean_loss.item()) learning_rate.append(lr) # update the learning rate lr_scheduler.step() # evaluate on the test dataset coco_info = utils.evaluate(model, val_data_set_loader, device=device) # write into txt with open(results_file, "a") as f: # 写入的数据包括coco指标还有loss和learning rate result_info = [ str(round(i, 4)) for i in coco_info + [mean_loss.item(), lr] ] txt = "epoch:{} {}".format(epoch, ' '.join(result_info)) f.write(txt + "\n") val_map.append(coco_info[1]) # pascal mAP # save weights save_files = { 'model': model.state_dict(), 'optimizer': optimizer.state_dict(), 'lr_scheduler': lr_scheduler.state_dict(), 'epoch': epoch } torch.save(save_files, "./save_weights/mobile-model-{}.pth".format(epoch)) # plot loss and lr curve if len(train_loss) != 0 and len(learning_rate) != 0: from plot_curve import plot_loss_and_lr plot_loss_and_lr(train_loss, learning_rate) # plot mAP curve if len(val_map) != 0: from plot_curve import plot_map plot_map(val_map)
def main(parser_data): device = torch.device(parser_data.device if torch.cuda.is_available() else "cpu") print(device) data_transform = { "train": transforms.Compose([transforms.ToTensor(), transforms.RandomHorizontalFlip(0.5)]), "val": transforms.Compose([transforms.ToTensor()]) } VOC_root = parser_data.data_path assert os.path.exists(os.path.join(VOC_root, "VOCdevkit")), "not found VOCdevkit in path:'{}'".format(VOC_root) # load train data set train_data_set = VOC2012DataSet(VOC_root, data_transform["train"], True) # 注意这里的collate_fn是自定义的,因为读取的数据包括image和targets,不能直接使用默认的方法合成batch train_data_loader = torch.utils.data.DataLoader(train_data_set, batch_size=1, shuffle=False, num_workers=0, collate_fn=utils.collate_fn) # load validation data set val_data_set = VOC2012DataSet(VOC_root, data_transform["val"], False) val_data_set_loader = torch.utils.data.DataLoader(val_data_set, batch_size=2, shuffle=False, num_workers=0, collate_fn=utils.collate_fn) # create model num_classes equal background + 20 classes model = create_model(num_classes=21) # print(model) model.to(device) # define optimizer params = [p for p in model.parameters() if p.requires_grad] optimizer = torch.optim.SGD(params, lr=0.005, momentum=0.9, weight_decay=0.0005) # learning rate scheduler lr_scheduler = torch.optim.lr_scheduler.StepLR(optimizer, step_size=5, gamma=0.33) # 如果指定了上次训练保存的权重文件地址,则接着上次结果接着训练 if parser_data.resume != "": checkpoint = torch.load(parser_data.resume) model.load_state_dict(checkpoint['model']) optimizer.load_state_dict(checkpoint['optimizer']) lr_scheduler.load_state_dict(checkpoint['lr_scheduler']) parser_data.start_epoch = checkpoint['epoch'] + 1 print("the training process from epoch{}...".format(parser_data.start_epoch)) train_loss = [] learning_rate = [] val_mAP = [] for epoch in range(parser_data.start_epoch, parser_data.epochs): # train for one epoch, printing every 10 iterations utils.train_one_epoch(model, optimizer, train_data_loader, device, epoch, train_loss=train_loss, train_lr=learning_rate, print_freq=50, warmup=True) # update the learning rate lr_scheduler.step() # evaluate on the test dataset utils.evaluate(model, val_data_set_loader, device=device, mAP_list=val_mAP) # save weights save_files = { 'model': model.state_dict(), 'optimizer': optimizer.state_dict(), 'lr_scheduler': lr_scheduler.state_dict(), 'epoch': epoch} torch.save(save_files, "./save_weights/resNetFpn-model-{}.pth".format(epoch)) # plot loss and lr curve if len(train_loss) != 0 and len(learning_rate) != 0: from plot_curve import plot_loss_and_lr plot_loss_and_lr(train_loss, learning_rate) # plot mAP curve if len(val_mAP) != 0: from plot_curve import plot_map plot_map(val_mAP)
(epoch + 1, running_loss / step, val_accurate)) plt.plot(range(len(lr_list)), lr_list, color='r') val_map.append(val_accurate) df = pd.DataFrame([train_loss,lr_list,val_map],index=clm) df.to_csv('Result.csv') # plot loss and lr curve if len(train_loss) != 0 and len(lr_list) != 0: from plot_curve import plot_loss_and_lr plot_loss_and_lr(train_loss, lr_list) # plot mAP curve if len(val_map) != 0: from plot_curve import plot_map plot_map(val_map) scheduler.step() print('Finished Training') # net.eval() # acc = 0.0 # with torch.no_grad(): # for val_data in tqdm(validate_loader): # val_images, val_labels = val_data # outputs = net(val_images.to(device)) # eval model only have last output layer # # loss = loss_function(outputs, test_labels) # predict_y = torch.max(outputs, dim=1)[1] # acc += (predict_y == val_labels.to(device)).sum().item() # val_accurate = acc / val_num # print('test_accuracy: %.3f' %(val_accurate))