def main(args): device = torch.device(args.device if torch.cuda.is_available() else "cpu") batch_size = args.batch_size # segmentation nun_classes + background num_classes = args.num_classes + 1 # 用来保存训练以及验证过程中信息 results_file = "results{}.txt".format( datetime.datetime.now().strftime("%Y%m%d-%H%M%S")) # VOCdevkit -> VOC2012 -> ImageSets -> Segmentation -> train.txt train_dataset = VOCSegmentation(args.data_path, year="2012", transforms=get_transform(train=True), txt_name="train.txt") # VOCdevkit -> VOC2012 -> ImageSets -> Segmentation -> val.txt val_dataset = VOCSegmentation(args.data_path, year="2012", transforms=get_transform(train=False), txt_name="val.txt") num_workers = min([os.cpu_count(), batch_size if batch_size > 1 else 0, 8]) train_loader = torch.utils.data.DataLoader( train_dataset, batch_size=batch_size, num_workers=num_workers, shuffle=True, pin_memory=True, collate_fn=train_dataset.collate_fn) val_loader = torch.utils.data.DataLoader(val_dataset, batch_size=1, num_workers=num_workers, pin_memory=True, collate_fn=val_dataset.collate_fn) model = create_model(aux=args.aux, num_classes=num_classes) model.to(device) params_to_optimize = [{ "params": [p for p in model.backbone.parameters() if p.requires_grad] }, { "params": [p for p in model.classifier.parameters() if p.requires_grad] }] if args.aux: params = [ p for p in model.aux_classifier.parameters() if p.requires_grad ] params_to_optimize.append({"params": params, "lr": args.lr * 10}) optimizer = torch.optim.SGD(params_to_optimize, lr=args.lr, momentum=args.momentum, weight_decay=args.weight_decay) scaler = torch.cuda.amp.GradScaler() if args.amp else None # 创建学习率更新策略,这里是每个step更新一次(不是每个epoch) lr_scheduler = create_lr_scheduler(optimizer, len(train_loader), args.epochs, warmup=True) # import matplotlib.pyplot as plt # lr_list = [] # for _ in range(args.epochs): # for _ in range(len(train_loader)): # lr_scheduler.step() # lr = optimizer.param_groups[0]["lr"] # lr_list.append(lr) # plt.plot(range(len(lr_list)), lr_list) # plt.show() if args.resume: checkpoint = torch.load(args.resume, map_location='cpu') model.load_state_dict(checkpoint['model']) optimizer.load_state_dict(checkpoint['optimizer']) lr_scheduler.load_state_dict(checkpoint['lr_scheduler']) args.start_epoch = checkpoint['epoch'] + 1 if args.amp: scaler.load_state_dict(checkpoint["scaler"]) start_time = time.time() for epoch in range(args.start_epoch, args.epochs): mean_loss, lr = train_one_epoch(model, optimizer, train_loader, device, epoch, lr_scheduler=lr_scheduler, print_freq=args.print_freq, scaler=scaler) confmat = evaluate(model, val_loader, device=device, num_classes=num_classes) val_info = str(confmat) print(val_info) # write into txt with open(results_file, "a") as f: # 记录每个epoch对应的train_loss、lr以及验证集各指标 train_info = f"[epoch: {epoch}]\n" \ f"train_loss: {mean_loss:.4f}\n" \ f"lr: {lr:.6f}\n" f.write(train_info + val_info + "\n\n") save_file = { "model": model.state_dict(), "optimizer": optimizer.state_dict(), "lr_scheduler": lr_scheduler.state_dict(), "epoch": epoch, "args": args } if args.amp: save_file["scaler"] = scaler.state_dict() torch.save(save_file, "save_weights/model_{}.pth".format(epoch)) total_time = time.time() - start_time total_time_str = str(datetime.timedelta(seconds=int(total_time))) print("training time {}".format(total_time_str))
def main(args): device = torch.device(args.device if torch.cuda.is_available() else "cpu") batch_size = args.batch_size # segmentation nun_classes + background num_classes = args.num_classes + 1 train_dataset = VOCSegmentation(args.data_path, transforms=get_transform(train=True), txt_name="train.txt") val_dataset = VOCSegmentation(args.data_path, transforms=get_transform(train=False), txt_name="val.txt") num_workers = 8 train_loader = torch.utils.data.DataLoader(train_dataset, batch_size=batch_size, num_workers=num_workers, shuffle=True, pin_memory=True, collate_fn=train_dataset.collate_fn) val_loader = torch.utils.data.DataLoader(val_dataset, batch_size=1, num_workers=num_workers, pin_memory=True, collate_fn=val_dataset.collate_fn) model = create_model(aux=args.aux, num_classes=num_classes) model.to(device) params_to_optimize = [ {"params": [p for p in model.backbone.parameters() if p.requires_grad]}, {"params": [p for p in model.classifier.parameters() if p.requires_grad]} ] if args.aux: params = [p for p in model.aux_classifier.parameters() if p.requires_grad] params_to_optimize.append({"params": params, "lr": args.lr * 10}) optimizer = torch.optim.SGD( params_to_optimize, lr=args.lr, momentum=args.momentum, weight_decay=args.weight_decay ) lr_scheduler = torch.optim.lr_scheduler.LambdaLR( optimizer, lambda x: (1 - x / args.epochs) ** 0.9) if args.resume: checkpoint = torch.load(args.resume, map_location='cpu') model.load_state_dict(checkpoint['model']) optimizer.load_state_dict(checkpoint['optimizer']) lr_scheduler.load_state_dict(checkpoint['lr_scheduler']) args.start_epoch = checkpoint['epoch'] + 1 start_time = time.time() for epoch in range(args.start_epoch, args.epochs): train_one_epoch(model, optimizer, train_loader, device, epoch, warmup=True, print_freq=args.print_freq) lr_scheduler.step() confmat = evaluate(model, val_loader, device=device, num_classes=num_classes) print(confmat) save_file = {"model": model.state_dict(), "optimizer": optimizer.state_dict(), "lr_scheduler": lr_scheduler.state_dict(), "epoch": epoch, "args": args} torch.save(save_file, "save_weights/model_{}.pth".format(epoch)) total_time = time.time() - start_time total_time_str = str(datetime.timedelta(seconds=int(total_time))) print("training time {}".format(total_time_str))
def main(args): init_distributed_mode(args) print(args) device = torch.device(args.device) # segmentation nun_classes + background num_classes = args.num_classes + 1 mean = (0.709, 0.381, 0.224) std = (0.127, 0.079, 0.043) # 用来保存coco_info的文件 results_file = "results{}.txt".format( datetime.datetime.now().strftime("%Y%m%d-%H%M%S")) data_root = args.data_path # check data root if os.path.exists(os.path.join(data_root, "DRIVE")) is False: raise FileNotFoundError( "DRIVE dose not in path:'{}'.".format(data_root)) train_dataset = DriveDataset(args.data_path, train=True, transforms=get_transform(train=True, mean=mean, std=std)) val_dataset = DriveDataset(args.data_path, train=False, transforms=get_transform(train=False, mean=mean, std=std)) print("Creating data loaders") if args.distributed: train_sampler = torch.utils.data.distributed.DistributedSampler( train_dataset) test_sampler = torch.utils.data.distributed.DistributedSampler( val_dataset) else: train_sampler = torch.utils.data.RandomSampler(train_dataset) test_sampler = torch.utils.data.SequentialSampler(val_dataset) train_data_loader = torch.utils.data.DataLoader( train_dataset, batch_size=args.batch_size, sampler=train_sampler, num_workers=args.workers, collate_fn=train_dataset.collate_fn, drop_last=True) val_data_loader = torch.utils.data.DataLoader( val_dataset, batch_size=1, sampler=test_sampler, num_workers=args.workers, collate_fn=train_dataset.collate_fn) print("Creating model") # create model num_classes equal background + foreground classes model = create_model(num_classes=num_classes) model.to(device) if args.sync_bn: model = torch.nn.SyncBatchNorm.convert_sync_batchnorm(model) model_without_ddp = model if args.distributed: model = torch.nn.parallel.DistributedDataParallel( model, device_ids=[args.gpu]) model_without_ddp = model.module params_to_optimize = [ p for p in model_without_ddp.parameters() if p.requires_grad ] optimizer = torch.optim.SGD(params_to_optimize, lr=args.lr, momentum=args.momentum, weight_decay=args.weight_decay) scaler = torch.cuda.amp.GradScaler() if args.amp else None # 创建学习率更新策略,这里是每个step更新一次(不是每个epoch) lr_scheduler = create_lr_scheduler(optimizer, len(train_data_loader), args.epochs, warmup=True) # 如果传入resume参数,即上次训练的权重地址,则接着上次的参数训练 if args.resume: # If map_location is missing, torch.load will first load the module to CPU # and then copy each parameter to where it was saved, # which would result in all processes on the same machine using the same set of devices. checkpoint = torch.load( args.resume, map_location='cpu') # 读取之前保存的权重文件(包括优化器以及学习率策略) model_without_ddp.load_state_dict(checkpoint['model']) optimizer.load_state_dict(checkpoint['optimizer']) lr_scheduler.load_state_dict(checkpoint['lr_scheduler']) args.start_epoch = checkpoint['epoch'] + 1 if args.amp: scaler.load_state_dict(checkpoint["scaler"]) if args.test_only: confmat = evaluate(model, val_data_loader, device=device, num_classes=num_classes) val_info = str(confmat) print(val_info) return best_dice = 0. print("Start training") start_time = time.time() for epoch in range(args.start_epoch, args.epochs): if args.distributed: train_sampler.set_epoch(epoch) mean_loss, lr = train_one_epoch(model, optimizer, train_data_loader, device, epoch, num_classes, lr_scheduler=lr_scheduler, print_freq=args.print_freq, scaler=scaler) confmat, dice = evaluate(model, val_data_loader, device=device, num_classes=num_classes) val_info = str(confmat) print(val_info) print(f"dice coefficient: {dice:.3f}") # 只在主进程上进行写操作 if args.rank in [-1, 0]: # write into txt with open(results_file, "a") as f: # 记录每个epoch对应的train_loss、lr以及验证集各指标 train_info = f"[epoch: {epoch}]\n" \ f"train_loss: {mean_loss:.4f}\n" \ f"lr: {lr:.6f}\n" \ f"dice coefficient: {dice:.3f}\n" f.write(train_info + val_info + "\n\n") if args.save_best is True: if best_dice < dice: best_dice = dice else: continue if args.output_dir: # 只在主节点上执行保存权重操作 save_file = { 'model': model_without_ddp.state_dict(), 'optimizer': optimizer.state_dict(), 'lr_scheduler': lr_scheduler.state_dict(), 'args': args, 'epoch': epoch } if args.amp: save_file["scaler"] = scaler.state_dict() if args.save_best is True: save_on_master(save_file, os.path.join(args.output_dir, 'best_model.pth')) else: save_on_master( save_file, os.path.join(args.output_dir, 'model_{}.pth'.format(epoch))) total_time = time.time() - start_time total_time_str = str(datetime.timedelta(seconds=int(total_time))) print('Training time {}'.format(total_time_str))
if Parallel: for param in yolov3_net.module.backbone.parameters(): param.requires_grad = False else: for param in yolov3_net.backbone.parameters(): param.requires_grad = False # 7.3 训练若干 Epoch for epoch in range(Init_Epoch, Freeze_Epoch): train_utils.train_one_epoch( Test_Name, yolov3_net, # 网络模型 yolov3_loss, # 损失函数 optimizer, # 优化器 epoch, # 当前 epoch len(freeze_train_data_loader), # 训练集批次数 len(freeze_validate_data_loader), # 验证集批次数 Freeze_Epoch, # 总批次 freeze_train_data_loader, # 训练集 freeze_validate_data_loader, # 验证集 Config["cuda"], ) lr_scheduler.step() # 更新步长 # 计算 mAP if (epoch + 1) % 10 == 0: yolov3.net = yolov3_net.eval() # print("\nmAP_train_data_loader:") train_utils.compute_map(yolov3, mAP_train_data_loader, Config["cuda"]) # print("\nmAP_validate_data_loader:") train_utils.compute_map(yolov3, mAP_validate_data_loader,
def main(args): device = torch.device(args.device if torch.cuda.is_available() else "cpu") batch_size = args.batch_size # segmentation nun_classes + background num_classes = args.num_classes + 1 # using compute_mean_std.py mean = (0.709, 0.381, 0.224) std = (0.127, 0.079, 0.043) # 用来保存训练以及验证过程中信息 results_file = "results{}.txt".format(datetime.datetime.now().strftime("%Y%m%d-%H%M%S")) train_dataset = DriveDataset(args.data_path, train=True, transforms=get_transform(train=True, mean=mean, std=std)) val_dataset = DriveDataset(args.data_path, train=False, transforms=get_transform(train=False, mean=mean, std=std)) num_workers = min([os.cpu_count(), batch_size if batch_size > 1 else 0, 8]) train_loader = torch.utils.data.DataLoader(train_dataset, batch_size=batch_size, num_workers=num_workers, shuffle=True, pin_memory=True, collate_fn=train_dataset.collate_fn) val_loader = torch.utils.data.DataLoader(val_dataset, batch_size=1, num_workers=num_workers, pin_memory=True, collate_fn=val_dataset.collate_fn) model = create_model(num_classes=num_classes) model.to(device) params_to_optimize = [p for p in model.parameters() if p.requires_grad] optimizer = torch.optim.SGD( params_to_optimize, lr=args.lr, momentum=args.momentum, weight_decay=args.weight_decay ) scaler = torch.cuda.amp.GradScaler() if args.amp else None # 创建学习率更新策略,这里是每个step更新一次(不是每个epoch) lr_scheduler = create_lr_scheduler(optimizer, len(train_loader), args.epochs, warmup=True) if args.resume: checkpoint = torch.load(args.resume, map_location='cpu') model.load_state_dict(checkpoint['model']) optimizer.load_state_dict(checkpoint['optimizer']) lr_scheduler.load_state_dict(checkpoint['lr_scheduler']) args.start_epoch = checkpoint['epoch'] + 1 if args.amp: scaler.load_state_dict(checkpoint["scaler"]) best_dice = 0. start_time = time.time() for epoch in range(args.start_epoch, args.epochs): mean_loss, lr = train_one_epoch(model, optimizer, train_loader, device, epoch, num_classes, lr_scheduler=lr_scheduler, print_freq=args.print_freq, scaler=scaler) confmat, dice = evaluate(model, val_loader, device=device, num_classes=num_classes) val_info = str(confmat) print(val_info) print(f"dice coefficient: {dice:.3f}") # write into txt with open(results_file, "a") as f: # 记录每个epoch对应的train_loss、lr以及验证集各指标 train_info = f"[epoch: {epoch}]\n" \ f"train_loss: {mean_loss:.4f}\n" \ f"lr: {lr:.6f}\n" \ f"dice coefficient: {dice:.3f}\n" f.write(train_info + val_info + "\n\n") if args.save_best is True: if best_dice < dice: best_dice = dice else: continue save_file = {"model": model.state_dict(), "optimizer": optimizer.state_dict(), "lr_scheduler": lr_scheduler.state_dict(), "epoch": epoch, "args": args} if args.amp: save_file["scaler"] = scaler.state_dict() if args.save_best is True: torch.save(save_file, "save_weights/best_model.pth") else: torch.save(save_file, "save_weights/model_{}.pth".format(epoch)) total_time = time.time() - start_time total_time_str = str(datetime.timedelta(seconds=int(total_time))) print("training time {}".format(total_time_str))
# Network training while keep_training: # Increment epoch variable epoch += 1 # Time each epoch start_time = time.time() # Print current epoch and learning rate print("Epoch: " + str(epoch) + ", Current Learning Rate: ", current_lr) # Re-shuffling of data in each epoch X_shuffled, Y_shuffled = tu.shuffle_data(X_shuffled,Y_shuffled) model, train_loss, train_acc = tu.train_one_epoch(model, param_dict, input_names,output_name, X_shuffled, Y_shuffled) val_loss, val_acc = tu.validate_one_epoch(model, param_dict, input_names, output_name, X_val, Y_val) # Write epoch results to result file tu.write_epoch_results(savefile_name,epoch,train_loss, train_acc, val_loss, val_acc) # Save weights if validation loss is a new best. # Reload previous best and reduce learning rate # if time x consecutive unimproved epochs. tmp = tu.check_learning_rate(optimizer_type, model,savefile_name, output_name,val_loss, best_validation_loss, epoch, best_epoch, current_lr, counter, num_lr_changes) model, text_to_infofile, best_validation_loss, best_epoch, current_lr,counter, num_lr_changes = tmp