def train(optimizer, num_classes, num_epochs, scheduler, device): load = get_dataset() model = get_model_instance_segmentation(num_classes) model = model.to(device) if optimizer == 'Adam': exp_optimizer = optim.Adam(model.parameters(), lr=1e-3) else: exp_optimizer = optim.SGD(model.parameters(), lr=0.005, momentum=0.9, weight_decay=0.0005) if scheduler: lr_scheduler = optim.lr_scheduler.StepLR(exp_optimizer, step_size=3, gamma=0.1) for epoch in range(num_epochs): train_one_epoch(model, exp_optimizer, load['train'], device, epoch, print_freq=10) lr_scheduler.step() evaluate(model, load['val'], device=device) torch.save(model.state_dict(), 'best_model') print('Finished')
def train(data_root): device = torch.device('cuda') if torch.cuda.is_available() else torch.device('cpu') # background and person num_classes = 2 dataset = PennFudanDataset(data_root,get_transform(train=True)) dataset_test = PennFudanDataset(data_root,get_transform(train=False)) # split the dataset indices = torch.randperm(len(dataset)).tolist() dataset = Subset(dataset,indices[:-50]) dataset_test = Subset(dataset_test,indices[-50:]) # define data loaders data_loader = DataLoader(dataset, batch_size=2, shuffle=True, num_workers=4, collate_fn=tools.collate_fn) data_loader_test = DataLoader(dataset_test, batch_size=1, shuffle=False, num_workers=4, collate_fn=tools.collate_fn) # get model model = get_model_instance_segmentation(num_classes) model.to(device) params = [p for p in model.parameters() if p.requires_grad] optimizer = torch.optim.SGD(params,lr=0.005,momentum=0.9,weight_decay=0.0005) lr_scheduler = torch.optim.lr_scheduler.StepLR(optimizer,step_size=3) num_epochs =10 for epoch in range(num_epochs): train_one_epoch(model,optimizer,data_loader,device,epoch,print_freq=10) lr_scheduler.step() # evaluate(model,data_loader_test,device=device) torch.save(model.state_dict(),"masknet.pth") print("OK!")
'loss_bb_regression_max', 'loss_bb_regression_min', 'loss_classifier_avg', 'loss_classifier_median', 'loss_classifier_max', 'loss_classifier_min', 'loss_rpn_bb_regression_avg', 'loss_rpn_bb_regression_median', 'loss_rpn_bb_regression_max', 'loss_rpn_bb_regression_min' ] train_epochs_log = pd.DataFrame(columns=columns_epochs) train_iterations_log = pd.DataFrame(columns=columns_iterations) # Train the network (saving the best model) for epoch in range(0, epochs): # train for one epoch, printing every <print_freq> iterations training_results, train_iterations_log = train_one_epoch( model, optimizer, loader_train, device, epoch, print_freq=1, df=train_iterations_log) # add epoch logs to df train_epochs_log = helper.df_add_epoch_log(train_epochs_log, epoch, training_results) # evaluate on the validation data set mAP = evaluate(model, loader_validation, device=device) # Check to keep best model if mAP > best_mAP: best_mAP = mAP # Save model
# Training loop params = [p for p in model.parameters() if p.requires_grad] optimizer = torch.optim.SGD(params, lr=0.005, momentum=0.9, weight_decay=0.0005) lr_scheduler = torch.optim.lr_scheduler.StepLR(optimizer, step_size=3, gamma=0.1) num_epochs = 10 for e in range(num_epochs): # train for one epoch train_one_epoch(model, optimizer, train_loader, device, e, print_freq=10) # update learning rate lr_scheduler.step() # evaluate on the test dataset print('entering eval') print(len(val_loader)) evaluate(model, val_loader, device=device) # save model if e % 10 == 0: torch.save({ 'epoch': e, 'model_state_dict': model.state_dict() }, f'leaf_od' + str(e) + 'EPOCH_checkpoint.pt')
def main(args): utils.init_distributed_mode(args) print(args) device = torch.device(args.device) # Data loading code print("Loading data") dataset, num_classes = get_dataset(args.dataset, "train", get_transform(train=True), args.data_path) dataset_test, _ = get_dataset(args.dataset, "val", get_transform(train=False), args.data_path) print("Creating data loaders") if args.distributed: train_sampler = torch.utils.data.distributed.DistributedSampler( dataset) test_sampler = torch.utils.data.distributed.DistributedSampler( dataset_test) else: train_sampler = torch.utils.data.RandomSampler(dataset) test_sampler = torch.utils.data.SequentialSampler(dataset_test) if args.aspect_ratio_group_factor >= 0: group_ids = create_aspect_ratio_groups( dataset, k=args.aspect_ratio_group_factor) train_batch_sampler = GroupedBatchSampler(train_sampler, group_ids, args.batch_size) else: train_batch_sampler = torch.utils.data.BatchSampler(train_sampler, args.batch_size, drop_last=True) data_loader = torch.utils.data.DataLoader( dataset, batch_sampler=train_batch_sampler, num_workers=args.workers, collate_fn=utils.collate_fn) data_loader_test = torch.utils.data.DataLoader(dataset_test, batch_size=1, sampler=test_sampler, num_workers=args.workers, collate_fn=utils.collate_fn) print("Creating model") # model = torchvision.models.detection.__dict__[args.model](num_classes=num_classes, # pretrained=args.pretrained) model = get_model(num_classes=num_classes) model.to(device) model_without_ddp = model if args.distributed: model = torch.nn.parallel.DistributedDataParallel( model, device_ids=[args.gpu]) model_without_ddp = model.module params = [p for p in model.parameters() if p.requires_grad] optimizer = torch.optim.SGD(params, lr=args.lr, momentum=args.momentum, weight_decay=args.weight_decay) # lr_scheduler = torch.optim.lr_scheduler.StepLR(optimizer, step_size=args.lr_step_size, gamma=args.lr_gamma) lr_scheduler = torch.optim.lr_scheduler.MultiStepLR( optimizer, milestones=args.lr_steps, gamma=args.lr_gamma) if args.resume: print("----------------------Resume--------------") checkpoint = torch.load(args.resume, map_location='cpu') model_without_ddp.load_state_dict(checkpoint['model']) optimizer.load_state_dict(checkpoint['optimizer']) lr_scheduler.load_state_dict(checkpoint['lr_scheduler']) args.start_epoch = checkpoint['epoch'] + 1 if args.test_only: evaluate(model, data_loader_test, device=device) return print("Start training") start_time = time.time() for epoch in range(args.start_epoch, args.epochs): if args.distributed: train_sampler.set_epoch(epoch) train_one_epoch(model, optimizer, data_loader, device, epoch, args.print_freq) lr_scheduler.step() if args.output_dir: utils.save_on_master( { 'model': model_without_ddp.state_dict(), 'optimizer': optimizer.state_dict(), 'lr_scheduler': lr_scheduler.state_dict(), 'args': args, 'epoch': epoch }, os.path.join(args.output_dir, 'model_{}.pth'.format(epoch))) # evaluate after every epoch evaluate(model, data_loader_test, device=device) total_time = time.time() - start_time total_time_str = str(datetime.timedelta(seconds=int(total_time))) print('Training time {}'.format(total_time_str))
def main(args): input_size = (224, 224) best_acc = 0.0 # prepare output folder if args.output_dir: if not Path(args.output_dir).is_dir(): Path(args.output_dir).mkdir() # read config with open(args.cfg, 'r') as f: cfg_dict = yaml.load(f, Loader=yaml.FullLoader) config_stem = Path(args.cfg).stem hyp = cfg_dict['hyp'] data = cfg_dict['data'] names = np.unique( data['names'] ) # sort as sklearn.preprocessing.LabelEncoder.fit_transform() does # set device mode device = torch.device('cuda:0' if torch.cuda.is_available() else 'cpu') # create model model_name = args.model nc = data['nc'] feature_extract = hyp['feature_extract'] print('[INFO] Creating model ({})'.format(model_name)) model, input_size = initialize_model(model_name, nc, feature_extract) model.to(device) # load data print('[INFO] Loading data') train_csv = data['train'] val_csv = data['val'] train_dataset, val_dataset, train_sampler = load_data_from_csv( train_csv, val_csv, input_size, args.transform) # dataloader batch_size = hyp['batch_size'] train_loader = DataLoader(train_dataset, batch_size=batch_size, sampler=train_sampler, num_workers=args.workers) val_loader = DataLoader(val_dataset, batch_size=batch_size, shuffle=False, num_workers=args.workers) # criterion + optimizer + scheduler learning_rate = hyp['lr'] momentum = hyp['momentum'] weight_decay = hyp['weight_decay'] criterion = nn.CrossEntropyLoss() optimizer = optim.SGD(model.parameters(), lr=learning_rate, momentum=momentum, weight_decay=weight_decay) # scheduler = optim.lr_scheduler.MultiStepLR(optimizer,milestones=[0.5*args.total_epochs, 0.8*args.total_epochs], gamma=0.1) # create tensorboard writter logdir = f'runs/{model_name}_{config_stem}' writter = SummaryWriter(log_dir=logdir) if args.resume: print('[INFO] Load checkpoint') ckpt = torch.load(args.resume, map_location=device) model.load_state_dict(ckpt['model_state_dict']) optimizer.load_state_dict(ckpt['optimizer']) args.start_epoch = ckpt['epoch'] + 1 best_acc = ckpt['best_acc'] if 'best_acc' in ckpt else ckpt['acc'] if args.eval: ckpt_ = torch.load(args.eval, map_location=device) model.load_state_dict(ckpt_['model']) evaluate(val_loader, model, names, device) return # train start_epoch = args.start_epoch total_epochs = hyp['total_epochs'] try: print('[INFO] Starting training') start_time = time.time() for epoch in range(start_epoch, total_epochs): epoch_info = f'Epoch {epoch}/{total_epochs-1}' print(epoch_info) print('-' * len(epoch_info)) # train engine train_acc, train_loss = train_one_epoch(train_loader, model, criterion, optimizer, epoch, device) val_acc, val_loss = validate(val_loader, model, criterion, device) # scheduler.step() # logging to tensorboard writter.add_scalar('Loss/train', train_loss, epoch) writter.add_scalar('Loss/val', val_loss, epoch) writter.add_scalar('Acc/train', train_acc, epoch) writter.add_scalar('Acc/val', val_acc, epoch) # print training info info = f'loss ' + f'{train_loss:.3f} ' + f'accuracy ' + f'{train_acc:.1f}% ' \ + f'val_loss ' + f'{val_loss:.3f} ' + f'val_accuracy ' + f'{val_acc:.1f}%' + '\n' print(info) is_best = val_acc > best_acc if is_best: best_acc = val_acc print('Found new best val_acc: {:6.2f}!\n'.format(best_acc)) # save checkpoint each 10 epochs checkpoint = { 'epoch': epoch, 'acc': val_acc, 'model': model, 'model_state_dict': model.state_dict(), 'optimizer': optimizer.state_dict() } filepath = str( Path(args.output_dir).joinpath( f'{model_name}_{config_stem}.pt')) save_checkpoint(checkpoint, filepath, epoch, is_best) except KeyboardInterrupt: print('[INFO] Training interrupted. Saving checkpoint') print('[INFO] Best val_acc: {:.2f}'.format(best_acc)) filepath = str( Path(args.output_dir).joinpath( f'{model_name}_{config_stem}_{epoch-1}.pt')) save_checkpoint(checkpoint, filepath, epoch, force_save=True) writter.flush() writter.close() sys.exit(0) # flush and close tensorboard writter writter.flush() writter.close() elapsed_time = time.time() - start_time elapsed_str = str(datetime.timedelta(seconds=int(elapsed_time))) print('[INFO] Training complete in: {}'.format(elapsed_str)) print('[INFO] Best val_acc: {:.2f}'.format(best_acc)) filepath = str( Path(args.output_dir).joinpath(f'{model_name}_{config_stem}_final.pt')) save_checkpoint(checkpoint, filepath, epoch, force_save=True)
weight_decay=0.0005) # and a learning rate scheduler which decreases the learning rate by # 10x every 3 epochs lr_scheduler = torch.optim.lr_scheduler.StepLR(optimizer, step_size=15, gamma=0.1) # TRAINING LOOP save_fr = 1 print_freq = 25 # make sure that print_freq is smaller than len(dataset) & len(dataset_test) os.makedirs('./maskrcnn_saved_models', exist_ok=True) for epoch in range(num_epochs): # train for one epoch, printing every 10 iterations train_one_epoch(model, optimizer, data_loader, device, epoch, print_freq=print_freq) if epoch % save_fr == 0: torch.save( model.state_dict(), './maskrcnn_saved_models/mask_rcnn_model_epoch_{}.pt'.format( str(epoch))) # update the learning rate lr_scheduler.step() # evaluate on the test dataset evaluate(model, data_loader_test, device=device)