def val(epoch): since = time.time() ### Test ### val_loss, val_err = train_utils.test(model, val_loader, criterion, epoch) print('Val - Loss: {:.4f} | Acc: {:.4f}'.format(val_loss, 1 - val_err)) time_elapsed = time.time() - since print('Total Time {:.0f}m {:.0f}s\n'.format(time_elapsed // 60, time_elapsed % 60)) global best_loss if val_loss < best_loss: ### Checkpoint ### train_utils.save_weights(model, epoch, val_loss, val_err) best_loss = val_loss ### Adjust Lr ### train_utils.adjust_learning_rate(LR, LR_DECAY, optimizer, epoch, DECAY_EVERY_N_EPOCHS) return val_loss, 1 - val_err
def main(): parser = argparse.ArgumentParser() parser.add_argument('--nClasses', type=int, default=10) #CIFAR parser.add_argument('--reduction', type=float, default=1.0) #no reduction parser.add_argument('--bottleneck', type=bool, default=False) parser.add_argument('--growthRate', type=int, default=12) parser.add_argument('--modelDepth', type=int, default=40) parser.add_argument('--batchSize', type=int, default=64) parser.add_argument('--nEpochs', type=int, default=2) parser.add_argument('--no-cuda', action='store_true') parser.add_argument('--save', type=str, default=RESULTS_PATH) parser.add_argument('--seed', type=int, default=1) parser.add_argument('--existingWeights', type=str, default=None) parser.add_argument('--sessionName', type=str, default=train_utils.get_rand_str(5)) parser.add_argument('--opt', type=str, default='sgd', choices=('sgd', 'adam', 'rmsprop')) args = parser.parse_args() args.cuda = not args.no_cuda and torch.cuda.is_available() setproctitle.setproctitle(args.save) #The process name torch.manual_seed(args.seed) if args.cuda: print("Using CUDA") torch.cuda.manual_seed(args.seed) # if os.path.exists(args.save): # shutil.rmtree(args.save) # os.makedirs(args.save, exist_ok=True) normMean = [0.49139968, 0.48215827, 0.44653124] normStd = [0.24703233, 0.24348505, 0.26158768] normTransform = transforms.Normalize(normMean, normStd) trainTransform = transforms.Compose([ transforms.RandomCrop(32, padding=4), transforms.RandomHorizontalFlip(), transforms.ToTensor(), normTransform ]) testTransform = transforms.Compose([transforms.ToTensor(), normTransform]) kwargs = {'num_workers': 1, 'pin_memory': True} if args.cuda else {} print("Kwargs: " + str(kwargs)) trainLoader = DataLoader(dset.CIFAR10(root=CIFAR10_PATH, train=True, download=True, transform=trainTransform), batch_size=args.batchSize, shuffle=True, **kwargs) testLoader = DataLoader(dset.CIFAR10(root=CIFAR10_PATH, train=False, download=True, transform=testTransform), batch_size=args.batchSize, shuffle=False, **kwargs) net = DenseNet(growthRate=args.growthRate, depth=args.modelDepth, reduction=args.reduction, bottleneck=args.bottleneck, nClasses=args.nClasses) if args.existingWeights: print("Loading existing weights: %s" % args.existingWeights) startEpoch = train_utils.load_weights(net, args.existingWeights) endEpoch = startEpoch + args.nEpochs print('Resume training at epoch: {}'.format(startEpoch)) if os.path.exists(args.save + 'train.csv'): #assume test.csv exists print("Found existing train.csv") append_write = 'a' # append if already exists else: print("Creating new train.csv") append_write = 'w' # make a new file if not trainF = open(os.path.join(args.save, 'train.csv'), append_write) testF = open(os.path.join(args.save, 'test.csv'), append_write) else: print("Training new model from scratch") startEpoch = 1 endEpoch = args.nEpochs trainF = open(os.path.join(args.save, 'train.csv'), 'w') testF = open(os.path.join(args.save, 'test.csv'), 'w') print(' + Number of params: {}'.format( sum([p.data.nelement() for p in net.parameters()]))) if args.cuda: net = net.cuda() if args.opt == 'sgd': optimizer = optim.SGD(net.parameters(), lr=1e-1, momentum=0.9, weight_decay=1e-4) elif args.opt == 'adam': optimizer = optim.Adam(net.parameters(), weight_decay=1e-4) elif args.opt == 'rmsprop': optimizer = optim.RMSprop(net.parameters(), weight_decay=1e-4) print("Training....") for epoch in range(startEpoch, endEpoch + 1): since = time.time() train_utils.adjust_opt(args.opt, optimizer, epoch) train_utils.train(epoch, net, trainLoader, optimizer, trainF, sessionName=args.sessionName) train_utils.test(epoch, net, testLoader, optimizer, testF) time_elapsed = time.time() - since print('Time {:.0f}m {:.0f}s\n'.format(time_elapsed // 60, time_elapsed % 60)) if epoch != 1: os.system('./plot.py {} &'.format(args.save)) trainF.close() testF.close()
alphas = checkpoint['alphas'].clone() betas = checkpoint['betas'].clone() biOptimizer.tau = args.tau_min del checkpoint if args.test: if dbsn: print( torch.cat( [F.softmax(alphas, 1), betas.exp()], 1).data.cpu().numpy()) test_loss, test_err, test_iou = train_utils.test( model, loaders['test'], criterion, alphas, betas, biOptimizer, ngpus, dbsn, ntimes=100, mcdropout=args.mcdropout, dir=args.dir) logging.info('Test - Loss: {:.4f} | Acc: {:.4f} | IOU: {:.4f}'.format( test_loss, 1 - test_err, test_iou)) exit(0) for epoch in range(start_epoch, args.epochs + 1): if epoch % 50 is 1 and dbsn: print( torch.cat([F.softmax(alphas, 1), betas.exp()], 1).data.cpu().numpy())
writer.add_scalar("params/cov00", cov[0, 0], epoch) writer.add_scalar("params/cov11", cov[1, 1], epoch) writer.add_scalar("params/cov22", cov[2, 2], epoch) print( "Epoch {:d}\nTrain - Loss: {:.4f}, Acc: {:.4f}".format( epoch, trn_loss, 1 - trn_err ) ) time_elapsed = time.time() - since print("Train Time {:.0f}m {:.0f}s".format(time_elapsed // 60, time_elapsed % 60)) if epoch % args.eval_freq == 0: #pass ### Test ### val_loss, val_err, val_iou = train_utils.test(model, loaders["val"], criterion, epoch=epoch, writer=writer) print( "Val - Loss: {:.4f} | Acc: {:.4f} | IOU: {:.4f}".format( val_loss, 1 - val_err, val_iou ) ) writer.add_scalar("val/loss", val_loss, epoch) writer.add_scalar("val/error", val_err, epoch) time_elapsed = time.time() - since print("Total Time {:.0f}m {:.0f}s\n".format(time_elapsed // 60, time_elapsed % 60)) ### Checkpoint ### if epoch % args.save_freq == 0: print("Saving model at Epoch: ", epoch) save_checkpoint(
### Train ### if epoch == args.ft_start: print('Now replacing data loader with fine-tuned data loader.') train_loader = loaders['fine_tune'] trn_loss, trn_err = train_utils.train(model, train_loader, optimizer, criterion) print('Epoch {:d}\nTrain - Loss: {:.4f}, Acc: {:.4f}'.format( epoch, trn_loss, 1 - trn_err)) time_elapsed = time.time() - since print('Train Time {:.0f}m {:.0f}s'.format(time_elapsed // 60, time_elapsed % 60)) if epoch % args.eval_freq is 0: ### Test ### val_loss, val_err, val_iou = train_utils.test(model, loaders['val'], criterion) print('Val - Loss: {:.4f} | Acc: {:.4f} | IOU: {:.4f}'.format( val_loss, 1 - val_err, val_iou)) time_elapsed = time.time() - since print('Total Time {:.0f}m {:.0f}s\n'.format(time_elapsed // 60, time_elapsed % 60)) if args.swa and (epoch + 1) > args.swa_start and ( epoch + 1 - args.swa_start) % args.swa_c_epochs == 0: print('Saving SWA model at epoch: ', epoch) swag_model.collect_model(model) if epoch % args.eval_freq is 0: swag_model.sample(0.0) bn_update(train_loader, swag_model)
### Train ### if epoch == args.ft_start: print("Now replacing data loader with fine-tuned data loader.") train_loader = loaders["fine_tune"] trn_loss, trn_err = train_utils.train(model, train_loader, optimizer, criterion) print("Epoch {:d}\nTrain - Loss: {:.4f}, Acc: {:.4f}".format( epoch, trn_loss, 1 - trn_err)) time_elapsed = time.time() - since print("Train Time {:.0f}m {:.0f}s".format(time_elapsed // 60, time_elapsed % 60)) if epoch % args.eval_freq is 0: ### Test ### val_loss, val_err, val_iou = train_utils.test(model, loaders["val"], criterion) print("Val - Loss: {:.4f} | Acc: {:.4f} | IOU: {:.4f}".format( val_loss, 1 - val_err, val_iou)) time_elapsed = time.time() - since print("Total Time {:.0f}m {:.0f}s\n".format(time_elapsed // 60, time_elapsed % 60)) if (args.swa and (epoch + 1) > args.swa_start and (epoch + 1 - args.swa_start) % args.swa_c_epochs == 0): print("Saving SWA model at epoch: ", epoch) swag_model.collect_model(model) if epoch % args.eval_freq is 0: swag_model.sample(0.0) bn_update(train_loader, swag_model)
model = model_cfg.base(num_classes=num_classes, use_aleatoric=args.loss == "aleatoric").cuda() checkpoint = torch.load(args.resume) start_epoch = checkpoint["epoch"] print(start_epoch) model.load_state_dict(checkpoint["state_dict"]) print(len(loaders["test"])) if args.use_test: print("Using test dataset") test_loader = "test" else: test_loader = "val" loss, err, mIOU, model_output_targets = test( model, loaders[test_loader], criterion, return_outputs=True, return_scale=args.loss == "aleatoric", ) print(loss, 1 - err, mIOU) outputs = np.concatenate(model_output_targets["outputs"]) targets = np.concatenate(model_output_targets["targets"]) if args.loss == "aleatoric": scales = np.concatenate(model_output_targets["scales"]) else: scales = None np.savez(args.output, preds=outputs, targets=targets, scales=scales)
# val_loss, 1 - val_err, val_iou # ) #) #writer.add_scalar("val/loss", val_loss, epoch) #writer.add_scalar("val/error", val_err, epoch) time_elapsed = time.time() - since print("Total Time {:.0f}m {:.0f}s\n".format(time_elapsed // 60, time_elapsed % 60)) ### Checkpoint ### if epoch % args.save_freq == 0: print("Saving model at Epoch: ", epoch) save_checkpoint( dir=args.dir, epoch=epoch, state_dict=model.state_dict(), optimizer=optimizer.state_dict(), ) lr = schedule(epoch, args.lr_init, args.epochs) adjust_learning_rate(optimizer, lr) writer.add_scalar("hypers/lr", lr, epoch) ### Test set ### test_loss, test_err, test_iou = train_utils.test(model, loaders["test"], criterion) print("SGD Test - Loss: {:.4f} | Acc: {:.4f} | IOU: {:.4f}".format( test_loss, 1 - test_err, test_iou))
criterion = nn.NLLLoss2d(weight=camvid.class_weight.cuda()).cuda() for epoch in range(1, N_EPOCHS + 1): since = time.time() ### Train ### trn_loss, trn_err = train_utils.train(model, train_loader, optimizer, criterion, epoch) print('Epoch {:d}\nTrain - Loss: {:.4f}, Acc: {:.4f}'.format( epoch, trn_loss, 1 - trn_err)) time_elapsed = time.time() - since print('Train Time {:.0f}m {:.0f}s'.format(time_elapsed // 60, time_elapsed % 60)) ### Test ### val_loss, val_err = train_utils.test(model, val_loader, criterion, epoch) print('Val - Loss: {:.4f} | Acc: {:.4f}'.format(val_loss, 1 - val_err)) time_elapsed = time.time() - since print('Total Time {:.0f}m {:.0f}s\n'.format(time_elapsed // 60, time_elapsed % 60)) ### Checkpoint ### train_utils.save_weights(model, epoch, val_loss, val_err) ### Adjust Lr ### train_utils.adjust_learning_rate(LR, LR_DECAY, optimizer, epoch, DECAY_EVERY_N_EPOCHS) '''Test''' train_utils.test(model, test_loader, criterion, epoch=1) #train_utils.view_sample_predictions(model, test_loader, n=1)
def main(): parser = argparse.ArgumentParser(description='Shallow-CNN Training') parser.add_argument('--batch-size', type=int, default=64, metavar='N', help='input batch size for training (default:64)') parser.add_argument('--test-batch-size', type=int, default=1000, metavar='N', help='input batch size for testing (default:1000)') parser.add_argument('--epochs', type=int, default=100, metavar='N', help='number of epochs to train for (default:100)') parser.add_argument('--lr', type=float, default=0.001, metavar='lr', help='learning rate for optimizer (default:0.001)') parser.add_argument('--no-cuda', action='store_true', default=False, help='disables CUDA training') parser.add_argument('--early-stopping', type=int, default=10, metavar='N', help='Patience for early stopping (default:10)') parser.add_argument( '--data-dir', type=str, default='../data', metavar='path/to/dir', help='path to directory containing data (default:../data)') parser.add_argument( '--train-size', type=float, default=0.85, metavar='pct', help='fraction of dataset to use for training (default:0.85)') parser.add_argument( '--test-size', type=float, default=0.15, metavar='pct', help='fraction of dataset to use for testing (default:0.15)') parser.add_argument( '--dropout-rate', type=float, default=0.5, metavar='pct', help='dropout rate after convolution layers (default:0.5)') parser.add_argument('--conv1-width', type=int, default=10, metavar='w', help='Width of 1st convolution kernel (default:10)') parser.add_argument( '--n_channels', type=int, default=30, metavar='N', help='Number of channels ouput by convolution layers (default:30)') parser.add_argument( '--max-pool-kernel-size', type=int, default=25, metavar='w', help='Width of max-pool kernel after convolution (default:25)') parser.add_argument('--max-pool-stride', type=int, default=5, metavar='N', help='stride along 2nd axis for max-pool (default:5)') parser.add_argument( '--log-interval', type=int, default=10, metavar='N', help='how many batches to wait before logging training status') parser.add_argument( '--checkpoint', type=str, default='checkpoint.pt', metavar='path/to/file', help='file to save checkpoints (default:checkpoint.pt)') #TODO add arg to save everything to specific folder # Time id used for saving files time_id = int(time()) args = parser.parse_args() use_cuda = not args.no_cuda and torch.cuda.is_available() device = torch.device('cuda' if use_cuda else 'cpu') torch.manual_seed(SEED) # Load the datsets print('loading datasets') train_set = RobotNavDataset(args.data_dir) submission_set = SubmissionDataset(args.data_dir) train_size = floor(0.8 * len(train_set)) test_size = floor(0.2 * len(train_set)) train_subset, test_subset = data.random_split(train_set, (train_size, test_size)) train_loader = torch.utils.data.DataLoader(train_subset, batch_size=args.batch_size, shuffle=True) # Don't think we actually need shuffle here... test_loader = torch.utils.data.DataLoader(test_subset, batch_size=args.test_batch_size) # Initialize objects print('creating model') model = ShallowCNN(n_channels=args.n_channels, conv1_width=args.conv1_width, max_pool_kernel_size=args.max_pool_kernel_size, max_pool_stride=args.max_pool_stride, dropout_rate=args.dropout_rate) model.double() # TODO: look into if this is actually needed... early_stopper = EarlyStopping(patience=args.early_stopping, check_file=args.checkpoint) optimizer = torch.optim.Adam(model.parameters(), lr=args.lr) logfile = '{}.log'.format(time_id) logger = setup_logger(logfile=logfile, console_out=True) loss_func = F.nll_loss # Train the model print('training model') for epoch in range(1, args.epochs + 1): train(model, train_loader, optimizer, loss_func, epoch, log_interval=args.log_interval, log_func=logger.info) test_loss = test(model, test_loader, loss_func, log_func=logger.info) # Early stopper will handle saving the checkpoints if early_stopper(test_loss, model): break print('creating submission') make_submission(model, submission_set.data, 'submission-{}.csv'.format(time_id))
'epoch_data': [] } for epoch in range(1, N_EPOCHS + 1): ### Train ### trn_loss, reg_loss, trn_err = train_utils.train( model, train_loader, optimizer, criterion, epoch, gpu_id, True) print('Epoch {:d}\nTrain - Loss: {:.4f}, Reg - Loss: {:.4f}, IOU: {:.4f}'.format( epoch, trn_loss, reg_loss, trn_err)) time_elapsed = time.time() - since print('Train Time {:.0f}m {:.0f}s'.format( time_elapsed // 60, time_elapsed % 60)) ### Test ### val_loss, val_err = train_utils.test(model, val_loader, criterion, gpu_id, 1, epoch=epoch) print('Val - Loss: {:.4f} | IOU: {:.4f}'.format(val_loss, val_err)) time_elapsed = time.time() - since print('Total Time {:.0f}m {:.0f}s\n'.format( time_elapsed // 60, time_elapsed % 60)) ### Adjust Lr ### train_utils.adjust_learning_rate(LR, LR_DECAY, optimizer, epoch, DECAY_EVERY_N_EPOCHS) training_data['epoch_data'].append( (trn_err, val_err, time_elapsed, collect_dropout_probabilities(model))) with open('training_data/training_data_{}.pk'.format(model_id), 'wb') as dump_file: pickle.dump(training_data, dump_file)
def test(): train_utils.test(model, test_loader, criterion, epoch=1) train_utils.view_sample_predictions(model, test_loader, n=1)