def main(args): global best_acc global best_auc if not os.path.exists(args.checkpoint): os.makedirs(args.checkpoint) print("==> Creating model '{}-{}', stacks={}, blocks={}, feats={}".format( args.netType, args.pointType, args.nStacks, args.nModules, args.nFeats)) print("=> Models will be saved at: {}".format(args.checkpoint)) model = models.__dict__[args.netType](num_stacks=args.nStacks, num_blocks=args.nModules, num_feats=args.nFeats, use_se=args.use_se, use_attention=args.use_attention, num_classes=68) model = torch.nn.DataParallel(model).cuda() criterion = torch.nn.MSELoss(size_average=True).cuda() optimizer = torch.optim.RMSprop(model.parameters(), lr=args.lr, momentum=args.momentum, weight_decay=args.weight_decay) title = args.checkpoint.split('/')[-1] + ' on ' + args.data.split('/')[-1] Loader = get_loader(args.data) val_loader = torch.utils.data.DataLoader(Loader(args, 'A'), batch_size=args.val_batch, shuffle=False, num_workers=args.workers, pin_memory=True) if args.resume: if os.path.isfile(args.resume): print("=> Loading checkpoint '{}'".format(args.resume)) checkpoint = torch.load(args.resume) args.start_epoch = checkpoint['epoch'] best_acc = checkpoint['best_acc'] model.load_state_dict(checkpoint['state_dict']) optimizer.load_state_dict(checkpoint['optimizer']) print("=> Loaded checkpoint '{}' (epoch {})".format( args.resume, checkpoint['epoch'])) logger = Logger(os.path.join(args.checkpoint, 'log.txt'), title=title, resume=True) else: print("=> no checkpoint found at '{}'".format(args.resume)) else: logger = Logger(os.path.join(args.checkpoint, 'log.txt'), title=title) logger.set_names([ 'Epoch', 'LR', 'Train Loss', 'Valid Loss', 'Train Acc', 'Val Acc', 'AUC' ]) cudnn.benchmark = True print('=> Total params: %.2fM' % (sum(p.numel() for p in model.parameters()) / (1024. * 1024))) if args.evaluation: print('=> Evaluation only') D = args.data.split('/')[-1] save_dir = os.path.join(args.checkpoint, D) if not os.path.exists(save_dir): os.makedirs(save_dir) loss, acc, predictions, auc = validate(val_loader, model, criterion, args.netType, args.debug, args.flip) save_pred(predictions, checkpoint=save_dir) return train_loader = torch.utils.data.DataLoader(Loader(args, 'train'), batch_size=args.train_batch, shuffle=True, num_workers=args.workers, pin_memory=True) lr = args.lr for epoch in range(args.start_epoch, args.epochs): lr = adjust_learning_rate(optimizer, epoch, lr, args.schedule, args.gamma) print('=> Epoch: %d | LR %.8f' % (epoch + 1, lr)) train_loss, train_acc = train(train_loader, model, criterion, optimizer, args.netType, args.debug, args.flip) # do not save predictions in model file valid_loss, valid_acc, predictions, valid_auc = validate( val_loader, model, criterion, args.netType, args.debug, args.flip) logger.append([ int(epoch + 1), lr, train_loss, valid_loss, train_acc, valid_acc, valid_auc ]) is_best = valid_auc >= best_auc best_auc = max(valid_auc, best_auc) save_checkpoint( { 'epoch': epoch + 1, 'netType': args.netType, 'state_dict': model.state_dict(), 'best_acc': best_auc, 'optimizer': optimizer.state_dict(), }, is_best, predictions, checkpoint=args.checkpoint) logger.close() logger.plot(['AUC']) savefig(os.path.join(args.checkpoint, 'log.eps'))
{ 'epoch': epoch + 1, 'netType': args.netType, 'state_dict': model.state_dict(), 'best_acc': best_auc, 'optimizer': optimizer.state_dict(), }, is_best, predictions, checkpoint=args.checkpoint, filename='checkpoint.pth.tar', snapshot=args.snapshot) logger.close() logger.plot(['AUC']) savefig(os.path.join(args.checkpoint, 'log.eps')) def train(loader, model, criterion, optimizer, netType, debug=False, flip=True): batch_time = AverageMeter() data_time = AverageMeter() losses = AverageMeter() acces = AverageMeter() model.train() end = time.time() gt_win, pred_win = None, None bar = Bar('Training', max=len(loader)) for i, (inputs, target) in enumerate(loader): data_time.update(time.time() - end)
def main(): global best_acc start_epoch = args.start_epoch # start from epoch 0 or last checkpoint epoch experimentID = args.experimentID%(args.arch, args.cv) # args.data = args.data%('%s',args.cv) if not os.path.isdir(args.checkpoint): mkdir_p(args.checkpoint) if not os.path.isdir(os.path.join(args.checkpoint, experimentID)): mkdir_p(os.path.join(args.checkpoint, experimentID)) checkpoint_dir = os.path.join(args.checkpoint, experimentID) # Data loading code train_dataset = EnsembleDataset(args, 'train') train_distri = train_dataset.get_label_distri() train_loader = torch.utils.data.DataLoader(train_dataset, batch_size=args.train_batch, shuffle=not args.serial_batches, num_workers=int(args.workers)) valid_dataset = EnsembleDataset(args, 'valid') val_loader = torch.utils.data.DataLoader(valid_dataset, batch_size=args.test_batch, shuffle=False, num_workers=args.workers, pin_memory=True) test_dataset = EnsembleDataset(args, 'test') test_loader = torch.utils.data.DataLoader(test_dataset, batch_size=args.test_batch, shuffle=False, num_workers=args.workers, pin_memory=True) # create model if args.pretrained: print("=> using pre-trained model '{}'".format(args.arch)) model = models.__dict__[args.arch](pretrained=True) elif args.arch.startswith('resnext'): model = models.__dict__[args.arch]( baseWidth=args.base_width, cardinality=args.cardinality, ) else: print("=> creating model '{}'".format(args.arch)) model = models.__dict__[args.arch]() if use_cuda: model = torch.nn.DataParallel(model).cuda() cudnn.benchmark = True print(' Total params: %.2fM' % (sum(p.numel() for p in model.parameters())/1000000.0)) # define loss function (criterion) and optimizer print (train_distri) # return criterion = focalloss(gamma=10, label_distri = train_distri, model_name = args.arch, cuda_a = use_cuda) # criterion = nn.CrossEntropyLoss() # criterion = nn.KLDivLoss() optimizer = optim.Adam(model.parameters(), lr=args.lr, weight_decay=args.weight_decay) # optimizer = optim.SGD(model.parameters(), lr=args.lr, weight_decay=args.weight_decay) if args.test is False: # Resume title = args.arch if args.resume: # Load checkpoint. print('==> Resuming from checkpoint..') checkpoint_path = os.path.join(checkpoint_dir,args.resume+'.checkpoint.pth.tar') print (checkpoint_path) assert os.path.isfile(checkpoint_path), 'Error: no checkpoint directory found!' checkpoint = torch.load(checkpoint_path) best_acc = checkpoint['best_acc'] start_epoch = checkpoint['epoch'] model.load_state_dict(checkpoint['state_dict']) optimizer.load_state_dict(checkpoint['optimizer']) logger = Logger(os.path.join(checkpoint_dir, 'log.txt'), title=title, resume=True) else: logger = Logger(os.path.join(checkpoint_dir, 'log.txt'), title=title) logger.set_names(['Learning Rate', 'Train Loss', 'Valid Loss', 'Train Acc.', 'Valid Acc.']) if args.test: print('\Test only') if len(args.resume) > 0: print ('load %s-th checkpoint'%args.resume) checkpoint_path = os.path.join(checkpoint_dir,args.resume+'.checkpoint.pth.tar') else: print ('load best checkpoint') checkpoint_path = os.path.join(checkpoint_dir,'model_best.pth.tar') print (checkpoint_path) assert os.path.isfile(checkpoint_path), 'Error: no checkpoint directory found!' checkpoint = torch.load(checkpoint_path) best_acc = checkpoint['best_acc'] start_epoch = checkpoint['epoch'] model.load_state_dict(checkpoint['state_dict']) optimizer.load_state_dict(checkpoint['optimizer']) if not os.path.isdir(args.results): mkdir_p(args.results) if not os.path.isdir(os.path.join(args.results, experimentID)): mkdir_p(os.path.join(args.results, experimentID)) results_dir = os.path.join(args.results, experimentID) test_loss, test_acc, pred_d, real_d = test(test_loader, model, criterion, start_epoch, use_cuda) with open(os.path.join(results_dir, 'result_detail.csv'), 'w') as f: csv_writer = csv.writer(f) for i in range(len(real_d)): x = np.zeros(len(pred_d[i])) x[real_d[i]] = 1 y = np.exp(pred_d[i])/np.sum(np.exp(pred_d[i])) csv_writer.writerow(list(y) + list(x)) mr = MeasureR(results_dir, test_loss, test_acc) mr.output() print(' Test Loss: %.8f, Test Acc: %.2f' % (test_loss, test_acc)) return for epoch in range(start_epoch, args.epochs): adjust_learning_rate(optimizer, epoch) print('\nEpoch: [%d | %d] LR: %f' % (epoch + 1, args.epochs, state['lr'])) train_loss, train_acc = train(train_loader, model, criterion, optimizer, epoch, use_cuda) test_loss, test_acc, _, _ = test(val_loader, model, criterion, epoch, use_cuda) l_loss, l_acc, _, _ = test(test_loader, model, criterion, epoch, use_cuda) print (train_loss, train_acc, test_acc, l_acc) # append logger file logger.append([state['lr'], train_loss.cpu(), test_loss.cpu(), train_acc.cpu(), test_acc.cpu()]) # save model is_best = test_acc > best_acc best_acc = max(test_acc, best_acc) if is_best or epoch%args.checkpoint_saved_n == 0: save_checkpoint({ 'epoch': epoch, 'state_dict': model.state_dict(), 'acc': test_acc, 'best_acc': best_acc, 'optimizer' : optimizer.state_dict(), }, epoch, is_best, checkpoint=checkpoint_dir) logger.close() logger.plot() savefig(os.path.join(checkpoint_dir, 'log.png')) print('Best acc:') print(best_acc)
def main(): global best_acc start_epoch = args.start_epoch # start from epoch 0 or last checkpoint epoch experimentID = args.experimentID % args.arch if not os.path.isdir(args.checkpoint): mkdir_p(args.checkpoint) if not os.path.isdir(os.path.join(args.checkpoint, experimentID)): mkdir_p(os.path.join(args.checkpoint, experimentID)) checkpoint_dir = os.path.join(args.checkpoint, experimentID) # Data loading code train_dataset = XrayDataset(args, 'train') train_distri = train_dataset.get_label_distri() train_loader = torch.utils.data.DataLoader(train_dataset, batch_size=args.train_batch, shuffle=False, num_workers=int(args.workers)) valid_dataset = XrayDataset(args, 'valid') val_loader = torch.utils.data.DataLoader(valid_dataset, batch_size=args.test_batch, shuffle=False, num_workers=args.workers, pin_memory=True) test_dataset = XrayDataset(args, 'test') test_loader = torch.utils.data.DataLoader(test_dataset, batch_size=args.test_batch, shuffle=False, num_workers=args.workers, pin_memory=True) # loders = [(test_loader, 'test')] loders = [(train_loader, 'train'), (val_loader, 'valid'), (test_loader, 'test')] # create model if args.pretrained: print("=> using pre-trained model '{}'".format(args.arch)) if args.arch.startswith('resnext'): model = models.__dict__[args.arch]( baseWidth=args.base_width, cardinality=args.cardinality, ) else: model = models.__dict__[args.arch](pretrained=True) else: print("=> creating model '{}'".format(args.arch)) model = models.__dict__[args.arch]() # for key, value in model.state_dict().items(): # print (key, value.shape) if args.arch == 'vgg19_bn': model.classifier[6] = torch.nn.Linear(4096, 4, bias=True) elif args.arch == "inception_v3": model.fc = torch.nn.Linear(2048, 4, bias=True) elif args.arch == "resnext101_32x8d": model.fc = torch.nn.Linear(2048, 4, bias=True) elif args.arch == "alexnet": model.classifier[6] = torch.nn.Linear(model.classifier[6].in_features, 4, bias=True) elif args.arch == 'resnet18': model.fc = torch.nn.Linear(512, 4, bias=True) elif args.arch == 'resnet50': model.fc = torch.nn.Linear(2048, 4, bias=True) elif args.arch == 'resnet101': model.fc = torch.nn.Linear(2048, 4, bias=True) elif args.arch == 'resnet152': model.fc = torch.nn.Linear(2048, 4, bias=True) elif args.arch == 'densenet121': model.classifier = torch.nn.Linear(1024, 4, bias=True) elif args.arch == 'densenet161': model.classifier = torch.nn.Linear(2208, 4, bias=True) if use_cuda: if args.arch.startswith('alexnet') or args.arch.startswith('vgg'): model.features = torch.nn.DataParallel(model.features) model.cuda() else: model = torch.nn.DataParallel(model).cuda() cudnn.benchmark = True print(' Total params: %.2fM' % (sum(p.numel() for p in model.parameters()) / 1000000.0)) # define loss function (criterion) and optimizer # criterion = focalloss(label_distri = train_distri, model_name = args.arch) criterion = nn.CrossEntropyLoss() optimizer = optim.Adam(model.parameters(), lr=args.lr, weight_decay=args.weight_decay) if args.test is False: # Resume title = args.arch if args.resume: # Load checkpoint. print('==> Resuming from checkpoint..') checkpoint_path = os.path.join(checkpoint_dir, args.resume + '.checkpoint.pth.tar') print(checkpoint_path) assert os.path.isfile( checkpoint_path), 'Error: no checkpoint directory found!' checkpoint = torch.load(checkpoint_path) best_acc = checkpoint['best_acc'] start_epoch = checkpoint['epoch'] model.load_state_dict(checkpoint['state_dict']) optimizer.load_state_dict(checkpoint['optimizer']) logger = Logger(os.path.join(checkpoint_dir, 'log.txt'), title=title, resume=True) else: logger = Logger(os.path.join(checkpoint_dir, 'log.txt'), title=title) logger.set_names([ 'Learning Rate', 'Train Loss', 'Valid Loss', 'Train Acc.', 'Valid Acc.' ]) if args.test: print('\Test only') if len(args.resume) > 0: print('load %s-th checkpoint' % args.resume) checkpoint_path = os.path.join(checkpoint_dir, args.resume + '.checkpoint.pth.tar') else: print('load best checkpoint') checkpoint_path = os.path.join(checkpoint_dir, 'model_best.pth.tar') print(checkpoint_path) assert os.path.isfile( checkpoint_path), 'Error: no checkpoint directory found!' checkpoint = torch.load(checkpoint_path) best_acc = checkpoint['best_acc'] start_epoch = checkpoint['epoch'] model.load_state_dict(checkpoint['state_dict']) optimizer.load_state_dict(checkpoint['optimizer']) if not os.path.isdir(args.results): mkdir_p(args.results) if not os.path.isdir(os.path.join(args.results, experimentID)): mkdir_p(os.path.join(args.results, experimentID)) results_dir = os.path.join(args.results, experimentID) runtype = [] for func in loders: test_loss, test_acc, pred_d, real_d = test(func[0], model, criterion, start_epoch, use_cuda) with open( os.path.join( results_dir, 'result_detail_%s_%s_cv1.csv' % (args.arch, func[1])), 'w') as f: csv_writer = csv.writer(f) for i in range(len(real_d)): x = np.zeros(len(pred_d[i])) x[real_d[i]] = 1 # y = np.exp(pred_d[i])/np.sum(np.exp(pred_d[i])) csv_writer.writerow(list(np.array(pred_d[i])) + list(x)) # mr = MeasureR(results_dir, test_loss, test_acc) # mr.output() print(' Test Loss: %.8f, Test Acc: %.4f' % (test_loss, test_acc)) return for epoch in range(start_epoch, args.epochs): adjust_learning_rate(optimizer, epoch) print('\nEpoch: [%d | %d] LR: %f' % (epoch + 1, args.epochs, state['lr'])) train_loss, train_acc = train(train_loader, model, criterion, optimizer, epoch, use_cuda) test_loss, test_acc, _, _ = test(val_loader, model, criterion, epoch, use_cuda) l_loss, l_acc, _, _ = test(test_loader, model, criterion, epoch, use_cuda) print(train_loss, train_acc, test_acc, l_acc) # append logger file logger.append( [state['lr'], train_loss, test_loss, train_acc, test_acc]) # save model is_best = test_acc > best_acc best_acc = max(test_acc, best_acc) if epoch % args.checkpoint_saved_n == 0: save_checkpoint( { 'epoch': epoch, 'state_dict': model.state_dict(), 'acc': test_acc, 'best_acc': best_acc, 'optimizer': optimizer.state_dict(), }, epoch, is_best, checkpoint=checkpoint_dir) logger.close() logger.plot() savefig(os.path.join(checkpoint_dir, 'log.eps')) print('Best acc:') print(best_acc)
def main(args): # Seed torch.manual_seed(args.seed) torch.backends.cudnn.deterministic = True torch.backends.cudnn.benchmark = True np.random.seed(args.seed) if args.featurize_mode: msg = "To perform featurization, use evaluation mode" assert args.evaluate and args.evaluate_video, msg msg = ( f"Until we fully understand the implications of multi-worker caching, we " f"should avoid using multiple workers (requested {args.workers})") assert args.workers <= 1, msg # create checkpoint dir if not os.path.isdir(args.checkpoint): mkdir_p(args.checkpoint) # Overload print statement to log to file setup_verbose_logging(Path(args.checkpoint)) logger_name = "train" if not args.evaluate else "eval" plog = logging.getLogger(logger_name) opts.print_args(args) opts.save_args(args, save_folder=args.checkpoint) if not args.debug: plt.switch_backend("agg") # create model plog.info(f"==> creating model '{args.arch}', out_dim={args.num_classes}") if args.arch == "InceptionI3d": model = models.__dict__[args.arch]( num_classes=args.num_classes, spatiotemporal_squeeze=True, final_endpoint="Logits", name="inception_i3d", in_channels=3, dropout_keep_prob=0.5, num_in_frames=args.num_in_frames, include_embds=args.include_embds, ) if args.save_features: msg = "Set --include_embds 1 to save_features" assert args.include_embds, msg elif args.arch == "Pose2Sign": model = models.Pose2Sign(num_classes=args.num_classes, ) else: model = models.__dict__[args.arch](num_classes=args.num_classes, ) device = "cuda" if torch.cuda.is_available() else "cpu" # adjust for opts for multi-gpu training. Note that we also apply warmup to the # learning rate. Can technically remove this if-statement, but leaving for now # to make the change explicit. if args.num_gpus > 1: num_gpus = torch.cuda.device_count() msg = f"Requested {args.num_gpus}, but {num_gpus} were visible" assert num_gpus == args.num_gpus, msg args.train_batch = args.train_batch * args.num_gpus args.test_batch = args.test_batch * args.num_gpus device_ids = list(range(args.num_gpus)) args.lr = args.lr * args.num_gpus else: device_ids = [0] model = torch.nn.DataParallel(model, device_ids=device_ids) model = model.to(device) optimizer = torch.optim.SGD( model.parameters(), lr=args.lr, momentum=args.momentum, weight_decay=args.weight_decay, ) # optionally resume from a checkpoint tic = time.time() title = f"{args.datasetname} - {args.arch}" if args.resume: if os.path.isfile(args.resume): plog.info(f"=> loading checkpoint '{args.resume}'") checkpoint = load_checkpoint(args.resume) model.load_state_dict(checkpoint["state_dict"]) optimizer.load_state_dict(checkpoint["optimizer"]) args.start_epoch = checkpoint["epoch"] plog.info( f"=> loaded checkpoint '{args.resume}' (epoch {checkpoint['epoch']})" ) logger = Logger(os.path.join(args.checkpoint, "log.txt"), title=title, resume=True) del checkpoint else: plog.info(f"=> no checkpoint found at '{args.resume}'") raise ValueError(f"Checkpoint not found at {args.resume}!") else: logger = Logger(os.path.join(args.checkpoint, "log.txt"), title=title) logger_names = ["Epoch", "LR", "train_loss", "val_loss"] for p in range(0, args.nloss - 1): logger_names.append("train_loss%d" % p) logger_names.append("val_loss%d" % p) for p in range(args.nperf): logger_names.append("train_perf%d" % p) logger_names.append("val_perf%d" % p) logger.set_names(logger_names) if args.pretrained: load_checkpoint_flexible(model, optimizer, args, plog) param_count = humanize.intword(sum(p.numel() for p in model.parameters())) plog.info(f" Total params: {param_count}") duration = time.strftime("%Hh%Mm%Ss", time.gmtime(time.time() - tic)) plog.info(f"Loaded parameters for model in {duration}") mdl = MultiDataLoader( train_datasets=args.datasetname, val_datasets=args.datasetname, ) train_loader, val_loader, meanstd = mdl._get_loaders(args) train_mean = meanstd[0] train_std = meanstd[1] val_mean = meanstd[2] val_std = meanstd[3] save_feature_dir = args.checkpoint save_fig_dir = Path(args.checkpoint) / "figs" if args.featurize_mode: save_feature_dir = Path( args.checkpoint) / "filtered" / args.featurize_mask save_feature_dir.mkdir(exist_ok=True, parents=True) save_fig_dir = Path(args.checkpoint) / "figs" / args.featurize_mask save_fig_dir.mkdir(exist_ok=True, parents=True) # Define criterion criterion = torch.nn.CrossEntropyLoss(reduction="mean") criterion = criterion.to(device) if args.evaluate or args.evaluate_video: plog.info("\nEvaluation only") loss, acc = do_epoch( "val", val_loader, model, criterion, num_classes=args.num_classes, debug=args.debug, checkpoint=args.checkpoint, mean=val_mean, std=val_std, feature_dim=args.feature_dim, save_logits=True, save_features=args.save_features, num_figs=args.num_figs, topk=args.topk, save_feature_dir=save_feature_dir, save_fig_dir=save_fig_dir, ) if args.featurize_mode: plog.info(f"Featurizing without metric evaluation") return # Summarize/save results evaluate.evaluate(args, val_loader.dataset, plog) logger_epoch = [0, 0] for p in range(len(loss)): logger_epoch.append(float(loss[p].avg)) logger_epoch.append(float(loss[p].avg)) for p in range(len(acc)): logger_epoch.append(float(acc[p].avg)) logger_epoch.append(float(acc[p].avg)) # append logger file logger.append(logger_epoch) return lr = args.lr for epoch in range(args.start_epoch, args.epochs): lr = adjust_learning_rate(optimizer, epoch, lr, args.schedule, args.gamma, num_gpus=args.num_gpus) plog.info("\nEpoch: %d | LR: %.8f" % (epoch + 1, lr)) # train for one epoch train_loss, train_perf = do_epoch( "train", train_loader, model, criterion, epochno=epoch, optimizer=optimizer, num_classes=args.num_classes, debug=args.debug, checkpoint=args.checkpoint, mean=train_mean, std=train_std, feature_dim=args.feature_dim, save_logits=False, save_features=False, num_figs=args.num_figs, topk=args.topk, save_feature_dir=save_feature_dir, save_fig_dir=save_fig_dir, ) # evaluate on validation set valid_loss, valid_perf = do_epoch( "val", val_loader, model, criterion, epochno=epoch, num_classes=args.num_classes, debug=args.debug, checkpoint=args.checkpoint, mean=val_mean, std=val_std, feature_dim=args.feature_dim, save_logits=False, save_features=False, num_figs=args.num_figs, topk=args.topk, save_feature_dir=save_feature_dir, save_fig_dir=save_fig_dir, ) logger_epoch = [epoch + 1, lr] for p in range(len(train_loss)): logger_epoch.append(float(train_loss[p].avg)) logger_epoch.append(float(valid_loss[p].avg)) for p in range(len(train_perf)): logger_epoch.append(float(train_perf[p].avg)) logger_epoch.append(float(valid_perf[p].avg)) # append logger file logger.append(logger_epoch) # save checkpoint save_checkpoint( { "epoch": epoch + 1, "arch": args.arch, "state_dict": model.state_dict(), "optimizer": optimizer.state_dict(), }, checkpoint=args.checkpoint, snapshot=args.snapshot, ) plt.clf() plt.subplot(121) logger.plot(["train_loss", "val_loss"]) plt.subplot(122) logger.plot(["train_perf0", "val_perf0"]) savefig(os.path.join(args.checkpoint, "log.pdf")) logger.close()
def main(args): global best_acc global best_auc if not os.path.exists(args.checkpoint): os.makedirs(args.checkpoint) print("==> Creating model '{}-{}', stacks={}, blocks={}, feats={}".format( args.netType, args.pointType, args.nStacks, args.nModules, args.nFeats)) print("=> Models will be saved at: {}".format(args.checkpoint)) model = models.__dict__[args.netType]( num_stacks=args.nStacks, num_blocks=args.nModules, num_feats=args.nFeats, use_se=args.use_se, use_attention=args.use_attention, num_classes=68) model = torch.nn.DataParallel(model).cuda() criterion = torch.nn.MSELoss(size_average=True).cuda() optimizer = torch.optim.RMSprop( model.parameters(), lr=args.lr, momentum=args.momentum, weight_decay=args.weight_decay) title = args.checkpoint.split('/')[-1] + ' on ' + args.data.split('/')[-1] Loader = get_loader(args.data) val_loader = torch.utils.data.DataLoader( Loader(args, 'A'), batch_size=args.val_batch, shuffle=False, num_workers=args.workers, pin_memory=True) if args.resume: if os.path.isfile(args.resume): print("=> Loading checkpoint '{}'".format(args.resume)) checkpoint = torch.load(args.resume) args.start_epoch = checkpoint['epoch'] best_acc = checkpoint['best_acc'] model.load_state_dict(checkpoint['state_dict']) optimizer.load_state_dict(checkpoint['optimizer']) print("=> Loaded checkpoint '{}' (epoch {})".format(args.resume, checkpoint['epoch'])) logger = Logger(os.path.join(args.checkpoint, 'log.txt'), title=title, resume=True) else: print("=> no checkpoint found at '{}'".format(args.resume)) else: logger = Logger(os.path.join(args.checkpoint, 'log.txt'), title=title) logger.set_names(['Epoch', 'LR', 'Train Loss', 'Valid Loss', 'Train Acc', 'Val Acc', 'AUC']) cudnn.benchmark = True print('=> Total params: %.2fM' % (sum(p.numel() for p in model.parameters()) / (1024. * 1024))) if args.evaluation: print('=> Evaluation only') D = args.data.split('/')[-1] save_dir = os.path.join(args.checkpoint, D) if not os.path.exists(save_dir): os.makedirs(save_dir) loss, acc, predictions, auc = validate(val_loader, model, criterion, args.netType, args.debug, args.flip) save_pred(predictions, checkpoint=save_dir) return train_loader = torch.utils.data.DataLoader( Loader(args, 'train'), batch_size=args.train_batch, shuffle=True, num_workers=args.workers, pin_memory=True) lr = args.lr for epoch in range(args.start_epoch, args.epochs): lr = adjust_learning_rate(optimizer, epoch, lr, args.schedule, args.gamma) print('=> Epoch: %d | LR %.8f' % (epoch + 1, lr)) train_loss, train_acc = train(train_loader, model, criterion, optimizer, args.netType, args.debug, args.flip) # do not save predictions in model file valid_loss, valid_acc, predictions, valid_auc = validate(val_loader, model, criterion, args.netType, args.debug, args.flip) logger.append([int(epoch + 1), lr, train_loss, valid_loss, train_acc, valid_acc, valid_auc]) is_best = valid_auc >= best_auc best_auc = max(valid_auc, best_auc) save_checkpoint( { 'epoch': epoch + 1, 'netType': args.netType, 'state_dict': model.state_dict(), 'best_acc': best_auc, 'optimizer': optimizer.state_dict(), }, is_best, predictions, checkpoint=args.checkpoint) logger.close() logger.plot(['AUC']) savefig(os.path.join(args.checkpoint, 'log.eps'))