def train(trainloader, model, criterion, optimizer, epoch, cuda=False, compute_step_variance=False): # switch to train mode model.train() batch_time = AverageMeter() data_time = AverageMeter() losses = AverageMeter() top1 = AverageMeter() top5 = AverageMeter() end = time.time() for batch_idx, (inputs, targets) in enumerate(trainloader): # measure data loading time data_time.update(time.time() - end) # found this that suggest changing `async` to `non_blocking`: https://github.com/quark0/darts/pull/25 if cuda: inputs, targets = inputs.cuda(), targets.cuda(non_blocking=True) # compute output outputs = model(inputs) loss = criterion(outputs, targets) # measure accuracy and record loss prec1, prec5 = accuracy(outputs.data, targets.data, topk=(1, 5)) losses.update(loss.item(), inputs.size(0)) top1.update(prec1.item(), inputs.size(0)) top5.update(prec5.item(), inputs.size(0)) # compute gradient and do SGD step optimizer.zero_grad() loss.backward() optimizer.step() # measure elapsed time batch_time.update(time.time() - end) end = time.time() # plot progress progress_str = 'Loss: %.3f | Acc: %.3f%% (%d/%d)'\ % (losses.avg, top1.avg, top1.sum, top1.count) progress_bar(batch_idx, len(trainloader), progress_str) iteration = epoch * len(trainloader) + batch_idx track.metric(iteration=iteration, epoch=epoch, avg_train_loss=losses.avg, avg_train_acc=top1.avg, cur_train_loss=loss.item(), cur_train_acc=prec1.item()) return (losses.avg, top1.avg)
def eval_model(model, env, y_placeholder, obs_placeholder, attack_method, attack_ord=2, num_rollouts=5, eps=0.1, trial_num=0, render=False, alg_name='ERROR', env_name='ERROR'): # cleverhans needs to get the logits tensor, but expects you to run # through and recompute it for the given observation # even though the graph is already created cleverhans_model = CallableModelWrapper(lambda o: y_placeholder, "logits") attack = ATTACKS[attack_method](cleverhans_model) fgsm_params = {'eps': eps, 'ord': attack_ord} # we'll keep tracking metrics here prev_done_step = 0 stats = {} rewards = [] stats['eval_step'] = 0 stats['episode'] = 0 stats['episode_reward'] = 0. obs = env.reset() num_episodes = 0 while num_episodes < num_rollouts: # the attack_op tensor will generate the perturbed state! attack_op = attack.generate(obs_placeholder, **fgsm_params) adv_obs = attack_op.eval({obs_placeholder: obs[None, :]}) action = model(adv_obs)[0] # it's time for my child to act out in this adversarial world obs, rew, done, _ = env.step(action) reward = rew[0] if isinstance(env, VecEnv) else rew if render: env.render() done = done.any() if isinstance(done, np.ndarray) else done # let's get our metrics stats['eval_step'] += 1 stats['episode_reward'] += reward stats['episode_len'] = stats['eval_step'] + prev_done_step if done: rewards.append(stats['episode_reward']) obs = env.reset() prev_done_step = stats['eval_step'] track.debug("Finished episode %d!" % (stats['episode'])) stats['episode'] += 1 stats['episode_reward'] = 0 stats['eval_step'] = 0 num_episodes += 1 # track metrics to access later through pandas track.metric(iteration=stats['eval_step'] + prev_done_step, trial_num=trial_num, **stats) env.close() np.save('./data/{0}_{1}_{2}_{3}_{4}.npy'.format(alg_name, env_name, attack_method, attack_ord, eps), rewards) print('REWARDS', rewards) return stats # gimme the final stats for the episode
def run(ensemble, proj_df, results_dir='./logs', dataroot='./data', batch_size=128, eval_batch_size=100, cuda=False, num_workers=2, **unused): """ this evaluates both the ensemble and the baseline model on the full test set we also evaluate each model and compute their individual losses, so that we can plot the variance around the ensemble's dashed horizontal line (see top of file) """ trainloader, testloader = build_dataset('cifar10', dataroot=dataroot, batch_size=batch_size, eval_batch_size=eval_batch_size, num_workers=2) ensemble_criterion = SoftmaxNLL() track.debug("[baseline] testing the ensemble on full dataset") ensemble_loss, ensemble_acc = test(testloader, ensemble, ensemble_criterion, epoch=-1, cuda=cuda, metric=False) # get the no-noise baseline evaluation proj = track.Project(results_dir) best_model, best_df = load_trial(proj, noise_scale=0.0) track.debug("[baseline] testing no-noise baseline model on full dataset") baseline_criterion = torch.nn.CrossEntropyLoss() baseline_loss, baseline_acc = test(testloader, best_model, baseline_criterion, epoch=-1, cuda=cuda, metric=False) # now, test each of the ensemble's models model_losses = [] model_accs = [] track.debug("[baseline] testing individual models on full dataset") for i, model in enumerate(ensemble.models): track.debug("[baseline] testing model %d of %d" % (i, len(ensemble.models))) model_loss, model_acc = test(testloader, model, baseline_criterion, epoch=-1, cuda=cuda, metric=False) model_losses.append(model_loss) model_accs.append(model_acc) # we just need to track the scalar results of this evaluation # we can access the baseline test *curve* from the jupyter notebook (later) track.metric(iteration=0, ensemble_loss=ensemble_loss, ensemble_acc=ensemble_acc, best_baseline_loss=baseline_loss, best_baseline_acc=baseline_acc, model_losses=model_losses, model_accs=model_accs)
def do_training(args): trainloader, testloader = build_dataset( args.dataset, dataroot=args.dataroot, batch_size=args.batch_size, eval_batch_size=args.eval_batch_size, num_workers=2) model = build_model(args.arch, num_classes=num_classes(args.dataset)) if args.cuda: model = torch.nn.DataParallel(model).cuda() # Calculate total number of model parameters num_params = sum(p.numel() for p in model.parameters()) track.metric(iteration=0, num_params=num_params) if args.optimizer == 'sgd': optimizer = SGD(model.parameters(), lr=args.lr, momentum=args.momentum, weight_decay=args.weight_decay) else: optimizer = EKFAC(model.parameters(), lr=args.lr, momentum=args.momentum, weight_decay=args.weight_decay, eps=args.eps, update_freq=args.update_freq) criterion = torch.nn.CrossEntropyLoss() best_acc = 0.0 for epoch in range(args.epochs): track.debug("Starting epoch %d" % epoch) args.lr = adjust_learning_rate(epoch, optimizer, args.lr, args.schedule, args.gamma) train_loss, train_acc = train(trainloader, model, criterion, optimizer, epoch, args.cuda) test_loss, test_acc = test(testloader, model, criterion, epoch, args.cuda) track.debug('Finished epoch %d... | train loss %.3f | train acc %.3f ' '| test loss %.3f | test acc %.3f' % (epoch, train_loss, train_acc, test_loss, test_acc)) # Save model model_fname = os.path.join(track.trial_dir(), "model{}.ckpt".format(epoch)) torch.save(model, model_fname) if test_acc > best_acc: best_acc = test_acc best_fname = os.path.join(track.trial_dir(), "best.ckpt") track.debug("New best score! Saving model") torch.save(model, best_fname)
def train(trainloader, model, criterion, optimizer, epoch): # switch to train mode model.train() batch_time = AverageMeter() data_time = AverageMeter() losses = AverageMeter() top1 = AverageMeter() top5 = AverageMeter() end = time.time() for batch_idx, (inputs, targets) in enumerate(trainloader): # measure data loading time data_time.update(time.time() - end) inputs, targets = inputs.cuda(), targets.cuda(async=True) inputs, targets = torch.autograd.Variable( inputs), torch.autograd.Variable(targets) # compute output outputs = model(inputs) loss = criterion(outputs, targets) # measure accuracy and record loss prec1, prec5 = accuracy(outputs.data, targets.data, topk=(1, 5)) losses.update(loss.item(), inputs.size(0)) top1.update(prec1.item(), inputs.size(0)) top5.update(prec5.item(), inputs.size(0)) # compute gradient and do SGD step optimizer.zero_grad() loss.backward() optimizer.step() # measure elapsed time batch_time.update(time.time() - end) end = time.time() # plot progress progress_str = 'Loss: %.3f | Acc: %.3f%% (%d/%d)'\ % (losses.avg, top1.avg, top1.sum, top1.count) progress_bar(batch_idx, len(trainloader), progress_str) iteration = epoch * len(trainloader) + batch_idx track.metric(iteration=iteration, epoch=epoch, avg_train_loss=losses.avg, avg_train_acc=top1.avg, cur_train_loss=loss.item(), cur_train_acc=prec1.item()) return (losses.avg, top1.avg)
def do_training(args): trainloader, testloader = build_dataset( args.dataset, dataroot=args.dataroot, batch_size=args.batch_size, eval_batch_size=args.eval_batch_size, num_workers=2) model = build_model(args.arch, num_classes=num_classes(args.dataset)) if args.cuda: model = torch.nn.DataParallel(model).cuda() # Calculate total number of model parameters num_params = sum(p.numel() for p in model.parameters()) track.metric(iteration=0, num_params=num_params) num_chunks = max(1, args.batch_size // args.max_samples_per_gpu) optimizer = LARS(params=model.parameters(), lr=args.lr, momentum=args.momentum, weight_decay=args.weight_decay, eta=args.eta, max_epoch=args.epochs) criterion = torch.nn.CrossEntropyLoss() best_acc = 0.0 for epoch in range(args.epochs): track.debug("Starting epoch %d" % epoch) train_loss, train_acc = train(trainloader, model, criterion, optimizer, epoch, args.cuda, num_chunks=num_chunks) test_loss, test_acc = test(testloader, model, criterion, epoch, args.cuda) track.debug('Finished epoch %d... | train loss %.3f | train acc %.3f ' '| test loss %.3f | test acc %.3f' % (epoch, train_loss, train_acc, test_loss, test_acc)) # Save model model_fname = os.path.join(track.trial_dir(), "model{}.ckpt".format(epoch)) torch.save(model, model_fname) if test_acc > best_acc: best_acc = test_acc best_fname = os.path.join(track.trial_dir(), "best.ckpt") track.debug("New best score! Saving model") torch.save(model, best_fname)
def test(testloader, model, criterion, epoch, cuda=False): batch_time = AverageMeter() data_time = AverageMeter() losses = AverageMeter() top1 = AverageMeter() top5 = AverageMeter() # switch to evaluate mode model.eval() end = time.time() with torch.no_grad(): for batch_idx, (inputs, targets) in enumerate(testloader): # measure data loading time data_time.update(time.time() - end) if cuda: inputs, targets = inputs.cuda(), targets.cuda() inputs = torch.autograd.Variable(inputs, volatile=True) targets = torch.autograd.Variable(targets, volatile=True) # compute output outputs = model(inputs) loss = criterion(outputs, targets) # measure accuracy and record loss prec1, prec5 = accuracy(outputs.data, targets.data, topk=(1, 5)) losses.update(loss.item(), inputs.size(0)) top1.update(prec1.item(), inputs.size(0)) top5.update(prec5.item(), inputs.size(0)) # measure elapsed time batch_time.update(time.time() - end) end = time.time() # plot progress progress_str = 'Loss: %.3f | Acc: %.3f%% (%d/%d)'\ % (losses.avg, top1.avg, top1.sum, top1.count) progress_bar(batch_idx, len(testloader), progress_str) track.metric(iteration=0, epoch=epoch, avg_test_loss=losses.avg, avg_test_acc=top1.avg) return (losses.avg, top1.avg)
def main(args): trainloader, testloader = build_dataset( 'cifar10', dataroot=args.dataroot, batch_size=args.batch_size, eval_batch_size=args.eval_batch_size, num_workers=2) model = build_model('ResNet18', num_classes=10) criterion = torch.nn.CrossEntropyLoss() eigenvals, eigenvecs = compute_hessian_eigenthings(model, testloader, criterion, args.num_eigenthings, args.num_steps, momentum=args.momentum, use_gpu=args.cuda) print("Eigenvecs:") print(eigenvecs) print("Eigenvals:") print(eigenvals) track.metric(iteration=0, eigenvals=eigenvals)
def run(ensemble, proj_df, dataroot='./data', batch_size=128, eval_batch_size=100, cuda=False, num_workers=2, **unused): """ let's compute that entropy baby """ num_classes = 10 # build_dataset('cifar10') <- not worth computing rn entropy_criterion = Entropy() ensemble.models = ensemble.models[::4] # iterate for all possible classes in dataset for class_ind in range(num_classes): # build dataset per class track.debug("Evaluating entropy for class id: %d" % (class_ind)) class_trainlaoder, class_testloader = build_single_class_dataset( 'cifar10', class_ind=class_ind, dataroot=dataroot, batch_size=batch_size, eval_batch_size=eval_batch_size, num_workers=2) # compute the entropy of the model post-hoc as well entropy = test(class_testloader, ensemble, entropy_criterion, epoch=-1, cuda=cuda, metric=False, criterion_has_labels=False, compute_acc=False) track.debug("\n\n\tEntropy: %.2f" % entropy) track.metric(cifar_class_id=class_ind, entropy=entropy)
def _do_training(train, val, shared, training_state): batch_size = flags.FLAGS.batch_size loss_window = RollingAverageWindow(len(train) // 10 // batch_size) acc_window = RollingAverageWindow(len(train) // 10 // batch_size) grad_window = RollingAverageWindow(len(train) // 10 // batch_size) def _tqdm_postfix(): return { 'loss': '{:06.3f}'.format(loss_window.value()), 'acc': '{:05.1%}'.format(acc_window.value()), 'gradnorm': '{:08.2e}'.format(grad_window.value()) } shared.set_mode(evaluation=False) shared.lr(training_state.lr) perm = np.arange(len(train)) for epoch in range(1 + training_state.epoch, 1 + flags.FLAGS.max_epochs): epochfmt = intfmt(flags.FLAGS.max_epochs) training_state.epoch = epoch track.debug('begin epoch ' + epochfmt, epoch) # one sample at a time greatly simplifies pytorch seq2seq! np.random.shuffle(perm) samples = (train[i] for i in perm) with tqdm(total=len(train), postfix=_tqdm_postfix()) as progbar: for exs in chunkify(samples, batch_size): shared.zero_grad() loss, acc, gradnorm = shared.train(exs) loss_window.update(loss) acc_window.update(acc) grad_window.update(gradnorm) shared.step() progbar.update(len(exs)) progbar.set_postfix(**_tqdm_postfix()) shared.set_mode(evaluation=True) val_diagnostics = _diagnose(val, shared) train_diagnostics = _diagnose(train, shared, min(len(val), len(train))) track.metric(iteration=epoch, lr=training_state.lr) track.metric(iteration=epoch, **{'val ' + k: v for k, v in val_diagnostics.items()}) track.metric(iteration=epoch, **{'train ' + k: v for k, v in train_diagnostics.items()}) shared.set_mode(evaluation=False) val_diagnostics_str = _str_diagnostics('val', val_diagnostics) train_diagnositcs_str = _str_diagnostics('(sampled) train', train_diagnostics) track.debug('epoch ' + epochfmt + ' of ' + epochfmt + '\n{}\n{}', epoch, flags.FLAGS.max_epochs, val_diagnostics_str, train_diagnositcs_str) cur_val_loss = val_diagnostics['loss (*total)'] if cur_val_loss < training_state.best_val_loss: training_state.patience = training_state.initial_patience training_state.best_val_loss = cur_val_loss best_file = _checkpoint_file('best.pth') track.debug('updating best model into file {}', best_file) _save_checkpoint(best_file, shared.model, training_state) else: training_state.patience -= 1 track.debug('val loss not improving; dropping patience') shared.lr(training_state.lr) if training_state.patience == 0: track.debug('out of patience, dropping lr') training_state.lr *= flags.FLAGS.lr_decay_rate training_state.patience = training_state.initial_patience track.debug('lr {} patience {} best val loss so far {}', training_state.lr, training_state.patience, training_state.best_val_loss) early_stop = training_state.lr < flags.FLAGS.min_lr if early_stop: track.debug( 'lr dropped to {} < min tolerable lr {}, early stopping', training_state.lr, flags.FLAGS.min_lr) if _check_period(epoch, flags.FLAGS.persist_every) or early_stop: epochfmt = intfmt(flags.FLAGS.max_epochs, fill='0') checkpoint_file = _checkpoint_file(epochfmt.format(epoch) + '.pth') track.debug('persisting model to {}', checkpoint_file) _save_checkpoint(checkpoint_file, shared.model, training_state) if early_stop: break
with track.trial(args.logroot, None, param_map=param_map): try: for epoch in range(1, args.epochs + 1): epoch_start_time = time.time() train_loss = train() val_loss = evaluate(val_data) print('-' * 89) track.debug( '| end of epoch {:3d} | time: {:5.2f}s | train loss {:5.2f} | valid loss {:5.2f} | ' 'valid ppl {:8.2f}'.format(epoch, (time.time() - epoch_start_time), train_loss, val_loss, math.exp(val_loss))) print('-' * 89) track.metric(iteration=epoch, train_loss=train_loss, test_loss=val_loss) # Log model model_fname = os.path.join(track.trial_dir(), "model{}.ckpt".format(epoch)) torch.save(model, model_fname) # Save the model if the validation loss is the best we've seen so far. if not best_val_loss or val_loss < best_val_loss: best_fname = os.path.join(track.trial_dir(), "best.ckpt") with open(best_fname, 'wb') as f: torch.save(model, f) best_val_loss = val_loss else: # Anneal the learning rate if no improvement has been seen in the validation dataset. lr /= 4.0 except KeyboardInterrupt:
def do_training(args): hyperparameters = { 'lr': args.lr, 'epochs': args.epochs, 'resume_from': 0, 'coco_version': args.coco_version, #can be either '2014' or '2017' 'batch_size': args.batch_size, 'weight_decay': args.weight_decay, 'momentum': args.momentum, 'optimizer': args.optimizer, 'alpha': args.alpha, 'gamma': args.gamma, 'lcoord': args.lcoord, 'lno_obj': args.lno_obj, 'iou_type': tuple(int(a) for a in tuple(args.iou_type)), 'iou_ignore_thresh': args.iou_ignore_thresh, 'tfidf': args.tfidf, 'idf_weights': True, 'tfidf_col_names': ['img_freq', 'none', 'none', 'none', 'no_softmax'], 'wasserstein': args.wasserstein, 'inf_confidence': args.inf_confidence, 'inf_iou_threshold': args.inf_iou_threshold, 'augment': args.augment, 'workers': 1, 'pretrained': args.is_pretrained, 'path': args.trial_id, 'reduction': args.reduction } mode = { 'bayes_opt': False, 'multi_scale': args.multi_scale, 'show_hp': args.show_hp, 'show_output': args.show_output, 'multi_gpu': False, 'train_subset': args.train_subset, 'test_subset': args.test_subset, 'show_temp_summary': args.show_temp_summary, 'save_summary': False } this_proj = track.Project("./logs/" + args.experimentname) if (args.resume == 'last'): this_proj = track.Project("./logs/" + args.experimentname) most_recent = this_proj.ids["start_time"].nlargest(2).idxmin() most_recent_id = this_proj.ids["trial_id"].iloc[[most_recent]] PATH = os.path.join("./logs/" + args.experimentname, most_recent_id.item()) hyperparameters['path'] = os.path.join(PATH, 'last.tar') args.resume = most_recent_id.item() elif (args.resume == 'best'): ids = this_proj.ids["trial_id"] res = this_proj.results(ids) best_map = res["coco_stats:map_all"].idxmax() best_map_id = res["trial_id"].iloc[[best_map]] PATH = os.path.join("./logs/" + args.experimentname, best_map_id.item()) hyperparameters['path'] = os.path.join(PATH, 'best.tar') args.resume = best_map_id.item() else: PATH = os.path.join("./logs/" + args.experimentname, args.resume) hyperparameters['path'] = os.path.join(PATH, 'last.tar') coco_version = hyperparameters['coco_version'] mAP_best = 0 device = torch.device("cuda:0" if torch.cuda.is_available() else "cpu") model, optimizer, hyperparameters, PATH = init_model.init_model( hyperparameters, mode) model.hp = hyperparameters model.mode = mode if type(model) is nn.DataParallel: inp_dim = model.module.inp_dim else: inp_dim = model.inp_dim if hyperparameters['augment'] > 0: train_dataset = Coco(partition='train', coco_version=coco_version, subset=mode['train_subset'], transform=transforms.Compose([ Augment(hyperparameters['augment']), ResizeToTensor(inp_dim) ])) else: train_dataset = Coco(partition='train', coco_version=coco_version, subset=mode['train_subset'], transform=transforms.Compose( [ResizeToTensor(inp_dim)])) batch_size = hyperparameters['batch_size'] train_dataloader = DataLoader(train_dataset, batch_size=batch_size, shuffle=True, collate_fn=helper.collate_fn, num_workers=hyperparameters['workers'], pin_memory=True) test_dataset = Coco(partition='val', coco_version=coco_version, subset=mode['test_subset'], transform=transforms.Compose([ResizeToTensor(inp_dim) ])) test_dataloader = DataLoader(test_dataset, batch_size=args.eval_batch_size, shuffle=False, collate_fn=helper.collate_fn, num_workers=1, pin_memory=True) # Calculate total number of model parameters num_params = sum(p.numel() for p in model.parameters()) track.metric(iteration=0, num_params=num_params) for epoch in range(args.epochs): track.debug("Starting epoch %d" % epoch) # args.lr = adjust_learning_rate(epoch, optimizer, args.lr, args.schedule, # args.gamma) outcome = train(train_dataloader, model, optimizer, epoch) mAP = 0 mAP = test(test_dataloader, model, epoch, device) track.debug( 'Finished epoch %d... | train loss %.3f | avg_iou %.3f | avg_conf %.3f | avg_no_conf %.3f' '| avg_pos %.3f | avg_neg %.5f | mAP %.5f' % (epoch, outcome['avg_loss'], outcome['avg_iou'], outcome['avg_conf'], outcome['avg_no_conf'], outcome['avg_pos'], outcome['avg_neg'], mAP)) model_fname = os.path.join(track.trial_dir(), "last.tar") torch.save( { 'model_state_dict': model.module.state_dict() if type(model) is nn.DataParallel else model.state_dict(), 'optimizer_state_dict': optimizer.state_dict(), 'avg_loss': outcome['avg_loss'], 'avg_iou': outcome['avg_iou'], 'avg_pos': outcome['avg_pos'], 'avg_neg': outcome['avg_neg'], 'avg_conf': outcome['avg_conf'], 'avg_no_conf': outcome['avg_no_conf'], 'mAP': mAP, 'hyperparameters': hyperparameters }, model_fname) if mAP > mAP_best: mAP_best = mAP best_fname = os.path.join(track.trial_dir(), "best.tar") track.debug("New best score! Saving model") torch.save( { 'model_state_dict': model.module.state_dict() if type(model) is nn.DataParallel else model.state_dict(), 'optimizer_state_dict': optimizer.state_dict(), 'avg_loss': outcome['avg_loss'], 'avg_iou': outcome['avg_iou'], 'avg_pos': outcome['avg_pos'], 'avg_neg': outcome['avg_neg'], 'avg_conf': outcome['avg_conf'], 'avg_no_conf': outcome['avg_no_conf'], 'mAP': mAP, 'hyperparameters': hyperparameters }, best_fname)
def test(testloader, model, epoch, device): # FIXME remove this and make paste_masks_in_image run on the GPU cpu_device = torch.device("cpu") device = device batch_time = AverageMeter() data_time = AverageMeter() hyperparameters = model.hp confidence = hyperparameters['inf_confidence'] iou_threshold = hyperparameters['inf_iou_threshold'] if type(model) is nn.DataParallel: inp_dim = model.module.inp_dim pw_ph = model.module.pw_ph cx_cy = model.module.cx_cy stride = model.module.stride else: inp_dim = model.inp_dim pw_ph = model.pw_ph cx_cy = model.cx_cy stride = model.stride pw_ph = pw_ph.to(device) cx_cy = cx_cy.to(device) stride = stride.to(device) sys.stdout = open(os.devnull, 'w') #wrapper to disable hardcoded printing coco = coco_utils.get_coco_api_from_dataset(testloader.dataset) iou_types = ["bbox"] coco_evaluator = coco_eval.CocoEvaluator(coco, iou_types) sys.stdout = sys.__stdout__ #wrapper to enable hardcoded printing (return to normal mode) # switch to evaluate mode model.eval() end = time.time() with torch.no_grad(): for batch_idx, (images, targets) in enumerate(testloader): # measure data loading time data_time.update(time.time() - end) images = images.to(device) targets2 = [] for t in targets: dd = {} for k, v in t.items(): if (k != 'img_size'): dd[k] = v.to(device) else: dd[k] = v targets2.append(dd) # targets = [{k: v.to(device) for k, v in t.items()} for t in targets] targets = targets2 raw_pred = model(images, device) true_pred = util.transform(raw_pred.clone().detach(), pw_ph, cx_cy, stride) sorted_pred = torch.sort(true_pred[:, :, 4] * (true_pred[:, :, 5:].max(axis=2)[0]), descending=True) pred_mask = sorted_pred[0] > confidence indices = [(sorted_pred[1][e, :][pred_mask[e, :]]) for e in range(pred_mask.shape[0])] pred_final = [ true_pred[i, indices[i], :] for i in range(len(indices)) ] pred_final_coord = [ util.get_abs_coord(pred_final[i].unsqueeze(-2)) for i in range(len(pred_final)) ] indices = [ nms_box.nms(pred_final_coord[i][0], pred_final[i][:, 4], iou_threshold) for i in range(len(pred_final)) ] pred_final = [ pred_final[i][indices[i], :] for i in range(len(pred_final)) ] abs_pred_final = [ helper.convert2_abs_xyxy(pred_final[i], targets[i]['img_size'], inp_dim) for i in range(len(pred_final)) ] outputs = [dict() for i in range(len((abs_pred_final)))] for i, atrbs in enumerate(abs_pred_final): outputs[i]['boxes'] = atrbs[:, :4] outputs[i]['scores'] = pred_final[i][:, 4] try: outputs[i]['labels'] = pred_final[i][:, 5:].max( axis=1)[1] + 1 #could be empty except: outputs[i]['labels'] = torch.tensor([]) outputs = [{k: v.to(cpu_device) for k, v in t.items()} for t in outputs] res = { target["image_id"].item(): output for target, output in zip(targets, outputs) } coco_evaluator.update(res) # measure elapsed time batch_time.update(time.time() - end) end = time.time() sys.stdout = open(os.devnull, 'w') #wrapper to disable hardcoded printing coco_evaluator.synchronize_between_processes() # accumulate predictions from all images coco_evaluator.accumulate() coco_evaluator.summarize() metrics = coco_evaluator.get_stats() sys.stdout = sys.__stdout__ #wrapper to enable hardcoded printing (return to normal mode) coco_stats = { 'map_all': metrics[0], '[email protected]': metrics[1], '[email protected]': metrics[2], 'map_small': metrics[3], 'map_med': metrics[4], 'map_large': metrics[5], 'recall@1': metrics[6], 'recall@10': metrics[7], 'recall@100': metrics[8], 'recall@small': metrics[9], 'recall@medium': metrics[10], 'recall@large': metrics[11] } track.metric(iteration=0, epoch=epoch, coco_stats=coco_stats) return (metrics[0])
def train(trainloader, model, optimizer, epoch, cuda=True): # switch to train mode model.train() hyperparameters = model.hp mode = model.mode if type(model) is nn.DataParallel: inp_dim = model.module.inp_dim pw_ph = model.module.pw_ph cx_cy = model.module.cx_cy stride = model.module.stride else: inp_dim = model.inp_dim pw_ph = model.pw_ph cx_cy = model.cx_cy stride = model.stride if cuda: pw_ph = pw_ph.cuda() cx_cy = cx_cy.cuda() stride = stride.cuda() batch_time = AverageMeter() data_time = AverageMeter() avg_loss = AverageMeter() avg_iou = AverageMeter() avg_conf = AverageMeter() avg_no_conf = AverageMeter() avg_pos = AverageMeter() avg_neg = AverageMeter() end = time.time() break_flag = 0 if mode['show_temp_summary'] == True: writer = SummaryWriter(os.path.join(track.trial_dir(), 'temp_vis/')) for batch_idx, (inputs, targets) in enumerate(trainloader): # measure data loading time data_time.update(time.time() - end) if cuda: inputs = inputs.cuda() # compute output raw_pred = model(inputs, torch.cuda.is_available()) true_pred = util.transform(raw_pred.clone().detach(), pw_ph, cx_cy, stride) iou_list = util.get_iou_list(true_pred, targets, hyperparameters, inp_dim) resp_raw_pred, resp_cx_cy, resp_pw_ph, resp_stride, no_obj = util.build_tensors( raw_pred, iou_list, pw_ph, cx_cy, stride, hyperparameters) stats = helper.get_progress_stats(true_pred, no_obj, iou_list, targets) if hyperparameters['wasserstein'] == True: no_obj = util.get_wasserstein_matrices(raw_pred, iou_list, inp_dim) try: loss = util.yolo_loss(resp_raw_pred, targets, no_obj, resp_pw_ph, resp_cx_cy, resp_stride, inp_dim, hyperparameters) except RuntimeError: print('bayes opt failed') break_flag = 1 break # measure accuracy and record loss avg_loss.update(loss.item()) avg_iou.update(stats['iou']) avg_conf.update(stats['pos_conf']) avg_no_conf.update(stats['neg_conf']) avg_pos.update(stats['pos_class']) avg_neg.update(stats['neg_class']) # compute gradient and do SGD step optimizer.zero_grad() loss.backward() optimizer.step() # measure elapsed time batch_time.update(time.time() - end) end = time.time() if mode['show_output'] == True: # plot progress progress_str = 'Loss: %.4f | AvIoU: %.3f | AvPConf: %.3f | AvNConf: %.5f | AvClass: %.3f | AvNClass: %.5f'\ % (loss.item(), stats['iou'], stats['pos_conf'], stats['neg_conf'],stats['pos_class'],stats['neg_class']) progress_bar(batch_idx, len(trainloader), progress_str) iteration = epoch * len(trainloader) + batch_idx if mode['show_temp_summary'] == True: writer.add_scalar('AvLoss/train', avg_loss.avg, iteration) writer.add_scalar('AvIoU/train', avg_iou.avg, iteration) writer.add_scalar('AvPConf/train', avg_conf.avg, iteration) writer.add_scalar('AvNConf/train', avg_no_conf.avg, iteration) writer.add_scalar('AvClass/train', avg_pos.avg, iteration) writer.add_scalar('AvNClass/train', avg_neg.avg, iteration) track.metric(iteration=iteration, epoch=epoch, avg_train_loss=avg_loss.avg, avg_train_iou=avg_iou.avg, avg_train_conf=avg_conf.avg, avg_train_neg_conf=avg_no_conf.avg, avg_train_pos=avg_pos.avg, avg_train_neg=avg_neg.avg) outcome = { 'avg_loss': avg_loss.avg, 'avg_iou': avg_iou.avg, 'avg_pos': avg_pos.avg, 'avg_neg': avg_neg.avg, 'avg_conf': avg_conf.avg, 'avg_no_conf': avg_no_conf.avg, 'broken': break_flag } return outcome
def main(): global args, best_prec1 args = parser.parse_args() args.distributed = args.world_size > 1 if args.distributed: dist.init_process_group(backend=args.dist_backend, init_method=args.dist_url, world_size=args.world_size) # create model if args.pretrained: print("=> using pre-trained model '{}'".format(args.arch)) model = models.__dict__[args.arch](pretrained=True) else: print("=> creating model '{}'".format(args.arch)) model = models.__dict__[args.arch]() if not args.distributed: if args.arch.startswith('alexnet') or args.arch.startswith('vgg'): model.features = torch.nn.DataParallel(model.features) model.cuda() else: model = torch.nn.DataParallel(model).cuda() else: model.cuda() model = torch.nn.parallel.DistributedDataParallel(model) # define loss function (criterion) and optimizer criterion = nn.CrossEntropyLoss().cuda() if args.sqrt_lr: lr = args.lr * math.sqrt(args.batch_size / 32.) else: lr = args.lr optimizer = torch.optim.SGD(model.parameters(), lr, momentum=args.momentum, weight_decay=args.weight_decay) # optionally resume from a checkpoint if args.resume: if os.path.isfile(args.resume): print("=> loading checkpoint '{}'".format(args.resume)) checkpoint = torch.load(args.resume) args.start_epoch = checkpoint['epoch'] best_prec1 = checkpoint['best_prec1'] model.load_state_dict(checkpoint['state_dict']) optimizer.load_state_dict(checkpoint['optimizer']) print("=> loaded checkpoint '{}' (epoch {})".format( args.resume, checkpoint['epoch'])) else: print("=> no checkpoint found at '{}'".format(args.resume)) cudnn.benchmark = True # Data loading code traindir = os.path.join(args.data, 'train') valdir = os.path.join(args.data, 'val') normalize = transforms.Normalize(mean=[0.485, 0.456, 0.406], std=[0.229, 0.224, 0.225]) train_dataset = datasets.ImageFolder( traindir, transforms.Compose([ transforms.RandomResizedCrop(224), transforms.RandomHorizontalFlip(), transforms.ToTensor(), normalize, ])) if args.distributed: train_sampler = torch.utils.data.distributed.DistributedSampler( train_dataset) else: train_sampler = None train_loader = torch.utils.data.DataLoader(train_dataset, batch_size=min( args.batch_size, args.max_samples), shuffle=(train_sampler is None), num_workers=args.workers, pin_memory=True, sampler=train_sampler) val_loader = torch.utils.data.DataLoader(datasets.ImageFolder( valdir, transforms.Compose([ transforms.Resize(256), transforms.CenterCrop(224), transforms.ToTensor(), normalize, ])), batch_size=args.max_samples, shuffle=False, num_workers=args.workers, pin_memory=True) if args.evaluate: validate(val_loader, model, criterion) return with track.trial(args.logroot, None, param_map={'batch_size': args.batch_size}): for epoch in range(args.start_epoch, args.epochs): if args.distributed: train_sampler.set_epoch(epoch) adjust_learning_rate(optimizer, epoch) # train for one epoch train_loss = train(train_loader, model, criterion, optimizer, epoch) # evaluate on validation set with torch.no_grad(): val_loss, prec1 = validate(val_loader, model, criterion) track.metric(iteration=epoch, train_loss=train_loss, test_loss=val_loss, prec=prec1) # Log model model_fname = os.path.join(track.trial_dir(), "model{}.ckpt".format(epoch)) torch.save(model, model_fname) # Save the model if the validation loss is the best we've seen so far. # remember best prec@1 and save checkpoint is_best = prec1 > best_prec1 best_prec1 = max(prec1, best_prec1) if is_best: best_fname = os.path.join(track.trial_dir(), "best.ckpt") with open(best_fname, 'wb') as f: torch.save(model, f)
# Save checkpoint. acc = 100.0 * correct / total if acc > best_acc: print("Saving..") state = {"net": net.state_dict(), "acc": acc, "epoch": epoch} if not os.path.isdir("checkpoint"): os.mkdir("checkpoint") ckpt_path = os.path.join(track.trial_dir(), "ckpt.pth") torch.save(state, ckpt_path) best_acc = acc test_loss = test_loss / len(testloader) return test_loss, acc, best_acc with track.trial(args.logroot, None, param_map=vars(args)): for epoch in range(start_epoch, start_epoch + 200): train_loss, train_acc = train(epoch) test_loss, test_acc, best_acc = test(epoch) track.metric( iteration=epoch, train_loss=train_loss, train_acc=train_acc, test_loss=test_loss, test_acc=test_acc, best_acc=best_acc, ) track.debug( f"epoch {epoch} finished with stats: best_acc = {best_acc} | train_acc = {train_acc} | test_acc = {test_acc} | train_loss = {train_loss} | test_loss = {test_loss}" )
def run(ensemble, trial_df, results_dir='./logs', dataroot='./data', class_ind=0, batch_size=128, eval_batch_size=100, cuda=False, num_workers=2, start_epoch=160, end_epoch=200, **unused): trainloader, testloader = build_dataset('cifar10', dataroot=dataroot, batch_size=batch_size, eval_batch_size=eval_batch_size, num_workers=2) # this will only iterate over examples of one class class_trainlaoder, class_testloader = build_single_class_dataset( 'cifar10', class_ind=class_ind, dataroot=dataroot, batch_size=batch_size, eval_batch_size=eval_batch_size, num_workers=2) full_ensemble = ensemble track.debug("[ensemble_size] starting to test all ensembles (class = %d)" % class_ind) for i in range(len(ensemble.models)): ensemble_size = i + 1 model_ind = len(ensemble.models) - 1 - i track.debug("[ensemble_size] starting size %d / %d ensemble" % (i + 1, len(ensemble.models))) ensemble_loss = SoftmaxNLL() one_loss = CrossEntropyLoss() entropy_criterion = Entropy() ensemble = Ensemble(full_ensemble.models[model_ind:]) single_model = full_ensemble.models[model_ind] # we want to do metrics for (a) the ensemble with varying sizes and # (b) the individual models corresponding to that epoch def _test_dataset(model, testloader, criterion): loss, acc = test(testloader, model, criterion, epoch=-1, cuda=cuda, metric=False) # compute the entropy of the model post-hoc as well entropy = test(testloader, model, entropy_criterion, epoch=-1, cuda=cuda, metric=False, criterion_has_labels=False, compute_acc=False) return loss, acc, entropy # metrics for the both models over both datasets # (a) on the whole dataset # (i) for the ensemble # (ii)for the single model from this epoch # (b) on a single class # (i) for the ensemble # (ii)for the single model from this epoch stats = {} models = (ensemble, single_model) loaders = (testloader, class_testloader) losses = ensemble_loss, one_loss model_names = ['ensemble', 'single_model'] loader_names = ['full', 'single_class'] for i, j in itertools.product(range(len(models)), range(len(loaders))): track.debug("[ensemble size: %d] Evaluating loss/acc/entropy for " "%s on %s dataset" % (ensemble_size, model_names[i], loader_names[i])) metric = model_names[i] + '_' + loader_names[i] loss, acc, entropy = _test_dataset(models[i], loaders[j], losses[i]) stats[metric + '_loss'] = loss stats[metric + '_acc'] = acc stats[metric + '_entropy'] = entropy track.metric(ensemble_size=ensemble_size, **stats)
def test(testloader, model, criterion, epoch, cuda=False, metric=True, criterion_has_labels=True, compute_acc=True): """ criterion = torch.nn.Loss instance. criterion_has_labels (bool): if true, the above criterion is called as criterion(outputs, labels). otherwise, just criterion(outputs). returns (test_loss, test_acc) if compute_acc is True otherwise, returns test_loss alone """ batch_time = AverageMeter() data_time = AverageMeter() losses = AverageMeter() top1 = AverageMeter() top5 = AverageMeter() # switch to evaluate mode model.eval() end = time.time() with torch.no_grad(): for batch_idx, (inputs, targets) in enumerate(testloader): # measure data loading time data_time.update(time.time() - end) if cuda: inputs, targets = inputs.cuda(), targets.cuda() with torch.no_grad(): # compute output outputs = model(inputs) if criterion_has_labels: loss = criterion(outputs, targets) else: loss = criterion(outputs) # measure accuracy and record loss losses.update(loss.item(), inputs.size(0)) if compute_acc: prec1, prec5 = accuracy(outputs.data, targets.data, topk=(1, 5)) top1.update(prec1.item(), inputs.size(0)) top5.update(prec5.item(), inputs.size(0)) # measure elapsed time batch_time.update(time.time() - end) end = time.time() # plot progress if compute_acc: progress_str = 'Loss: %.3f | Acc: %.3f%% (%d/%d)'\ % (losses.avg, top1.avg, top1.sum, top1.count) else: progress_str = 'Loss: %.3f (%d/%d)'\ % (losses.avg, batch_idx*inputs.size(0), losses.count) progress_bar(batch_idx, len(testloader), progress_str) if metric: track.metric(iteration=0, epoch=epoch, avg_test_loss=losses.avg, avg_test_acc=top1.avg) if compute_acc: return (losses.avg, top1.avg) else: return losses.avg
def train(trainloader, model, criterion, optimizer, epoch, cuda=False, num_chunks=4): # switch to train mode model.train() batch_time = AverageMeter() data_time = AverageMeter() losses = AverageMeter() top1 = AverageMeter() top5 = AverageMeter() end = time.time() for batch_idx, (all_inputs, all_targets) in enumerate(trainloader): # measure data loading time data_time.update(time.time() - end) # do mini-mini-batching for large batch sizes xs = all_inputs.chunk(num_chunks) ys = all_targets.chunk(num_chunks) optimizer.zero_grad() batch_prec1 = 0.0 batch_loss = 0.0 for (inputs, targets) in zip(xs, ys): if cuda: inputs, targets = inputs.cuda(), targets.cuda(async=True) # compute output outputs = model(inputs) mini_loss = criterion(outputs, targets) / num_chunks batch_loss += mini_loss.item() mini_loss.backward() # measure accuracy and record loss prec1, prec5 = accuracy(outputs.data, targets.data, topk=(1, 5)) batch_prec1 += prec1.item() / num_chunks losses.update(num_chunks * mini_loss.item(), inputs.size(0)) top1.update(prec1.item(), inputs.size(0)) top5.update(prec5.item(), inputs.size(0)) # compute gradient and do SGD step optimizer.step(epoch) # measure elapsed time batch_time.update(time.time() - end) end = time.time() # plot progress progress_str = 'Loss: %.3f | Acc: %.3f%% (%d/%d)'\ % (losses.avg, top1.avg, top1.sum, top1.count) progress_bar(batch_idx, len(trainloader), progress_str) iteration = epoch * len(trainloader) + batch_idx track.metric(iteration=iteration, epoch=epoch, avg_train_loss=losses.avg, avg_train_acc=top1.avg, cur_train_loss=batch_loss, cur_train_acc=batch_prec1) return (losses.avg, top1.avg)