def main(args): """ loads the model and trial data and runs the specified experiment(s)! """ # load the project from track proj = track.Project(args.results_dir) # create the ensemble model, trial_df = load_trial(proj, args.start_epoch, args.end_epoch, args.noise_scale) # register svhn so we can load it in OOD sk.datasets.add_dataset(svhn) # run the experiment def _run(experiment): track.debug('Starting to run experiment: %s' % experiment) experiment_module = 'sgld.experiments.' + experiment runner = getattr(importlib.import_module(experiment_module), 'run') runner(model, trial_df, **vars(args)) if args.mode == 'all': for experiment in EXPERIMENTS: _run(experiment) else: _run(args.mode)
def _main(_): seed_all(flags.FLAGS.seed) if flags.FLAGS.toy: print('using toy data subset') print('found gpus {}'.format(gpus())) dataset_file = os.path.join( flags.FLAGS.dataroot, 'wikisql', 'processed-toy{}.pth'.format(1 if flags.FLAGS.toy else 0)) print('loading data from {}'.format(dataset_file)) _train, _val, test = torch.load(dataset_file) proj = track.Project(os.getenv('TRACK_DIRECTORY')) initial_model_file = proj.fetch_artifact(flags.FLAGS.trial, 'untrained_model.pth') model_file = proj.fetch_artifact(flags.FLAGS.trial, 'checkpoints/best.pth') print('loading initial model from {}'.format(initial_model_file)) disable_source_code_warning() model = torch.load(initial_model_file) model = model.to(get_device()) print('loading model parameters from {}'.format(model_file)) state_dict = torch.load(model_file) model.load_state_dict(state_dict['model']) model = model.share_memory() num_workers = flags.FLAGS.workers print('initializing {} workers'.format(num_workers)) with closing(SharedGPU(None, model, num_workers)) as shared: shared.set_mode(evaluation=True) print('all {} remote workers initialized'.format(num_workers)) _do_evaluation('test', test, shared)
def df(logroot): proj = track.Project(logroot, None) results = [] for _, trial in proj.ids.iterrows(): res = proj.results([trial['trial_id']]) for col in proj.ids.columns: res[col] = trial[col] results.append(res) _df = pd.concat(results) return _df
def run(ensemble, proj_df, results_dir='./logs', dataroot='./data', batch_size=128, eval_batch_size=100, cuda=False, num_workers=2, **unused): """ this evaluates both the ensemble and the baseline model on the full test set we also evaluate each model and compute their individual losses, so that we can plot the variance around the ensemble's dashed horizontal line (see top of file) """ trainloader, testloader = build_dataset('cifar10', dataroot=dataroot, batch_size=batch_size, eval_batch_size=eval_batch_size, num_workers=2) ensemble_criterion = SoftmaxNLL() track.debug("[baseline] testing the ensemble on full dataset") ensemble_loss, ensemble_acc = test(testloader, ensemble, ensemble_criterion, epoch=-1, cuda=cuda, metric=False) # get the no-noise baseline evaluation proj = track.Project(results_dir) best_model, best_df = load_trial(proj, noise_scale=0.0) track.debug("[baseline] testing no-noise baseline model on full dataset") baseline_criterion = torch.nn.CrossEntropyLoss() baseline_loss, baseline_acc = test(testloader, best_model, baseline_criterion, epoch=-1, cuda=cuda, metric=False) # now, test each of the ensemble's models model_losses = [] model_accs = [] track.debug("[baseline] testing individual models on full dataset") for i, model in enumerate(ensemble.models): track.debug("[baseline] testing model %d of %d" % (i, len(ensemble.models))) model_loss, model_acc = test(testloader, model, baseline_criterion, epoch=-1, cuda=cuda, metric=False) model_losses.append(model_loss) model_accs.append(model_acc) # we just need to track the scalar results of this evaluation # we can access the baseline test *curve* from the jupyter notebook (later) track.metric(iteration=0, ensemble_loss=ensemble_loss, ensemble_acc=ensemble_acc, best_baseline_loss=baseline_loss, best_baseline_acc=baseline_acc, model_losses=model_losses, model_accs=model_accs)
def proj(experimentname=None, logroot=None, s3=None, proj_dir=None): """ Loads the track.Project object for this experiment directory. Gets the flattened dataframe. if proj_dir specified, load directly from there. otherwise, load from logroot/experimentname. loads from s3 if it can via track. """ if not proj_dir: if experimentname: assert logroot, "must supply logroot with experiment name" proj_dir = os.path.join(logroot, experimentname) track_proj = track.Project(proj_dir, s3) return track_proj
def execute(experiment_fn): """ Launches an experiment using the supplied `experiment_fn(args)` launcher. If the config is set, it will use ray to launch all experiments in parallel. """ # Parse all arguments (default + user-supplied) if not _parser.val: supply_args() args = _parser.val.parse_args() # Launch ray if we need to. if args.config and args.ray: _launch_ray_experiments(experiment_fn, args) _cleanup_ray_experiments(args) elif args.config and not args.ray: with open(args.config) as f: config = yaml.load(f) args_dict = vars(args) args_dict.update(config) _experiment(experiment_fn, args) # Launch a single experiment otherwise. else: _experiment(experiment_fn, args) # Load resulting experiment data from Track local = os.path.join(args.logroot, args.experimentname) if args.s3: track_remote_dir = os.path.join(args.s3, args.projectname, args.experimentname) else: track_remote_dir = None proj = track.Project(local, track_remote_dir) # Save project to a pickle in <logroot>/<experimentname>. if _save_proj.val: proj_fname = os.path.join(args.logroot, args.experimentname, args.experimentname + '.pkl') try: with open(proj_fname, 'wb') as f: pickle.dump(proj, f) except Exception as e: print('swallowing pickle error: {}'.format(e)) # Launch postprocessing code. if _postprocess_fn.val: _postprocess_fn.val(proj)
def do_training(args): hyperparameters = { 'lr': args.lr, 'epochs': args.epochs, 'resume_from': 0, 'coco_version': args.coco_version, #can be either '2014' or '2017' 'batch_size': args.batch_size, 'weight_decay': args.weight_decay, 'momentum': args.momentum, 'optimizer': args.optimizer, 'alpha': args.alpha, 'gamma': args.gamma, 'lcoord': args.lcoord, 'lno_obj': args.lno_obj, 'iou_type': tuple(int(a) for a in tuple(args.iou_type)), 'iou_ignore_thresh': args.iou_ignore_thresh, 'tfidf': args.tfidf, 'idf_weights': True, 'tfidf_col_names': ['img_freq', 'none', 'none', 'none', 'no_softmax'], 'wasserstein': args.wasserstein, 'inf_confidence': args.inf_confidence, 'inf_iou_threshold': args.inf_iou_threshold, 'augment': args.augment, 'workers': 1, 'pretrained': args.is_pretrained, 'path': args.trial_id, 'reduction': args.reduction } mode = { 'bayes_opt': False, 'multi_scale': args.multi_scale, 'show_hp': args.show_hp, 'show_output': args.show_output, 'multi_gpu': False, 'train_subset': args.train_subset, 'test_subset': args.test_subset, 'show_temp_summary': args.show_temp_summary, 'save_summary': False } this_proj = track.Project("./logs/" + args.experimentname) if (args.resume == 'last'): this_proj = track.Project("./logs/" + args.experimentname) most_recent = this_proj.ids["start_time"].nlargest(2).idxmin() most_recent_id = this_proj.ids["trial_id"].iloc[[most_recent]] PATH = os.path.join("./logs/" + args.experimentname, most_recent_id.item()) hyperparameters['path'] = os.path.join(PATH, 'last.tar') args.resume = most_recent_id.item() elif (args.resume == 'best'): ids = this_proj.ids["trial_id"] res = this_proj.results(ids) best_map = res["coco_stats:map_all"].idxmax() best_map_id = res["trial_id"].iloc[[best_map]] PATH = os.path.join("./logs/" + args.experimentname, best_map_id.item()) hyperparameters['path'] = os.path.join(PATH, 'best.tar') args.resume = best_map_id.item() else: PATH = os.path.join("./logs/" + args.experimentname, args.resume) hyperparameters['path'] = os.path.join(PATH, 'last.tar') coco_version = hyperparameters['coco_version'] mAP_best = 0 device = torch.device("cuda:0" if torch.cuda.is_available() else "cpu") model, optimizer, hyperparameters, PATH = init_model.init_model( hyperparameters, mode) model.hp = hyperparameters model.mode = mode if type(model) is nn.DataParallel: inp_dim = model.module.inp_dim else: inp_dim = model.inp_dim if hyperparameters['augment'] > 0: train_dataset = Coco(partition='train', coco_version=coco_version, subset=mode['train_subset'], transform=transforms.Compose([ Augment(hyperparameters['augment']), ResizeToTensor(inp_dim) ])) else: train_dataset = Coco(partition='train', coco_version=coco_version, subset=mode['train_subset'], transform=transforms.Compose( [ResizeToTensor(inp_dim)])) batch_size = hyperparameters['batch_size'] train_dataloader = DataLoader(train_dataset, batch_size=batch_size, shuffle=True, collate_fn=helper.collate_fn, num_workers=hyperparameters['workers'], pin_memory=True) test_dataset = Coco(partition='val', coco_version=coco_version, subset=mode['test_subset'], transform=transforms.Compose([ResizeToTensor(inp_dim) ])) test_dataloader = DataLoader(test_dataset, batch_size=args.eval_batch_size, shuffle=False, collate_fn=helper.collate_fn, num_workers=1, pin_memory=True) # Calculate total number of model parameters num_params = sum(p.numel() for p in model.parameters()) track.metric(iteration=0, num_params=num_params) for epoch in range(args.epochs): track.debug("Starting epoch %d" % epoch) # args.lr = adjust_learning_rate(epoch, optimizer, args.lr, args.schedule, # args.gamma) outcome = train(train_dataloader, model, optimizer, epoch) mAP = 0 mAP = test(test_dataloader, model, epoch, device) track.debug( 'Finished epoch %d... | train loss %.3f | avg_iou %.3f | avg_conf %.3f | avg_no_conf %.3f' '| avg_pos %.3f | avg_neg %.5f | mAP %.5f' % (epoch, outcome['avg_loss'], outcome['avg_iou'], outcome['avg_conf'], outcome['avg_no_conf'], outcome['avg_pos'], outcome['avg_neg'], mAP)) model_fname = os.path.join(track.trial_dir(), "last.tar") torch.save( { 'model_state_dict': model.module.state_dict() if type(model) is nn.DataParallel else model.state_dict(), 'optimizer_state_dict': optimizer.state_dict(), 'avg_loss': outcome['avg_loss'], 'avg_iou': outcome['avg_iou'], 'avg_pos': outcome['avg_pos'], 'avg_neg': outcome['avg_neg'], 'avg_conf': outcome['avg_conf'], 'avg_no_conf': outcome['avg_no_conf'], 'mAP': mAP, 'hyperparameters': hyperparameters }, model_fname) if mAP > mAP_best: mAP_best = mAP best_fname = os.path.join(track.trial_dir(), "best.tar") track.debug("New best score! Saving model") torch.save( { 'model_state_dict': model.module.state_dict() if type(model) is nn.DataParallel else model.state_dict(), 'optimizer_state_dict': optimizer.state_dict(), 'avg_loss': outcome['avg_loss'], 'avg_iou': outcome['avg_iou'], 'avg_pos': outcome['avg_pos'], 'avg_neg': outcome['avg_neg'], 'avg_conf': outcome['avg_conf'], 'avg_no_conf': outcome['avg_no_conf'], 'mAP': mAP, 'hyperparameters': hyperparameters }, best_fname)