def get_xp(args, optimizer): # various useful information to store args.command_line = 'python ' + ' '.join(sys.argv) args.pid = os.getpid() args.cwd = os.getcwd() args.hostname = socket.gethostname() xp = logger.Experiment(args.xp_name, use_visdom=args.visdom, visdom_opts={'server': 'http://localhost', 'port': args.port}, time_indexing=False, xlabel='Epoch') xp.SumMetric(name='epoch', to_plot=False) xp.AvgMetric(name='error', tag='train') xp.AvgMetric(name='error', tag='val') xp.AvgMetric(name='error', tag='test') xp.TimeMetric(name='timer', tag='train') xp.TimeMetric(name='timer', tag='val') xp.TimeMetric(name='timer', tag='test') xp.AvgMetric(name='obj', tag='train') xp.log_config(vars(args)) return xp
def get_xp(args, model, optimizer): # various useful information to store args.command_line = 'python ' + ' '.join(sys.argv) args.pid = os.getpid() args.cwd = os.getcwd() args.hostname = socket.gethostname() xp = logger.Experiment(args.xp_name, use_visdom=args.visdom, visdom_opts={ 'server': args.server, 'port': args.port }, time_indexing=False, xlabel='Epoch') xp.SumMetric(name='epoch', to_plot=False) xp.AvgMetric(name='acc', tag='train') xp.AvgMetric(name='acc', tag='val') xp.AvgMetric(name='acc', tag='test') xp.BestMetric(name='acc', tag='valbest') xp.TimeMetric(name='timer', tag='train') xp.TimeMetric(name='timer', tag='val') xp.TimeMetric(name='timer', tag='test') xp.AvgMetric(name='loss', tag='train') xp.AvgMetric(name='obj', tag='train') xp.AvgMetric(name='reg') xp.log_config(vars(args)) xp.AvgMetric(name="gamma") xp.SimpleMetric(name='eta') if args.log: # log at each epoch xp.Epoch.add_hook( lambda: xp.to_json('{}/results.json'.format(xp.name_and_dir))) # log after final evaluation on test set xp.Acc_Test.add_hook( lambda: xp.to_json('{}/results.json'.format(xp.name_and_dir))) # save with different names at each epoch if needed if args.dump_epoch: filename = lambda: '{}-{}/model.pkl'.format( xp.name_and_dir, int(xp.Epoch.value)) else: filename = lambda: '{}/model.pkl'.format(xp.name_and_dir) xp.Epoch.add_hook(lambda: save_state(model, optimizer, filename())) # save results and model for best validation performance xp.Acc_Valbest.add_hook( lambda: xp.to_json('{}/best_results.json'.format(xp.name_and_dir))) xp.Acc_Valbest.add_hook(lambda: save_state( model, optimizer, '{}/best_model.pkl'.format(xp.name_and_dir))) return xp
def logger_from_dict(config, use_visdom=True): """ Create a logger from a dictionary :param config: Dictionary containing the experiment hyper-parameters :param use_visdom: Wether to use Visdom :return: Logger """ experiment_name = datetime.datetime.now().strftime("%Y-%m-%d-") ordered_config = collections.OrderedDict(sorted(config.items())) for key, value in ordered_config.items(): key = helpers.to_camel_case(key.replace(' ', '_')) if value is str: value = helpers.to_camel_case(value.replace(' ', '_')) if key == 'Transforms' or key == 'trainType': continue if value is not None and key is not None: experiment_name += "{}_{}-".format(key, value) # Delete last dash experiment_name = experiment_name[:-1] # Create logger log = logger.Experiment(name=experiment_name, use_visdom=use_visdom, visdom_opts={ 'server': 'http://localhost', 'port': 8097 }, time_indexing=False, xlabel='Epoch') log.log_config(config) # create parent metric for training metrics (easier interface) log.ParentWrapper( tag='train', name='parent', children=[log.AvgMetric(name='loss'), log.AvgMetric(name='acc')]) # same for validation metrics (note all children inherit tag from parent) log.ParentWrapper( tag='val', name='parent', children=[log.AvgMetric(name='loss'), log.AvgMetric(name='acc')]) # Add a best metric for the validation accuracy log.ParentWrapper(tag='best', name='parent', children=[log.BestMetric(name='acc')]) return log
def create_experiment(args): xp = logger.Experiment(args.out_name, use_visdom=args.visdom, visdom_opts=dict(server=args.server, port=args.port), time_indexing=False, xlabel='Epoch') xp.ParentWrapper(tag='train', name='parent', children=(xp.AvgMetric(name='loss'), xp.AvgMetric(name='acc1'), xp.AvgMetric(name='acck'), xp.SimpleMetric(name='obj'), xp.TimeMetric(name='timer'))) xp.ParentWrapper(tag='val', name='parent', children=(xp.AvgMetric(name='acck'), xp.AvgMetric(name='acc1'), xp.TimeMetric(name='timer'))) xp.ParentWrapper(tag='test', name='parent', children=(xp.AvgMetric(name='acc1'), xp.AvgMetric(name='acck'), xp.TimeMetric(name='timer'))) xp.SumMetric(name='epoch', to_plot=False) xp.DynamicMetric(name='learning_rate') xp.DynamicMetric(name='temperature', to_plot=False) xp.DynamicMetric(name='mu', to_plot=False) xp.BestMetric(tag='val_best', name='acc1', mode='max') xp.BestMetric(tag='val_best', name='acck', mode='max') if args.visdom: xp.plotter.set_win_opts(name='acc1', opts={'title': 'Accuracy@1'}) xp.plotter.set_win_opts(name='acck', opts={'title': 'Accuracy@k'}) xp.plotter.set_win_opts(name='loss', opts={'title': 'Loss Function'}) xp.plotter.set_win_opts(name='obj', opts={'title': 'Objective Function'}) xp.plotter.set_win_opts(name='learning_rate', opts={'title': 'Learning Rate'}) xp.plotter.set_win_opts(name='Timer', opts={'title': 'Time (s) / epoch'}) xp.log_config(vars(args)) return xp
def init_visdom(env_name, config): assert type(config) == dict visdom_opts = {"server": 'http://localhost', "port": 8787} stats = logger.Experiment(env_name, log_git_hash=False, use_visdom=True, visdom_opts=visdom_opts, time_indexing=False) val_metrics = stats.ParentWrapper( tag="validation", name="parent", children=(stats.SimpleMetric(name='loss'), stats.SimpleMetric(name="accuracy"))) train_metrics = stats.ParentWrapper( tag="training", name="parent", children=(stats.AvgMetric(name='loss'), stats.AvgMetric(name="accuracy"))) stats.log_config(config) def update_metrics(loss, acc, key='train'): if key == 'train': train_metrics.update(loss=loss, accuracy=acc) elif key == "val": val_metrics.update(loss=loss, accuracy=acc) def log_metrics(): stats.log_metric(train_metrics) stats.log_metric(val_metrics) train_metrics.reset() val_metrics.reset() # norm = stats.ParentWrapper(tag="norm", name="parent2", children=(stats.SimpleMetric(name="norm"), )) def plot_norm(val): norm.update(norm=val) stats.log_metric(norm) norm.reset() return update_metrics, log_metrics, plot_norm
def set_defaults(args): # force no logging in debug mode if args.debug: args.visdom = False args.log = False args.out_name = '../xp/debug' # remove previous log in debug mode if os.path.exists('../xp/debug_log.txt'): os.remove('../xp/debug_log.txt') elif args.eval: args.visdom = False args.log = False args.out_name = '../xp/{}'.format(args.dataset) args.epochs = 0 # find settings of experiment _xp = logger.Experiment("") _xp.from_json(args.load_model.replace(".pkl", ".json")) for k in ('topk', 'model'): setattr(args, k, _xp.config[k]) if args.multiple_crops: args.test_batch_size = 32 assert args.dataset in ('imagenet', 'cifar100') if args.dataset == 'imagenet': my_dict = imagenet_defaults if args.dataset == 'cifar100': my_dict = cifar100_defaults # set number of classes args.num_classes = my_dict['num_classes'] # replace None values with default ones for (k, v) in my_dict.items(): if getattr(args, k) is None: setattr(args, k, v) # store full command line in args args.command_line = ' '.join(sys.argv) # store cuurent directory and hostname in args args.cwd = os.getcwd() args.hostname = socket.gethostname()
def create_xp(args): logger.set_default_indexing('increment') xp_name = "{}_{}_{}".format(args.opt, args.lr, args.weight_decay) if args.proj is not None: xp_name += "_proj{}".format(args.proj) setproctitle.setproctitle(xp_name) plotter = logger.Plotter(visdom_opts={ 'server': 'http://atlas.robots.ox.ac.uk', 'port': 9006, 'env': xp_name }, mode='automatic') xp = logger.Experiment(name=xp_name, plotter=plotter, track_git=True) # log the hyperparameters of the experiment xp.config.update(**vars(args)) xp.config.record() xp.gamma = logger.SimpleMetric() xp.gamma_unclipped = logger.SimpleMetric() xp.epsilon = logger.SimpleMetric() return xp
acck = np.random.rand() + 90 return loss, acc1, acck # some hyper-parameters of the experiment lr = 0.01 n_epochs = 10 #---------------------------------------------------------- # Prepare logging #---------------------------------------------------------- # create Experiment xp = logger.Experiment("xp_name", use_visdom=True, visdom_opts={'server': 'http://localhost', 'port': 8097}, time_indexing=False, xlabel='Epoch') # log the hyperparameters of the experiment xp.log_config({'lr': lr, 'n_epochs': n_epochs}) # create parent metric for training metrics (easier interface) xp.ParentWrapper(tag='train', name='parent', children=(xp.AvgMetric(name='loss'), xp.AvgMetric(name='acc1'), xp.AvgMetric(name='acck'))) # same for validation metrics (note all children inherit tag from parent) xp.ParentWrapper(tag='val', name='parent', children=(xp.AvgMetric(name='loss'), xp.AvgMetric(name='acc1'), xp.AvgMetric(name='acck'))) best1 = xp.BestMetric(tag="val-best", name="acc1") bestk = xp.BestMetric(tag="val-best", name="acck")
np.random.seed(opt.seed) torch.manual_seed(opt.seed) if opt.cuda: cudnn.benchmark = True torch.cuda.manual_seed_all(opt.seed) # create logger LOG_DIR = '{0}/logger'.format(opt.experiment) # some hyperparameters we wish to save for this experiment hyperparameters = dict(regularization=1, n_epochs=opt.epochs) # options for the remote visualization backend visdom_opts = dict(server='http://localhost', port=8097) # create logger for visdom xp = logger.Experiment('xp_name', use_visdom=True, visdom_opts=visdom_opts) # log the hyperparameters of the experiment xp.log_config(hyperparameters) # create parent metric for training metrics (easier interface) train_metrics = xp.ParentWrapper(tag='train', name='parent', children=(xp.AvgMetric(name='lossD'), xp.AvgMetric(name='lossG'))) if torch.cuda.is_available() and not opt.cuda: print( "WARNING: You have a CUDA device, so you should probably run with --cuda" ) # setup transformations transforms = transforms.Compose([
if len(real_paths[0]) > 0: sg = model.graph.subgraph(real_paths[0]) drawer.draw(sg, vis_opts={'title': 'Real', **draw_ops}, weights=1.0, vis_win='Clean') for ce_name, ce in cost_evaluators.items(): if ce_name.startswith('parallel'): n_steps = max(ce.node_steps[0].values()) drawer.draw(sg, vis_opts={'title': 'Alloc ' + ce_name, **draw_ops}, weights=ce.node_steps[0], colormap=drawing_utils.get_colormap(n_steps), vis_win='Alloc ' + ce_name) drawer.draw(model.graph, vis_opts={'title': 'Mean', **draw_ops}, weights=path_recorder.get_posterior_weights(), vis_win='mean') logger.info('EPOCH DONE') if __name__ == '__main__': logger.info('Executing main from {}'.format(os.getcwd())) exp_name = 'Direct Launch' args = vars(argument_parser()) vis_conf = external.get_visdom_conf() xp_logger = data_logger.Experiment(exp_name, use_visdom=True, visdom_opts={'server': vis_conf['url'], 'port': vis_conf['port'], 'env': args['draw_env']}, time_indexing=False, xlabel='Epoch') main(args, xp_logger)
import logger import os for root, dirs, files in os.walk("./runs"): for file in files: if file.endswith('.json') and 'archive' not in root: file_path = os.path.join(root, file) print('Loading: ', file_path) xp = logger.Experiment('dummy_name') xp.from_json(file_path) for opts in xp.visdom_win_opts.values(): if 'legend' in opts: opts.pop('legend') xp.to_visdom(visdom_opts={'server': 'http://localhost', 'port': 8098}) print('Success!')
def main(_run, nepochs, device, use_visdom, visdom_conf, n_classes, lambda_reward, r_beta, r_gamma, _config): exp_name = format_exp_name(_run._id, _config) if use_visdom: visdom_conf.update(env=exp_name) _run.info['visdom_server'] = "{server}:{port}/env/{env}".format( **visdom_conf) else: _run.info['visdom_server'] = "No visdom" _run.info['exp_name'] = exp_name front = _run.info['front'] = {} xp_logger = data_logger.Experiment(exp_name, use_visdom=use_visdom, visdom_opts=visdom_conf, time_indexing=False, xlabel='Epoch', log_git_hash=False) xp_logger.add_log_hook(_run.log_scalar) if use_visdom: xp_logger.plotter.windows_opts = defaultdict( lambda: dict(showlegend=True)) viz = Visdom(**visdom_conf) if use_visdom else None # Dataset creation logger.info('### Dataset ###') ds, batch_first, class_w = create_dataset() _run.info['class_weights'] = class_w.tolist() confusion_matrix_opts = { 'columnnames': ds['train'].dataset.ordered_class_names, 'rownames': ds['train'].dataset.ordered_class_names } # Model Creation logger.info('### Model ###') adaptive_model = create_model() adaptive_model.loss = torch.nn.CrossEntropyLoss(weight=class_w, reduction='none', ignore_index=-7) path_recorder = PathRecorder(adaptive_model.stochastic_model) cost_evaluator = ComputationCostEvaluator( node_index=path_recorder.node_index, bw=False) # cost_evaluator = SimpleEdgeCostEvaluator(node_index=path_recorder.node_index, bw=False) cost_evaluator.init_costs(adaptive_model.stochastic_model) logger.info('Cost: {:.5E}'.format(cost_evaluator.total_cost)) adaptive_model.to(device) # Optim Creation logger.info('### Optim ###') optimizer, schedulder = create_optim( params=adaptive_model.get_param_groups()) # Check the param_groups order, to be sure to get the learning rates in the right order for logging assert [pg['name'] for pg in optimizer.param_groups ] == ['arch_params', 'pred_params'] def optim_closure(loss): optimizer.zero_grad() loss.backward() optimizer.step() # Logger creation splits = ['train', 'validation', 'test'] metrics = [ 'classif_loss', 'arch_loss', 'reward', 'lambda_reward', 'silence_ratio', 'accuracy', 'average_cost', 'learning_rate_pred', 'learning_rate_arch' ] for split in splits: xp_logger.ParentWrapper(tag=split, name='parent'.format(split), children=[ xp_logger.SimpleMetric(name=metric) for metric in metrics ]) train_cost_loggers = dict( (i, xp_logger.AvgMetric(name='train_cost', tag=name)) for i, name in enumerate(ds['train'].dataset.ordered_class_names)) train_cost_loggers_perc = dict( (i, xp_logger.AvgMetric(name='train_cost_perceived', tag=name)) for i, name in enumerate(ds['train'].dataset.ordered_class_names)) node_names = adaptive_model.stochastic_model.ordered_node_names # entropy_loggers = [xp_logger.SimpleMetric(name='entropy', tag=name) for name in node_names] entropy_loggers = OrderedDict( (i, xp_logger.SimpleMetric(name='entropy_per_node', tag=name)) for i, name in enumerate(node_names)) # proba_loggers = [xp_logger.SimpleMetric(name='proba', tag=name) for name in node_names] proba_loggers = OrderedDict( (i, xp_logger.SimpleMetric(name='proba_per_node', tag=name)) for i, name in enumerate(node_names)) val_cost_loggers = dict( (i, xp_logger.AvgMetric(name='val_cost', tag=name)) for i, name in enumerate(ds['validation'].dataset.ordered_class_names)) val_cost_loggers_perc = dict( (i, xp_logger.AvgMetric(name='val_cost_perceived', tag=name)) for i, name in enumerate(ds['validation'].dataset.ordered_class_names)) test_cost_loggers = dict( (i, xp_logger.AvgMetric(name='test_cost', tag=name)) for i, name in enumerate(ds['test'].dataset.ordered_class_names)) test_cost_loggers_perc = dict( (i, xp_logger.AvgMetric(name='test_cost_perceived', tag=name)) for i, name in enumerate(ds['test'].dataset.ordered_class_names)) if use_visdom: print_properties(viz, _config) print_properties(viz, _run.info) ema_reward = EMA(r_beta) # Init the exponential moving average for n in range(1, nepochs + 1): logger.info('### Sarting epoch n°{} ### {}'.format( n, _run.info['visdom_server'])) logger.info(' '.join(sys.argv)) if schedulder: schedulder.step(n) arch_lr, pred_lr = schedulder.get_lr() xp_logger.Parent_Train.update(learning_rate_pred=pred_lr, learning_rate_arch=arch_lr) # Training adaptive_model.train() train_cm, train_costcm, train_costcm_norm, train_cost_per_step, logs, train_cost_per_signal_level, train_stats = evaluate_model( adaptive_model, ds['train'], batch_first, device, path_recorder, cost_evaluator, train_cost_loggers, train_cost_loggers_perc, n_classes, lambda_reward, ema_reward, r_gamma, optim_closure, name='Train') xp_logger.Parent_Train.update(**dict( (k, v.value()[0]) for k, v in logs.items())) for node_idx, ent in train_stats['en'].items(): entropy_loggers[node_idx].update(ent.value()[0]) for node_idx, prob in train_stats['pn'].items(): proba_loggers[node_idx].update(prob.value()[0]) # Evaluation adaptive_model.eval() val_cm, val_costcm, val_costcm_norm, val_cost_per_step, logs, val_cost_per_signal_level, val_stats = evaluate_model( adaptive_model, ds['validation'], batch_first, device, path_recorder, cost_evaluator, val_cost_loggers, val_cost_loggers_perc, n_classes, lambda_reward, ema_reward, r_gamma, name='Validation') xp_logger.Parent_Validation.update(**dict( (k, v.value()[0]) for k, v in logs.items())) test_cm, test_costcm, test_costcm_norm, test_cost_per_step, logs, test_cost_per_signal_level, test_stats = evaluate_model( adaptive_model, ds['test'], batch_first, device, path_recorder, cost_evaluator, test_cost_loggers, test_cost_loggers_perc, n_classes, lambda_reward, ema_reward, r_gamma, name='Test') xp_logger.Parent_Test.update(**dict( (k, v.value()[0]) for k, v in logs.items())) if use_visdom: # Log plot_(viz, train_stats['es'], node_names, f'Entropy per step {n} - Train', 'train_eps', log_func=_run.log_scalar) plot_(viz, train_stats['ps'], node_names, f'Probability per step {n} - Train', 'train_pps', log_func=_run.log_scalar) try: viz.heatmap(train_cm, win='train_cm', opts={ **confusion_matrix_opts, 'title': 'Train Confusion matrix' }) viz.heatmap(val_cm, win='val_cm', opts={ **confusion_matrix_opts, 'title': 'Val Confusion matrix' }) viz.heatmap(test_cm, win='test_cm', opts={ **confusion_matrix_opts, 'title': 'Test Confusion matrix' }) # viz.heatmap(train_costcm, win='train_cost_matrix', # opts={**confusion_matrix_opts, 'title': 'Train cost matrix'}) # viz.heatmap(val_costcm, win='val_cost_matrix', opts={**confusion_matrix_opts, 'title': 'Val cost matrix'}) # viz.heatmap(test_costcm, win='test_cost_matrix', # opts={**confusion_matrix_opts, 'title': 'Test cost matrix'}) viz.heatmap(train_costcm_norm, win='train_cost_matrix_norm', opts={ **confusion_matrix_opts, 'title': 'Train cost matrix Normalized' }) viz.heatmap(val_costcm_norm, win='val_cost_matrix_norm', opts={ **confusion_matrix_opts, 'title': 'Val cost matrix Normalized' }) viz.heatmap(test_costcm_norm, win='test_cost_matrix_norm', opts={ **confusion_matrix_opts, 'title': 'Test cost matrix Normalized' }) except ConnectionError as err: logger.warning('Error in heatmaps:') logger.warning(err) traceback.print_exc() plot_meters(viz, train_cost_per_step, 'train_cps', 'Cost per step {}'.format(n), win='cps', log_func=_run.log_scalar) plot_meters(viz, val_cost_per_step, 'val_cps', win='cps', log_func=_run.log_scalar) plot_meters(viz, test_cost_per_step, 'test_cps', win='cps', log_func=_run.log_scalar) plot_meters(viz, train_cost_per_signal_level, 'cost/sig_train', 'Cost per signal {}'.format(n), win='cpsig', error_bars=False, log_func=_run.log_scalar) plot_meters(viz, val_cost_per_signal_level, 'cost/sig_val', win='cpsig', error_bars=False, log_func=_run.log_scalar) plot_meters(viz, test_cost_per_signal_level, 'cost/sig_test', win='cpsig', error_bars=False, log_func=_run.log_scalar) xp_logger.log_with_tag(tag='*', reset=True) msg = 'Losses: {:.3f}({:.3E})-{:.3f}-{:.3f}, Accuracies: {:.3f}-{:.3f}-{:.3f}, Avg cost: {:.3E}-{:.3E}-{:.3E}' msg = msg.format(xp_logger.classif_loss_train, xp_logger.reward_train, xp_logger.classif_loss_validation, xp_logger.classif_loss_test, xp_logger.accuracy_train, xp_logger.accuracy_validation, xp_logger.accuracy_test, xp_logger.average_cost_train, xp_logger.average_cost_validation, xp_logger.average_cost_test) logger.info(msg) pareto_data = { 'cost': xp_logger.logged['average_cost_validation'].values(), 'acc': xp_logger.logged['accuracy_validation'].values(), '_orig_': xp_logger.logged['average_cost_validation'].keys() } pareto = paretize_exp(pareto_data, x_name='cost', crit_name='acc') if n in pareto['_orig_']: logger.info('New on front !') front.update(**pareto) save_checkpoint(adaptive_model, ex, n) elif n > 0 and n % 50 == 0: logger.info('Checkpointing') save_checkpoint(adaptive_model, ex, n) logger.info(pareto['_orig_']) best_epoch = pareto['_orig_'][-1] logger.info('Best \tVal: {:.3f} - Test: {:.3f} (Epoch {})\n'.format( xp_logger.logged['accuracy_validation'][best_epoch], xp_logger.logged['accuracy_test'][best_epoch], best_epoch))
def train(cfg, writer, logger_old, args): # Setup seeds torch.manual_seed(cfg.get('seed', 1337)) torch.cuda.manual_seed(cfg.get('seed', 1337)) np.random.seed(cfg.get('seed', 1337)) random.seed(cfg.get('seed', 1337)) # Setup device device = torch.device("cuda" if torch.cuda.is_available() else "cpu") # Setup Augmentations augmentations = cfg['training'].get('augmentations', None) data_aug = get_composed_augmentations(augmentations) # Setup Dataloader data_loader = get_loader(cfg['data']['dataset']) data_path = cfg['data']['path'] if isinstance(cfg['training']['loss']['superpixels'], int): use_superpixels = True cfg['data']['train_split'] = 'train_super' cfg['data']['val_split'] = 'val_super' setup_superpixels(cfg['training']['loss']['superpixels']) elif cfg['training']['loss']['superpixels'] is not None: raise Exception( "cfg['training']['loss']['superpixels'] is of the wrong type") else: use_superpixels = False t_loader = data_loader(data_path, is_transform=True, split=cfg['data']['train_split'], superpixels=cfg['training']['loss']['superpixels'], img_size=(cfg['data']['img_rows'], cfg['data']['img_cols']), augmentations=data_aug) v_loader = data_loader( data_path, is_transform=True, split=cfg['data']['val_split'], superpixels=cfg['training']['loss']['superpixels'], img_size=(cfg['data']['img_rows'], cfg['data']['img_cols']), ) n_classes = t_loader.n_classes trainloader = data.DataLoader(t_loader, batch_size=cfg['training']['batch_size'], num_workers=cfg['training']['n_workers'], shuffle=True) valloader = data.DataLoader(v_loader, batch_size=cfg['training']['batch_size'], num_workers=cfg['training']['n_workers']) # Setup Metrics running_metrics_val = runningScore(n_classes) running_metrics_train = runningScore(n_classes) # Setup Model model = get_model(cfg['model'], n_classes).to(device) model = torch.nn.DataParallel(model, device_ids=range(torch.cuda.device_count())) # Setup optimizer, lr_scheduler and loss function optimizer_cls = get_optimizer(cfg) optimizer_params = { k: v for k, v in cfg['training']['optimizer'].items() if k != 'name' } optimizer = optimizer_cls(model.parameters(), **optimizer_params) logger_old.info("Using optimizer {}".format(optimizer)) scheduler = get_scheduler(optimizer, cfg['training']['lr_schedule']) loss_fn = get_loss_function(cfg) logger_old.info("Using loss {}".format(loss_fn)) start_iter = 0 if cfg['training']['resume'] is not None: if os.path.isfile(cfg['training']['resume']): logger_old.info( "Loading model and optimizer from checkpoint '{}'".format( cfg['training']['resume'])) checkpoint = torch.load(cfg['training']['resume']) model.load_state_dict(checkpoint["model_state"]) optimizer.load_state_dict(checkpoint["optimizer_state"]) scheduler.load_state_dict(checkpoint["scheduler_state"]) start_iter = checkpoint["epoch"] logger_old.info("Loaded checkpoint '{}' (iter {})".format( cfg['training']['resume'], checkpoint["epoch"])) else: logger_old.info("No checkpoint found at '{}'".format( cfg['training']['resume'])) val_loss_meter = averageMeter() train_loss_meter = averageMeter() time_meter = averageMeter() train_len = t_loader.train_len val_static = 0 best_iou = -100.0 i = start_iter j = 0 flag = True # Prepare logging xp_name = cfg['model']['arch'] + '_' + \ cfg['training']['loss']['name'] + '_' + args.name xp = logger.Experiment(xp_name, use_visdom=True, visdom_opts={ 'server': 'http://localhost', 'port': 8098 }, time_indexing=False, xlabel='Epoch') # log the hyperparameters of the experiment xp.log_config(flatten(cfg)) # create parent metric for training metrics (easier interface) xp.ParentWrapper(tag='train', name='parent', children=(xp.AvgMetric(name="loss"), xp.AvgMetric(name='acc'), xp.AvgMetric(name='acccls'), xp.AvgMetric(name='fwavacc'), xp.AvgMetric(name='meaniu'))) xp.ParentWrapper(tag='val', name='parent', children=(xp.AvgMetric(name="loss"), xp.AvgMetric(name='acc'), xp.AvgMetric(name='acccls'), xp.AvgMetric(name='fwavacc'), xp.AvgMetric(name='meaniu'))) best_loss = xp.BestMetric(tag='val-best', name='loss', mode='min') best_acc = xp.BestMetric(tag='val-best', name='acc') best_acccls = xp.BestMetric(tag='val-best', name='acccls') best_fwavacc = xp.BestMetric(tag='val-best', name='fwavacc') best_meaniu = xp.BestMetric(tag='val-best', name='meaniu') xp.plotter.set_win_opts(name="loss", opts={'title': 'Loss'}) xp.plotter.set_win_opts(name="acc", opts={'title': 'Micro-Average'}) xp.plotter.set_win_opts(name="acccls", opts={'title': 'Macro-Average'}) xp.plotter.set_win_opts(name="fwavacc", opts={'title': 'FreqW Accuracy'}) xp.plotter.set_win_opts(name="meaniu", opts={'title': 'Mean IoU'}) it_per_step = cfg['training']['acc_batch_size'] eff_batch_size = cfg['training']['batch_size'] * it_per_step while i <= train_len * (cfg['training']['epochs']) and flag: for (images, labels, labels_s, masks) in trainloader: i += 1 j += 1 start_ts = time.time() scheduler.step() model.train() images = images.to(device) labels = labels.to(device) labels_s = labels_s.to(device) masks = masks.to(device) outputs = model(images) if use_superpixels: outputs_s, labels_s, sizes = convert_to_superpixels( outputs, labels_s, masks) loss = loss_fn(input=outputs_s, target=labels_s, size=sizes) outputs = convert_to_pixels(outputs_s, outputs, masks) else: loss = loss_fn(input=outputs, target=labels) # accumulate train metrics during train pred = outputs.data.max(1)[1].cpu().numpy() gt = labels.data.cpu().numpy() running_metrics_train.update(gt, pred) train_loss_meter.update(loss.item()) if args.evaluate: decoded = t_loader.decode_segmap(np.squeeze(pred, axis=0)) misc.imsave("./{}.png".format(i), decoded) image_save = np.transpose( np.squeeze(images.data.cpu().numpy(), axis=0), (1, 2, 0)) misc.imsave("./{}.jpg".format(i), image_save) # accumulate gradients based on the accumulation batch size if i % it_per_step == 1 or it_per_step == 1: optimizer.zero_grad() grad_rescaling = torch.tensor(1. / it_per_step).type_as(loss) loss.backward(grad_rescaling) if (i + 1) % it_per_step == 1 or it_per_step == 1: optimizer.step() optimizer.zero_grad() time_meter.update(time.time() - start_ts) # training logs if (j + 1) % (cfg['training']['print_interval'] * it_per_step) == 0: fmt_str = "Epoch [{}/{}] Iter [{}/{:d}] Loss: {:.4f} Time/Image: {:.4f}" total_iter = int(train_len / eff_batch_size) total_epoch = int(cfg['training']['epochs']) current_epoch = ceil((i + 1) / train_len) current_iter = int((j + 1) / it_per_step) print_str = fmt_str.format( current_epoch, total_epoch, current_iter, total_iter, loss.item(), time_meter.avg / cfg['training']['batch_size']) print(print_str) logger_old.info(print_str) writer.add_scalar('loss/train_loss', loss.item(), i + 1) time_meter.reset() # end of epoch evaluation if (i + 1) % train_len == 0 or \ (i + 1) == train_len * (cfg['training']['epochs']): optimizer.step() optimizer.zero_grad() model.eval() with torch.no_grad(): for i_val, (images_val, labels_val, labels_val_s, masks_val) in tqdm(enumerate(valloader)): images_val = images_val.to(device) labels_val = labels_val.to(device) labels_val_s = labels_val_s.to(device) masks_val = masks_val.to(device) outputs = model(images_val) if use_superpixels: outputs_s, labels_val_s, sizes_val = convert_to_superpixels( outputs, labels_val_s, masks_val) val_loss = loss_fn(input=outputs_s, target=labels_val_s, size=sizes_val) outputs = convert_to_pixels( outputs_s, outputs, masks_val) else: val_loss = loss_fn(input=outputs, target=labels_val) pred = outputs.data.max(1)[1].cpu().numpy() gt = labels_val.data.cpu().numpy() running_metrics_val.update(gt, pred) val_loss_meter.update(val_loss.item()) writer.add_scalar('loss/val_loss', val_loss_meter.avg, i + 1) writer.add_scalar('loss/train_loss', train_loss_meter.avg, i + 1) logger_old.info("Epoch %d Val Loss: %.4f" % (int( (i + 1) / train_len), val_loss_meter.avg)) logger_old.info("Epoch %d Train Loss: %.4f" % (int( (i + 1) / train_len), train_loss_meter.avg)) score, class_iou = running_metrics_train.get_scores() print("Training metrics:") for k, v in score.items(): print(k, v) logger_old.info('{}: {}'.format(k, v)) writer.add_scalar('train_metrics/{}'.format(k), v, i + 1) for k, v in class_iou.items(): logger_old.info('{}: {}'.format(k, v)) writer.add_scalar('train_metrics/cls_{}'.format(k), v, i + 1) xp.Parent_Train.update(loss=train_loss_meter.avg, acc=score['Overall Acc: \t'], acccls=score['Mean Acc : \t'], fwavacc=score['FreqW Acc : \t'], meaniu=score['Mean IoU : \t']) score, class_iou = running_metrics_val.get_scores() print("Validation metrics:") for k, v in score.items(): print(k, v) logger_old.info('{}: {}'.format(k, v)) writer.add_scalar('val_metrics/{}'.format(k), v, i + 1) for k, v in class_iou.items(): logger_old.info('{}: {}'.format(k, v)) writer.add_scalar('val_metrics/cls_{}'.format(k), v, i + 1) xp.Parent_Val.update(loss=val_loss_meter.avg, acc=score['Overall Acc: \t'], acccls=score['Mean Acc : \t'], fwavacc=score['FreqW Acc : \t'], meaniu=score['Mean IoU : \t']) xp.Parent_Val.log_and_reset() xp.Parent_Train.log_and_reset() best_loss.update(xp.loss_val).log() best_acc.update(xp.acc_val).log() best_acccls.update(xp.acccls_val).log() best_fwavacc.update(xp.fwavacc_val).log() best_meaniu.update(xp.meaniu_val).log() visdir = os.path.join('runs', cfg['training']['loss']['name'], args.name, 'plots.json') xp.to_json(visdir) val_loss_meter.reset() train_loss_meter.reset() running_metrics_val.reset() running_metrics_train.reset() j = 0 if score["Mean IoU : \t"] >= best_iou: val_static = 0 best_iou = score["Mean IoU : \t"] state = { "epoch": i + 1, "model_state": model.state_dict(), "optimizer_state": optimizer.state_dict(), "scheduler_state": scheduler.state_dict(), "best_iou": best_iou, } save_path = os.path.join( writer.file_writer.get_logdir(), "{}_{}_best_model.pkl".format(cfg['model']['arch'], cfg['data']['dataset'])) torch.save(state, save_path) else: val_static += 1 if (i + 1) == train_len * ( cfg['training']['epochs']) or val_static == 10: flag = False break return best_iou