def main(args): if not os.path.isfile(MONGO_CONF_PATH): raise ValueError('File {} must exist'.format(MONGO_CONF_PATH)) runs = get_runs(args.sacred_ids, args.slurm_ids, MONGO_CONF_PATH) results = {} index = None for r in runs: res = r['result'] if index is None: index = res['model'] assert res['model'] == index results[(get_key(r['config']), r['_id'])] = res['accuracy'] res = pd.DataFrame(results, index=index) res.sort_index(axis=1, inplace=True) print(res.to_clipboard()) viz = visdom.Visdom(args.host, port=args.port) viz.text(res.to_html(classes=['table', 'table-bordered', 'table-hover'])) logger.info(get_env_url(viz))
def train_model_on_task(self, task, task_viz, exp_dir, use_ray, use_ray_logging, grace_period, num_hp_samplings, local_mode, redis_address, lca_n, **training_params): logger.info("Training dashboard: {}".format(get_env_url(task_viz))) t_id = task['id'] trainable = self.get_trainable(use_ray_logging=use_ray_logging) past_tasks = training_params.pop('past_tasks') normalize = training_params.pop('normalize') augment_data = training_params.pop('augment_data') transformations = [] if augment_data: transformations.extend([ transforms.ToPILImage(), transforms.RandomHorizontalFlip(), transforms.RandomCrop(32, 4), transforms.ToTensor() ]) t_trans = [[] for _ in range(len(task['split_names']))] t_trans[0] = transformations datasets = trainable._load_datasets(task, task['loss_fn'], past_tasks, t_trans, normalize) train_loader, eval_loaders = get_classic_dataloaders(datasets, training_params.pop( 'batch_sizes')) model = self.get_model(task_id=t_id, x_dim=task['x_dim'], n_classes=task['n_classes'], descriptor=task['descriptor'], dataset=eval_loaders[:2]) if use_ray: if not ray.is_initialized(): ray.init(address=redis_address) scheduler = None training_params['loss_fn'] = tune.function( training_params['loss_fn']) training_params['optim_func'] = tune.function(self.optim_func) init_model_path = os.path.join(exp_dir, 'model_initializations') model_file_name = '{}_init.pth'.format(training_params['name']) model_path = os.path.join(init_model_path, model_file_name) torch.save(model, model_path) training_params['model_path'] = model_path config = {**self.get_search_space(), 'training-params': training_params} if use_ray_logging: stop_condition = {'training_iteration': training_params['n_it_max']} checkpoint_at_end = False keep_checkpoints_num = 1 checkpoint_score_attr = 'min-Val nll' else: stop_condition = None # loggers = [JsonLogger, MyCSVLogger] checkpoint_at_end = False keep_checkpoints_num = None checkpoint_score_attr = None trainable = rename_class(trainable, training_params['name']) experiment = Experiment( name=training_params['name'], run=trainable, stop=stop_condition, config=config, resources_per_trial=self.ray_resources, num_samples=num_hp_samplings, local_dir=exp_dir, loggers=(JsonLogger, CSVLogger), checkpoint_at_end=checkpoint_at_end, keep_checkpoints_num=keep_checkpoints_num, checkpoint_score_attr=checkpoint_score_attr) analysis = tune.run(experiment, scheduler=scheduler, verbose=1, raise_on_failed_trial=True, # max_failures=-1, # with_server=True, # server_port=4321 ) os.remove(model_path) logger.info("Training dashboard: {}".format(get_env_url(task_viz))) all_trials = {t.logdir: t for t in analysis.trials} best_logdir = analysis.get_best_logdir('Val nll', 'min') best_trial = all_trials[best_logdir] # picked_metric = 'accuracy_0' # metric_names = {s: '{} {}'.format(s, picked_metric) for s in # ['Train', 'Val', 'Test']} logger.info('Best trial: {}'.format(best_trial)) best_res = best_trial.checkpoint.result best_point = (best_res['training_iteration'], best_res['Val nll']) # y_keys = ['mean_loss' if use_ray_logging else 'Val nll', 'train_loss'] y_keys = ['Val nll', 'Train nll'] epoch_key = 'training_epoch' it_key = 'training_iteration' plot_res_dataframe(analysis, training_params['name'], best_point, task_viz, epoch_key, it_key, y_keys) if 'entropy' in next(iter(analysis.trial_dataframes.values())): plot_res_dataframe(analysis, training_params['name'], None, task_viz, epoch_key, it_key, ['entropy']) best_model = self.get_model(task_id=t_id) best_model.load_state_dict(torch.load(best_trial.checkpoint.value)) train_accs = analysis.trial_dataframes[best_logdir]['Train accuracy_0'] best_t = best_res['training_iteration'] t = best_trial.last_result['training_iteration'] else: search_space = self.get_search_space() rand_config = list(generate_variants(search_space))[0][1] learner_params = rand_config.pop('learner-params', {}) optim_params = rand_config.pop('optim') split_optims = training_params.pop('split_optims') if hasattr(model, 'set_h_params'): model.set_h_params(**learner_params) if hasattr(model, 'train_loader_wrapper'): train_loader = model.train_loader_wrapper(train_loader) loss_fn = task['loss_fn'] if hasattr(model, 'loss_wrapper'): loss_fn = model.loss_wrapper(task['loss_fn']) prepare_batch = _prepare_batch if hasattr(model, 'prepare_batch_wrapper'): prepare_batch = model.prepare_batch_wrapper(prepare_batch, t_id) optim_fact = partial(set_optim_params, optim_func=self.optim_func, optim_params=optim_params, split_optims=split_optims) if hasattr(model, 'train_func'): f = model.train_func t, metrics, b_state_dict = f(train_loader=train_loader, eval_loaders=eval_loaders, optim_fact=optim_fact, loss_fn=loss_fn, split_names=task['split_names'], viz=task_viz, prepare_batch=prepare_batch, **training_params) else: optim = optim_fact(model=model) t, metrics, b_state_dict = train(model=model, train_loader=train_loader, eval_loaders=eval_loaders, optimizer=optim, loss_fn=loss_fn, split_names=task['split_names'], viz=task_viz, prepare_batch=prepare_batch, **training_params) train_accs = metrics['Train accuracy_0'] best_t = b_state_dict['iter'] if 'training_archs' in metrics: plot_trajectory(model.ssn.graph, metrics['training_archs'], model.ssn.stochastic_node_ids, task_viz) weights = model.arch_sampler().squeeze() archs = model.ssn.get_top_archs(weights, 5) list_top_archs(archs, task_viz) list_arch_scores(self.arch_scores[t_id], task_viz) update_summary(self.arch_scores[t_id], task_viz, 'scores') if len(train_accs) > lca_n: lca_accs = [] for i in range(lca_n + 1): if i in train_accs: lca_accs.append(train_accs[i]) else: logger.warning('Missing step for {}/{} for lca computation' .format(i, lca_n)) lca = np.mean(lca_accs) else: lca = np.float('nan') stats = {} start = time.time() # train_idx = task['split_names'].index('Train') # train_path = task['data_path'][train_idx] # train_dataset = _load_datasets([train_path])[0] train_dataset = _load_datasets(task, 'Train')[0] stats.update(self.finish_task(train_dataset, t_id, task_viz, path='drawings')) stats['duration'] = {'iterations': t, 'finish': time.time() - start, 'best_iterations': best_t} stats['params'] = {'total': self.n_params(t_id), 'new': self.new_params(t_id)} stats['lca'] = lca return stats
def process_final_results(main_vis, res_dict, exp_name, visdom_conf, task_envs_str, n_task, best_task_envs_str, simplify_pareto=True, traces_folder=None, plot=True): global_summary = defaultdict(list) first_plot = True for ll_name, (best_traj, exp_summary) in res_dict.items(): if plot: exp_env = '{}_{}'.format(exp_name, ll_name) if traces_folder is not None: log_file = '{}/{}'.format(traces_folder, exp_env) else: log_file = None exp_viz = visdom.Visdom(env=exp_env, log_to_filename=log_file, **visdom_conf) env_url = get_env_url(exp_viz) task_envs_str[ll_name].append(env_url) update_summary(exp_summary, main_vis, ll_name) val_accs_detailed_summary = defaultdict(list) val_accs_detailed_summary['Tag'] = exp_summary['model'] for trial_accs in exp_summary['Acc Val']: for i, acc in enumerate(trial_accs): val_accs_detailed_summary[f'T{i}'].append(acc) update_summary(val_accs_detailed_summary, exp_viz, ll_name + 'vaccs') parto_mem = paretize_exp( exp_summary, 'Params', 'Avg acc Val', ['Avg acc Test', 'model', 'paths'], ) if simplify_pareto: parto_mem = {k: v[-1:] for k, v in parto_mem.items()} update_pareto(exp_summary['Params'].tolist(), exp_summary['Avg acc Test'].tolist(), ll_name, main_vis, first_plot, exp_summary['model'].tolist(), 'All') update_pareto(parto_mem['Params'], parto_mem['Avg acc Test'], ll_name, main_vis, first_plot, parto_mem['model']) update_pareto(exp_summary['Steps'].tolist(), exp_summary['Avg acc Test'].tolist(), ll_name, main_vis, first_plot, exp_summary['model'].tolist(), 'Steps', 'steps') pareto_steps = paretize_exp(exp_summary, 'Steps', 'Avg acc Val', ['Avg acc Test', 'model', 'paths']) if simplify_pareto: pareto_steps = {k: v[-1:] for k, v in pareto_steps.items()} update_pareto(pareto_steps['Steps'], pareto_steps['Avg acc Test'], ll_name, main_vis, first_plot, pareto_steps['model'], 'Steps_clean', 'steps') all_test_accuracies = [] sum_acc_t = 0 sum_durations = 0 sum_lca = 0 if not len(best_traj) == n_task and n_task is not None: logger.warning( 'There was an issue with the results, revieved ' '{} results while the stream contains {} tasks.'.format( len(best_traj), n_task)) raise RuntimeError for t_id, result in best_traj.iterrows(): # for eval_t in range(t_id + 1): arr = [ result['Test_T{}'.format(eval_t)] for eval_t in range(len(best_traj)) ] all_test_accuracies.append(arr) durations = { 'iterations': result['duration_iterations'], 'finish': result['duration_finish'], 'seconds': result['duration_seconds'], 'best_iterations': result['duration_best_it'] } if 'duration_model_creation' in result: durations['model_creation'] = result[ 'duration_model_creation'], params = { 'total': result['total_params'], 'new': result['new_params'] } sum_acc_t += result['test_acc'] sum_durations += result['duration_best_it'] sum_lca += result['lca'] avg_duration = sum_durations / (t_id + 1) avg_acc_t = sum_acc_t / (t_id + 1) avg_lca = sum_lca / (t_id + 1) if plot: update_plots(ll_name, t_id, main_vis, None, False, all_test_accuracies, avg_duration, avg_acc_t, result['avg_acc_test'], {}, durations, result.get('entropy'), None, result['test_acc'], params, result['lca'], avg_lca) # if isinstance(best_ll_model, ProgressiveSSN): # for i, trial_tag in enumerate(parto_mem['model']): # tag = trial_tag.split('_')[0] # env = '{}_Pareto_{}-{}_T{}'.format(self.exp_name, ll_name, # tag, t_id) # log_file = '{}/{}'.format(self.visdom_traces_folder, # env) # viz = visdom.Visdom(env=env, log_to_filename=log_file, # **self.visdom_conf) # trial_path = parto_mem['paths'][i] # viz.text('<pre>{}</pre>'.format(trial_tag)) # self.task_envs_str[ll_name].append( # get_env_url(viz)) # files = ['trained', 'pruned', 'cleaned', # 'full', 'newget'] # for f in files: # file = path.join(trial_path, 'model_T{}_{}.svg' # .format(t_id, f)) # if path.isfile(file): # plot_svg(str(open(file).readlines()), f, viz) task_envs = exp_summary['envs'] for trial_envs in task_envs: params = {**visdom_conf, 'env': trial_envs[t_id]} task_envs_str[ll_name].append(get_env_url(params)) best_task_envs_str[ll_name].append(result['env_url']) ### Update task plots global_summary['model'].append(ll_name) global_summary['speed'].append(avg_duration) global_summary['LCA'].append(avg_lca) global_summary['Acc now'].append(result['avg_acc_test']) global_summary['Acc t'].append(avg_acc_t) global_summary['Params'].append(result['total_params']) global_summary['Steps'].append(result['total_steps']) update_summary(global_summary, main_vis) # best_ll_model = torch.load(path.join(exp_summary['paths'][0], # 'learner.pth')) # # # if isinstance(best_ll_model, ProgressiveSSN): # if 'ProgressiveSSN' in type(best_ll_model).__name__: # for t_id, _ in best_traj.iterrows(): # viz_params = training_envs[t_id][ll_name] # viz = visdom.Visdom(**viz_params) # best_model = best_ll_model.get_model(t_id) # if 'ZeroModel' in type(best_model).__name__: # continue # svg = graph_to_svg(best_model.get_graph()) # viz.svg(svgstr=str(svg), # win='best_{}'.format(t_id), # opts=dict(title='best_{}'.format(t_id))) if plot: plot_para_coord(exp_summary['evaluated_params'], exp_summary['Avg acc Val'], ll_name, exp_viz) first_plot = False return global_summary
def train_single_task(t_id, task, tasks, vis_p, learner, config, transfer_matrix, total_steps): training_params = config.pop('training-params') learner_params = config.pop('learner-params', {}) assert 'model-params' not in config, "Can't have model-specific " \ "parameters while tuning at the " \ "stream level." if learner_params: learner.set_h_params(**learner_params) batch_sizes = training_params.pop('batch_sizes') # optim_func = training_params.pop('optim_func') optim_func = learner.optim_func optim_params = config.pop('optim') schedule_mode = training_params.pop('schedule_mode') split_optims = training_params.pop('split_optims') dropout = config.pop('dropout') if 'dropout' in config else None stream_setting = training_params.pop('stream_setting') plot_all = training_params.pop('plot_all') normalize = training_params.pop('normalize') augment_data = training_params.pop('augment_data') transformations = [] if augment_data: transformations.extend([ transforms.ToPILImage(), transforms.RandomHorizontalFlip(), transforms.RandomCrop(32, 4), transforms.ToTensor() ]) lca_n = training_params.pop('lca') if plot_all: vis_p = get_training_vis_conf(vis_p, tune.get_trial_dir()) # print('NEW vis: ', vis_p) task_vis = visdom.Visdom(**vis_p) # env = [env[0], env[-1]] # vis_p['env'] = '_'.join(env) # vis_p['log_to_filename'] = os.path.join(vis_logdir, vis_p['env']) # g_task_vis = visdom.Visdom(**vis_p) logger.info(get_env_url(task_vis)) else: task_vis = None t_trans = [[] for _ in range(len(task['split_names']))] t_trans[0] = transformations.copy() datasets_p = dict(task=task, transforms=t_trans, normalize=normalize) datasets = _load_datasets(**datasets_p) train_loader, eval_loaders = get_classic_dataloaders(datasets, batch_sizes) assert t_id == task['id'] start1 = time.time() model = learner.get_model(task['id'], x_dim=task['x_dim'], n_classes=task['n_classes'], descriptor=task['descriptor'], dataset=eval_loaders[:2]) model_creation_time = time.time() - start1 loss_fn = task['loss_fn'] training_params['loss_fn'] = loss_fn prepare_batch = _prepare_batch if hasattr(model, 'prepare_batch_wrapper'): prepare_batch = model.prepare_batch_wrapper(prepare_batch, t_id) if hasattr(model, 'loss_wrapper'): training_params['loss_fn'] = \ model.loss_wrapper(training_params['loss_fn']) # if hasattr(model, 'backward_hook'): # training_params[] # optim = set_optim_params(optim_func, optim_params, model, split_optims) optim_fact = partial(set_optim_params, optim_func=optim_func, optim_params=optim_params, split_optims=split_optims) # if schedule_mode == 'steps': # lr_scheduler = torch.optim.lr_scheduler.\ # MultiStepLR(optim[0], milestones=[25, 40]) # elif schedule_mode == 'cos': # lr_scheduler = torch.optim.lr_scheduler.\ # CosineAnnealingLR(optim[0], T_max=200, eta_min=0.001) # elif schedule_mode is None: # lr_scheduler = None # else: # raise NotImplementedError() if dropout is not None: set_dropout(model, dropout) assert not config, config start2 = time.time() rescaled, t, metrics, b_state_dict = train_model(model, datasets_p, batch_sizes, optim_fact, prepare_batch, task, train_loader, eval_loaders, training_params, config) training_time = time.time() - start2 start3 = time.time() if not isinstance(model, ExhaustiveSearch): #todo Handle the state dict loading uniformly for all learners RN only # the exhaustive search models load the best state dict after training model.load_state_dict(b_state_dict['state_dict']) iterations = list(metrics.pop('training_iteration').values()) epochs = list(metrics.pop('training_epoch').values()) assert len(iterations) == len(epochs) index = dict(epochs=epochs, iterations=iterations) update_summary(index, task_vis, 'index', 0.5) grouped_xs = dict() grouped_metrics = defaultdict(list) grouped_legends = defaultdict(list) for metric_n, metric_v in metrics.items(): split_n = metric_n.split() if len(split_n) < 2: continue name = ' '.join(split_n[:-1]) grouped_metrics[split_n[-1]].append(list(metric_v.values())) grouped_legends[split_n[-1]].append(name) if split_n[-1] in grouped_xs: if len(metric_v) > len(grouped_xs[split_n[-1]]): longer_xs = list(metric_v.keys()) assert all(a == b for a, b in zip(longer_xs, grouped_xs[split_n[-1]])) grouped_xs[split_n[-1]] = longer_xs else: grouped_xs[split_n[-1]] = list(metric_v.keys()) for (plot_name, val), (_, legends) in sorted(zip(grouped_metrics.items(), grouped_legends.items())): assert plot_name == _ val = fill_matrix(val) if len(val) == 1: val = np.array(val[0]) else: val = np.array(val).transpose() x = grouped_xs[plot_name] task_vis.line(val, X=x, win=plot_name, opts={'title': plot_name, 'showlegend': True, 'width': 500, 'legend': legends, 'xlabel': 'iterations', 'ylabel': plot_name}) avg_data_time = list(metrics['data time_ps'].values())[-1] avg_forward_time = list(metrics['forward time_ps'].values())[-1] avg_epoch_time = list(metrics['epoch time_ps'].values())[-1] avg_eval_time = list(metrics['eval time_ps'].values())[-1] total_time = list(metrics['total time'].values())[-1] entropies, ent_legend = [], [] for metric_n, metric_v in metrics.items(): if metric_n.startswith('Trainer entropy'): entropies.append(list(metric_v.values())) ent_legend.append(metric_n) if entropies: task_vis.line(np.array(entropies).transpose(), X=iterations, win='ENT', opts={'title': 'Arch entropy', 'showlegend': True, 'width': 500, 'legend': ent_legend, 'xlabel': 'Iterations', 'ylabel': 'Loss'}) if hasattr(learner, 'arch_scores') and hasattr(learner, 'get_top_archs'): update_summary(learner.arch_scores[t_id], task_vis, 'scores') archs = model.get_top_archs(5) list_top_archs(archs, task_vis) if 'training_archs' in metrics: plot_trajectory(model.ssn.graph, metrics['training_archs'], model.ssn.stochastic_node_ids, task_vis) postproc_time = time.time() - start3 start4 = time.time() save_path = tune.get_trial_dir() finish_res = learner.finish_task(datasets[0], t_id, task_vis, save_path) finish_time = time.time() - start4 start5 = time.time() eval_tasks = tasks # eval_tasks = tasks[:t_id + 1] if stream_setting else tasks evaluation = evaluate_on_tasks(eval_tasks, learner, batch_sizes[1], training_params['device'], ['Val', 'Test'], normalize, cur_task=t_id) assert evaluation['Val']['accuracy'][t_id] == b_state_dict['value'] stats = {} eval_time = time.time() - start5 stats.update(finish_res) test_accs = metrics['Test accuracy_0'] if not test_accs: lca = np.float('nan') else: if len(test_accs) <= lca_n: last_key = max(test_accs.keys()) assert len(test_accs) == last_key + 1,\ f"Can't compute LCA@{lca_n} if steps were skipped " \ f"(got {list(test_accs.keys())})" test_accs = test_accs.copy() last_acc = test_accs[last_key] for i in range(last_key + 1, lca_n+1): test_accs[i] = last_acc lca = np.mean([test_accs[i] for i in range(lca_n + 1)]) accs = {} key = 'accuracy' # logger.warning(evaluation) for split in evaluation.keys(): transfer_matrix[split].append(evaluation[split][key]) for i in range(len(tasks)): split_acc = evaluation[split][key] if i < len(split_acc): accs['{}_T{}'.format(split, i)] = split_acc[i] else: accs['{}_T{}'.format(split, i)] = float('nan') plot_heatmaps(list(transfer_matrix.keys()), list(map(fill_matrix, transfer_matrix.values())), task_vis) # logger.warning(t_id) # logger.warning(transfer_matrix) avg_val = np.mean(evaluation['Val']['accuracy']) avg_val_so_far = np.mean(evaluation['Val']['accuracy'][:t_id+1]) avg_test = np.mean(evaluation['Test']['accuracy']) avg_test_so_far = np.mean(evaluation['Test']['accuracy'][:t_id+1]) step_time_s = time.time() - start1 step_sum = model_creation_time + training_time + postproc_time + \ finish_time + eval_time best_it = b_state_dict.get('cum_best_iter', b_state_dict['iter']) tune.report(t=t_id, best_val=b_state_dict['value'], avg_acc_val=avg_val, avg_acc_val_so_far=avg_val_so_far, avg_acc_test_so_far=avg_test_so_far, lca=lca, avg_acc_test=avg_test, test_acc=evaluation['Test']['accuracy'][t_id], duration_seconds=step_time_s, duration_iterations=t, duration_best_it=best_it, duration_finish=finish_time, duration_model_creation=model_creation_time, duration_training=training_time, duration_postproc=postproc_time, duration_eval=eval_time, duration_sum=step_sum, # entropy=stats.pop('entropy'), new_params=learner.new_params(t_id), total_params=learner.n_params(t_id), total_steps=total_steps + t, fw_t=round(avg_forward_time * 1000) / 1000, data_t=round(avg_data_time * 1000) / 1000, epoch_t=round(avg_epoch_time * 1000) / 1000, eval_t=round(avg_eval_time * 1000) / 1000, total_t=round(total_time * 1000) / 1000, env_url=get_env_url(vis_p), **accs, **stats) return rescaled, t, metrics, b_state_dict, stats
def run(self): if self.task_gen.concept_pool.attribute_similarities is not None: attr_sim = self.task_gen.concept_pool.attribute_similarities self.main_viz.heatmap(attr_sim, opts={'title': 'Attribute similarities'}) if self.plot_tasks: self.task_gen.concept_pool.draw_tree(viz=self.main_viz, title='Full tree') self.task_gen.concept_pool.draw_attrs(viz=self.main_viz) self.task_gen.concept_pool.plot_concepts(self.main_viz) self.init_tasks() self.init_sims() self.clean_tasks() if not self.stream_setting: self.init_models(True) else: details = self.init_models(False) logger.info('Architecture details for the first models:') for learner, det in details.items(): logger.info(f'{learner}: {det} ({sum(det.values())}, ' f'{4*sum(det.values())/1e6})') self.init_plots() logger.info("General dashboard: {}".format(get_env_url(self.main_viz))) logger.info('Tasks: {}'.format(get_env_url(self.task_env))) # if self.use_ray and not self.use_processes: # if self.redis_address and not self.local_mode: # ray.init(redis_address=self.redis_address) # else: # logger.warning('Launching a new ray cluster') # ray.init(object_store_memory=int(1e7), include_webui=True, # local_mode=self.local_mode, num_gpus=0) train_calls = [] for model_name, ll_model in self.ll_models.items(): vis_params = [vis_params[model_name] for vis_params in self.training_envs] params = dict(learner=ll_model, stream=self.task_gen.stream_infos(True), task_level_tuning=self.val_per_task, learner_name=model_name, # exp_name=self.exp_name, vis_params=vis_params, plot_all=self.plot_all, batch_sizes=self.batch_sizes, n_it_max=self.n_it_max, n_ep_max=self.n_ep_max, augment_data=self.augment_data, normalize=self.normalize, schedule_mode=self.schedule_mode, patience=self.patience, # grace_period=self.grace_period, num_hp_samplings=self.num_hp_samplings, device=self.device, log_steps=self.log_steps, log_epoch=self.log_epoch, exp_dir=self.exp_dir, lca=self.lca, single_pass=self.single_pass, stream_setting=self.stream_setting, split_optims=self.split_optims, # use_ray=self.use_ray, # use_ray_logging=self.use_ray_logging, local_mode=self.local_mode, redis_address=self.redis_address, seed=self.seed ) train_calls.append(partial(tune_learner_on_stream, **params)) ctx = torch.multiprocessing.get_context('spawn') # ctx = None results_array = execute_step(train_calls, self.use_processes, ctx=ctx) res = dict(zip(self.ll_models.keys(), results_array)) summ = process_final_results(self.main_viz, res, self.exp_name, self.visdom_conf, self.task_envs_str, len(self.task_gen.task_pool), self.best_task_envs_str, self.val_per_task, self.visdom_traces_folder) plot_tasks_env_urls(self.task_envs_str, self.main_viz, 'all') plot_tasks_env_urls(self.best_task_envs_str, self.main_viz, 'best') self.save_traces() res_py = {k: [itm.to_dict('list') for itm in v] for k, v in res.items()} # res_2 = {k: [pandas.DataFrame(itm) for itm in v] for k, v in res_py.items()} # for (k1, v1), (k2, v2) in zip(res.items(), res_2.items()): # assert k1 == k2 # print([i1.equals(i2) for i1, i2 in zip(v1, v2)]) logger.info(f'Args {" ".join(sys.argv[2:])}') print(pandas.DataFrame(summ).set_index('model')) return [res_py, self.task_gen.stream_infos(full=False)]
def train_on_task(self, task): logger.info('###############') logger.info('## Task {}/{} ##'.format(task.id, self.n_tasks)) logger.info('###############') training_calls = [] all_train_viz = [] main_env_url = get_env_url(self.main_viz) logger.info("General dashboard: {}".format(main_env_url)) logger.info('Tasks: {}'.format(get_env_url(self.task_env))) for j, (name, ll_model) in enumerate(self.ll_models.items()): ### # Init ### training_name = '{}_{}-{}'.format(self.exp_name, name, task.name) log_folder = '{}/{}'.format(self.visdom_traces_folder, training_name) task_viz = visdom.Visdom(env=training_name, log_to_filename=log_folder, **self.visdom_conf) task_viz.text('<pre>{}</pre>'.format(task), win='task_descr', opts={'width': 800, 'height': 250}) self.task_envs_str[name].append(get_env_url(task_viz)) all_train_viz.append(task_viz) task_names = [t.name for t in self.task_gen.task_pool] ideal_tp, current_tp = plot_transfers(self.all_perfs[j], self.sims[task.id], task_names, task_viz) self.ideal_potentials[j].append(ideal_tp) self.current_potentials[j].append(current_tp) if self.plot_tasks: task.plot_task(task_viz, training_name) ### # Prepare a call to train on task & Evaluate on all tasks ### past_tasks = self.task_gen.task_pool[:task.id] params = dict(task=task, past_tasks=past_tasks, task_viz=task_viz, learner=ll_model, training_name=training_name) training_calls.append(partial(self.give_task_to_learner, **params)) # Execute all the training calls plot_tasks_env_urls(self.task_envs_str, self.main_viz) training_results = self.execute_step(training_calls) ### Handle the results if self.norm_models: min = np.array(training_results[self.norm_models_idx[0]][1]) max = np.array(training_results[self.norm_models_idx[1]][1]) for j, ((train_time, all_tests, all_confs), train_viz, learner_name) in \ enumerate(zip(training_results, all_train_viz, self.ll_models.keys())): self.training_times_it[j].append(train_time['iterations']) self.training_times_s[j].append(train_time['seconds']) for key, val in train_time.items(): self.metrics['Train time {}'.format(key)][j].append(val) if self.norm_models: norm_tests = np.array(all_tests) - min norm_tests = (norm_tests / (max - min)).tolist() self.all_perfs_normalized[j].append(norm_tests) self.all_perfs[j].append(all_tests) ### # Plot ### plot_heatmaps([learner_name], [self.all_perfs[j]], train_viz) categories = list( map(str, self.task_gen.task_pool[task.id].src_concepts)) plot_heatmaps([learner_name], [all_confs[task.id]], train_viz, title='Confusion matrix', width=600, height=600, xlabel='Predicted category', ylabel='Real category', # rownames=categories, # columnnames=categories ) update_acc_plots(self.all_perfs[j], learner_name, self.main_viz) self.sacred_run.info['transfers'][learner_name] = self.all_perfs[j] name = '{} Accuracies'.format(learner_name) self.sacred_run.log_scalar(name, all_tests)
def main(args): if not os.path.isfile(MONGO_CONF_PATH): raise ValueError('File {} must exist'.format(MONGO_CONF_PATH)) runs = get_runs(args.sacred_ids, args.slurm_ids, MONGO_CONF_PATH) viz = visdom.Visdom(args.host, port=args.port) envs = [] tot_time = 0 n_replayed = 0 configs = {} results = {} failed = [] index = None for run in runs: slurm_id = run['host']['ENV'].get('SLURM_JOB_ID', None) logger.info('\nProcessing run {} ({})'.format(run['_id'], slurm_id)) if args.replay: # Replay part env, n, time = replay_run(run, viz, args, MONGO_CONF_PATH, logger) envs.append(env) tot_time += time n_replayed += n # Results aggregation part res = run['result'] if not res: failed.append(run) continue configs[run['_id']] = run['config'] if index is None: index = res['model'] assert res['model'] == index # results[run['_id']] = {'Avg acc': res['accuracy']} metrics = ['accuracy', 'speed'] accs = [((metric, mod), val) for metric in metrics for mod, val in zip(res['model'], res[metric])] results[run['_id']] = dict(accs) # results[(get_key(run['config']), run['_id'])] = res['accuracy'] logger.info('Done.') logger.info('Replayed {} envs in {:.3f} seconds.'.format( n_replayed, tot_time)) logger.info(envs) log_failed(failed) key, values = validate_configurations(configs) # res = pd.DataFrame(results, index=index) res = pd.DataFrame(results) # res.loc['ids'] = res.columns ref_val = next(iter(values.values())) new_cols = [(values.get(k, ref_val), k) for k in res.columns] new_cols = pd.MultiIndex.from_tuples(new_cols, names=[key, '_id']) res.columns = new_cols res.sort_index(axis=1, inplace=True) if args.group: # Group and compute statistics over runs res = res.transpose().groupby(level=key).agg([np.mean, np.std]).transpose() # agg_funcs = {'accuracy': lambda grp: grp.groupby(level=0).apply(lambda x: x.round(3).astype(str).apply('±'.join, 0)), # 'speed': lambda grp: grp.groupby(level=0).apply(lambda x: x.round(1).astype(str).apply('±'.join, 0))} # Process the results to get everything in the "mean±std" format res = res.groupby(level=[0, 1]).apply( lambda x: x.round(3).astype(str).apply('±'.join, 0)) sacred_ids = list(results.keys()) print(sacred_ids) print(res) id_str = ' '.join(map(str, sacred_ids)) id_row = 'ids\t{}'.format(id_str) buf = io.StringIO() res.to_csv( buf, sep='\t', encoding='utf-8', ) txt = buf.getvalue() txt += id_row clipboard_set(txt) viz = visdom.Visdom(args.host, port=args.port) viz.text(res.to_html(classes=['table', 'table-bordered', 'table-hover'])) logger.info(get_env_url(viz)) log_failed(failed)
def train_model_on_task(self, task, task_viz, exp_dir, use_ray, use_ray_logging, smoke_test, n_it_max, grace_period, num_hp_samplings, local_mode, tune_register_lock, resources, **training_params): logger.info("Training dashboard: {}".format(get_env_url(task_viz))) model = self.get_model(task_id=task.id) trainable = self.get_trainable(use_ray_logging=use_ray_logging) self.prepare_task(task, training_params) if use_ray: # Required to avoid collisions in Tune's global Registry: # https://github.com/ray-project/ray/blob/master/python/ray/tune/registry.py trainable = rename_class(trainable, training_params['name']) scheduler = None training_params['loss_fn'] = tune.function( training_params['loss_fn']) training_params['optim_func'] = tune.function(self.optim_func) training_params['n_it_max'] = n_it_max init_model_path = os.path.join(exp_dir, 'model_initializations') model_file_name = '{}_init.pth'.format(training_params['name']) model_path = os.path.join(init_model_path, model_file_name) torch.save(model, model_path) training_params['model_path'] = model_path config = {'hyper-params': self.get_search_space(smoke_test), 'tp': training_params} if use_ray_logging: stop_condition = {'training_iteration': n_it_max} loggers = None else: stop_condition = None loggers = [JsonLogger, MyCSVLogger] # We need to create the experiment using a lock here to avoid issues # with Tune's global registry, more specifically with the # `_to_flush` dict that may change during the iteration over it. # https://github.com/ray-project/ray/blob/e3c9f7e83a6007ded7ae7e99fcbe9fcaa371bad3/python/ray/tune/registry.py#L91-L93 tune_register_lock.acquire() experiment = Experiment( name=training_params['name'], run=trainable, stop=stop_condition, config=config, resources_per_trial=resources, num_samples=num_hp_samplings, local_dir=exp_dir, loggers=loggers, keep_checkpoints_num=1, checkpoint_score_attr='min-mean_loss') tune_register_lock.release() analysis = tune.run(experiment, scheduler=scheduler, verbose=1, raise_on_failed_trial=True, # max_failures=-1, # with_server=True, # server_port=4321 ) os.remove(model_path) logger.info("Training dashboard: {}".format(get_env_url(task_viz))) all_trials = {t.logdir: t for t in analysis.trials} best_logdir = analysis.get_best_logdir('mean_loss', 'min') best_trial = all_trials[best_logdir] # picked_metric = 'accuracy_0' # metric_names = {s: '{} {}'.format(s, picked_metric) for s in # ['Train', 'Val', 'Test']} logger.info('Best trial: {}'.format(best_trial)) best_res = best_trial._checkpoint.last_result best_point = (best_res['training_iteration'], best_res['mean_loss']) y_keys = ['mean_loss' if use_ray_logging else 'Val nll', 'train_loss'] epoch_key = 'training_epoch' it_key = 'training_iteration' if use_ray_logging else 'training_iterations' plot_res_dataframe(analysis, training_params['name'], best_point, task_viz, epoch_key, it_key, y_keys) best_model = self.get_model(task_id=task.id) best_model.load_state_dict(torch.load(best_trial._checkpoint.value)) t = best_trial._checkpoint.last_result['training_iteration'] else: data_path = training_params.pop('data_path') past_tasks = training_params.pop('past_tasks') datasets = trainable._load_datasets(data_path, training_params['loss_fn'], past_tasks) train_loader, eval_loaders = get_classic_dataloaders(datasets, training_params.pop('batch_sizes')) optim = self.optim_func(model.parameters()) t, accs, best_state_dict = train(model, train_loader, eval_loaders, optimizer=optim, viz=task_viz, n_it_max=n_it_max, **training_params) logger.info('Finishing task ...') t1 = time.time() self.finish_task(task.datasets[0]) logger.info('done in {}s'.format(time.time() - t1)) return t