def main(args):
    if not os.path.isfile(MONGO_CONF_PATH):
        raise ValueError('File {} must exist'.format(MONGO_CONF_PATH))

    runs = get_runs(args.sacred_ids, args.slurm_ids, MONGO_CONF_PATH)

    results = {}
    index = None
    for r in runs:
        res = r['result']
        if index is None:
            index = res['model']
        assert res['model'] == index
        results[(get_key(r['config']), r['_id'])] = res['accuracy']

    res = pd.DataFrame(results, index=index)
    res.sort_index(axis=1, inplace=True)
    print(res.to_clipboard())

    viz = visdom.Visdom(args.host, port=args.port)
    viz.text(res.to_html(classes=['table', 'table-bordered', 'table-hover']))

    logger.info(get_env_url(viz))
Ejemplo n.º 2
0
    def train_model_on_task(self, task, task_viz, exp_dir, use_ray,
                            use_ray_logging, grace_period,
                            num_hp_samplings, local_mode,
                            redis_address, lca_n, **training_params):
        logger.info("Training dashboard: {}".format(get_env_url(task_viz)))
        t_id = task['id']

        trainable = self.get_trainable(use_ray_logging=use_ray_logging)
        past_tasks = training_params.pop('past_tasks')
        normalize = training_params.pop('normalize')
        augment_data = training_params.pop('augment_data')

        transformations = []
        if augment_data:
            transformations.extend([
                transforms.ToPILImage(),
                transforms.RandomHorizontalFlip(),
                transforms.RandomCrop(32, 4),
                transforms.ToTensor()
            ])
        t_trans = [[] for _ in range(len(task['split_names']))]
        t_trans[0] = transformations
        datasets = trainable._load_datasets(task,
                                            task['loss_fn'],
                                            past_tasks, t_trans, normalize)
        train_loader, eval_loaders = get_classic_dataloaders(datasets,
                                                             training_params.pop(
                                                                 'batch_sizes'))
        model = self.get_model(task_id=t_id, x_dim=task['x_dim'],
                               n_classes=task['n_classes'],
                               descriptor=task['descriptor'],
                               dataset=eval_loaders[:2])

        if use_ray:
            if not ray.is_initialized():
                ray.init(address=redis_address)

            scheduler = None

            training_params['loss_fn'] = tune.function(
                training_params['loss_fn'])
            training_params['optim_func'] = tune.function(self.optim_func)

            init_model_path = os.path.join(exp_dir, 'model_initializations')
            model_file_name = '{}_init.pth'.format(training_params['name'])
            model_path = os.path.join(init_model_path, model_file_name)
            torch.save(model, model_path)

            training_params['model_path'] = model_path
            config = {**self.get_search_space(),
                      'training-params': training_params}
            if use_ray_logging:
                stop_condition = {'training_iteration':
                                      training_params['n_it_max']}
                checkpoint_at_end = False
                keep_checkpoints_num = 1
                checkpoint_score_attr = 'min-Val nll'
            else:
                stop_condition = None
                # loggers = [JsonLogger, MyCSVLogger]
                checkpoint_at_end = False
                keep_checkpoints_num = None
                checkpoint_score_attr = None

            trainable = rename_class(trainable, training_params['name'])
            experiment = Experiment(
                name=training_params['name'],
                run=trainable,
                stop=stop_condition,
                config=config,
                resources_per_trial=self.ray_resources,
                num_samples=num_hp_samplings,
                local_dir=exp_dir,
                loggers=(JsonLogger, CSVLogger),
                checkpoint_at_end=checkpoint_at_end,
                keep_checkpoints_num=keep_checkpoints_num,
                checkpoint_score_attr=checkpoint_score_attr)

            analysis = tune.run(experiment,
                                scheduler=scheduler,
                                verbose=1,
                                raise_on_failed_trial=True,
                                # max_failures=-1,
                                # with_server=True,
                                # server_port=4321
                                )
            os.remove(model_path)
            logger.info("Training dashboard: {}".format(get_env_url(task_viz)))

            all_trials = {t.logdir: t for t in analysis.trials}
            best_logdir = analysis.get_best_logdir('Val nll', 'min')
            best_trial = all_trials[best_logdir]

            # picked_metric = 'accuracy_0'
            # metric_names = {s: '{} {}'.format(s, picked_metric) for s in
            #                 ['Train', 'Val', 'Test']}

            logger.info('Best trial: {}'.format(best_trial))
            best_res = best_trial.checkpoint.result
            best_point = (best_res['training_iteration'], best_res['Val nll'])

            # y_keys = ['mean_loss' if use_ray_logging else 'Val nll', 'train_loss']
            y_keys = ['Val nll', 'Train nll']

            epoch_key = 'training_epoch'
            it_key = 'training_iteration'
            plot_res_dataframe(analysis, training_params['name'], best_point,
                               task_viz, epoch_key, it_key, y_keys)
            if 'entropy' in next(iter(analysis.trial_dataframes.values())):
                plot_res_dataframe(analysis, training_params['name'], None,
                                    task_viz, epoch_key, it_key, ['entropy'])
            best_model = self.get_model(task_id=t_id)
            best_model.load_state_dict(torch.load(best_trial.checkpoint.value))

            train_accs = analysis.trial_dataframes[best_logdir]['Train accuracy_0']
            best_t = best_res['training_iteration']
            t = best_trial.last_result['training_iteration']
        else:
            search_space = self.get_search_space()
            rand_config = list(generate_variants(search_space))[0][1]
            learner_params = rand_config.pop('learner-params', {})
            optim_params = rand_config.pop('optim')


            split_optims = training_params.pop('split_optims')
            if hasattr(model, 'set_h_params'):
                model.set_h_params(**learner_params)
            if hasattr(model, 'train_loader_wrapper'):
                train_loader = model.train_loader_wrapper(train_loader)

            loss_fn = task['loss_fn']
            if hasattr(model, 'loss_wrapper'):
                loss_fn = model.loss_wrapper(task['loss_fn'])

            prepare_batch = _prepare_batch
            if hasattr(model, 'prepare_batch_wrapper'):
                prepare_batch = model.prepare_batch_wrapper(prepare_batch, t_id)

            optim_fact = partial(set_optim_params,
                                 optim_func=self.optim_func,
                                 optim_params=optim_params,
                                 split_optims=split_optims)
            if hasattr(model, 'train_func'):
                f = model.train_func
                t, metrics, b_state_dict = f(train_loader=train_loader,
                                                eval_loaders=eval_loaders,
                                                optim_fact=optim_fact,
                                                loss_fn=loss_fn,
                                                split_names=task['split_names'],
                                                viz=task_viz,
                                                prepare_batch=prepare_batch,
                                                **training_params)
            else:
                optim = optim_fact(model=model)
                t, metrics, b_state_dict = train(model=model,
                                                 train_loader=train_loader,
                                                 eval_loaders=eval_loaders,
                                                 optimizer=optim,
                                                 loss_fn=loss_fn,
                                                 split_names=task['split_names'],
                                                 viz=task_viz,
                                                 prepare_batch=prepare_batch,
                                                 **training_params)
            train_accs = metrics['Train accuracy_0']
            best_t = b_state_dict['iter']
            if 'training_archs' in metrics:
                plot_trajectory(model.ssn.graph, metrics['training_archs'],
                                model.ssn.stochastic_node_ids, task_viz)
                weights = model.arch_sampler().squeeze()
                archs = model.ssn.get_top_archs(weights, 5)
                list_top_archs(archs, task_viz)
                list_arch_scores(self.arch_scores[t_id], task_viz)
                update_summary(self.arch_scores[t_id], task_viz, 'scores')

        if len(train_accs) > lca_n:
            lca_accs = []
            for i in range(lca_n + 1):
                if i in train_accs:
                    lca_accs.append(train_accs[i])
                else:
                    logger.warning('Missing step for {}/{} for lca computation'
                                   .format(i, lca_n))
            lca = np.mean(lca_accs)
        else:
            lca = np.float('nan')
        stats = {}
        start = time.time()
        # train_idx = task['split_names'].index('Train')
        # train_path = task['data_path'][train_idx]
        # train_dataset = _load_datasets([train_path])[0]
        train_dataset = _load_datasets(task, 'Train')[0]
        stats.update(self.finish_task(train_dataset, t_id, task_viz,
                                      path='drawings'))
        stats['duration'] = {'iterations': t,
                             'finish': time.time() - start,
                             'best_iterations': best_t}
        stats['params'] = {'total': self.n_params(t_id),
                           'new': self.new_params(t_id)}
        stats['lca'] = lca
        return stats
Ejemplo n.º 3
0
def process_final_results(main_vis,
                          res_dict,
                          exp_name,
                          visdom_conf,
                          task_envs_str,
                          n_task,
                          best_task_envs_str,
                          simplify_pareto=True,
                          traces_folder=None,
                          plot=True):

    global_summary = defaultdict(list)
    first_plot = True
    for ll_name, (best_traj, exp_summary) in res_dict.items():
        if plot:
            exp_env = '{}_{}'.format(exp_name, ll_name)
            if traces_folder is not None:
                log_file = '{}/{}'.format(traces_folder, exp_env)
            else:
                log_file = None
            exp_viz = visdom.Visdom(env=exp_env,
                                    log_to_filename=log_file,
                                    **visdom_conf)
            env_url = get_env_url(exp_viz)
            task_envs_str[ll_name].append(env_url)

            update_summary(exp_summary, main_vis, ll_name)

            val_accs_detailed_summary = defaultdict(list)
            val_accs_detailed_summary['Tag'] = exp_summary['model']
            for trial_accs in exp_summary['Acc Val']:
                for i, acc in enumerate(trial_accs):
                    val_accs_detailed_summary[f'T{i}'].append(acc)

            update_summary(val_accs_detailed_summary, exp_viz,
                           ll_name + 'vaccs')

            parto_mem = paretize_exp(
                exp_summary,
                'Params',
                'Avg acc Val',
                ['Avg acc Test', 'model', 'paths'],
            )
            if simplify_pareto:
                parto_mem = {k: v[-1:] for k, v in parto_mem.items()}

            update_pareto(exp_summary['Params'].tolist(),
                          exp_summary['Avg acc Test'].tolist(), ll_name,
                          main_vis, first_plot, exp_summary['model'].tolist(),
                          'All')
            update_pareto(parto_mem['Params'], parto_mem['Avg acc Test'],
                          ll_name, main_vis, first_plot, parto_mem['model'])

            update_pareto(exp_summary['Steps'].tolist(),
                          exp_summary['Avg acc Test'].tolist(), ll_name,
                          main_vis, first_plot, exp_summary['model'].tolist(),
                          'Steps', 'steps')
            pareto_steps = paretize_exp(exp_summary, 'Steps', 'Avg acc Val',
                                        ['Avg acc Test', 'model', 'paths'])
            if simplify_pareto:
                pareto_steps = {k: v[-1:] for k, v in pareto_steps.items()}
            update_pareto(pareto_steps['Steps'], pareto_steps['Avg acc Test'],
                          ll_name, main_vis, first_plot, pareto_steps['model'],
                          'Steps_clean', 'steps')

        all_test_accuracies = []
        sum_acc_t = 0
        sum_durations = 0
        sum_lca = 0
        if not len(best_traj) == n_task and n_task is not None:
            logger.warning(
                'There was an issue with the results, revieved '
                '{} results while the stream contains {} tasks.'.format(
                    len(best_traj), n_task))
            raise RuntimeError
        for t_id, result in best_traj.iterrows():
            # for eval_t in range(t_id + 1):
            arr = [
                result['Test_T{}'.format(eval_t)]
                for eval_t in range(len(best_traj))
            ]
            all_test_accuracies.append(arr)

            durations = {
                'iterations': result['duration_iterations'],
                'finish': result['duration_finish'],
                'seconds': result['duration_seconds'],
                'best_iterations': result['duration_best_it']
            }
            if 'duration_model_creation' in result:
                durations['model_creation'] = result[
                    'duration_model_creation'],
            params = {
                'total': result['total_params'],
                'new': result['new_params']
            }
            sum_acc_t += result['test_acc']
            sum_durations += result['duration_best_it']
            sum_lca += result['lca']
            avg_duration = sum_durations / (t_id + 1)
            avg_acc_t = sum_acc_t / (t_id + 1)
            avg_lca = sum_lca / (t_id + 1)
            if plot:
                update_plots(ll_name, t_id, main_vis, None, False,
                             all_test_accuracies, avg_duration, avg_acc_t,
                             result['avg_acc_test'], {}, durations,
                             result.get('entropy'), None, result['test_acc'],
                             params, result['lca'], avg_lca)
                # if isinstance(best_ll_model, ProgressiveSSN):
                #     for i, trial_tag in enumerate(parto_mem['model']):
                #         tag = trial_tag.split('_')[0]
                #         env = '{}_Pareto_{}-{}_T{}'.format(self.exp_name, ll_name,
                #                                     tag, t_id)
                #         log_file = '{}/{}'.format(self.visdom_traces_folder,
                #                                   env)
                #         viz = visdom.Visdom(env=env, log_to_filename=log_file,
                #                             **self.visdom_conf)
                #         trial_path = parto_mem['paths'][i]
                #         viz.text('<pre>{}</pre>'.format(trial_tag))
                #         self.task_envs_str[ll_name].append(
                #             get_env_url(viz))
                #         files = ['trained', 'pruned', 'cleaned',
                #                  'full', 'newget']
                #         for f in files:
                #             file = path.join(trial_path, 'model_T{}_{}.svg'
                #                              .format(t_id, f))
                #             if path.isfile(file):
                #                 plot_svg(str(open(file).readlines()), f, viz)
                task_envs = exp_summary['envs']
                for trial_envs in task_envs:
                    params = {**visdom_conf, 'env': trial_envs[t_id]}
                    task_envs_str[ll_name].append(get_env_url(params))

                best_task_envs_str[ll_name].append(result['env_url'])
            ### Update task plots

        global_summary['model'].append(ll_name)
        global_summary['speed'].append(avg_duration)
        global_summary['LCA'].append(avg_lca)
        global_summary['Acc now'].append(result['avg_acc_test'])
        global_summary['Acc t'].append(avg_acc_t)
        global_summary['Params'].append(result['total_params'])
        global_summary['Steps'].append(result['total_steps'])
        update_summary(global_summary, main_vis)

        # best_ll_model = torch.load(path.join(exp_summary['paths'][0],
        #                                      'learner.pth'))
        #
        # # if isinstance(best_ll_model, ProgressiveSSN):
        # if 'ProgressiveSSN' in type(best_ll_model).__name__:
        #     for t_id, _ in best_traj.iterrows():
        #         viz_params = training_envs[t_id][ll_name]
        #         viz = visdom.Visdom(**viz_params)
        #         best_model = best_ll_model.get_model(t_id)
        #         if 'ZeroModel' in type(best_model).__name__:
        #             continue
        #         svg = graph_to_svg(best_model.get_graph())
        #         viz.svg(svgstr=str(svg),
        #                 win='best_{}'.format(t_id),
        #                 opts=dict(title='best_{}'.format(t_id)))

        if plot:
            plot_para_coord(exp_summary['evaluated_params'],
                            exp_summary['Avg acc Val'], ll_name, exp_viz)

            first_plot = False
    return global_summary
Ejemplo n.º 4
0
def train_single_task(t_id, task, tasks, vis_p, learner, config, transfer_matrix,
                      total_steps):

    training_params = config.pop('training-params')
    learner_params = config.pop('learner-params', {})
    assert 'model-params' not in config, "Can't have model-specific " \
                                         "parameters while tuning at the " \
                                         "stream level."

    if learner_params:
        learner.set_h_params(**learner_params)

    batch_sizes = training_params.pop('batch_sizes')
    # optim_func = training_params.pop('optim_func')
    optim_func = learner.optim_func
    optim_params = config.pop('optim')
    schedule_mode = training_params.pop('schedule_mode')
    split_optims = training_params.pop('split_optims')

    dropout = config.pop('dropout') if 'dropout' in config else None

    stream_setting = training_params.pop('stream_setting')
    plot_all = training_params.pop('plot_all')
    normalize = training_params.pop('normalize')
    augment_data = training_params.pop('augment_data')
    transformations = []
    if augment_data:
        transformations.extend([
            transforms.ToPILImage(),
            transforms.RandomHorizontalFlip(),
            transforms.RandomCrop(32, 4),
            transforms.ToTensor()
        ])
    lca_n = training_params.pop('lca')

    if plot_all:
        vis_p = get_training_vis_conf(vis_p, tune.get_trial_dir())
        # print('NEW vis: ', vis_p)
        task_vis = visdom.Visdom(**vis_p)
        # env = [env[0], env[-1]]
        # vis_p['env'] = '_'.join(env)
        # vis_p['log_to_filename'] = os.path.join(vis_logdir, vis_p['env'])
        # g_task_vis = visdom.Visdom(**vis_p)

        logger.info(get_env_url(task_vis))
    else:
        task_vis = None

    t_trans = [[] for _ in range(len(task['split_names']))]
    t_trans[0] = transformations.copy()

    datasets_p = dict(task=task,
                      transforms=t_trans,
                      normalize=normalize)
    datasets = _load_datasets(**datasets_p)
    train_loader, eval_loaders = get_classic_dataloaders(datasets,
                                                         batch_sizes)

    assert t_id == task['id']

    start1 = time.time()
    model = learner.get_model(task['id'], x_dim=task['x_dim'],
                              n_classes=task['n_classes'],
                              descriptor=task['descriptor'],
                              dataset=eval_loaders[:2])
    model_creation_time = time.time() - start1

    loss_fn = task['loss_fn']
    training_params['loss_fn'] = loss_fn

    prepare_batch = _prepare_batch
    if hasattr(model, 'prepare_batch_wrapper'):
        prepare_batch = model.prepare_batch_wrapper(prepare_batch, t_id)

    if hasattr(model, 'loss_wrapper'):
        training_params['loss_fn'] = \
            model.loss_wrapper(training_params['loss_fn'])

    # if hasattr(model, 'backward_hook'):
    #     training_params[]

    # optim = set_optim_params(optim_func, optim_params, model, split_optims)
    optim_fact = partial(set_optim_params,
                         optim_func=optim_func,
                         optim_params=optim_params,
                         split_optims=split_optims)
    # if schedule_mode == 'steps':
    #     lr_scheduler = torch.optim.lr_scheduler.\
    #         MultiStepLR(optim[0], milestones=[25, 40])
    # elif schedule_mode == 'cos':
    #     lr_scheduler = torch.optim.lr_scheduler.\
    #         CosineAnnealingLR(optim[0], T_max=200, eta_min=0.001)
    # elif schedule_mode is None:
    #     lr_scheduler = None
    # else:
    #     raise NotImplementedError()
    if dropout is not None:
        set_dropout(model, dropout)

    assert not config, config
    start2 = time.time()
    rescaled, t, metrics, b_state_dict = train_model(model, datasets_p,
                                                     batch_sizes, optim_fact,
                                                     prepare_batch, task,
                                                     train_loader, eval_loaders,
                                                     training_params, config)

    training_time = time.time() - start2
    start3 = time.time()
    if not isinstance(model, ExhaustiveSearch):
        #todo Handle the state dict loading uniformly for all learners RN only
        # the exhaustive search models load the best state dict after training
        model.load_state_dict(b_state_dict['state_dict'])

    iterations = list(metrics.pop('training_iteration').values())
    epochs = list(metrics.pop('training_epoch').values())

    assert len(iterations) == len(epochs)
    index = dict(epochs=epochs, iterations=iterations)
    update_summary(index, task_vis, 'index', 0.5)

    grouped_xs = dict()
    grouped_metrics = defaultdict(list)
    grouped_legends = defaultdict(list)
    for metric_n, metric_v in metrics.items():
        split_n = metric_n.split()
        if len(split_n) < 2:
            continue
        name = ' '.join(split_n[:-1])
        grouped_metrics[split_n[-1]].append(list(metric_v.values()))
        grouped_legends[split_n[-1]].append(name)
        if split_n[-1] in grouped_xs:
            if len(metric_v) > len(grouped_xs[split_n[-1]]):
                longer_xs = list(metric_v.keys())
                assert all(a == b for a, b in zip(longer_xs,
                                                  grouped_xs[split_n[-1]]))
                grouped_xs[split_n[-1]] = longer_xs
        else:
            grouped_xs[split_n[-1]] = list(metric_v.keys())

    for (plot_name, val), (_, legends) in sorted(zip(grouped_metrics.items(),
                                                     grouped_legends.items())):
        assert plot_name == _
        val = fill_matrix(val)
        if len(val) == 1:
            val = np.array(val[0])
        else:
            val = np.array(val).transpose()
        x = grouped_xs[plot_name]
        task_vis.line(val, X=x, win=plot_name,
                      opts={'title': plot_name, 'showlegend': True,
                            'width': 500, 'legend': legends,
                            'xlabel': 'iterations', 'ylabel': plot_name})

    avg_data_time = list(metrics['data time_ps'].values())[-1]
    avg_forward_time = list(metrics['forward time_ps'].values())[-1]
    avg_epoch_time = list(metrics['epoch time_ps'].values())[-1]
    avg_eval_time = list(metrics['eval time_ps'].values())[-1]
    total_time = list(metrics['total time'].values())[-1]

    entropies, ent_legend = [], []
    for metric_n, metric_v in metrics.items():
        if metric_n.startswith('Trainer entropy'):
            entropies.append(list(metric_v.values()))
            ent_legend.append(metric_n)

    if entropies:
        task_vis.line(np.array(entropies).transpose(), X=iterations,
                      win='ENT',
                      opts={'title': 'Arch entropy', 'showlegend': True,
                            'width': 500, 'legend': ent_legend,
                            'xlabel': 'Iterations', 'ylabel': 'Loss'})

    if hasattr(learner, 'arch_scores') and hasattr(learner, 'get_top_archs'):
        update_summary(learner.arch_scores[t_id], task_vis, 'scores')
        archs = model.get_top_archs(5)
        list_top_archs(archs, task_vis)

    if 'training_archs' in metrics:
        plot_trajectory(model.ssn.graph, metrics['training_archs'],
                        model.ssn.stochastic_node_ids, task_vis)

    postproc_time = time.time() - start3
    start4 = time.time()
    save_path = tune.get_trial_dir()
    finish_res = learner.finish_task(datasets[0], t_id,
                                     task_vis, save_path)
    finish_time = time.time() - start4

    start5 = time.time()
    eval_tasks = tasks
    # eval_tasks = tasks[:t_id + 1] if stream_setting else tasks
    evaluation = evaluate_on_tasks(eval_tasks, learner, batch_sizes[1],
                                   training_params['device'],
                                   ['Val', 'Test'], normalize,
                                   cur_task=t_id)
    assert evaluation['Val']['accuracy'][t_id] == b_state_dict['value']

    stats = {}
    eval_time = time.time() - start5

    stats.update(finish_res)

    test_accs = metrics['Test accuracy_0']
    if not test_accs:
        lca = np.float('nan')
    else:
        if len(test_accs) <= lca_n:
            last_key = max(test_accs.keys())
            assert len(test_accs) == last_key + 1,\
                f"Can't compute LCA@{lca_n} if steps were skipped " \
                f"(got {list(test_accs.keys())})"
            test_accs = test_accs.copy()
            last_acc = test_accs[last_key]
            for i in range(last_key + 1, lca_n+1):
                test_accs[i] = last_acc
        lca = np.mean([test_accs[i] for i in range(lca_n + 1)])

    accs = {}
    key = 'accuracy'
    # logger.warning(evaluation)
    for split in evaluation.keys():
        transfer_matrix[split].append(evaluation[split][key])
        for i in range(len(tasks)):
            split_acc = evaluation[split][key]
            if i < len(split_acc):
                accs['{}_T{}'.format(split, i)] = split_acc[i]
            else:
                accs['{}_T{}'.format(split, i)] = float('nan')
    plot_heatmaps(list(transfer_matrix.keys()),
                  list(map(fill_matrix, transfer_matrix.values())),
                  task_vis)


    # logger.warning(t_id)
    # logger.warning(transfer_matrix)

    avg_val = np.mean(evaluation['Val']['accuracy'])
    avg_val_so_far = np.mean(evaluation['Val']['accuracy'][:t_id+1])
    avg_test = np.mean(evaluation['Test']['accuracy'])
    avg_test_so_far = np.mean(evaluation['Test']['accuracy'][:t_id+1])

    step_time_s = time.time() - start1
    step_sum = model_creation_time + training_time + postproc_time + \
               finish_time + eval_time
    best_it = b_state_dict.get('cum_best_iter', b_state_dict['iter'])
    tune.report(t=t_id,
                best_val=b_state_dict['value'],
                avg_acc_val=avg_val,
                avg_acc_val_so_far=avg_val_so_far,
                avg_acc_test_so_far=avg_test_so_far,
                lca=lca,
                avg_acc_test=avg_test,
                test_acc=evaluation['Test']['accuracy'][t_id],
                duration_seconds=step_time_s,
                duration_iterations=t,
                duration_best_it=best_it,
                duration_finish=finish_time,
                duration_model_creation=model_creation_time,
                duration_training=training_time,
                duration_postproc=postproc_time,
                duration_eval=eval_time,
                duration_sum=step_sum,
                # entropy=stats.pop('entropy'),
                new_params=learner.new_params(t_id),
                total_params=learner.n_params(t_id),
                total_steps=total_steps + t,
                fw_t=round(avg_forward_time * 1000) / 1000,
                data_t=round(avg_data_time * 1000) / 1000,
                epoch_t=round(avg_epoch_time * 1000) / 1000,
                eval_t=round(avg_eval_time * 1000) / 1000,
                total_t=round(total_time * 1000) / 1000,
                env_url=get_env_url(vis_p),
                **accs, **stats)
    return rescaled, t, metrics, b_state_dict, stats
Ejemplo n.º 5
0
    def run(self):
        if self.task_gen.concept_pool.attribute_similarities is not None:
            attr_sim = self.task_gen.concept_pool.attribute_similarities
            self.main_viz.heatmap(attr_sim,
                                  opts={'title': 'Attribute similarities'})
        if self.plot_tasks:
            self.task_gen.concept_pool.draw_tree(viz=self.main_viz,
                                                 title='Full tree')
            self.task_gen.concept_pool.draw_attrs(viz=self.main_viz)
            self.task_gen.concept_pool.plot_concepts(self.main_viz)

        self.init_tasks()
        self.init_sims()
        self.clean_tasks()
        if not self.stream_setting:
            self.init_models(True)
        else:
            details = self.init_models(False)
            logger.info('Architecture details for the first models:')
            for learner, det in details.items():
                logger.info(f'{learner}: {det} ({sum(det.values())}, '
                            f'{4*sum(det.values())/1e6})')
        self.init_plots()

        logger.info("General dashboard: {}".format(get_env_url(self.main_viz)))
        logger.info('Tasks: {}'.format(get_env_url(self.task_env)))
        # if self.use_ray and not self.use_processes:
        #     if self.redis_address and not self.local_mode:
        #         ray.init(redis_address=self.redis_address)
        #     else:
        #         logger.warning('Launching a new ray cluster')
        #         ray.init(object_store_memory=int(1e7), include_webui=True,
        #                  local_mode=self.local_mode, num_gpus=0)

        train_calls = []
        for model_name, ll_model in self.ll_models.items():
            vis_params = [vis_params[model_name]
                          for vis_params in self.training_envs]
            params = dict(learner=ll_model,
                          stream=self.task_gen.stream_infos(True),
                          task_level_tuning=self.val_per_task,
                          learner_name=model_name,
                          # exp_name=self.exp_name,
                          vis_params=vis_params,
                          plot_all=self.plot_all,
                          batch_sizes=self.batch_sizes,
                          n_it_max=self.n_it_max,
                          n_ep_max=self.n_ep_max,
                          augment_data=self.augment_data,
                          normalize=self.normalize,
                          schedule_mode=self.schedule_mode,
                          patience=self.patience,
                          # grace_period=self.grace_period,
                          num_hp_samplings=self.num_hp_samplings,
                          device=self.device,
                          log_steps=self.log_steps,
                          log_epoch=self.log_epoch,
                          exp_dir=self.exp_dir,
                          lca=self.lca,
                          single_pass=self.single_pass,
                          stream_setting=self.stream_setting,
                          split_optims=self.split_optims,
                          # use_ray=self.use_ray,
                          # use_ray_logging=self.use_ray_logging,
                          local_mode=self.local_mode,
                          redis_address=self.redis_address,
                          seed=self.seed
                          )
            train_calls.append(partial(tune_learner_on_stream, **params))

        ctx = torch.multiprocessing.get_context('spawn')
        # ctx = None
        results_array = execute_step(train_calls, self.use_processes, ctx=ctx)
        res = dict(zip(self.ll_models.keys(), results_array))

        summ = process_final_results(self.main_viz, res, self.exp_name,
                                     self.visdom_conf, self.task_envs_str,
                                     len(self.task_gen.task_pool),
                                     self.best_task_envs_str, self.val_per_task,
                                     self.visdom_traces_folder)

        plot_tasks_env_urls(self.task_envs_str, self.main_viz, 'all')
        plot_tasks_env_urls(self.best_task_envs_str, self.main_viz, 'best')
        self.save_traces()

        res_py = {k: [itm.to_dict('list') for itm in v] for k, v in res.items()}
        # res_2 = {k: [pandas.DataFrame(itm) for itm in v] for k, v in res_py.items()}

        # for (k1, v1), (k2, v2) in zip(res.items(), res_2.items()):
            # assert k1 == k2
            # print([i1.equals(i2) for i1, i2 in zip(v1, v2)])
        logger.info(f'Args {" ".join(sys.argv[2:])}')
        print(pandas.DataFrame(summ).set_index('model'))
        return [res_py, self.task_gen.stream_infos(full=False)]
    def train_on_task(self, task):
        logger.info('###############')
        logger.info('## Task {}/{} ##'.format(task.id, self.n_tasks))
        logger.info('###############')

        training_calls = []
        all_train_viz = []
        main_env_url = get_env_url(self.main_viz)
        logger.info("General dashboard: {}".format(main_env_url))
        logger.info('Tasks: {}'.format(get_env_url(self.task_env)))
        for j, (name, ll_model) in enumerate(self.ll_models.items()):
            ###
            # Init
            ###
            training_name = '{}_{}-{}'.format(self.exp_name, name,
                                              task.name)
            log_folder = '{}/{}'.format(self.visdom_traces_folder,
                                        training_name)
            task_viz = visdom.Visdom(env=training_name,
                                     log_to_filename=log_folder,
                                     **self.visdom_conf)
            task_viz.text('<pre>{}</pre>'.format(task), win='task_descr',
                          opts={'width': 800, 'height': 250})
            self.task_envs_str[name].append(get_env_url(task_viz))
            all_train_viz.append(task_viz)

            task_names = [t.name for t in self.task_gen.task_pool]
            ideal_tp, current_tp = plot_transfers(self.all_perfs[j],
                                                  self.sims[task.id],
                                                  task_names,
                                                  task_viz)
            self.ideal_potentials[j].append(ideal_tp)
            self.current_potentials[j].append(current_tp)

            if self.plot_tasks:
                task.plot_task(task_viz, training_name)
            ###
            # Prepare a call to train on task & Evaluate on all tasks
            ###
            past_tasks = self.task_gen.task_pool[:task.id]
            params = dict(task=task, past_tasks=past_tasks,
                          task_viz=task_viz, learner=ll_model,
                          training_name=training_name)
            training_calls.append(partial(self.give_task_to_learner,
                                          **params))

        # Execute all the training calls
        plot_tasks_env_urls(self.task_envs_str, self.main_viz)

        training_results = self.execute_step(training_calls)

        ### Handle the results
        if self.norm_models:
            min = np.array(training_results[self.norm_models_idx[0]][1])
            max = np.array(training_results[self.norm_models_idx[1]][1])

        for j, ((train_time, all_tests, all_confs), train_viz, learner_name) in \
                enumerate(zip(training_results, all_train_viz,
                              self.ll_models.keys())):
            self.training_times_it[j].append(train_time['iterations'])
            self.training_times_s[j].append(train_time['seconds'])
            for key, val in train_time.items():
                self.metrics['Train time {}'.format(key)][j].append(val)

            if self.norm_models:
                norm_tests = np.array(all_tests) - min
                norm_tests = (norm_tests / (max - min)).tolist()
                self.all_perfs_normalized[j].append(norm_tests)

            self.all_perfs[j].append(all_tests)

            ###
            # Plot
            ###
            plot_heatmaps([learner_name], [self.all_perfs[j]], train_viz)
            categories = list(
                map(str, self.task_gen.task_pool[task.id].src_concepts))
            plot_heatmaps([learner_name], [all_confs[task.id]], train_viz,
                          title='Confusion matrix', width=600, height=600,
                          xlabel='Predicted category',
                          ylabel='Real category',
                          # rownames=categories,
                          # columnnames=categories
                          )
            update_acc_plots(self.all_perfs[j], learner_name, self.main_viz)
            self.sacred_run.info['transfers'][learner_name] = self.all_perfs[j]
            name = '{} Accuracies'.format(learner_name)
            self.sacred_run.log_scalar(name, all_tests)
def main(args):
    if not os.path.isfile(MONGO_CONF_PATH):
        raise ValueError('File {} must exist'.format(MONGO_CONF_PATH))

    runs = get_runs(args.sacred_ids, args.slurm_ids, MONGO_CONF_PATH)

    viz = visdom.Visdom(args.host, port=args.port)

    envs = []
    tot_time = 0
    n_replayed = 0

    configs = {}
    results = {}
    failed = []
    index = None
    for run in runs:
        slurm_id = run['host']['ENV'].get('SLURM_JOB_ID', None)
        logger.info('\nProcessing run {} ({})'.format(run['_id'], slurm_id))
        if args.replay:
            # Replay part
            env, n, time = replay_run(run, viz, args, MONGO_CONF_PATH, logger)
            envs.append(env)
            tot_time += time
            n_replayed += n

        # Results aggregation part
        res = run['result']
        if not res:
            failed.append(run)
            continue
        configs[run['_id']] = run['config']
        if index is None:
            index = res['model']
        assert res['model'] == index
        # results[run['_id']] = {'Avg acc': res['accuracy']}
        metrics = ['accuracy', 'speed']
        accs = [((metric, mod), val) for metric in metrics
                for mod, val in zip(res['model'], res[metric])]
        results[run['_id']] = dict(accs)
        # results[(get_key(run['config']), run['_id'])] = res['accuracy']

    logger.info('Done.')
    logger.info('Replayed {} envs in {:.3f} seconds.'.format(
        n_replayed, tot_time))
    logger.info(envs)

    log_failed(failed)

    key, values = validate_configurations(configs)
    # res = pd.DataFrame(results, index=index)
    res = pd.DataFrame(results)
    # res.loc['ids'] = res.columns

    ref_val = next(iter(values.values()))
    new_cols = [(values.get(k, ref_val), k) for k in res.columns]
    new_cols = pd.MultiIndex.from_tuples(new_cols, names=[key, '_id'])
    res.columns = new_cols
    res.sort_index(axis=1, inplace=True)
    if args.group:
        # Group and compute statistics over runs
        res = res.transpose().groupby(level=key).agg([np.mean,
                                                      np.std]).transpose()

        # agg_funcs = {'accuracy': lambda grp: grp.groupby(level=0).apply(lambda x: x.round(3).astype(str).apply('±'.join, 0)),
        #              'speed': lambda grp: grp.groupby(level=0).apply(lambda x: x.round(1).astype(str).apply('±'.join, 0))}
        # Process the results to get everything in the "mean±std" format
        res = res.groupby(level=[0, 1]).apply(
            lambda x: x.round(3).astype(str).apply('±'.join, 0))

    sacred_ids = list(results.keys())
    print(sacred_ids)
    print(res)

    id_str = ' '.join(map(str, sacred_ids))
    id_row = 'ids\t{}'.format(id_str)

    buf = io.StringIO()
    res.to_csv(
        buf,
        sep='\t',
        encoding='utf-8',
    )
    txt = buf.getvalue()
    txt += id_row
    clipboard_set(txt)
    viz = visdom.Visdom(args.host, port=args.port)
    viz.text(res.to_html(classes=['table', 'table-bordered', 'table-hover']))
    logger.info(get_env_url(viz))

    log_failed(failed)
Ejemplo n.º 8
0
    def train_model_on_task(self, task, task_viz, exp_dir, use_ray,
                            use_ray_logging, smoke_test, n_it_max, grace_period,
                            num_hp_samplings, local_mode, tune_register_lock,
                            resources, **training_params):
        logger.info("Training dashboard: {}".format(get_env_url(task_viz)))

        model = self.get_model(task_id=task.id)
        trainable = self.get_trainable(use_ray_logging=use_ray_logging)

        self.prepare_task(task, training_params)

        if use_ray:
            # Required to avoid collisions in Tune's global Registry:
            # https://github.com/ray-project/ray/blob/master/python/ray/tune/registry.py
            trainable = rename_class(trainable, training_params['name'])

            scheduler = None


            training_params['loss_fn'] = tune.function(
                training_params['loss_fn'])
            training_params['optim_func'] = tune.function(self.optim_func)
            training_params['n_it_max'] = n_it_max

            init_model_path = os.path.join(exp_dir, 'model_initializations')
            model_file_name = '{}_init.pth'.format(training_params['name'])
            model_path = os.path.join(init_model_path, model_file_name)
            torch.save(model, model_path)

            training_params['model_path'] = model_path
            config = {'hyper-params': self.get_search_space(smoke_test),
                      'tp': training_params}
            if use_ray_logging:
                stop_condition = {'training_iteration': n_it_max}
                loggers = None
            else:
                stop_condition = None
                loggers = [JsonLogger, MyCSVLogger]

            # We need to create the experiment using a lock here to avoid issues
            # with Tune's global registry, more specifically with the
            # `_to_flush` dict that may change during the iteration over it.
            # https://github.com/ray-project/ray/blob/e3c9f7e83a6007ded7ae7e99fcbe9fcaa371bad3/python/ray/tune/registry.py#L91-L93
            tune_register_lock.acquire()
            experiment = Experiment(
                name=training_params['name'],
                run=trainable,
                stop=stop_condition,
                config=config,
                resources_per_trial=resources,
                num_samples=num_hp_samplings,
                local_dir=exp_dir,
                loggers=loggers,
                keep_checkpoints_num=1,
                checkpoint_score_attr='min-mean_loss')
            tune_register_lock.release()

            analysis = tune.run(experiment,
                                scheduler=scheduler,
                                verbose=1,
                                raise_on_failed_trial=True,
                                # max_failures=-1,
                                # with_server=True,
                                # server_port=4321
                                )
            os.remove(model_path)
            logger.info("Training dashboard: {}".format(get_env_url(task_viz)))

            all_trials = {t.logdir: t for t in analysis.trials}
            best_logdir = analysis.get_best_logdir('mean_loss', 'min')
            best_trial = all_trials[best_logdir]

            # picked_metric = 'accuracy_0'
            # metric_names = {s: '{} {}'.format(s, picked_metric) for s in
            #                 ['Train', 'Val', 'Test']}

            logger.info('Best trial: {}'.format(best_trial))
            best_res = best_trial._checkpoint.last_result
            best_point = (best_res['training_iteration'], best_res['mean_loss'])

            y_keys = ['mean_loss' if use_ray_logging else 'Val nll', 'train_loss']
            epoch_key = 'training_epoch'
            it_key = 'training_iteration' if use_ray_logging else 'training_iterations'
            plot_res_dataframe(analysis, training_params['name'], best_point,
                               task_viz, epoch_key, it_key, y_keys)
            best_model = self.get_model(task_id=task.id)
            best_model.load_state_dict(torch.load(best_trial._checkpoint.value))

            t = best_trial._checkpoint.last_result['training_iteration']
        else:
            data_path = training_params.pop('data_path')
            past_tasks = training_params.pop('past_tasks')
            datasets = trainable._load_datasets(data_path,
                                                training_params['loss_fn'],
                                                past_tasks)
            train_loader, eval_loaders = get_classic_dataloaders(datasets,
                                                                 training_params.pop('batch_sizes'))
            optim = self.optim_func(model.parameters())

            t, accs, best_state_dict = train(model, train_loader, eval_loaders,
                                             optimizer=optim, viz=task_viz,
                                             n_it_max=n_it_max, **training_params)
        logger.info('Finishing task ...')
        t1 = time.time()
        self.finish_task(task.datasets[0])
        logger.info('done in {}s'.format(time.time() - t1))

        return t