def compare_posteriors_with_different_data(cfg_name, model, t, replace_indices,
                                           params) -> None:
    plt.clf()
    plt.close()
    fig, axarr = plt.subplots(nrows=1, ncols=len(params))
    colours = cm.viridis(np.linspace(0.2, 0.8, len(replace_indices)))

    for j, replace_index in enumerate(replace_indices):
        for i, p in enumerate(params):
            samples = results_utils.get_posterior_samples(
                cfg_name,
                iter_range=(t, t + 1),
                model=model,
                replace_index=replace_index,
                params=[p])
            sns.distplot(samples,
                         ax=axarr[i],
                         color=to_hex(colours[j]),
                         label=str(replace_index),
                         kde=False)
    # save

    for i, p in enumerate(params):
        axarr[i].set_xlabel('parameter ' + p)

    axarr[0].set_title('iteration ' + str(t))
    axarr[-1].legend()
    vis_utils.beautify_axes(axarr)

    return
def weight_posterior(cfg_name,
                     model,
                     replace_indices='random',
                     t=500,
                     param='#0',
                     n_bins=25):
    """
    """
    iter_range = (t, t + 1)
    nolegend = False

    if replace_indices == 'random':
        print('Picking two *random* replace indices for this setting...')
        df = results_utils.get_available_results(cfg_name, model)
        replace_counts = df['replace'].value_counts()
        replaces = replace_counts[replace_counts > 2].index.values
        replace_indices = np.random.choice(replaces, 2, replace=False).tolist()
    elif type(replace_indices) == int:
        replace_indices = [replace_indices]
        nolegend = True

    assert type(replace_indices) == list
    # Set up the plot
    fig, axarr = plt.subplots(nrows=1, ncols=1, figsize=(4, 2.5))
    # now load the data!
    for replace_index in replace_indices:
        df = results_utils.get_posterior_samples(cfg_name,
                                                 iter_range,
                                                 model,
                                                 replace_index=replace_index,
                                                 params=[param],
                                                 seeds='all')
        sns.distplot(df[param],
                     ax=axarr,
                     label=f'D\{replace_index}',
                     kde=True,
                     bins=n_bins,
                     norm_hist=True)

    axarr.set_xlabel('weight ' + param)
    if not nolegend:
        axarr.legend()
    axarr.set_ylabel('# runs')
    vis_utils.beautify_axes(np.array([axarr]))
    plt.tight_layout()

    plot_identifier = f'weight_posterior_{cfg_name}_{param}'
    plt.savefig((PLOTS_DIR / plot_identifier).with_suffix('.png'))
    plt.savefig((PLOTS_DIR / plot_identifier).with_suffix('.pdf'))

    return
def plot_sigmas_distribution(model, cfg_names=None, ylim=None) -> None:
    if model == 'logistic':
        convergence_points = em.lr_convergence_points
        title = 'Logistic regression'
    else:
        convergence_points = em.nn_convergence_points
        title = 'Neural network'

    if cfg_names is None:
        cfg_names = convergence_points.keys()
    fig, axarr = plt.subplots(nrows=1, ncols=1, figsize=(3, 3))

    for ds in cfg_names:
        t = convergence_points[ds]
        # now just the sigmas distribution
        all_sigmas = dr.Sigmas(ds, model, t).load(diffinit=True)['sigmas']
        # lose the nans
        all_sigmas = all_sigmas[~np.isnan(all_sigmas)]
        min_sigma = np.nanmin(all_sigmas)
        sns.distplot(all_sigmas - min_sigma,
                     ax=axarr,
                     norm_hist=True,
                     label=em.dataset_names[ds],
                     color=to_rgba(em.dataset_colours[ds]),
                     kde=False,
                     bins=50)
        percentiles = np.percentile(all_sigmas, [0, 0.25, 0.5, 0.75, 1])
        print(ds, len(all_sigmas))
        print(percentiles)

    axarr.set_xlabel('variability estimate')
    axarr.set_ylabel('density')
    axarr.set_title(title)
    axarr.legend()

    if ylim is not None:
        axarr.set_ylim(ylim)
    axarr.set_xlim(0, None)
    vis_utils.beautify_axes(np.array([axarr]))
    plt.tight_layout()

    plot_identifier = f'stability_sigmas_dist_{model}'
    plt.savefig((PLOTS_DIR / plot_identifier).with_suffix('.png'))
    plt.savefig((PLOTS_DIR / plot_identifier).with_suffix('.pdf'))
    plt.clf()
    plt.close()

    return
def multivariate_normal_test_vis(df, logscale: bool = False) -> None:
    fig, axarr = plt.subplots(nrows=3, ncols=1, sharex=True)
    axarr[-1].set_xlabel('N')
    ns = df['n'].unique()
    ds = df['d'].unique()
    colours = cm.viridis(np.linspace(0, 1, len(ds)))
    for i, d in enumerate(ds):
        df_d = df[df['d'] == d]
        for j, label in enumerate(
            ['pval_diagonal_gauss', 'pval_nondiag_gauss', 'pval_laplace']):
            val_mean = df_d[[label, 'n']].groupby('n').mean()
            val_std = df_d[[label, 'n']].groupby('n').std()
            uh = axarr[j].plot(val_mean.index,
                               val_mean.values[:, 0],
                               color=colours[i],
                               label=d)
            axarr[j].fill_between(val_mean.index,
                                  (val_mean - val_std).values[:, 0],
                                  (val_mean + val_std).values[:, 0],
                                  color=colours[i],
                                  alpha=0.1)
    fig.colorbar(plt.cm.ScalarMappable(plt.Normalize(vmin=min(ds),
                                                     vmax=max(ds)),
                                       cmap='viridis'),
                 ax=axarr,
                 label='dimension',
                 drawedges=False,
                 ticks=ds)
    axarr[0].set_ylabel('pval\nMVN')
    axarr[1].set_ylabel('pval\nMVNd')
    axarr[2].set_ylabel('pval\nlaplace')

    for ax in axarr:
        #ax.legend()
        ax.axhline(y=0.05, ls='--', color='red', alpha=0.5)
        if logscale:
            ax.set_yscale('log')
            ax.set_ylim(1e-5, 1)
        else:
            ax.set_ylim(0, 1)
    vis_utils.beautify_axes(axarr)

    plt.savefig(PLOTS_DIR / f'multivar_test{"_log"*logscale}.png')
    plt.savefig(PLOTS_DIR / f'multivar_test{"_log"*logscale}.pdf')
    plt.clf()
    plt.close()
    return
def visualise_weight_trajectory(cfg_name,
                                identifiers,
                                df=None,
                                save=True,
                                iter_range=(None, None),
                                params=['#4', '#2'],
                                include_optimum=False,
                                include_autocorrelation=False,
                                diffinit=False) -> None:
    """
    """
    df_list = []

    for identifier in identifiers:
        model = identifier['model']
        replace_index = identifier['replace']
        seed = identifier['seed']
        experiment = results_utils.ExperimentIdentifier(
            cfg_name, model, replace_index, seed, diffinit)
        df = experiment.load_weights(iter_range=iter_range, params=params)
        df_list.append(df)
    colors = cm.viridis(np.linspace(0.2, 0.8, len(df_list)))
    labels = [':'.join(x) for x in identifiers]

    if params is None:
        if len(df.columns) > 6:
            print('WARNING: No parameters indicated, choosing randomly...')
            params = np.random.choice(df_list[0].columns[1:], 4, replace=False)
        else:
            print('WARNING: No parameters indicated, selecting all')
            params = df_list[0].columns[1:]

    for p in params:
        for df in df_list:
            assert p in df.columns

    if include_optimum:
        # hack!
        optimum, hessian = data_utils.solve_with_linear_regression(cfg_name)

    if include_autocorrelation:
        ncols = 2
    else:
        ncols = 1
    fig, axarr = plt.subplots(nrows=len(params),
                              ncols=ncols,
                              sharex='col',
                              figsize=(4 * ncols, 1.5 * len(params) + 1))

    firstcol = axarr[:, 0] if include_autocorrelation else axarr

    for k, df in enumerate(df_list):
        color = to_hex(colors[k])

        for i, p in enumerate(params):
            firstcol[i].scatter(df['t'],
                                df[p],
                                c=color,
                                alpha=1,
                                s=4,
                                label=labels[k])
            firstcol[i].plot(df['t'],
                             df[p],
                             c=color,
                             alpha=0.75,
                             label='_nolegend_')
            firstcol[i].set_ylabel('param: ' + str(p))

            if include_optimum:
                firstcol[i].axhline(y=optimum[int(p[1:])],
                                    ls='--',
                                    color='red',
                                    alpha=0.5)
        firstcol[0].set_title('weight trajectory')
        firstcol[-1].set_xlabel('training steps')
        firstcol[0].legend()

        if include_autocorrelation:
            n_lags = 100
            autocorr = np.zeros(n_lags)
            axarr[0, 1].set_title('autocorrelation of weight trajectory')

            for i, p in enumerate(params):
                for lag in range(n_lags):
                    autocorr[lag] = df[p].autocorr(lag=lag)

                axarr[i, 1].plot(range(n_lags),
                                 autocorr,
                                 alpha=0.5,
                                 color=color)
                axarr[i, 1].scatter(range(n_lags),
                                    autocorr,
                                    s=4,
                                    zorder=2,
                                    color=color)
                axarr[i, 1].set_ylabel(p)
                axarr[i, 1].axhline(y=0, ls='--', alpha=0.5, color='black')
            axarr[-1, 1].set_xlabel('lag')

    vis_utils.beautify_axes(axarr)
    plt.tight_layout()

    if save:
        plot_identifier = f'weights_{cfg_name}_{"_".join(labels)}'
        plt.savefig((PLOTS_DIR / plot_identifier).with_suffix('.png'))
        plt.savefig((PLOTS_DIR / plot_identifier).with_suffix('.pdf'))
    plt.clf()
    plt.close()

    return
def visualise_trace(cfg_names,
                    models,
                    replaces,
                    seeds,
                    privacys,
                    save=True,
                    include_batches=False,
                    iter_range=(None, None),
                    include_convergence=True,
                    diffinit=False,
                    convergence_tolerance=3,
                    include_vali=True,
                    labels=None) -> None:
    """
    Show the full training set loss as well as the gradient (at our element) over training
    """
    identifiers = vis_utils.process_identifiers(cfg_names, models, replaces,
                                                seeds, privacys)
    print(identifiers)

    if len(identifiers) > 1:
        print(
            'WARNING: When more than one experiment is included, we turn off visualisation of batches to avoid cluttering the plot'
        )
        include_batches = False

    if labels is None:
        labels = [
            f'{x["cfg_name"]}-{x["model"]}-{x["replace"]}-{x["seed"]}'
            for x in identifiers
        ]
    else:
        assert len(labels) == len(identifiers)
    loss_list = []

    for identifier in identifiers:
        cfg_name = identifier['cfg_name']
        model = identifier['model']
        replace_index = identifier['replace']
        seed = identifier['seed']
        data_privacy = identifier['data_privacy']
        experiment = results_utils.ExperimentIdentifier(
            cfg_name,
            model,
            replace_index,
            seed,
            data_privacy=data_privacy,
            diffinit=diffinit)
        df_loss = experiment.load_loss(iter_range=iter_range)

        if df_loss is False:
            print('No fit data available for identifier:', identifier)
            df_loss = []
        loss_list.append(df_loss)

    if len(loss_list) == 0:
        print('Error: no valid data')

        return False

    if include_batches:
        minibatch_ids = loss_list[0]['minibatch_id'].unique()
        colormap = dict(
            zip(minibatch_ids, cm.viridis(np.linspace(0, 1,
                                                      len(minibatch_ids)))))
    colours = cm.viridis(np.linspace(0.2, 0.8, len(loss_list)))

    # what metrics were recorded for this run?
    metrics = loss_list[0].columns[2:]
    print('Visualising trace of', identifiers, 'with metrics', metrics)

    nrows = len(metrics)
    fig, axarr = plt.subplots(nrows=nrows,
                              ncols=1,
                              sharex='col',
                              figsize=(4, 3.2))

    if nrows == 1:
        axarr = np.array([axarr])

    for j, df in enumerate(loss_list):
        # this is just for the purpose of plotting the overall, not batches
        df_train = df.loc[df['minibatch_id'] == 'ALL', :]
        df_vali = df.loc[df['minibatch_id'] == 'VALI', :]

        # plot all

        for i, metric in enumerate(metrics):
            axarr[i].scatter(df_train['t'],
                             df_train[metric],
                             s=4,
                             color=colours[j],
                             zorder=2,
                             label='_nolegend_',
                             alpha=0.5)
            axarr[i].plot(df_train['t'],
                          df_train[metric],
                          alpha=0.25,
                          color=colours[j],
                          zorder=2,
                          label=labels[j])

            if include_vali:
                axarr[i].plot(df_vali['t'],
                              df_vali[metric],
                              ls='--',
                              color=colours[j],
                              zorder=2,
                              label='_nolegend_',
                              alpha=0.5)
            axarr[i].legend()

            if metric in ['mse']:
                axarr[i].set_yscale('log')
            axarr[i].set_ylabel(re.sub('_', '\n', metric))

            if include_batches:
                axarr[i].scatter(df['t'],
                                 df[metric],
                                 c=[colormap[x] for x in df['minibatch_id']],
                                 s=4,
                                 alpha=0.2,
                                 zorder=0)

                for minibatch_idx in df['minibatch_id'].unique():
                    df_temp = df.loc[df['minibatch_id'] == minibatch_idx, :]
                    axarr[i].plot(df_temp['t'],
                                  df_temp[metric],
                                  c=colormap[minibatch_idx],
                                  alpha=0.1,
                                  zorder=0)

    if include_convergence:
        for j, identifier in enumerate(identifiers):
            cfg_name = identifier['cfg_name']
            model = identifier['model']
            replace_index = identifier['replace']
            seed = identifier['seed']
            data_privacy = identifier['data_privacy']
            convergence_point = dr.find_convergence_point_for_single_experiment(
                cfg_name,
                model,
                replace_index,
                seed,
                diffinit,
                tolerance=convergence_tolerance,
                metric=metrics[0],
                data_privacy=data_privacy)
            print('Convergence point:', convergence_point)

            for ax in axarr:
                ax.axvline(x=convergence_point, ls='--', color=colours[j])
    axarr[-1].set_xlabel('training steps')

    vis_utils.beautify_axes(axarr)
    plt.tight_layout()

    if save:
        plot_label = '__'.join(
            [f'r{x["replace"]}-s{x["seed"]}' for x in identifiers])
        plot_identifier = f'trace_{cfg_name}_{plot_label}'
        plt.savefig((PLOTS_DIR / plot_identifier).with_suffix('.png'))
        plt.savefig((PLOTS_DIR / plot_identifier).with_suffix('.pdf'))
    plt.clf()
    plt.close()

    return
def fit_pval_histogram(what,
                       cfg_name,
                       model,
                       t,
                       n_experiments=3,
                       diffinit=False,
                       xlim=None,
                       seed=1) -> None:
    """
    histogram of p-values (across parameters-?) for a given model etc.
    """
    assert what in ['weights', 'gradients']
    # set some stuff up
    iter_range = (t, t + 1)
    fig, axarr = plt.subplots(nrows=1, ncols=1, figsize=(3.5, 2.1))
    pval_colour = '#b237c4'
    # sample experiments
    df = results_utils.get_available_results(cfg_name,
                                             model,
                                             diffinit=diffinit)
    replace_indices = df['replace'].unique()
    replace_indices = np.random.choice(replace_indices,
                                       n_experiments,
                                       replace=False)
    print('Looking at replace indices...', replace_indices)
    all_pvals = []

    for i, replace_index in enumerate(replace_indices):
        experiment = results_utils.ExperimentIdentifier(
            cfg_name, model, replace_index, seed, diffinit)

        if what == 'gradients':
            print('Loading gradients...')
            df = experiment.load_gradients(noise=True,
                                           iter_range=iter_range,
                                           params=None)
            second_col = df.columns[1]
        elif what == 'weights':
            df = results_utils.get_posterior_samples(
                cfg_name,
                iter_range=iter_range,
                model=model,
                replace_index=replace_index,
                params=None,
                seeds='all')
            second_col = df.columns[1]
        params = df.columns[2:]
        n_params = len(params)
        print(n_params)

        if n_params < 50:
            print(
                'ERROR: Insufficient parameters for this kind of visualisation, please try something else'
            )

            return False
        print('Identified', n_params, 'parameters, proceeding with analysis')
        p_vals = np.zeros(shape=(n_params))

        for j, p in enumerate(params):
            print('getting fit for parameter', p)
            df_fit = dr.estimate_statistics_through_training(
                what=what,
                cfg_name=None,
                model=None,
                replace_index=None,
                seed=None,
                df=df.loc[:, ['t', second_col, p]],
                params=None,
                iter_range=None)
            p_vals[j] = df_fit.loc[t, 'norm_p']
            del df_fit
        log_pvals = np.log(p_vals)
        all_pvals.append(log_pvals)
    log_pvals = np.concatenate(all_pvals)

    if xlim is not None:
        # remove values below the limit
        number_below = (log_pvals < xlim[0]).sum()
        print('There are', number_below, 'p-values below the limit of',
              xlim[0])
        log_pvals = log_pvals[log_pvals > xlim[0]]
        print('Remaining pvals:', len(log_pvals))
    sns.distplot(log_pvals,
                 kde=True,
                 bins=min(100, int(len(log_pvals) * 0.25)),
                 ax=axarr,
                 color=pval_colour,
                 norm_hist=True)
    axarr.axvline(x=np.log(0.05),
                  ls=':',
                  label='p = 0.05',
                  color='black',
                  alpha=0.75)
    axarr.axvline(x=np.log(0.05 / n_params),
                  ls='--',
                  label='p = 0.05/' + str(n_params),
                  color='black',
                  alpha=0.75)
    axarr.legend()
    axarr.set_xlabel(r'$\log(p)$')
    axarr.set_ylabel('density')

    if xlim is not None:
        axarr.set_xlim(xlim)
    else:
        axarr.set_xlim((None, 0.01))


#    axarr.set_xscale('log')
    vis_utils.beautify_axes(np.array([axarr]))
    plt.tight_layout()
    plot_identifier = f'pval_histogram_{cfg_name}_{model}_{what}'
    plt.savefig((PLOTS_DIR / plot_identifier).with_suffix('.png'))
    plt.savefig((PLOTS_DIR / plot_identifier).with_suffix('.pdf'))

    return
def weight_evolution(cfg_name,
                     model,
                     n_seeds=50,
                     replace_indices=None,
                     iter_range=(None, None),
                     params=['#4', '#2'],
                     diffinit=False,
                     aggregate=False):
    plt.clf()
    plt.close()
    fig, axarr = plt.subplots(nrows=len(params),
                              ncols=1,
                              sharex=True,
                              figsize=(4, 3))

    if aggregate:
        colours = cm.get_cmap('Set1')(np.linspace(0.2, 0.8,
                                                  len(replace_indices)))
        assert n_seeds > 1

        for i, replace_index in enumerate(replace_indices):
            vary_S = results_utils.get_posterior_samples(
                cfg_name,
                iter_range,
                model,
                replace_index=replace_index,
                params=params,
                seeds='all',
                n_seeds=n_seeds,
                diffinit=diffinit)
            vary_S_min = vary_S.groupby('t').min()
            vary_S_std = vary_S.groupby('t').std()
            vary_S_max = vary_S.groupby('t').max()
            vary_S_mean = vary_S.groupby('t').mean()

            for j, p in enumerate(params):
                axarr[j].fill_between(vary_S_min.index,
                                      vary_S_min[p],
                                      vary_S_max[p],
                                      alpha=0.1,
                                      color=colours[i],
                                      label='_legend_')
                axarr[j].fill_between(vary_S_mean.index,
                                      vary_S_mean[p] - vary_S_std[p],
                                      vary_S_mean[p] + vary_S_std[p],
                                      alpha=0.1,
                                      color=colours[i],
                                      label='_nolegend_',
                                      linestyle='--')
                axarr[j].plot(vary_S_min.index,
                              vary_S_mean[p],
                              color=colours[i],
                              alpha=0.7,
                              label='D -' + str(replace_index))
                axarr[j].set_ylabel('weight ' + p)
    else:
        colours = cm.get_cmap('plasma')(np.linspace(0.2, 0.8, n_seeds))
        assert len(replace_indices) == 1
        replace_index = replace_indices[0]
        vary_S = results_utils.get_posterior_samples(
            cfg_name,
            iter_range,
            model,
            replace_index=replace_index,
            params=params,
            seeds='all',
            n_seeds=n_seeds,
            diffinit=diffinit)
        seeds = vary_S['seed'].unique()

        for i, s in enumerate(seeds):
            vary_Ss = vary_S.loc[vary_S['seed'] == s, :]

            for j, p in enumerate(params):
                axarr[j].plot(vary_Ss['t'],
                              vary_Ss[p],
                              color=colours[i],
                              label='seed ' + str(s),
                              alpha=0.8)

                if i == 0:
                    axarr[j].set_ylabel(r'$\mathbf{w}^{' + p[1:] + '}$')

    axarr[-1].set_xlabel('training steps')
    vis_utils.beautify_axes(np.array([axarr]))
    plt.tight_layout()
    plot_identifier = f'weight_trajectory_{cfg_name}.{model}'
    plt.savefig((PLOTS_DIR / plot_identifier).with_suffix('.png'))
    plt.savefig((PLOTS_DIR / plot_identifier).with_suffix('.pdf'))

    return
def qq_plot(what: str,
            cfg_name: str,
            model: str,
            replace_index: int,
            seed: int,
            times=[50],
            params='random') -> None:
    """
    grab trace file, do qq plot for gradient noise at specified time-point
    """
    plt.clf()
    plt.close()
    assert what in ['gradients', 'weights']

    if what == 'weights':
        print('Looking at weights, this means we consider all seeds!')
    colours = cm.viridis(np.linspace(0.2, 0.8, len(times)))

    experiment = results_utils.ExperimentIdentifier(cfg_name, model,
                                                    replace_index, seed)

    if params == 'random':
        if what == 'gradients':
            df = experiment.load_gradients(noise=True,
                                           params=None,
                                           iter_range=(min(times),
                                                       max(times) + 1))
        else:
            df = results_utils.get_posterior_samples(
                cfg_name,
                model=model,
                replace_index=replace_index,
                iter_range=(min(times), max(times) + 1),
                params=None)
        params = np.random.choice(df.columns[2:], 1)
        print('picking random parameter', params)
        first_two_cols = df.columns[:2].tolist()
        df = df.loc[:, first_two_cols + list(params)]
    else:
        if what == 'gradients':
            df = experiment.load_gradients(noise=True,
                                           params=params,
                                           iter_range=(min(times),
                                                       max(times) + 1))
        else:
            df = results_utils.get_posterior_samples(
                cfg_name,
                model=model,
                replace_index=replace_index,
                iter_range=(min(times), max(times) + 1),
                params=params)

    if df is False:
        print('ERROR: No data available')

        return False
    fig, axarr = plt.subplots(nrows=1, ncols=2, figsize=(7, 3.5))

    for i, t in enumerate(times):
        df_t = df.loc[df['t'] == t, :]
        X = df_t.iloc[:, 2:].values.flatten()
        print('number of samples:', X.shape[0])
        sns.distplot(X,
                     ax=axarr[0],
                     kde=False,
                     color=to_hex(colours[i]),
                     label=str(t))
        sm.qqplot(X,
                  line='45',
                  fit=True,
                  ax=axarr[1],
                  c=colours[i],
                  alpha=0.5,
                  label=str(t))
    plt.suptitle('cfg_name: ' + cfg_name + ', model:' + model + ',' + what)
    axarr[0].legend()
    axarr[1].legend()
    axarr[0].set_xlabel('parameter:' + '.'.join(params))
    vis_utils.beautify_axes(axarr)
    plt.tight_layout()

    plot_identifier = f'qq_{what}_{cfg_name}_{model}_{"_".join(params)}'
    plt.savefig((PLOTS_DIR / plot_identifier).with_suffix('.png'))
    plt.savefig((PLOTS_DIR / plot_identifier).with_suffix('.pdf'))

    return
Ejemplo n.º 10
0
def overlay_pval_plot(model='logistic',
                      xlim=None,
                      n_experiments=50,
                      cfg_names=None,
                      ylim=None) -> None:
    """
    want to overlay pvals from the four datasets in one plot
    """
    what = 'weights'
    figsize = (3.7, 3.05)

    if model == 'logistic':
        convergence_points = em.lr_convergence_points
        title = 'Logistic regression'
    else:
        convergence_points = em.nn_convergence_points
        title = 'Neural network'

    if cfg_names is None:
        cfg_names = em.dataset_colours.keys()
        plot_label = '_'
    else:
        plot_label = '_'.join(cfg_names) + '_'
    fig, axarr = plt.subplots(nrows=1, ncols=1, figsize=figsize)
    vertical_lines_we_already_have = set()

    for ds in cfg_names:
        print(ds)
        log_pvals, n_params = vis_utils.fit_pval_histogram(
            what=what,
            dataset=ds,
            model=model,
            t=convergence_points[ds],
            n_experiments=n_experiments,
            plot=False)
        sns.distplot(log_pvals,
                     kde=True,
                     bins=min(100, int(len(log_pvals) * 0.25)),
                     ax=axarr,
                     color=em.dataset_colours[ds],
                     norm_hist=True,
                     label=em.get_dataset_name(ds),
                     kde_kws={'alpha': 0.6})

        if n_params not in vertical_lines_we_already_have:
            axarr.axvline(x=np.log(0.05 / (n_params * n_experiments)),
                          ls='--',
                          label='p = 0.05/' + str(n_params * n_experiments),
                          color=em.dataset_colours[ds],
                          alpha=0.75)
            vertical_lines_we_already_have.add(n_params)
    axarr.axvline(x=np.log(0.05),
                  ls=':',
                  label='p = 0.05',
                  color='black',
                  alpha=0.75)
    axarr.legend()
    axarr.set_xlabel(r'$\log(p)$')
    axarr.set_ylabel('density')
    axarr.set_title(title)

    if ylim is not None:
        axarr.set_ylim(ylim)

    if xlim is not None:
        axarr.set_xlim(xlim)
    else:
        axarr.set_xlim((None, 0.01))
    vis_utils.beautify_axes(np.array([axarr]))
    plt.tight_layout()
    figure_identifier = f'pval_histogram_{plot_label}_{model}'
    plt.savefig((FIGS_DIR / figure_identifier).with_suffix('.png'))
    plt.savefig((FIGS_DIR / figure_identifier).with_suffix('.pdf'))

    return
Ejemplo n.º 11
0
def plot_stability_of_estimated_values(cfg_name, model, t) -> None:
    stability = dr.Stability(cfg_name, model, t)
    stability_dict = stability.load()

    # lets just do 3 separate plots
    figsize = (3.5, 2.8)
    size = 6
    # SIGMA V N SEEDS
    print('Plotting sigma v seeds')
    sigma_df = stability_dict['sigma']
    sigma_v_seed = sigma_df[['num_seeds', 'sigma']]
    fig, axarr = plt.subplots(nrows=1, ncols=1, figsize=figsize)
    axarr.scatter(sigma_v_seed['num_seeds'],
                  sigma_v_seed['sigma'],
                  s=size,
                  c=em.dp_colours['augment_diffinit'])
    sigma_we_use = dr.estimate_variability(cfg_name, model, t, diffinit=True)
    axarr.axhline(y=sigma_we_use,
                  ls='--',
                  c=em.dp_colours['augment_diffinit'],
                  alpha=0.4)
    axarr.set_xlabel('number of random seeds')
    axarr.set_ylabel(r'estimated $\sigma_i(\mathcal{D})$')
    axarr.set_title(
        em.get_dataset_name(cfg_name) + ' (' + em.model_names[model] + ')')
    upper_y = 1.05 * max(np.max(sigma_v_seed['sigma']), sigma_we_use)
    lower_y = 0.95 * np.min(sigma_v_seed['sigma'])
    axarr.set_ylim(lower_y, upper_y)
    vis_utils.beautify_axes(np.array([axarr]))

    plt.tight_layout()
    figure_identifier = f'stability_sigma_v_seeds_{cfg_name}_{model}_t{t}'
    plt.savefig((FIGS_DIR / figure_identifier).with_suffix('.png'))
    plt.savefig((FIGS_DIR / figure_identifier).with_suffix('.pdf'))

    plt.clf()
    plt.close()

    # With fixed num_deltas, sensitivity
    print('Plotting sens v num deltas')
    sens_df = stability_dict['sens']
    sens_v_deltas = sens_df[['num_deltas', 'sens']].drop_duplicates()
    fig, axarr = plt.subplots(nrows=1, ncols=1, figsize=figsize)
    axarr.scatter(sens_v_deltas['num_deltas'],
                  sens_v_deltas['sens'],
                  s=size,
                  c=em.dp_colours['bolton'])
    sens_we_use = dr.estimate_sensitivity_empirically(cfg_name,
                                                      model,
                                                      t,
                                                      num_deltas='max',
                                                      diffinit=True,
                                                      data_privacy='all')
    axarr.axhline(y=sens_we_use, ls='--', c=em.dp_colours['bolton'], alpha=0.4)
    axarr.set_xlabel('number of dataset comparisons')
    axarr.set_ylabel('estimated sensitivity')
    axarr.set_ylim(0, None)
    axarr.set_xscale('log')
    axarr.set_title(
        em.get_dataset_name(cfg_name) + ' (' + em.model_names[model] + ')')
    vis_utils.beautify_axes(np.array([axarr]))
    plt.tight_layout()

    figure_identifier = f'stability_sens_v_deltas_{cfg_name}_{model}_t{t}'
    plt.savefig((FIGS_DIR / figure_identifier).with_suffix('.png'))
    plt.savefig((FIGS_DIR / figure_identifier).with_suffix('.pdf'))

    plt.clf()
    plt.close()

    return
Ejemplo n.º 12
0
def plot_distance_v_time(cfg_name,
                         model,
                         num_pairs='max',
                         convergence_point=None) -> None:
    """
    This will take precedence over the normal sens_var_over_time one
    """
    df = dr.VersusTime(cfg_name, model).load()

    # Get distance (vary seed)
    distance_columns = [x for x in df.columns if 'distance' in x]
    df_distance = df[['t'] + distance_columns]
    df_distance.dropna(axis=0, inplace=True)

    # Get sensitivity (vary data)
    df_sens = df[['t', 'theoretical_sensitivity', 'empirical_sensitivity']]
    if model in ['mlp', 'cnn']:
        df_sens.drop(columns='theoretical_sensitivity', inplace=True)
    else:
        # discretise the sensitivity
        ds = [np.nan] * df_sens.shape[0]
        for i, ts in enumerate(df_sens['theoretical_sensitivity'].values):
            ds[i] = test_private_model.discretise_theoretical_sensitivity(
                cfg_name, model, ts)
        df_sens['theoretical_sensitivity_discretised'] = ds
    df_sens.dropna(axis=0, inplace=True)

    # Now plot
    size = 6
    fig, axarr = plt.subplots(nrows=1, ncols=1, figsize=(4, 2.1))

    # First distance (vary seed)
    t = df_distance['t']
    which_colours = {
        'fixinit': em.dp_colours['augment'],
        'diffinit': em.dp_colours['augment_diffinit']
    }
    which_labels = {'fixinit': np.nan, 'diffinit': r'$\Delta_V^{vary}$'}

    for which in ['diffinit']:  # not interested in fixinit
        min_dist = df_distance[f'min_{which}_distance']
        mean_dist = df_distance[f'mean_{which}_distance']
        max_dist = df_distance[f'max_{which}_distance']
        std_dist = df_distance[f'std_{which}_distance']
        axarr.plot(t,
                   mean_dist,
                   label=which_labels[which],
                   color=which_colours[which],
                   alpha=0.5)
        axarr.scatter(t,
                      mean_dist,
                      color=which_colours[which],
                      label='_nolegend_',
                      s=size)
        axarr.fill_between(t,
                           mean_dist - std_dist,
                           mean_dist + std_dist,
                           alpha=0.2,
                           label='_nolegend_',
                           color=which_colours[which])
        axarr.fill_between(t,
                           min_dist,
                           max_dist,
                           alpha=0.1,
                           label='_nolegend_',
                           color=which_colours[which])

    # Now sensitivity (vary data)
    t = df_sens['t']
    if 'theoretical_sensitivity_discretised' in df_sens:
        axarr.plot(t,
                   df_sens['theoretical_sensitivity_discretised'],
                   label=r'$\hat{\Delta}_S$',
                   alpha=0.5,
                   c=em.dp_colours['bolton'],
                   ls='--')
    axarr.scatter(t,
                  df_sens['empirical_sensitivity'],
                  label='_nolegend_',
                  s=size,
                  c=em.dp_colours['bolton'])
    axarr.plot(t,
               df_sens['empirical_sensitivity'],
               label=r'$\hat{\Delta}^*_S$',
               alpha=0.5,
               c=em.dp_colours['bolton'])

    if convergence_point is not None:
        # add a vertical line
        axarr.axvline(x=convergence_point, ls='--', alpha=0.5, color='black')
    # Now save and stuff
    axarr.legend()
    axarr.set_ylabel(r'$\|w - w^\prime\|$')
    axarr.set_xlabel('training steps')
    xmin, _ = axarr.get_xlim()  # this is a hack for mnist
    axarr.set_xlim(xmin, t.max())

    vis_utils.beautify_axes(np.array([axarr]))
    plt.tight_layout()

    figure_identifier = f'distance_v_time_{cfg_name}'
    plt.savefig((FIGS_DIR / figure_identifier).with_suffix('.png'))
    plt.savefig((FIGS_DIR / figure_identifier).with_suffix('.pdf'))

    return
Ejemplo n.º 13
0
def plot_delta_histogram(cfg_name: str,
                         model: str,
                         num_deltas='max',
                         t=500,
                         include_bounds=False,
                         xlim=None,
                         ylim=None,
                         data_privacy='all',
                         multivariate=False) -> None:

    if multivariate:
        raise NotImplementedError('Multivariate plotting is not implemented')
    delta_histogram = dr.DeltaHistogram(cfg_name, model, num_deltas, t,
                                        data_privacy, multivariate)
    plot_data = delta_histogram.load(diffinit=False)
    plot_data_diffinit = delta_histogram.load(diffinit=True)

    vary_both = plot_data['vary_both']
    vary_S = plot_data['vary_S']
    vary_r = plot_data['vary_r']

    vary_both_diffinit = plot_data_diffinit['vary_both']
    vary_S_diffinit = plot_data_diffinit['vary_S']
    vary_r_diffinit = plot_data_diffinit['vary_r']

    # remove NANs
    vary_both = vary_both[~np.isnan(vary_both)]
    vary_S = vary_S[~np.isnan(vary_S)]
    vary_r = vary_r[~np.isnan(vary_r)]
    vary_both_diffinit = vary_both_diffinit[~np.isnan(vary_both_diffinit)]
    vary_S_diffinit = vary_S_diffinit[~np.isnan(vary_S_diffinit)]
    vary_r_diffinit = vary_r_diffinit[~np.isnan(vary_r_diffinit)]
    # merge vary_S for the different initialisations
    vary_S = np.concatenate([vary_S, vary_S_diffinit])

    plt.clf()
    plt.close()
    fig, axarr = plt.subplots(nrows=1, ncols=1, figsize=(4, 2.1))
    print('Plotting varying S... number of deltas:', vary_S.shape[0])
    sns.distplot(vary_S,
                 ax=axarr,
                 color=em.dp_colours['bolton'],
                 label=r'$\Delta_S$',
                 kde=True,
                 norm_hist=True)
    print('Plotting varying r... number of deltas:', vary_r.shape[0])
    sns.distplot(vary_r,
                 ax=axarr,
                 color=em.dp_colours['augment'],
                 label=r'$\Delta_V^{fix}$',
                 kde=True,
                 norm_hist=True)
    sns.distplot(vary_r_diffinit,
                 ax=axarr,
                 color=em.dp_colours['augment_diffinit'],
                 label=r'$\Delta_V^{vary}$',
                 kde=True,
                 norm_hist=True)

    print('Plotting varying both... number of deltas:', vary_both.shape[0])
    sns.distplot(vary_both,
                 ax=axarr,
                 color=em.dp_colours['both'],
                 label=r'$\Delta_{S+V}^{fix}$',
                 kde=True,
                 hist=False,
                 kde_kws={'linestyle': '--'})
    sns.distplot(vary_both_diffinit,
                 ax=axarr,
                 color=em.dp_colours['both_diffinit'],
                 label=r'$\Delta_{S+V}^{vary}$',
                 kde=True,
                 hist=False,
                 kde_kws={
                     'linestyle': ':',
                     'lw': 2
                 })

    if include_bounds:
        assert model == 'logistic'
        lipschitz_constant = np.sqrt(2.0)
        _, batch_size, lr, _, N = em.get_experiment_details(cfg_name,
                                                            model,
                                                            verbose=True)
        wu_bound = test_private_model.compute_wu_bound(lipschitz_constant,
                                                       t=t,
                                                       N=N,
                                                       batch_size=batch_size,
                                                       eta=lr)
        axarr.axvline(x=wu_bound,
                      ls='--',
                      color=em.dp_colours['bolton'],
                      label=r'$\hat{\Delta}_S$')

    axarr.legend()
    axarr.set_xlabel(r'$\|w - w^\prime\|$')
    axarr.set_ylabel('density')

    if xlim is not None:
        axarr.set_xlim(xlim)

    if ylim is not None:
        axarr.set_ylim(ylim)

    vis_utils.beautify_axes(np.array([axarr]))
    plt.tight_layout()

    figure_identifier = f'delta_histogram_{cfg_name}_{data_privacy}_{model}_t{t}'

    plt.savefig((FIGS_DIR / figure_identifier).with_suffix('.png'))
    plt.savefig((FIGS_DIR / figure_identifier).with_suffix('.pdf'))

    return