Exemple #1
0
def benchmark_correlations(best_layer=True):
    data = []
    # Pereira internal
    Pereira_experiment2_scores, Pereira_experiment3_scores = collect_Pereira_experiment_scores(
        best_layer=best_layer)
    Pereira_experiment2_scores = Pereira_experiment2_scores['score'].values
    Pereira_experiment3_scores = Pereira_experiment3_scores['score'].values
    correlation_Pereira, p_Pereira = pearsonr(Pereira_experiment2_scores,
                                              Pereira_experiment3_scores)
    data.append(
        dict(benchmark1='Pereira Exp. 2',
             benchmark2='Pereira Exp. 3',
             r=correlation_Pereira,
             p=p_Pereira))
    # cross-benchmark
    benchmarks = ('Pereira2018-encoding', 'Blank2014fROI-encoding',
                  'Fedorenko2016-encoding')
    for benchmark1, benchmark2 in itertools.combinations(benchmarks, 2):
        benchmark1_scores = collect_scores(benchmark=benchmark1,
                                           models=all_models)
        benchmark2_scores = collect_scores(benchmark=benchmark2,
                                           models=all_models)
        benchmark1_scores = average_adjacent(benchmark1_scores).dropna()
        benchmark2_scores = average_adjacent(benchmark2_scores).dropna()
        if best_layer:
            benchmark1_scores = choose_best_scores(benchmark1_scores)
            benchmark2_scores = choose_best_scores(benchmark2_scores)
        benchmark1_scores, benchmark2_scores = align_scores(
            benchmark1_scores,
            benchmark2_scores,
            identifier_set=('model', ) if best_layer else ('model', 'layer'))
        benchmark1_scores, benchmark2_scores = benchmark1_scores[
            'score'].values, benchmark2_scores['score'].values
        r, p = pearsonr(benchmark1_scores, benchmark2_scores)
        data.append(
            dict(benchmark1=benchmark1, benchmark2=benchmark2, r=r, p=p))
    data = pd.DataFrame(data)
    # plot
    fig, ax = pyplot.subplots(figsize=(3, 4))
    x = np.arange(len(data))
    ax.bar(x, data['r'])
    ax.set_xticks(x)
    ax.set_xticklabels([
        f"{benchmark1[:5]} / {benchmark2[:5]}" for benchmark1, benchmark2 in
        zip(data['benchmark1'].values, data['benchmark2'].values)
    ],
                       rotation=90)
    for _x, r, p in zip(x, data['r'].values, data['p'].values):
        ax.text(_x,
                r + .05,
                significance_stars(p) if p < .05 else 'n.s.',
                fontsize=12,
                horizontalalignment='center',
                verticalalignment='center')
    savefig(
        fig,
        Path(__file__).parent / 'benchmark-correlations' +
        ('-best' if best_layer else '-layers'))
Exemple #2
0
def shortcomings(model):
    benchmarks = [
        'Futrell2018-encoding', 'Futrell2018sentences-encoding',
        'Futrell2018stories-encoding'
    ]
    scores = [
        collect_scores(models=[model], benchmark=benchmark)
        for benchmark in benchmarks
    ]
    scores = reduce(lambda left, right: pd.concat([left, right]), scores)
    scores = average_adjacent(scores).dropna()
    scores = choose_best_scores(scores)
    fig, ax = pyplot.subplots(figsize=(3.5, 5))
    x = np.arange(len(benchmarks))
    ax.bar(x,
           height=scores['score'],
           yerr=scores['error'],
           color=model_colors[model],
           edgecolor='none',
           ecolor='gray',
           error_kw=dict(elinewidth=1, alpha=.5))
    ax.set_xticks(x)
    ax.set_xticklabels(['words', 'sentences', 'stories'], rotation=45)
    ax.yaxis.set_major_locator(MultipleLocator(base=0.2))
    ax.yaxis.set_major_formatter(score_formatter)
    ax.set_ylim([0, 1.2])
    ax.set_ylabel('Normalized Predictivity')
    ax.set_title('Futrell2018 variations')
    savefig(fig, Path(__file__).parent / "bars-generalization")
Exemple #3
0
def random_embedding():
    models = ['gpt2-xl', 'gpt2-xl-untrained', 'random-embedding']
    benchmarks = [
        'Pereira2018-encoding', 'Fedorenko2016v3-encoding',
        'Blank2014fROI-encoding', 'Futrell2018-encoding'
    ]
    scores = [
        collect_scores(benchmark=benchmark, models=models)
        for benchmark in benchmarks
    ]
    scores = [
        average_adjacent(benchmark_scores) for benchmark_scores in scores
    ]
    scores = [
        choose_best_scores(benchmark_scores).dropna()
        for benchmark_scores in scores
    ]
    scores = reduce(lambda left, right: pd.concat([left, right]), scores)

    fig, ax = pyplot.subplots(figsize=(5, 4))
    colors = {
        'gpt2-xl': model_colors['gpt2-xl'],
        'gpt2-xl-untrained': '#284343',
        'random-embedding': '#C3CCCC'
    }
    offsets = {0: -.2, 1: 0, 2: +.2}
    width = 0.5 / 3
    text_kwargs = dict(fontdict=dict(fontsize=7), color='white')
    base_x = np.arange(len(benchmarks))
    for i, model in enumerate(models):
        model_scores = scores[scores['model'] == model]
        x = base_x + offsets[i]
        ax.bar(x,
               height=model_scores['score'],
               yerr=model_scores['error'],
               width=width,
               align='center',
               color=colors[model],
               edgecolor='none',
               ecolor='gray',
               error_kw=dict(elinewidth=1, alpha=.5))
        for xpos in x:
            text = ax.text(x=xpos + .6 * width / 2,
                           y=.05,
                           s=model_label_replace[model],
                           rotation=90,
                           rotation_mode='anchor',
                           **text_kwargs)
            text.set_path_effects(
                [patheffects.withStroke(linewidth=0.75, foreground='black')])
    ax.set_xticks(base_x)
    ax.set_xticklabels(
        [benchmark_label_replace[benchmark] for benchmark in benchmarks],
        fontsize=9)
    ax.yaxis.set_major_locator(MultipleLocator(base=0.2))
    ax.yaxis.set_major_formatter(score_formatter)
    ax.set_ylim([0, 1.2])
    ax.set_ylabel('Normalized Predictivity')
    savefig(fig, Path(__file__).parent / "bars-random_embedding")
Exemple #4
0
def retrieve_scores(benchmark, models=all_models):
    scores = collect_scores(benchmark, models)
    scores = average_adjacent(
        scores)  # average each model+layer's score per experiment and atlas
    scores = scores.fillna(0)  # nan scores are 0
    scores = choose_best_scores(scores)
    nan = scores[scores.isna().any(1)]
    if len(nan) > 0:
        _logger.warning(f"Dropping nan rows: {nan}")
        scores = scores.dropna()
    return scores
Exemple #5
0
def Fedorenko2016(best_layer=True):
    scores_lang = collect_scores(benchmark='Fedorenko2016v3-encoding',
                                 models=models,
                                 normalize=True)
    scores_nonlang = collect_scores(
        benchmark='Fedorenko2016v3nonlang-encoding',
        models=models,
        normalize=True)
    scores_lang, scores_nonlang = average_adjacent(
        scores_lang).dropna(), average_adjacent(scores_nonlang).dropna()
    if best_layer:
        scores_lang, scores_nonlang = choose_best_scores(
            scores_lang), choose_best_scores(scores_nonlang)
    scores_lang, scores_nonlang = align_scores(
        scores_lang,
        scores_nonlang,
        identifier_set=['model'] if best_layer else ['model', 'layer'])
    diffs = scores_lang['score'] - scores_nonlang['score']
    print(f"median drop {np.nanmedian(diffs)}+-{np.std(diffs)}")
    mults = scores_lang['score'] / scores_nonlang['score']
    print(f"median multiplicative drop {np.nanmedian(mults)}+-{np.std(mults)}")
Exemple #6
0
def layer_preference_single(
    model='gpt2-xl',
    benchmarks=('Pereira2018-encoding', 'Fedorenko2016v3-encoding',
                'Blank2014fROI-encoding'),
    smoothing=False,
):
    data = [
        collect_scores(benchmark=benchmark, models=[model])
        for benchmark in benchmarks
    ]
    data = [average_adjacent(d) for d in data]

    fig, axes = pyplot.subplots(figsize=(15, 6),
                                nrows=1,
                                ncols=len(benchmarks),
                                sharey=True)
    for benchmark_iter, (ax, benchmark_name, benchmark_data) in enumerate(
            zip(axes.flatten(), benchmarks, data)):
        ax.set_title(benchmark_name)
        num_layers = len(
            benchmark_data['layer'])  # assume layers are correctly ordered
        relative_position = np.arange(num_layers) / (num_layers - 1)
        y, error = benchmark_data['score'], benchmark_data['error']
        if smoothing:
            window_size = int(len(y) * 2 / 3)
            if window_size % 2 == 0:  # if even
                window_size += 1  # make odd (required for filter)
            y = savgol_filter(y, window_size, 3)
        shaded_errorbar(x=relative_position,
                        y=y,
                        error=error,
                        label=model,
                        ax=ax,
                        alpha=0.4,
                        color=model_colors[model],
                        linewidth=7.0 if model == 'gpt2-xl' else 1.0,
                        shaded_kwargs=dict(alpha=0.2,
                                           color=model_colors[model]))
        if benchmark_iter > 0:
            ax.set_yticklabels([])
        else:
            ax.set_ylabel('score')
        ax.set_ylim([0, 1.2])
    # xlabel
    fig.text(0.5, 0.01, 'relative layer position', ha='center')
    # save
    fig.tight_layout()
    savefig(fig, Path(__file__).parent / f'layer_ordering-{model}')
Exemple #7
0
def layer_preference(benchmark='Pereira2018-encoding'):
    models = [model for model in all_models if len(model_layers[model]) > 1
              ]  # need at least 2 layers to plot
    data = collect_scores(benchmark=benchmark, models=models)
    data = average_adjacent(data)

    groups = copy.deepcopy(model_groups)
    groups['other'] = [
        model for model in models
        if not any(model in group for group in groups.values())
    ]
    _logger.debug(f"Non-assigned models: {groups['other']}")
    fig, axes = pyplot.subplots(figsize=(20, 6),
                                nrows=1,
                                ncols=len(groups),
                                sharey=True)
    for model_group_iter, (ax, (group_name, models)) in enumerate(
            zip(axes.flatten(), groups.items())):
        ax.set_title(group_name)
        for model in models:
            group = data[data['model'] == model]
            num_layers = len(
                group['layer'])  # assume layers are correctly ordered
            relative_position = np.arange(num_layers) / (num_layers - 1)
            shaded_errorbar(x=relative_position,
                            y=group['score'],
                            error=group['error'],
                            label=model,
                            ax=ax,
                            alpha=0.4,
                            color=model_colors[model],
                            shaded_kwargs=dict(alpha=0.2,
                                               color=model_colors[model]))
        if model_group_iter > 0:
            ax.set_yticklabels([])
        else:
            ax.set_ylabel('score')
        ax.set_ylim([0, 1.2])
    # xlabel
    fig.text(0.5, 0.01, 'relative layer position', ha='center')
    # save
    fig.tight_layout()
    savefig(fig, Path(__file__).parent / f'layer_ordering-{benchmark}')
Exemple #8
0
def first_last_layer_scores(benchmarks=('Pereira2018-encoding',
                                        'Fedorenko2016v3-encoding',
                                        'Blank2014fROI-encoding')):
    models = all_models
    data = [
        collect_scores(benchmark=benchmark, models=models)
        for benchmark in benchmarks
    ]
    data = [average_adjacent(d) for d in data]

    @matplotlib.ticker.FuncFormatter
    def score_formatter(score, pos):
        if score < 0 or score > 1:
            return ""
        return f"{score:.2f}"

    fig, axes = pyplot.subplots(figsize=(15, 15),
                                nrows=len(benchmarks),
                                ncols=1,
                                sharey=False)
    width = 0.5
    for benchmark_iter, (benchmark,
                         benchmark_data) in enumerate(zip(benchmarks, data)):
        ax = axes[benchmark_iter]
        for model_iter, model in enumerate(models):
            model_data = benchmark_data[benchmark_data['model'] == model]
            best_score = model_data['score'].max()
            best_score_error = model_data[model_data['score'] ==
                                          best_score]['error']
            ax.errorbar(x=model_iter,
                        y=best_score,
                        yerr=best_score_error,
                        marker='.',
                        color='black',
                        label='best layer' if model_iter == len(models) -
                        1 else None)
            if len(model_data) > 1:
                first_score, first_score_error = model_data['score'].values[
                    0], model_data['error'].values[0]
                ax.errorbar(x=model_iter - 0.2 * width,
                            y=first_score,
                            yerr=first_score_error,
                            marker='.',
                            color='lightgray',
                            label='first layer' if model_iter == len(models) -
                            1 else None)
                last_score, last_score_error = model_data['score'].values[
                    -1], model_data['error'].values[-1]
                ax.errorbar(x=model_iter + 0.2 * width,
                            y=last_score,
                            yerr=last_score_error,
                            marker='.',
                            color='gray',
                            label='last layer' if model_iter == len(models) -
                            1 else None)
        if benchmark_iter < len(benchmarks) - 1:
            ax.set_xticks([])
        else:
            ax.set_xticks(np.arange(len(models)))
            ax.set_xticklabels(
                [model_label_replace[model] for model in models], rotation=90)
        if benchmark_iter == 0:
            ax.legend()
        ax.set_ylabel('Normalized Predictivity')
        ax.set_title(benchmark_label_replace[benchmark])
        ax.yaxis.set_major_formatter(score_formatter)
    savefig(fig, Path(__file__).parent / 'first_layer_layer_scores')
Exemple #9
0
def _model_scores(benchmark, models):
    scores = collect_scores(benchmark=benchmark, models=models, normalize=True)
    scores = average_adjacent(scores).dropna()
    scores = choose_best_scores(scores)
    return scores