def benchmark_correlations(best_layer=True): data = [] # Pereira internal Pereira_experiment2_scores, Pereira_experiment3_scores = collect_Pereira_experiment_scores( best_layer=best_layer) Pereira_experiment2_scores = Pereira_experiment2_scores['score'].values Pereira_experiment3_scores = Pereira_experiment3_scores['score'].values correlation_Pereira, p_Pereira = pearsonr(Pereira_experiment2_scores, Pereira_experiment3_scores) data.append( dict(benchmark1='Pereira Exp. 2', benchmark2='Pereira Exp. 3', r=correlation_Pereira, p=p_Pereira)) # cross-benchmark benchmarks = ('Pereira2018-encoding', 'Blank2014fROI-encoding', 'Fedorenko2016-encoding') for benchmark1, benchmark2 in itertools.combinations(benchmarks, 2): benchmark1_scores = collect_scores(benchmark=benchmark1, models=all_models) benchmark2_scores = collect_scores(benchmark=benchmark2, models=all_models) benchmark1_scores = average_adjacent(benchmark1_scores).dropna() benchmark2_scores = average_adjacent(benchmark2_scores).dropna() if best_layer: benchmark1_scores = choose_best_scores(benchmark1_scores) benchmark2_scores = choose_best_scores(benchmark2_scores) benchmark1_scores, benchmark2_scores = align_scores( benchmark1_scores, benchmark2_scores, identifier_set=('model', ) if best_layer else ('model', 'layer')) benchmark1_scores, benchmark2_scores = benchmark1_scores[ 'score'].values, benchmark2_scores['score'].values r, p = pearsonr(benchmark1_scores, benchmark2_scores) data.append( dict(benchmark1=benchmark1, benchmark2=benchmark2, r=r, p=p)) data = pd.DataFrame(data) # plot fig, ax = pyplot.subplots(figsize=(3, 4)) x = np.arange(len(data)) ax.bar(x, data['r']) ax.set_xticks(x) ax.set_xticklabels([ f"{benchmark1[:5]} / {benchmark2[:5]}" for benchmark1, benchmark2 in zip(data['benchmark1'].values, data['benchmark2'].values) ], rotation=90) for _x, r, p in zip(x, data['r'].values, data['p'].values): ax.text(_x, r + .05, significance_stars(p) if p < .05 else 'n.s.', fontsize=12, horizontalalignment='center', verticalalignment='center') savefig( fig, Path(__file__).parent / 'benchmark-correlations' + ('-best' if best_layer else '-layers'))
def shortcomings(model): benchmarks = [ 'Futrell2018-encoding', 'Futrell2018sentences-encoding', 'Futrell2018stories-encoding' ] scores = [ collect_scores(models=[model], benchmark=benchmark) for benchmark in benchmarks ] scores = reduce(lambda left, right: pd.concat([left, right]), scores) scores = average_adjacent(scores).dropna() scores = choose_best_scores(scores) fig, ax = pyplot.subplots(figsize=(3.5, 5)) x = np.arange(len(benchmarks)) ax.bar(x, height=scores['score'], yerr=scores['error'], color=model_colors[model], edgecolor='none', ecolor='gray', error_kw=dict(elinewidth=1, alpha=.5)) ax.set_xticks(x) ax.set_xticklabels(['words', 'sentences', 'stories'], rotation=45) ax.yaxis.set_major_locator(MultipleLocator(base=0.2)) ax.yaxis.set_major_formatter(score_formatter) ax.set_ylim([0, 1.2]) ax.set_ylabel('Normalized Predictivity') ax.set_title('Futrell2018 variations') savefig(fig, Path(__file__).parent / "bars-generalization")
def random_embedding(): models = ['gpt2-xl', 'gpt2-xl-untrained', 'random-embedding'] benchmarks = [ 'Pereira2018-encoding', 'Fedorenko2016v3-encoding', 'Blank2014fROI-encoding', 'Futrell2018-encoding' ] scores = [ collect_scores(benchmark=benchmark, models=models) for benchmark in benchmarks ] scores = [ average_adjacent(benchmark_scores) for benchmark_scores in scores ] scores = [ choose_best_scores(benchmark_scores).dropna() for benchmark_scores in scores ] scores = reduce(lambda left, right: pd.concat([left, right]), scores) fig, ax = pyplot.subplots(figsize=(5, 4)) colors = { 'gpt2-xl': model_colors['gpt2-xl'], 'gpt2-xl-untrained': '#284343', 'random-embedding': '#C3CCCC' } offsets = {0: -.2, 1: 0, 2: +.2} width = 0.5 / 3 text_kwargs = dict(fontdict=dict(fontsize=7), color='white') base_x = np.arange(len(benchmarks)) for i, model in enumerate(models): model_scores = scores[scores['model'] == model] x = base_x + offsets[i] ax.bar(x, height=model_scores['score'], yerr=model_scores['error'], width=width, align='center', color=colors[model], edgecolor='none', ecolor='gray', error_kw=dict(elinewidth=1, alpha=.5)) for xpos in x: text = ax.text(x=xpos + .6 * width / 2, y=.05, s=model_label_replace[model], rotation=90, rotation_mode='anchor', **text_kwargs) text.set_path_effects( [patheffects.withStroke(linewidth=0.75, foreground='black')]) ax.set_xticks(base_x) ax.set_xticklabels( [benchmark_label_replace[benchmark] for benchmark in benchmarks], fontsize=9) ax.yaxis.set_major_locator(MultipleLocator(base=0.2)) ax.yaxis.set_major_formatter(score_formatter) ax.set_ylim([0, 1.2]) ax.set_ylabel('Normalized Predictivity') savefig(fig, Path(__file__).parent / "bars-random_embedding")
def retrieve_scores(benchmark, models=all_models): scores = collect_scores(benchmark, models) scores = average_adjacent( scores) # average each model+layer's score per experiment and atlas scores = scores.fillna(0) # nan scores are 0 scores = choose_best_scores(scores) nan = scores[scores.isna().any(1)] if len(nan) > 0: _logger.warning(f"Dropping nan rows: {nan}") scores = scores.dropna() return scores
def Fedorenko2016(best_layer=True): scores_lang = collect_scores(benchmark='Fedorenko2016v3-encoding', models=models, normalize=True) scores_nonlang = collect_scores( benchmark='Fedorenko2016v3nonlang-encoding', models=models, normalize=True) scores_lang, scores_nonlang = average_adjacent( scores_lang).dropna(), average_adjacent(scores_nonlang).dropna() if best_layer: scores_lang, scores_nonlang = choose_best_scores( scores_lang), choose_best_scores(scores_nonlang) scores_lang, scores_nonlang = align_scores( scores_lang, scores_nonlang, identifier_set=['model'] if best_layer else ['model', 'layer']) diffs = scores_lang['score'] - scores_nonlang['score'] print(f"median drop {np.nanmedian(diffs)}+-{np.std(diffs)}") mults = scores_lang['score'] / scores_nonlang['score'] print(f"median multiplicative drop {np.nanmedian(mults)}+-{np.std(mults)}")
def layer_preference_single( model='gpt2-xl', benchmarks=('Pereira2018-encoding', 'Fedorenko2016v3-encoding', 'Blank2014fROI-encoding'), smoothing=False, ): data = [ collect_scores(benchmark=benchmark, models=[model]) for benchmark in benchmarks ] data = [average_adjacent(d) for d in data] fig, axes = pyplot.subplots(figsize=(15, 6), nrows=1, ncols=len(benchmarks), sharey=True) for benchmark_iter, (ax, benchmark_name, benchmark_data) in enumerate( zip(axes.flatten(), benchmarks, data)): ax.set_title(benchmark_name) num_layers = len( benchmark_data['layer']) # assume layers are correctly ordered relative_position = np.arange(num_layers) / (num_layers - 1) y, error = benchmark_data['score'], benchmark_data['error'] if smoothing: window_size = int(len(y) * 2 / 3) if window_size % 2 == 0: # if even window_size += 1 # make odd (required for filter) y = savgol_filter(y, window_size, 3) shaded_errorbar(x=relative_position, y=y, error=error, label=model, ax=ax, alpha=0.4, color=model_colors[model], linewidth=7.0 if model == 'gpt2-xl' else 1.0, shaded_kwargs=dict(alpha=0.2, color=model_colors[model])) if benchmark_iter > 0: ax.set_yticklabels([]) else: ax.set_ylabel('score') ax.set_ylim([0, 1.2]) # xlabel fig.text(0.5, 0.01, 'relative layer position', ha='center') # save fig.tight_layout() savefig(fig, Path(__file__).parent / f'layer_ordering-{model}')
def layer_preference(benchmark='Pereira2018-encoding'): models = [model for model in all_models if len(model_layers[model]) > 1 ] # need at least 2 layers to plot data = collect_scores(benchmark=benchmark, models=models) data = average_adjacent(data) groups = copy.deepcopy(model_groups) groups['other'] = [ model for model in models if not any(model in group for group in groups.values()) ] _logger.debug(f"Non-assigned models: {groups['other']}") fig, axes = pyplot.subplots(figsize=(20, 6), nrows=1, ncols=len(groups), sharey=True) for model_group_iter, (ax, (group_name, models)) in enumerate( zip(axes.flatten(), groups.items())): ax.set_title(group_name) for model in models: group = data[data['model'] == model] num_layers = len( group['layer']) # assume layers are correctly ordered relative_position = np.arange(num_layers) / (num_layers - 1) shaded_errorbar(x=relative_position, y=group['score'], error=group['error'], label=model, ax=ax, alpha=0.4, color=model_colors[model], shaded_kwargs=dict(alpha=0.2, color=model_colors[model])) if model_group_iter > 0: ax.set_yticklabels([]) else: ax.set_ylabel('score') ax.set_ylim([0, 1.2]) # xlabel fig.text(0.5, 0.01, 'relative layer position', ha='center') # save fig.tight_layout() savefig(fig, Path(__file__).parent / f'layer_ordering-{benchmark}')
def first_last_layer_scores(benchmarks=('Pereira2018-encoding', 'Fedorenko2016v3-encoding', 'Blank2014fROI-encoding')): models = all_models data = [ collect_scores(benchmark=benchmark, models=models) for benchmark in benchmarks ] data = [average_adjacent(d) for d in data] @matplotlib.ticker.FuncFormatter def score_formatter(score, pos): if score < 0 or score > 1: return "" return f"{score:.2f}" fig, axes = pyplot.subplots(figsize=(15, 15), nrows=len(benchmarks), ncols=1, sharey=False) width = 0.5 for benchmark_iter, (benchmark, benchmark_data) in enumerate(zip(benchmarks, data)): ax = axes[benchmark_iter] for model_iter, model in enumerate(models): model_data = benchmark_data[benchmark_data['model'] == model] best_score = model_data['score'].max() best_score_error = model_data[model_data['score'] == best_score]['error'] ax.errorbar(x=model_iter, y=best_score, yerr=best_score_error, marker='.', color='black', label='best layer' if model_iter == len(models) - 1 else None) if len(model_data) > 1: first_score, first_score_error = model_data['score'].values[ 0], model_data['error'].values[0] ax.errorbar(x=model_iter - 0.2 * width, y=first_score, yerr=first_score_error, marker='.', color='lightgray', label='first layer' if model_iter == len(models) - 1 else None) last_score, last_score_error = model_data['score'].values[ -1], model_data['error'].values[-1] ax.errorbar(x=model_iter + 0.2 * width, y=last_score, yerr=last_score_error, marker='.', color='gray', label='last layer' if model_iter == len(models) - 1 else None) if benchmark_iter < len(benchmarks) - 1: ax.set_xticks([]) else: ax.set_xticks(np.arange(len(models))) ax.set_xticklabels( [model_label_replace[model] for model in models], rotation=90) if benchmark_iter == 0: ax.legend() ax.set_ylabel('Normalized Predictivity') ax.set_title(benchmark_label_replace[benchmark]) ax.yaxis.set_major_formatter(score_formatter) savefig(fig, Path(__file__).parent / 'first_layer_layer_scores')
def _model_scores(benchmark, models): scores = collect_scores(benchmark=benchmark, models=models, normalize=True) scores = average_adjacent(scores).dropna() scores = choose_best_scores(scores) return scores