def layer_deviations(): original_scores = collect_scores(benchmark='Pereira2018-encoding', models=models) experiment2_scores, experiment3_scores = collect_Pereira_experiment_scores( best_layer=False) # filter trained only experiment2_scores = experiment2_scores[~experiment2_scores['model'].str. endswith('-untrained')] experiment3_scores = experiment3_scores[~experiment3_scores['model'].str. endswith('-untrained')] # compute deviation between the exp3 score of the layer chosen on exp2 and the max exp3 score (layer chosen on exp3) deviations = [] assert (experiment2_scores['model'].values == experiment3_scores['model'].values).all() for model in set(experiment2_scores['model']): model_data2 = experiment2_scores[experiment2_scores['model'] == model] model_data3 = experiment3_scores[experiment3_scores['model'] == model] best_layer2 = model_data2['layer'][model_data2['score'] == max( model_data2['score'])].values[0] best_layer3 = model_data3['layer'][model_data3['score'] == max( model_data3['score'])].values[0] best3 = model_data3[model_data3['layer'] == best_layer3] chosen = model_data3[model_data3['layer'] == best_layer2] deviations.append({ 'model': model, 'best_layer': best_layer3, 'chosen_layer': best_layer2, 'max_score': best3['score'].values[0], 'error1': best3['error'].values[0], 'chosen_score': chosen['score'].values[0], 'error2': chosen['error'].values[0], 'reference_error': original_scores['error'] }) deviations = pd.DataFrame(deviations) deviations[ 'deviation'] = deviations['max_score'] - deviations['chosen_score'] deviations['avg_error'] = deviations.loc[:, ["error1", "error2"]].mean(axis=1) # plot fig, ax = pyplot.subplots() width = 0.5 step = (len(models) + 1) * width offset = len(models) / 2 for model_iter, model in enumerate(models): model_score = deviations[deviations['model'] == model] y, yerr = model_score['deviation'], model_score['avg_error'] x = np.arange(start=0, stop=len(y) * step, step=step) model_x = x - offset * width + model_iter * width ax.bar(model_x, height=y, yerr=yerr, width=width, edgecolor='none', color=model_colors[model], ecolor='gray', error_kw=dict(elinewidth=1, alpha=.5)) for xpos in model_x: ax.text(x=xpos + .6 * width / 2, y=.005, s=model_label_replace[model], rotation=90, rotation_mode='anchor', fontdict=dict(fontsize=6.5), color='gray') ax.set_xticks([]) ax.set_ylim([-.15, 1]) ax.set_ylabel('train/test deviation of layer choice') savefig(fig, savename=Path(__file__).parent / 'layer_deviations')
def metric_generalizations(): data_identifiers = ['Pereira2018', 'Fedorenko2016v3', 'Blank2014fROI'] base_metric = 'encoding' comparison_metrics = ['rdm'] # , 'cka'] fig = pyplot.figure(figsize=(20, 10 * len(comparison_metrics)), constrained_layout=True) gridspec = fig.add_gridspec(nrows=len(data_identifiers) * len(comparison_metrics), ncols=4) for benchmark_index, benchmark_prefix in enumerate(data_identifiers): settings = dict(xlim=[0, 1.2], ylim=[0, 1.8]) if benchmark_prefix.startswith("Pereira") else \ dict(xlim=[0, .4], ylim=[0, .4]) if benchmark_prefix.startswith('Blank') else dict() for metric_index, comparison_metric in enumerate(comparison_metrics): grid_row = (benchmark_index * len(comparison_metrics)) + metric_index correlations = [] # all as well as trained and untrained separately for train_index, include_untrained in enumerate( [True, False, 'only']): gridpos = gridspec[grid_row, 1 + train_index] ax = fig.add_subplot(gridpos) train_description = 'all' if include_untrained is True \ else 'trained' if include_untrained is False \ else 'untrained' ax.set_title(f"{comparison_metric} {train_description}") _, _, info = compare( benchmark1=f"{benchmark_prefix}-{base_metric}", benchmark2=f"{benchmark_prefix}-{comparison_metric}", **settings, plot_significance_p=False, plot_significance_stars=True, include_untrained=include_untrained, ax=ax) ax.set_xlabel(ax.get_xlabel() + '-' + base_metric) correlations.append({ 'trained': train_description, 'r': info['r'], 'p': info['p'], 'index': train_index }) # plot bars correlations = pd.DataFrame(correlations).sort_values(by='index') ax = fig.add_subplot(gridspec[grid_row, 0]) ticks = list(reversed(correlations['index'])) bar_width = 0.5 ax.barh(y=ticks, width=correlations['r'], height=bar_width, color='#ababab') ax.set_yticks([]) for ypos, label, pvalue in zip(ticks, correlations['trained'], correlations['p']): ax.text(y=ypos + .15 * bar_width / 2, x=.01, s=label, verticalalignment='center', fontdict=dict(fontsize=20), color='black') ax.text( y=ypos, x=0, s=significance_stars(pvalue) if pvalue < .05 else 'n.s.', rotation=90, rotation_mode='anchor', horizontalalignment='center', fontdict=dict(fontsize=14, fontweight='normal')) ax.set_title(comparison_metric) fig.text(x=0.007, y=1 - (.33 * benchmark_index + .15), s=benchmark_prefix, rotation=90, horizontalalignment='center', verticalalignment='center', fontdict=dict(fontsize=20, fontweight='bold')) savefig(fig, savename=Path(__file__).parent / f"metric_generalizations")
def untrained_vs_trained(benchmark='Pereira2018-encoding', layer_mode='best', model_selection=None, analyze_only=False, **kwargs): """ :param layer_mode: 'best' to select the best layer per model, 'group' to keep all layers and color them based on their model, 'pos' to keep all layers and color them based on their relative position. """ all_models = model_selection or models all_models = [[model, f"{model}-untrained"] for model in all_models] all_models = [model for model_tuple in all_models for model in model_tuple] scores = collect_scores(benchmark=benchmark, models=all_models) scores = average_adjacent(scores) # average experiments & atlases scores = scores.dropna( ) # embedding layers in xlnets and t5s have nan scores if layer_mode == 'best': scores = choose_best_scores(scores) elif layer_mode == 'pos': scores['layer_position'] = [ model_layers[model].index(layer) / len(model_layers[model]) for model, layer in zip(scores['model'].values, scores['layer'].values) ] # separate into trained / untrained untrained_rows = np.array( [model.endswith('-untrained') for model in scores['model']]) scores_trained, scores_untrained = scores[~untrained_rows], scores[ untrained_rows] # align scores_untrained['model'] = [ model.replace('-untrained', '') for model in scores_untrained['model'].values ] scores_trained, scores_untrained = align_scores( scores_trained, scores_untrained, identifier_set=('model', ) if layer_mode == 'best' else ('model', 'layer')) if layer_mode != 'best': assert (scores_trained['layer'].values == scores_untrained['layer'].values).all() # analyze average_trained, average_untrained = np.mean( scores_trained['score']), np.mean(scores_untrained['score']) _, p_diff = pearsonr(scores_trained['score'], scores_untrained['score']) logger.info( f"Trained/untrained on {benchmark}: " f"score trained={average_trained:.2f}, untrained={average_untrained:.2f} | " f"diff {average_trained - average_untrained:.2f} ({average_trained / average_untrained * 100:.0f}%, " f"p={p_diff}") if analyze_only: return # plot if layer_mode in ('best', 'group'): colors = [model_colors[model] for model in scores_trained['model']] colors = [to_rgba(named_color) for named_color in colors] else: cmap = matplotlib.cm.get_cmap('binary') colors = cmap(scores_trained['layer_position'].values) fig, ax = pyplot.subplots(figsize=(6, 6)) _plot_scores1_2(scores_untrained, scores_trained, alpha=None if layer_mode == 'best' else 0.4, color=colors, xlabel="architecture (no training)", ylabel="architecture + training", plot_significance_stars=False, ax=ax, **kwargs) lims = [-.05, 1.1] if benchmark.startswith('Fedorenko') else [-.05, 1.2] if benchmark.startswith('Pereira') \ else [8, 4] if benchmark.startswith('wikitext-2') else [0, 1.335] if benchmark.startswith('Futrell') \ else [-.05, 1.] ax.set_xlim(lims) ax.set_ylim(lims) ax.plot(ax.get_xlim(), ax.get_xlim(), linestyle='dashed', color='darkgray') ax.set_title(benchmark_label_replace[benchmark]) savefig(fig, savename=Path(__file__).parent / f"untrained_trained-{benchmark}")
def num_features_vs_score(benchmark='Pereira2018-encoding', per_layer=True, include_untrained=True): if include_untrained: all_models = [(model, f"{model}-untrained") for model in models] all_models = [ model for model_tuple in all_models for model in model_tuple ] else: all_models = models scores = collect_scores(benchmark=benchmark, models=all_models) scores = average_adjacent(scores) scores = scores.dropna() if not per_layer: scores = choose_best_scores(scores) # count number of features store_file = Path(__file__).parent / "num_features.csv" if store_file.is_file(): num_features = pd.read_csv(store_file) else: num_features = [] for model in tqdm(ordered_set(scores['model'].values), desc='models'): # mock-run stimuli that are already stored mock_extractor = ActivationsExtractorHelper(get_activations=None, reset=None) features = mock_extractor._from_sentences_stored( layers=model_layers[model.replace('-untrained', '')], sentences=None, identifier=model.replace('-untrained', ''), stimuli_identifier='Pereira2018-243sentences.astronaut') if per_layer: for layer in scores['layer'].values[scores['model'] == model]: num_features.append({ 'model': model, 'layer': layer, 'score': len(features.sel(layer=layer)['neuroid']) }) else: num_features.append({ 'model': model, 'score': len(features['neuroid']) }) num_features = pd.DataFrame(num_features) num_features['error'] = np.nan num_features.to_csv(store_file, index=False) if per_layer: assert (scores['layer'].values == num_features['layer'].values).all() # plot colors = [ model_colors[model.replace('-untrained', '')] for model in scores['model'].values ] fig, ax = _plot_scores1_2(num_features, scores, color=colors, xlabel="number of features", ylabel=benchmark) savefig(fig, savename=Path(__file__).parent / f"num_features-{benchmark}" + ("-layerwise" if per_layer else ""))
def compare(benchmark1='wikitext-2', benchmark2='Blank2014fROI-encoding', include_untrained=False, best_layer=True, normalize=True, reference_best=False, identity_line=False, annotate=False, plot_ceiling=False, xlim=None, ylim=None, ax=None, **kwargs): ax_given = ax is not None all_models = models if include_untrained: all_models = [([model] if include_untrained != 'only' else []) + [f"{model}-untrained"] for model in all_models] all_models = [ model for model_tuple in all_models for model in model_tuple ] scores1 = collect_scores(benchmark=benchmark1, models=all_models, normalize=normalize) scores2 = collect_scores(benchmark=benchmark2, models=all_models, normalize=normalize) scores1, scores2 = average_adjacent(scores1).dropna(), average_adjacent( scores2).dropna() if best_layer: choose_best = choose_best_scores if not reference_best else reference_best_scores scores1, scores2 = choose_best(scores1), choose_best(scores2) scores1, scores2 = align_scores( scores1, scores2, identifier_set=['model'] if best_layer else ['model', 'layer']) colors = [ model_colors[model.replace('-untrained', '')] for model in scores1['model'].values ] colors = [to_rgba(named_color) for named_color in colors] if not best_layer or not annotate: score_annotations = None elif annotate is True: score_annotations = scores1['model'].values else: score_annotations = [ model if model in annotate else None for model in scores1['model'].values ] fig, ax, info = _plot_scores1_2( scores1, scores2, color=colors, alpha=None if best_layer else .2, score_annotations=score_annotations, xlabel=benchmark1, ylabel=benchmark2, loss_xaxis=benchmark1.startswith('wikitext'), ax=ax, return_info=True, **kwargs) xlim_given, ylim_given = xlim is not None, ylim is not None xlim, ylim = ax.get_xlim() if not xlim_given else xlim, ax.get_ylim( ) if not ylim_given else ylim normalize_x = normalize and not any( benchmark1.startswith(perf_prefix) for perf_prefix in performance_benchmarks) normalize_y = normalize and not any( benchmark2.startswith(perf_prefix) for perf_prefix in performance_benchmarks) if normalize_x and not xlim_given: xlim = [0, 1.1] if normalize_y and not ylim_given: ylim = [0, 1.1] if normalize_x and plot_ceiling: ceiling_err = get_ceiling(benchmark1) shaded_errorbar(y=ylim, x=np.array([1, 1]), error=ceiling_err, ax=ax, vertical=True, alpha=0, shaded_kwargs=dict(color='gray', alpha=.5)) if normalize_y and plot_ceiling: ceiling_err = get_ceiling(benchmark2) shaded_errorbar(x=xlim, y=np.array([1, 1]), error=ceiling_err, ax=ax, alpha=0, shaded_kwargs=dict(color='gray', alpha=.5)) if identity_line: lim = [min(xlim[0], ylim[0]), max(xlim[1], ylim[1])] if not xlim_given: xlim = lim if not ylim_given: ylim = lim ax.plot(lim, lim, linestyle='dashed', color='gray') ax.set_xlim(xlim) ax.set_ylim(ylim) if not ax_given: savefig(fig, savename=Path(__file__).parent / (f"{benchmark1}__{benchmark2}" + ('-best' if best_layer else '-layers'))) return fig, ax, info