def __call__(self, assembly, *args, **kwargs): result = Score([assembly.values[0]], dims=['dim']) raw = Score(result.copy(), coords={ 'dim_id': ('dim', [assembly.values[1]]), 'division_coord': ('dim', [assembly.values[2]]) }) result.attrs['raw'] = raw return result
def __init__(self, noise_type, parent_category): self._noise_type = noise_type ceiling = Score([1, np.nan], coords={'aggregation': ['center', 'error']}, dims=['aggregation']) super(Imagenet_C_Group, self).__init__(identifier=f'dietterich.Hendrycks2019-{noise_type}-top1', version=1, ceiling_func=lambda: ceiling, parent=f'dietterich.Hendrycks2019-{parent_category}-top1', bibtex=BIBTEX)
def fROI_correlation(): assembly = load_voxels() stories = list(sorted(set(assembly['story'].values))) subjects = list(sorted(set(assembly['subject_UID'].values))) split_scores = [] correlate = pearsonr_correlation(xarray_kwargs=dict(correlation_coord='stimulus_id', neuroid_coord='fROI_area')) cross_stories_subjects = list(itertools.product(stories, subjects)) for story, heldout_subject in tqdm(cross_stories_subjects, desc='cross-{story,subject}'): story_assembly = assembly[{'presentation': [coord_story == story for coord_story in assembly['story'].values]}] subject_pool = story_assembly[{'neuroid': [subject != heldout_subject for subject in story_assembly['subject_UID'].values]}] subject_pool = average_subregions(subject_pool) heldout = story_assembly[{'neuroid': [subject == heldout_subject for subject in story_assembly['subject_UID'].values]}] heldout = average_subregions(heldout) split_score = correlate(subject_pool, heldout) split_score = type(split_score)(split_score.values, coords={ coord: (dims, values) for coord, dims, values in walk_coords(split_score) if not coord.startswith('subject_') and coord != 'neuroid_id'}, dims=split_score.dims) split_score = split_score.expand_dims('heldout_subject').expand_dims('story') split_score['heldout_subject'], split_score['story'] = [heldout_subject], [story] split_scores.append(split_score) correlation = Score.merge(*split_scores) correlation = apply_aggregate(lambda scores: scores.mean('neuroid').mean('story'), correlation) center = correlation.mean('heldout_subject') error = correlation.std('heldout_subject') score = Score([center, error], coords={**{'aggregation': ['center', 'error']}, **{coord: (dims, values) for coord, dims, values in walk_coords(center)}}, dims=('aggregation',) + center.dims) score.attrs[Score.RAW_VALUES_KEY] = correlation.attrs[Score.RAW_VALUES_KEY] return score
def __call__(self, prediction, target): # align prediction = prediction.sortby( [self._correlation_coord, self._neuroid_coord]) target = target.sortby([self._correlation_coord, self._neuroid_coord]) assert np.array(prediction[self._correlation_coord].values == target[ self._correlation_coord].values).all() assert np.array(prediction[self._neuroid_coord].values == target[ self._neuroid_coord].values).all() # compute correlation per neuroid neuroid_dims = target[self._neuroid_coord].dims assert len(neuroid_dims) == 1 correlations = [] for i, coord_value in enumerate(target[self._neuroid_coord].values): target_neuroids = target.isel(**{ neuroid_dims[0]: i }) # `isel` is about 10x faster than `sel` prediction_neuroids = prediction.isel(**{neuroid_dims[0]: i}) r, p = self._correlation(target_neuroids, prediction_neuroids) correlations.append(r) # package result = Score(correlations, coords={ coord: (dims, values) for coord, dims, values in walk_coords(target) if dims == neuroid_dims }, dims=neuroid_dims) return result
def test_sel(self): score = Score([1, 2], coords={'a': [1, 2]}, dims=['a']) score.attrs['raw'] = DataAssembly([0, 2, 1, 3], coords={'a': [1, 1, 2, 2]}, dims=['a']) sel_score = score.sel(a=1) np.testing.assert_array_equal(sel_score.raw['a'], [1, 1])
def test_mean_no_apply_raw(self): score = Score([1, 2], coords={'a': [1, 2]}, dims=['a']) score.attrs['raw'] = DataAssembly([0, 2, 1, 3], coords={'a': [1, 1, 2, 2]}, dims=['a']) mean_score = score.mean('a', _apply_raw=True) assert mean_score.raw == 1.5
def __init__(self): ceiling = Score([1, np.nan], coords={'aggregation': ['center', 'error']}, dims=['aggregation']) assembly_repetition = get_assembly() assert len(np.unique(assembly_repetition['region'])) == 1 assert hasattr(assembly_repetition, 'repetition') self.region = 'IT' self.assembly = average_repetition(assembly_repetition) self._assembly = self.assembly self.timebins = timebins_from_assembly(self.assembly) self._similarity_metric = CrossRegressedCorrelation( regression=pls_regression(), correlation=pearsonr_correlation(), crossvalidation_kwargs=dict( stratification_coord=Split.Defaults.stratification_coord if hasattr(self.assembly, Split.Defaults.stratification_coord ) else None)) identifier = f'{assembly_repetition.name}-layer_selection' ceiler = InternalConsistency() super(_MockBenchmark, self).__init__(identifier=identifier, ceiling_func=lambda: ceiler(assembly_repetition), version='1.0')
def __init__(self): stimulus_set = pd.read_csv( os.path.join(os.path.dirname(__file__), 'imagenet2012.csv')) stimulus_set = StimulusSet(stimulus_set) stimulus_set.image_paths = { row.image_id: row.filepath for row in stimulus_set.itertuples() } self._stimulus_set = stimulus_set self._similarity_metric = Accuracy() ceiling = Score([1, np.nan], coords={'aggregation': ['center', 'error']}, dims=['aggregation']) super(Imagenet2012, self).__init__(identifier='fei-fei.Deng2009-top1', version=1, ceiling_func=lambda: ceiling, parent='ImageNet', bibtex="""@INPROCEEDINGS{5206848, author={J. {Deng} and W. {Dong} and R. {Socher} and L. {Li} and {Kai Li} and {Li Fei-Fei}}, booktitle={2009 IEEE Conference on Computer Vision and Pattern Recognition}, title={ImageNet: A large-scale hierarchical image database}, year={2009}, volume={}, number={}, pages={248-255}, url = {https://ieeexplore.ieee.org/document/5206848} }""")
def __call__(self, candidate): self._logger.info('Computing activations') model_activations = read_words( candidate, self._target_assembly.attrs['stimulus_set'], reset_column='story_id', copy_columns=('stimulus_id', 'word_id', 'sentence_id')) assert set(model_activations['stimulus_id'].values) == set( self._target_assembly['stimulus_id'].values) self._logger.info('Scoring model') cross_subject_scores = self._cross_subject( self._target_assembly, apply=lambda cross_assembly: self._apply_within_subject( model_activations, cross_assembly)) # normalize by ceiling # Note that we normalize by an overall ceiling, so the scores per subject are not normalized wrt. that subject # and should thus not be used by themselves. Only the aggregate makes sense to report normalized_subject_scores = consistency( cross_subject_scores.sel(aggregation='center'), self.ceiling.sel(aggregation='center')) score = normalized_subject_scores.median('subject_id') std = normalized_subject_scores.std('subject_id') std['aggregation'] = 'error' # the MultiIndex tends to mess things up, so we get rid of it here score, std = xr.DataArray(score).expand_dims( 'aggregation'), xr.DataArray(std).expand_dims('aggregation') score = Score(Score.merge(score, std)) score.attrs['raw'] = cross_subject_scores score.attrs['ceiling'] = self.ceiling return score
def _apply_cross(self, source_assembly, cross_assembly): # some subjects have only done one experiment which leads to nans cross_assembly = cross_assembly.dropna('neuroid') if len(cross_assembly['neuroid']) == 0: return Score([np.nan, np.nan], coords={'aggregation': ['center', 'error']}, dims=['aggregation']) return super(_PereiraSubjectWise, self)._apply_cross( source_assembly=source_assembly, cross_assembly=cross_assembly)
def test_mean(self): score = Score([1, 2], coords={'a': [1, 2]}, dims=['a']) score.attrs['raw'] = DataAssembly([0, 2, 1, 3], coords={'a': [1, 1, 2, 2]}, dims=['a']) mean_score = score.mean('a') np.testing.assert_array_equal(mean_score.raw['a'], [1, 1, 2, 2])
def __init__(self): ceiling = Score( [.79, np.nan], # following private conversation with Kohitij Kar coords={'aggregation': ['center', 'error']}, dims=['aggregation']) super(DicarloKar2019OST, self).__init__( identifier='dicarlo.Kar2019-ost', version=2, ceiling_func=lambda: ceiling, parent='IT-temporal', paper_link='https://www.nature.com/articles/s41593-019-0392-5') assembly = brainscore.get_assembly('dicarlo.Kar2019') # drop duplicate images _, index = np.unique(assembly['image_id'], return_index=True) assembly = assembly.isel(presentation=index) assembly.attrs['stimulus_set'] = assembly.stimulus_set.drop_duplicates( 'image_id') assembly = assembly.sel(decoder='svm') self._assembly = assembly self._assembly['truth'] = self._assembly['image_label'] self._assembly.stimulus_set['truth'] = self._assembly.stimulus_set[ 'image_label'] self._similarity_metric = OSTCorrelation() self._visual_degrees = VISUAL_DEGREES self._number_of_trials = 44
def __call__(self, candidate: BrainModel): candidate.start_recording('IT', time_bins=self._time_bins) stimulus_set = place_on_screen( self._assembly.stimulus_set, target_visual_degrees=candidate.visual_degrees(), source_visual_degrees=self._visual_degrees) # Temporal recordings from large candidates take up a lot of memory and compute time. # In order to quickly reject recordings that are static over time, # we will show one image and check whether the recordings vary over time at all or not. # If they don't we can quickly score the candidate with a failure state # since it will not be able to predict temporal differences with the OST metric check_stimulus_set = stimulus_set[:1] check_stimulus_set.identifier = None # unset identifier to avoid storing (interferes with actual stimulus_set) check_recordings = candidate.look_at( check_stimulus_set, number_of_trials=self._number_of_trials) if not temporally_varying(check_recordings): score = Score([np.nan, np.nan], coords={'aggregation': ['center', 'error']}, dims=['aggregation']) else: recordings = candidate.look_at( stimulus_set, number_of_trials=self._number_of_trials) score = self._similarity_metric(recordings, self._assembly) score = ceil_score(score, self.ceiling) return score
def run_evaluation(return_score=False): scores = [] # Loop to handle MNLI double evaluation (matched, mis-matched) for eval_task in eval_task_names: examples, label_list, output_mode = get_examples( data_dir=data_dir, task=eval_task, evaluate=True) eval_dataset = model.glue_dataset( task=eval_task, examples=examples, label_list=label_list, output_mode=output_mode, max_seq_length=max_seq_length) result = evaluate(features_model=model, decoder_head=decoder_head, eval_dataset=eval_dataset, task_name=eval_task, output_mode=output_mode, device=device) if not return_score: return result # we're ignoring mnli-mm here, but this return is just for progress logging anyway score = Score([[value for key, value in result.items()]], coords={ 'eval_task': [eval_task], 'measure': list(result.keys()) }, dims=['eval_task', 'measure']) score.attrs['data_dir'] = data_dir score.attrs['benchmark_identifier'] = f"glue-{self.task_name}" score.attrs['eval_task'] = eval_task score.attrs['model_identifier'] = model.identifier scores.append(score) scores = Score.merge(*scores) return scores
def _repeat(self, func): random_state = self._initialize_random_state() repetitions = list(range(self._repetitions)) scores = [func(random_state=random_state) for repetition in repetitions] score = Score(scores, coords={'split': repetitions}, dims=['split']) self._save_matrix() return apply_aggregate(self.aggregate, score)
def correlate(self, predicted_osts, target_osts): non_nan = np.logical_and(~np.isnan(predicted_osts), ~np.isnan(target_osts)) predicted_osts, target_osts = predicted_osts[non_nan], target_osts[ non_nan] # use Spearman over Pearson since it tests whether the rank orders are the same, # which allows for nonlinear correlates whereas Pearson assumes linearity. correlation, p = spearmanr(predicted_osts, target_osts) return Score(correlation)
def __call__(self, model: TaskModel): model.mode = TaskModel.Modes.tokens_to_features set_seed(self.seed) device = 'cuda' if torch.cuda.is_available() else 'cpu' logger.debug( f"Using block size {self.block_size} for {model.identifier}") # Data vocab_size = min(model.vocab_size, 250000) train_tokens = TextDataset(model_identifier=model.identifier, model=model, block_size=self.block_size, vocab_size=vocab_size, file_path=self.train_data_file) val_tokens = TextDataset(model_identifier=model.identifier, model=model, block_size=self.block_size, vocab_size=vocab_size, file_path=self.val_data_file) test_tokens = TextDataset(model_identifier=model.identifier, model=model, block_size=self.block_size, vocab_size=vocab_size, file_path=self.eval_data_file) # Decoder logger.info(f"Vocab size: {vocab_size}") features_sample, _ = train_tokens[0] lm_head = LMHeadModel(features_size=features_sample.shape[-1], vocab_size=vocab_size, embedding_weights=model.get_embedding_weights() if self.tied else None) lm_head = lm_head.to(device) # Train train(model=lm_head, train_dataset=train_tokens, val_dataset=val_tokens, device=device, seed=self.seed, **self.kwargs) # Evaluation test_result = evaluate(model=lm_head, eval_dataset=test_tokens, device=device) score = Score([test_result[key] for key in ['perplexity', 'loss']], coords={'measure': ['test_perplexity', 'test_loss']}, dims=['measure']) score.attrs['datasets'] = { 'train': self.train_data_file, 'val': self.val_data_file, 'test': self.eval_data_file } score.attrs['benchmark_identifier'] = self.identifier score.attrs['model_identifier'] = model.identifier return score
def __call__(self, source_recordings, target_osts): if len(set(source_recordings['time_bin'].values) ) <= 1: # short-cut for non-temporal models return Score([np.nan, np.nan], coords={'aggregation': ['center', 'error']}, dims=['aggregation']) score = self._cross_validation(source_recordings, target_osts, apply=self.apply) return score
def __call__(self, source, target): values = source == target center = np.mean(values) error = np.std(values) score = Score([center, error], coords={'aggregation': ['center', 'error']}, dims=('aggregation', )) score.attrs[Score.RAW_VALUES_KEY] = values return score
def aggregate_neuroid_scores(neuroid_scores, subject_column): subject_scores = neuroid_scores.groupby(subject_column).median() center = subject_scores.median(subject_column) subject_values = np.nan_to_num(subject_scores.values, nan=0) # mad cannot deal with all-nan in one axis, treat as 0 subject_axis = subject_scores.dims.index(subject_scores[subject_column].dims[0]) error = median_absolute_deviation(subject_values, axis=subject_axis) score = Score([center, error], coords={'aggregation': ['center', 'error']}, dims=['aggregation']) score.attrs['raw'] = neuroid_scores score.attrs['description'] = "score aggregated by taking median of neuroids per subject, " \ "then median of subject scores" return score
def extrapolate_neuroid(self, ceilings): # figure out how many extrapolation x points we have. E.g. for Pereira, not all combinations are possible subject_subsamples = list(sorted(set(ceilings['num_subjects'].values))) rng = RandomState(0) bootstrap_params = [] for bootstrap in range(self.num_bootstraps): bootstrapped_scores = [] for num_subjects in subject_subsamples: num_scores = ceilings.sel(num_subjects=num_subjects) # the sub_subjects dimension creates nans, get rid of those num_scores = num_scores.dropna(f'sub_{self.subject_column}') assert set(num_scores.dims) == {f'sub_{self.subject_column}', 'split'} or \ set(num_scores.dims) == {f'sub_{self.subject_column}'} # choose from subject subsets and the splits therein, with replacement for variance choices = num_scores.values.flatten() bootstrapped_score = rng.choice(choices, size=len(choices), replace=True) bootstrapped_scores.append(np.mean(bootstrapped_score)) try: params = self.fit(subject_subsamples, bootstrapped_scores) except RuntimeError: # optimal parameters not found params = [np.nan, np.nan] params = DataAssembly([params], coords={ 'bootstrap': [bootstrap], 'param': ['v0', 'tau0'] }, dims=['bootstrap', 'param']) bootstrap_params.append(params) bootstrap_params = merge_data_arrays(bootstrap_params) # find endpoint and error asymptote_threshold = .0005 interpolation_xs = np.arange(1000) ys = np.array([ v(interpolation_xs, *params) for params in bootstrap_params.values if not np.isnan(params).any() ]) median_ys = np.median(ys, axis=0) diffs = np.diff(median_ys) end_x = np.where(diffs < asymptote_threshold)[0].min( ) # first x where increase smaller than threshold # put together center = np.median(np.array(bootstrap_params)[:, 0]) error = ci_error(ys[:, end_x], center=center) score = Score( [center] + list(error), coords={'aggregation': ['center', 'error_low', 'error_high']}, dims=['aggregation']) score.attrs['raw'] = ceilings score.attrs['bootstrapped_params'] = bootstrap_params score.attrs['endpoint_x'] = DataAssembly(end_x) return score
def __call__(self, assembly1, assembly2): """ :param brainio_base.assemblies.NeuroidAssembly assembly1: :param brainio_base.assemblies.NeuroidAssembly assembly2: :return: brainscore.metrics.Score """ rdm1 = self._rdm(assembly1) rdm2 = self._rdm(assembly2) similarity = self._similarity(rdm1, rdm2) return Score(similarity)
def __call__(self, candidate): scores = xr.concat([ Imagenet_C_Group(group, parent_category=self._category)(candidate) for group in self._groups ], dim='presentation') assert len(set(scores['noise_type'].values)) == len(self._groups) center = np.mean(scores) error = np.std(scores) score = Score([center, error], coords={'aggregation': ['center', 'error']}, dims=('aggregation',)) score.attrs[Score.RAW_VALUES_KEY] = scores return score
def __init__(self, identifier_suffix, noise_type): identifier = f'dietterich.Hendrycks2019.{identifier_suffix}' stimulus_set = brainscore.get_stimulus_set(identifier) self._stimulus_set = stimulus_set self._similarity_metric = Accuracy() self._benchmark_name = identifier self._noise_type = noise_type ceiling = Score([1, np.nan], coords={'aggregation': ['center', 'error']}, dims=['aggregation']) super(Imagenet_C_Individual, self).__init__(identifier=f"{identifier}-top1", version=1, ceiling_func=lambda: ceiling, parent=f'dietterich.Hendrycks2019-{noise_type}-top1', bibtex=BIBTEX)
def __init__(self, category): category_groups = { 'noise': ['gaussian_noise', 'shot_noise', 'impulse_noise'], 'blur': ['glass_blur', 'motion_blur', 'zoom_blur', 'defocus_blur'], 'weather': ['snow', 'frost', 'fog', 'brightness'], 'digital': ['pixelate', 'contrast', 'elastic_transform', 'jpeg_compression'] } self._category = category self._groups = category_groups[category] ceiling = Score([1, np.nan], coords={'aggregation': ['center', 'error']}, dims=['aggregation']) super(Imagenet_C_Category, self).__init__(identifier=f'dietterich.Hendrycks2019-{category}-top1', version=1, ceiling_func=lambda: ceiling, parent='dietterich.Hendrycks2019-top1', bibtex=BIBTEX)
def test_squeeze(self): score = Score([[1, 2]], coords={ 's': [0], 'a': [1, 2] }, dims=['s', 'a']) score.attrs['raw'] = DataAssembly([[0, 2, 1, 3]], coords={ 's': [0], 'a': [1, 1, 2, 2] }, dims=['s', 'a']) sel_score = score.squeeze('s') np.testing.assert_array_equal(sel_score.raw.dims, ['a'])
def aggregate(cls, values): center = values.mean('split') error = standard_error_of_the_mean(values, 'split') return Score( [center, error], coords={ **{ 'aggregation': ['center', 'error'] }, **{ coord: (dims, values) for coord, dims, values in walk_coords(center) } }, dims=('aggregation', ) + center.dims)
def __init__(self, stimulus_set, noise_type, noise_category): self.stimulus_set = stimulus_set[stimulus_set['noise_type'] == noise_type] self.noise_type = noise_type self.noise_category = noise_category ceiling = Score([1, np.nan], coords={'aggregation': ['center', 'error']}, dims=['aggregation']) super(Imagenet_C_Type, self).__init__( identifier= f'dietterich.Hendrycks2019-{noise_category}-{noise_type}-top1', version=2, ceiling_func=lambda: ceiling, parent=f'dietterich.Hendrycks2019-{noise_category}-top1', bibtex=BIBTEX)
def __call__(self, candidate: BrainModel): self._metric = ScanMatchPy.initialize() self._logger.info("## Starting visual search task...") candidate.start_task(BrainModel.Task.visual_search, max_fix=self.max_fix, data_len=self.data_len, ior_size=self.ior_size) self.cumm_perf, self.saccades = candidate.look_at(self._stimuli) # in saccades the last index denotes the index at which the target was found fix_model = self.saccades[:, :self.max_fix + 1, :] # first n saccades I_fix_model = self.saccades[:, self.max_fix + 1, : 1] # index at which the target was found fix1 = matlab.int32(fix_model.tolist()) I_fix1 = matlab.int32(I_fix_model.tolist()) self._logger.info("## Search task done...\n") self._logger.info("## Calculating score...") scores = [] for sub_id in tqdm(range(self.num_sub), desc="comparing with human data: "): data_human = self._assemblies.values[sub_id * self.data_len:(sub_id + 1) * self.data_len] fix_human = data_human[:, :self.max_fix + 1, :] I_fix_human = data_human[:, self.max_fix + 1, :1] fix2 = matlab.int32(fix_human.tolist()) I_fix2 = matlab.int32(I_fix_human.tolist()) score = self._metric.findScore(fix1, fix2, I_fix1, I_fix2) scores.append(score) scores = np.asarray(scores) self.raw_score = np.mean(scores) self.std = np.std(scores) / np.sqrt(scores.shape[0]) self.model_score = Score([self.raw_score, self.std], coords={'aggregation': ['center', 'error']}, dims=['aggregation']) self._metric.terminate() ceiled_score = ceil_score(self.model_score, self.ceiling) self._logger.info("## Score calculated...\n") return ceiled_score
def __init__(self): ceiling = Score( [.79, np.nan], # following private conversation with Kohitij Kar coords={'aggregation': ['center', 'error']}, dims=['aggregation']) super(DicarloKar2019OST, self).__init__(identifier='dicarlo.Kar2019-ost', version=2, ceiling_func=lambda: ceiling, parent='IT-temporal', bibtex="""@Article{Kar2019, author={Kar, Kohitij and Kubilius, Jonas and Schmidt, Kailyn and Issa, Elias B. and DiCarlo, James J.}, title={Evidence that recurrent circuits are critical to the ventral stream's execution of core object recognition behavior}, journal={Nature Neuroscience}, year={2019}, month={Jun}, day={01}, volume={22}, number={6}, pages={974-983}, abstract={Non-recurrent deep convolutional neural networks (CNNs) are currently the best at modeling core object recognition, a behavior that is supported by the densely recurrent primate ventral stream, culminating in the inferior temporal (IT) cortex. If recurrence is critical to this behavior, then primates should outperform feedforward-only deep CNNs for images that require additional recurrent processing beyond the feedforward IT response. Here we first used behavioral methods to discover hundreds of these `challenge' images. Second, using large-scale electrophysiology, we observed that behaviorally sufficient object identity solutions emerged {\textasciitilde}30{\thinspace}ms later in the IT cortex for challenge images compared with primate performance-matched `control' images. Third, these behaviorally critical late-phase IT response patterns were poorly predicted by feedforward deep CNN activations. Notably, very-deep CNNs and shallower recurrent CNNs better predicted these late IT responses, suggesting that there is a functional equivalence between additional nonlinear transformations and recurrence. Beyond arguing that recurrent circuits are critical for rapid object identification, our results provide strong constraints for future recurrent model development.}, issn={1546-1726}, doi={10.1038/s41593-019-0392-5}, url={https://doi.org/10.1038/s41593-019-0392-5} }""") assembly = brainscore.get_assembly('dicarlo.Kar2019') # drop duplicate images _, index = np.unique(assembly['image_id'], return_index=True) assembly = assembly.isel(presentation=index) assembly.attrs['stimulus_set'] = assembly.stimulus_set.drop_duplicates( 'image_id') assembly = assembly.sel(decoder='svm') self._assembly = assembly self._assembly['truth'] = self._assembly['image_label'] self._assembly.stimulus_set['truth'] = self._assembly.stimulus_set[ 'image_label'] self._similarity_metric = OSTCorrelation() self._visual_degrees = VISUAL_DEGREES self._number_of_trials = 44