def fROI_correlation(): assembly = load_voxels() stories = list(sorted(set(assembly['story'].values))) subjects = list(sorted(set(assembly['subject_UID'].values))) split_scores = [] correlate = pearsonr_correlation(xarray_kwargs=dict(correlation_coord='stimulus_id', neuroid_coord='fROI_area')) cross_stories_subjects = list(itertools.product(stories, subjects)) for story, heldout_subject in tqdm(cross_stories_subjects, desc='cross-{story,subject}'): story_assembly = assembly[{'presentation': [coord_story == story for coord_story in assembly['story'].values]}] subject_pool = story_assembly[{'neuroid': [subject != heldout_subject for subject in story_assembly['subject_UID'].values]}] subject_pool = average_subregions(subject_pool) heldout = story_assembly[{'neuroid': [subject == heldout_subject for subject in story_assembly['subject_UID'].values]}] heldout = average_subregions(heldout) split_score = correlate(subject_pool, heldout) split_score = type(split_score)(split_score.values, coords={ coord: (dims, values) for coord, dims, values in walk_coords(split_score) if not coord.startswith('subject_') and coord != 'neuroid_id'}, dims=split_score.dims) split_score = split_score.expand_dims('heldout_subject').expand_dims('story') split_score['heldout_subject'], split_score['story'] = [heldout_subject], [story] split_scores.append(split_score) correlation = Score.merge(*split_scores) correlation = apply_aggregate(lambda scores: scores.mean('neuroid').mean('story'), correlation) center = correlation.mean('heldout_subject') error = correlation.std('heldout_subject') score = Score([center, error], coords={**{'aggregation': ['center', 'error']}, **{coord: (dims, values) for coord, dims, values in walk_coords(center)}}, dims=('aggregation',) + center.dims) score.attrs[Score.RAW_VALUES_KEY] = correlation.attrs[Score.RAW_VALUES_KEY] return score
def test_sel(self): score = Score([1, 2], coords={'a': [1, 2]}, dims=['a']) score.attrs['raw'] = DataAssembly([0, 2, 1, 3], coords={'a': [1, 1, 2, 2]}, dims=['a']) sel_score = score.sel(a=1) np.testing.assert_array_equal(sel_score.raw['a'], [1, 1])
def test_mean_no_apply_raw(self): score = Score([1, 2], coords={'a': [1, 2]}, dims=['a']) score.attrs['raw'] = DataAssembly([0, 2, 1, 3], coords={'a': [1, 1, 2, 2]}, dims=['a']) mean_score = score.mean('a', _apply_raw=True) assert mean_score.raw == 1.5
def test_mean(self): score = Score([1, 2], coords={'a': [1, 2]}, dims=['a']) score.attrs['raw'] = DataAssembly([0, 2, 1, 3], coords={'a': [1, 1, 2, 2]}, dims=['a']) mean_score = score.mean('a') np.testing.assert_array_equal(mean_score.raw['a'], [1, 1, 2, 2])
def __call__(self, assembly, *args, **kwargs): result = Score([assembly.values[0]], dims=['dim']) raw = result.copy() raw['dim_id'] = 'dim', [assembly.values[1]] raw['division_coord'] = 'dim', [assembly.values[2]] result.attrs['raw'] = raw return result
def __call__(self, candidate): self._logger.info('Computing activations') model_activations = read_words( candidate, self._target_assembly.attrs['stimulus_set'], reset_column='story_id', copy_columns=('stimulus_id', 'word_id', 'sentence_id')) assert set(model_activations['stimulus_id'].values) == set( self._target_assembly['stimulus_id'].values) self._logger.info('Scoring model') cross_subject_scores = self._cross_subject( self._target_assembly, apply=lambda cross_assembly: self._apply_within_subject( model_activations, cross_assembly)) # normalize by ceiling # Note that we normalize by an overall ceiling, so the scores per subject are not normalized wrt. that subject # and should thus not be used by themselves. Only the aggregate makes sense to report normalized_subject_scores = consistency( cross_subject_scores.sel(aggregation='center'), self.ceiling.sel(aggregation='center')) score = normalized_subject_scores.median('subject_id') std = normalized_subject_scores.std('subject_id') std['aggregation'] = 'error' # the MultiIndex tends to mess things up, so we get rid of it here score, std = xr.DataArray(score).expand_dims( 'aggregation'), xr.DataArray(std).expand_dims('aggregation') score = Score(Score.merge(score, std)) score.attrs['raw'] = cross_subject_scores score.attrs['ceiling'] = self.ceiling return score
def __call__(self, model: TaskModel): model.mode = TaskModel.Modes.tokens_to_features set_seed(self.seed) device = 'cuda' if torch.cuda.is_available() else 'cpu' logger.debug( f"Using block size {self.block_size} for {model.identifier}") # Data vocab_size = min(model.vocab_size, 250000) train_tokens = TextDataset(model_identifier=model.identifier, model=model, block_size=self.block_size, vocab_size=vocab_size, file_path=self.train_data_file) val_tokens = TextDataset(model_identifier=model.identifier, model=model, block_size=self.block_size, vocab_size=vocab_size, file_path=self.val_data_file) test_tokens = TextDataset(model_identifier=model.identifier, model=model, block_size=self.block_size, vocab_size=vocab_size, file_path=self.eval_data_file) # Decoder logger.info(f"Vocab size: {vocab_size}") features_sample, _ = train_tokens[0] lm_head = LMHeadModel(features_size=features_sample.shape[-1], vocab_size=vocab_size, embedding_weights=model.get_embedding_weights() if self.tied else None) lm_head = lm_head.to(device) # Train train(model=lm_head, train_dataset=train_tokens, val_dataset=val_tokens, device=device, seed=self.seed, **self.kwargs) # Evaluation test_result = evaluate(model=lm_head, eval_dataset=test_tokens, device=device) score = Score([test_result[key] for key in ['perplexity', 'loss']], coords={'measure': ['test_perplexity', 'test_loss']}, dims=['measure']) score.attrs['datasets'] = { 'train': self.train_data_file, 'val': self.val_data_file, 'test': self.eval_data_file } score.attrs['benchmark_identifier'] = self.identifier score.attrs['model_identifier'] = model.identifier return score
def extrapolate_neuroid(self, ceilings): # figure out how many extrapolation x points we have. E.g. for Pereira, not all combinations are possible subject_subsamples = list(sorted(set(ceilings['num_subjects'].values))) rng = RandomState(0) bootstrap_params = [] for bootstrap in range(self.num_bootstraps): bootstrapped_scores = [] for num_subjects in subject_subsamples: num_scores = ceilings.sel(num_subjects=num_subjects) # the sub_subjects dimension creates nans, get rid of those num_scores = num_scores.dropna(f'sub_{self.subject_column}') assert set(num_scores.dims) == {f'sub_{self.subject_column}', 'split'} or \ set(num_scores.dims) == {f'sub_{self.subject_column}'} # choose from subject subsets and the splits therein, with replacement for variance choices = num_scores.values.flatten() bootstrapped_score = rng.choice(choices, size=len(choices), replace=True) bootstrapped_scores.append(np.mean(bootstrapped_score)) try: params = self.fit(subject_subsamples, bootstrapped_scores) except RuntimeError: # optimal parameters not found params = [np.nan, np.nan] params = DataAssembly([params], coords={ 'bootstrap': [bootstrap], 'param': ['v0', 'tau0'] }, dims=['bootstrap', 'param']) bootstrap_params.append(params) bootstrap_params = merge_data_arrays(bootstrap_params) # find endpoint and error asymptote_threshold = .0005 interpolation_xs = np.arange(1000) ys = np.array([ v(interpolation_xs, *params) for params in bootstrap_params.values if not np.isnan(params).any() ]) median_ys = np.median(ys, axis=0) diffs = np.diff(median_ys) end_x = np.where(diffs < asymptote_threshold)[0].min( ) # first x where increase smaller than threshold # put together center = np.median(np.array(bootstrap_params)[:, 0]) error = ci_error(ys[:, end_x], center=center) score = Score( [center] + list(error), coords={'aggregation': ['center', 'error_low', 'error_high']}, dims=['aggregation']) score.attrs['raw'] = ceilings score.attrs['bootstrapped_params'] = bootstrap_params score.attrs['endpoint_x'] = DataAssembly(end_x) return score
def aggregate_neuroid_scores(neuroid_scores, subject_column): subject_scores = neuroid_scores.groupby(subject_column).median() center = subject_scores.median(subject_column) subject_values = np.nan_to_num(subject_scores.values, nan=0) # mad cannot deal with all-nan in one axis, treat as 0 subject_axis = subject_scores.dims.index(subject_scores[subject_column].dims[0]) error = median_absolute_deviation(subject_values, axis=subject_axis) score = Score([center, error], coords={'aggregation': ['center', 'error']}, dims=['aggregation']) score.attrs['raw'] = neuroid_scores score.attrs['description'] = "score aggregated by taking median of neuroids per subject, " \ "then median of subject scores" return score
def test_squeeze(self): score = Score([[1, 2]], coords={ 's': [0], 'a': [1, 2] }, dims=['s', 'a']) score.attrs['raw'] = DataAssembly([[0, 2, 1, 3]], coords={ 's': [0], 'a': [1, 1, 2, 2] }, dims=['s', 'a']) sel_score = score.squeeze('s') np.testing.assert_array_equal(sel_score.raw.dims, ['a'])
def _call( self, model_identifier, benchmark_identifier, # storage fields model, benchmark, layers, prerun=False): if prerun: # pre-run activations together to avoid running every layer separately model(layers=layers, stimuli=benchmark._assembly.stimulus_set) layer_scores = [] for layer in tqdm(layers, desc="layers"): layer_model = LayerMappedModel( identifier=f"{model_identifier}-{layer}", # per-layer identifier to avoid overlap activations_model=model, region_layer_map={benchmark.region: layer}) layer_model = TemporalIgnore(layer_model) score = benchmark(layer_model) score = score.expand_dims('layer') score['layer'] = [layer] layer_scores.append(score) layer_scores = Score.merge(*layer_scores) layer_scores = layer_scores.sel( layer=layers) # preserve layer ordering return layer_scores
def _call( self, model_identifier, benchmark_identifier, # storage fields model, benchmark_builder, layer_and_params): all_scores = [] all_layer_param_str = [] for layer, param_str in tqdm(layer_and_params, desc="layers"): bench_args, bench_kwargs = json.loads(param_str) benchmark = benchmark_builder(*bench_args, **bench_kwargs) layer_model = self.build_layer_model( identifier=f"{model_identifier}-{layer}-layer-param", model=model, benchmark=benchmark, layer=layer, ) score = benchmark(layer_model) score = score.expand_dims('layer_param') layer_param_str = '%s-%s' % (layer, param_str) score['layer_param'] = [layer_param_str] all_scores.append(score) all_layer_param_str.append(layer_param_str) all_scores = Score.merge(*all_scores) all_scores = all_scores.sel(layer_param=all_layer_param_str) return all_scores
def score(benchmark, model, layers=None, model_impl=None, subsample=None): model_impl = model_impl or model_pool[model] if subsample: SubsamplingHook.hook(model, subsample) layers = layers or model_layers[model] _logger.info('Loading benchmark') benchmark_impl = benchmark_pool[benchmark] _logger.info('Running') # shortcut for performance benchmarks if any(benchmark.startswith(performance_prefix) for performance_prefix in ['wikitext', 'glue']): return benchmark_impl(model_impl) # only last layer for behavioral benchmarks if benchmark.startswith('Futrell2018'): layers = layers[-1:] layer_scores = [] for i, layer in enumerate(tqdm(layers, desc='layers')): if any(benchmark.startswith(performance_prefix) for performance_prefix in ['wikitext', 'glue']): candidate = StripLayersAfter(model_impl, layer=layer) else: # prerun everything for 1st layer candidate = FixedLayer(model_impl, layer, prerun=layers if i == 0 else None) layer_score = benchmark_impl(candidate) layer_score = layer_score.expand_dims('layer') layer_score['layer'] = [layer] layer_scores.append(layer_score) layer_scores = Score.merge(*layer_scores) layer_scores = layer_scores.sel(layer=layers) # preserve layer ordering layer_scores.attrs['model'] = model layer_scores.attrs['benchmark'] = benchmark return layer_scores
def _repeat(self, func): random_state = self._initialize_random_state() repetitions = list(range(self._repetitions)) scores = [func(random_state=random_state) for repetition in repetitions] score = Score(scores, coords={'split': repetitions}, dims=['split']) self._save_matrix() return apply_aggregate(self.aggregate, score)
def _call( self, model_identifier, benchmark_identifier, visual_degrees, # storage fields model, benchmark, layers, prerun=False): layer_scores = [] for i, layer in enumerate(tqdm(layers, desc="layers")): layer_model = LayerMappedModel( identifier=f"{model_identifier}-{layer}", visual_degrees=visual_degrees, # per-layer identifier to avoid overlap activations_model=model, region_layer_map={benchmark.region: layer}) layer_model = TemporalIgnore(layer_model) if i == 0 and prerun: # pre-run activations together to avoid running every layer separately # we can only pre-run stimuli in response to the benchmark, since we might otherwise be missing # visual_degrees resizing. layer_model = PreRunLayers(model=model, layers=layers, forward=layer_model) score = benchmark(layer_model) score = score.expand_dims('layer') score['layer'] = [layer] layer_scores.append(score) layer_scores = Score.merge(*layer_scores) layer_scores = layer_scores.sel( layer=layers) # preserve layer ordering return layer_scores
def _apply_cross(self, source_assembly, cross_assembly): # some subjects have only done one experiment which leads to nans cross_assembly = cross_assembly.dropna('neuroid') if len(cross_assembly['neuroid']) == 0: return Score([np.nan, np.nan], coords={'aggregation': ['center', 'error']}, dims=['aggregation']) return super(_PereiraSubjectWise, self)._apply_cross( source_assembly=source_assembly, cross_assembly=cross_assembly)
def __call__(self, candidate: BrainModel): candidate.start_recording('IT', time_bins=self._time_bins) stimulus_set = place_on_screen( self._assembly.stimulus_set, target_visual_degrees=candidate.visual_degrees(), source_visual_degrees=self._visual_degrees) # Temporal recordings from large candidates take up a lot of memory and compute time. # In order to quickly reject recordings that are static over time, # we will show one image and check whether the recordings vary over time at all or not. # If they don't we can quickly score the candidate with a failure state # since it will not be able to predict temporal differences with the OST metric check_stimulus_set = stimulus_set[:1] check_stimulus_set.identifier = None # unset identifier to avoid storing (interferes with actual stimulus_set) check_recordings = candidate.look_at( check_stimulus_set, number_of_trials=self._number_of_trials) if not temporally_varying(check_recordings): score = Score([np.nan, np.nan], coords={'aggregation': ['center', 'error']}, dims=['aggregation']) else: recordings = candidate.look_at( stimulus_set, number_of_trials=self._number_of_trials) score = self._similarity_metric(recordings, self._assembly) score = ceil_score(score, self.ceiling) return score
def collect(self, identifier, assembly, metric): """ Instead of iterating over subject combinations and then afterwards over holdout subjects, we here iterate over holdout subjects and then over electrode sub-combinations of the remaining pool. """ subjects = set(assembly[self.subject_column].values) scores = [] for holdout_subject in tqdm(subjects, desc='subjects'): subject_pool = subjects - {holdout_subject} subject_pool_assembly = assembly[{'neuroid': [subject in subject_pool for subject in assembly[self.subject_column].values]}] holdout_subject_assembly = assembly[{'neuroid': [subject == holdout_subject for subject in assembly[self.subject_column].values]}] electrodes = subject_pool_assembly['neuroid_id'].values electrodes_range = np.arange(5, len(electrodes), 5) for num_electrodes in tqdm(electrodes_range, desc='num electrodes'): electrodes_combinations = self._choose_electrodes(electrodes, num_electrodes, num_choices=self._num_samples) for electrodes_split, electrodes_selection in enumerate(electrodes_combinations): electrodes_assembly = subject_pool_assembly[{'neuroid': [ neuroid_id in electrodes_selection for neuroid_id in subject_pool_assembly['neuroid_id'].values]}] score = metric(electrodes_assembly, holdout_subject_assembly) # store scores score = score.expand_dims(f"sub_{self.subject_column}") score.__setitem__(f"sub_{self.subject_column}", [holdout_subject]) score = score.expand_dims('num_electrodes').expand_dims('electrodes_split') score['num_electrodes'] = [num_electrodes] score['electrodes_split'] = [electrodes_split] scores.append(score) scores = Score.merge(*scores) ceilings = scores.raw ceilings = ceilings.rename({'split': 'subsplit'}).stack(split=['electrodes_split', 'subsplit']) ceilings.attrs['raw'] = scores return ceilings
def pipe(self, assembly): """ :param brainscore.assemblies.NeuroidAssembly assembly: :return: brainscore.assemblies.DataAssembly """ dividers = self.dividers(assembly, dividing_coords=self._dividers) scores = [] progress = tqdm(enumerate_done(dividers), total=len(dividers), desc='cartesian product') for i, divider, done in progress: progress.set_description(str(divider)) divided_assembly = assembly.multisel(**divider) # squeeze dimensions if necessary for divider_coord in divider: dims = assembly[divider_coord].dims assert len(dims) == 1 if dims[0] in divided_assembly.dims and len( divided_assembly[dims[0]]) == 1: divided_assembly = divided_assembly.squeeze(dims[0]) result = yield from self._get_result(divided_assembly, done=done) for coord_name, coord_value in divider.items(): result = result.expand_dims(coord_name) result[coord_name] = [coord_value] scores.append(result) scores = Score.merge(*scores) yield scores
def __init__(self): ceiling = Score([1, np.nan], coords={'aggregation': ['center', 'error']}, dims=['aggregation']) assembly_repetition = get_assembly() assert len(np.unique(assembly_repetition['region'])) == 1 assert hasattr(assembly_repetition, 'repetition') self.region = 'IT' self.assembly = average_repetition(assembly_repetition) self._assembly = self.assembly self.timebins = timebins_from_assembly(self.assembly) self._similarity_metric = CrossRegressedCorrelation( regression=pls_regression(), correlation=pearsonr_correlation(), crossvalidation_kwargs=dict( stratification_coord=Split.Defaults.stratification_coord if hasattr(self.assembly, Split.Defaults.stratification_coord ) else None)) identifier = f'{assembly_repetition.name}-layer_selection' ceiler = InternalConsistency() super(_MockBenchmark, self).__init__(identifier=identifier, ceiling_func=lambda: ceiler(assembly_repetition), version='1.0')
def _call( self, model_identifier, layer, benchmark_identifier, # storage fields model, benchmark_builder, params): param_scores = [] for param_str in tqdm(params, desc="params"): bench_args, bench_kwargs = json.loads(param_str) benchmark = benchmark_builder(*bench_args, **bench_kwargs) layer_model = self.build_layer_model( identifier=f"{model_identifier}-{layer}", model=model, benchmark=benchmark, layer=layer, ) score = benchmark(layer_model) score = score.expand_dims('param') score['param'] = [param_str] param_scores.append(score) param_scores = Score.merge(*param_scores) param_scores = param_scores.sel( param=params) # preserve layer ordering return param_scores
def __call__(self, prediction, target): # align prediction = prediction.sortby( [self._correlation_coord, self._neuroid_coord]) target = target.sortby([self._correlation_coord, self._neuroid_coord]) assert np.array(prediction[self._correlation_coord].values == target[ self._correlation_coord].values).all() assert np.array(prediction[self._neuroid_coord].values == target[ self._neuroid_coord].values).all() # compute correlation per neuroid neuroid_dims = target[self._neuroid_coord].dims assert len(neuroid_dims) == 1 correlations = [] for i, coord_value in enumerate(target[self._neuroid_coord].values): target_neuroids = target.isel(**{ neuroid_dims[0]: i }) # `isel` is about 10x faster than `sel` prediction_neuroids = prediction.isel(**{neuroid_dims[0]: i}) r, p = self._correlation(target_neuroids, prediction_neuroids) correlations.append(r) # package result = Score(correlations, coords={ coord: (dims, values) for coord, dims, values in walk_coords(target) if dims == neuroid_dims }, dims=neuroid_dims) return result
def __init__(self, noise_type, parent_category): self._noise_type = noise_type ceiling = Score([1, np.nan], coords={'aggregation': ['center', 'error']}, dims=['aggregation']) super(Imagenet_C_Group, self).__init__(identifier=f'dietterich.Hendrycks2019-{noise_type}-top1', version=1, ceiling_func=lambda: ceiling, parent=f'dietterich.Hendrycks2019-{parent_category}-top1', bibtex=BIBTEX)
def __init__(self): ceiling = Score( [.79, np.nan], # following private conversation with Kohitij Kar coords={'aggregation': ['center', 'error']}, dims=['aggregation']) super(DicarloKar2019OST, self).__init__( identifier='dicarlo.Kar2019-ost', version=2, ceiling_func=lambda: ceiling, parent='IT-temporal', paper_link='https://www.nature.com/articles/s41593-019-0392-5') assembly = brainscore.get_assembly('dicarlo.Kar2019') # drop duplicate images _, index = np.unique(assembly['image_id'], return_index=True) assembly = assembly.isel(presentation=index) assembly.attrs['stimulus_set'] = assembly.stimulus_set.drop_duplicates( 'image_id') assembly = assembly.sel(decoder='svm') self._assembly = assembly self._assembly['truth'] = self._assembly['image_label'] self._assembly.stimulus_set['truth'] = self._assembly.stimulus_set[ 'image_label'] self._similarity_metric = OSTCorrelation() self._visual_degrees = VISUAL_DEGREES self._number_of_trials = 44
def collect(self, identifier, assembly, metric): subjects = set(assembly[self.subject_column].values) subject_subsamples = self.build_subject_subsamples(subjects) scores = [] for num_subjects in tqdm(subject_subsamples, desc='num subjects'): selection_combinations = self.iterate_subsets( assembly, num_subjects=num_subjects) for selections, sub_assembly in tqdm(selection_combinations, desc='selections'): try: score = self.holdout_ceiling(assembly=sub_assembly, metric=metric) score = score.expand_dims('num_subjects') score['num_subjects'] = [num_subjects] for key, selection in selections.items(): expand_dim = f'sub_{key}' score = score.expand_dims(expand_dim) score[expand_dim] = [str(selection)] scores.append(score.raw) except KeyError as e: # nothing to merge if str(e) == "'z'": self._logger.debug(f"Ignoring merge error {e}") continue else: raise e scores = Score.merge(*scores) scores = self.post_process(scores) return scores
def decode_voxels(): assembly = load_voxels() cross_validation = CrossValidationSingle(splits=1, split_coord='stimulus_id', stratification_coord='story') subjects = list(sorted(set(assembly['subject_UID'].values))) scores = [] for subject in subjects: subject_index = [ subject == coord_subject for coord_subject in assembly['subject_UID'].values ] subject_assembly = assembly[{'neuroid': subject_index}] subject_score = cross_validation(subject_assembly, apply=fit_decode) subject_score = subject_score.sel( aggregation='center') # since we're only doing one split print( f"subject {subject}: " f"{subject_score.sel(accuracy_aggregation='center', train_test='train').values:.2f} train, " f"{subject_score.sel(accuracy_aggregation='center', train_test='test').values:.2f} test" ) subject_score = subject_score.expand_dims('subject') subject_score['subject'] = [subject] scores.append(subject_score) scores = Score.merge(*scores) return scores
def __init__(self): stimulus_set = pd.read_csv( os.path.join(os.path.dirname(__file__), 'imagenet2012.csv')) stimulus_set = StimulusSet(stimulus_set) stimulus_set.image_paths = { row.image_id: row.filepath for row in stimulus_set.itertuples() } self._stimulus_set = stimulus_set self._similarity_metric = Accuracy() ceiling = Score([1, np.nan], coords={'aggregation': ['center', 'error']}, dims=['aggregation']) super(Imagenet2012, self).__init__(identifier='fei-fei.Deng2009-top1', version=1, ceiling_func=lambda: ceiling, parent='ImageNet', bibtex="""@INPROCEEDINGS{5206848, author={J. {Deng} and W. {Dong} and R. {Socher} and L. {Li} and {Kai Li} and {Li Fei-Fei}}, booktitle={2009 IEEE Conference on Computer Vision and Pattern Recognition}, title={ImageNet: A large-scale hierarchical image database}, year={2009}, volume={}, number={}, pages={248-255}, url = {https://ieeexplore.ieee.org/document/5206848} }""")
def run_evaluation(return_score=False): scores = [] # Loop to handle MNLI double evaluation (matched, mis-matched) for eval_task in eval_task_names: examples, label_list, output_mode = get_examples( data_dir=data_dir, task=eval_task, evaluate=True) eval_dataset = model.glue_dataset( task=eval_task, examples=examples, label_list=label_list, output_mode=output_mode, max_seq_length=max_seq_length) result = evaluate(features_model=model, decoder_head=decoder_head, eval_dataset=eval_dataset, task_name=eval_task, output_mode=output_mode, device=device) if not return_score: return result # we're ignoring mnli-mm here, but this return is just for progress logging anyway score = Score([[value for key, value in result.items()]], coords={ 'eval_task': [eval_task], 'measure': list(result.keys()) }, dims=['eval_task', 'measure']) score.attrs['data_dir'] = data_dir score.attrs['benchmark_identifier'] = f"glue-{self.task_name}" score.attrs['eval_task'] = eval_task score.attrs['model_identifier'] = model.identifier scores.append(score) scores = Score.merge(*scores) return scores
def __call__(self, assembly): ceilings = [] for time_bin in tqdm(assembly['time_bin'].values, desc='time-ceiling'): ceiling = self.ceiling(assembly.sel(time_bin=time_bin)) ceiling = ceiling.expand_dims('time_bin') ceiling['time_bin'] = [str(time_bin)] ceilings.append(ceiling) ceiling = Score.merge(*ceilings) return ceiling
def correlate(self, predicted_osts, target_osts): non_nan = np.logical_and(~np.isnan(predicted_osts), ~np.isnan(target_osts)) predicted_osts, target_osts = predicted_osts[non_nan], target_osts[ non_nan] # use Spearman over Pearson since it tests whether the rank orders are the same, # which allows for nonlinear correlates whereas Pearson assumes linearity. correlation, p = spearmanr(predicted_osts, target_osts) return Score(correlation)