def predict_proba(self, X): import tensorflow as tf assert len(X.shape) == 2, "expected 2-dimensional input" if self._zscore_feats: scaled_X = self._scaler.transform(X) else: scaled_X = X with self._graph.as_default(): preds = [] for batch in self._iterate_minibatches( scaled_X, batchsize=self._eval_batch_size, shuffle=False): feed_dict = { self._input_placeholder: batch, self._fc_keep_prob: 1.0 } softmax = self._sess.run([tf.nn.softmax(self._predictions)], feed_dict=feed_dict) preds.append(np.squeeze(softmax)) proba = np.concatenate(preds, axis=0) # we take only the 0th dimension because the 1st dimension is just the features X_coords = { coord: (dims, value) for coord, dims, value in walk_coords(X) if array_is_element(dims, X.dims[0]) } proba = BehavioralAssembly(proba, coords={ **X_coords, **{ 'choice': list(self._label_mapping.values()) } }, dims=[X.dims[0], 'choice']) return proba
def average_subregions(self, bold_shift, assembly): attrs = assembly.attrs del assembly['threshold'] # group by stimuli, fROI, subject after one another. # this gets rid of adjacent coords unfortunately, but we accept that for now. averaged_assembly = assembly.groupby('stimulus_id').apply( lambda stimulus_group: stimulus_group.groupby('fROI_area').apply( lambda fROI_group: fROI_group.groupby('subject_UID').mean())) averaged_assembly = averaged_assembly.stack( presentation=['stimulus_id'], neuroid=['fROI_area', 'subject_UID']) # copy presentation coords back since those are needed for e.g. metric stratification order = [ averaged_assembly['stimulus_id'].values.tolist().index(stimulus_id) for stimulus_id in assembly['stimulus_id'].values ] for copy_coord, dims, copy_value in walk_coords(assembly): if not array_is_element(dims, 'presentation') or hasattr( averaged_assembly, copy_coord): continue averaged_assembly[copy_coord] = dims, copy_value[order] averaged_assembly.attrs = attrs averaged_assembly['neuroid_id'] = 'neuroid', [ ".".join([str(value) for value in values]) for values in zip(*[ averaged_assembly[coord].values for coord in ['subject_UID', 'fROI_area'] ]) ] return averaged_assembly
def _package_prediction(self, predicted_values, source): coords = { coord: (dims, values) for coord, dims, values in walk_coords(source) if not array_is_element(dims, self._neuroid_dim) } # re-package neuroid coords dims = source.dims # if there is only one neuroid coordinate, it would get discarded and the dimension would be used as coordinate. # to avoid this, we can build the assembly first and then stack on the neuroid dimension. neuroid_level_dim = None if len( self._target_neuroid_values ) == 1: # extract single key: https://stackoverflow.com/a/20145927/2225200 (neuroid_level_dim, _), = self._target_neuroid_values.items() dims = [ dim if dim != self._neuroid_dim else neuroid_level_dim for dim in dims ] for target_coord, target_value in self._target_neuroid_values.items(): # this might overwrite values which is okay coords[target_coord] = (neuroid_level_dim or self._neuroid_dim), target_value prediction = NeuroidAssembly(predicted_values, coords=coords, dims=dims) if neuroid_level_dim: prediction = prediction.stack( **{self._neuroid_dim: [neuroid_level_dim]}) return prediction
def avg_repr(assembly): presentation_coords = [ coord for coord, dims, values in walk_coords(assembly) if array_is_element(dims, 'presentation') and coord != 'repetition' ] assembly = assembly.multi_groupby(presentation_coords).mean( dim='presentation', skipna=True) return assembly
def fit(self, source, target): source, target = self._align(source), self._align(target) source, target = source.sortby(self._stimulus_coord), target.sortby( self._stimulus_coord) self._regression.fit(source, target) self._target_neuroid_values = {} for name, dims, values in walk_coords(target): if self._neuroid_dim in dims: assert array_is_element(dims, self._neuroid_dim) self._target_neuroid_values[name] = values
def manual_merge(*elements, on='neuroid'): dims = elements[0].dims assert all(element.dims == dims for element in elements[1:]) merge_index = dims.index(on) # the coordinates in the merge index should have the same keys assert _coords_match( elements, dim=on, match_values=False ), f"coords in {[element[on] for element in elements]} do not match" # all other dimensions, their coordinates and values should already align for dim in set(dims) - {on}: assert _coords_match( elements, dim=dim, match_values=True ), f"coords in {[element[dim] for element in elements]} do not match" # merge values without meta merged_values = np.concatenate([element.values for element in elements], axis=merge_index) # piece together with meta result = type( elements[0] )(merged_values, coords={ **{ coord: (dims, values) for coord, dims, values in walk_coords(elements[0]) if not array_is_element( dims, on) }, **{ coord: (dims, np.concatenate([ element[coord].values for element in elements ])) for coord, dims, _ in walk_coords(elements[0]) if array_is_element( dims, on) } }, dims=elements[0].dims) return result
def average_repetition(self, assembly): attrs = assembly.attrs # workaround to keeping attrs presentation_coords = [ coord for coord, dims, values in walk_coords(assembly) if array_is_element(dims, 'presentation') ] presentation_coords = set(presentation_coords) - { 'repetition_id', 'id' } assembly = assembly.multi_groupby(presentation_coords).mean( dim='presentation', skipna=True) assembly, stimulus_set = self.dropna( assembly, stimulus_set=attrs['stimulus_set']) attrs['stimulus_set'] = stimulus_set assembly.attrs = attrs return assembly
def predict_proba(self, X): assert len(X.shape) == 2, "expected 2-dimensional input" scaled_X = self._scaler.transform(X) proba = self._classifier.predict_proba(scaled_X) # we take only the 0th dimension because the 1st dimension is just the features X_coords = { coord: (dims, value) for coord, dims, value in walk_coords(X) if array_is_element(dims, X.dims[0]) } proba = BehavioralAssembly( proba, coords={ **X_coords, **{ 'choice': list(self._label_mapping.values()) } }, dims=[X.dims[0], 'choice']) return proba
def add_neuroid_meta(self, target, source): target = target.expand_dims(self.extrapolation_dimension) for coord, dims, values in walk_coords(source): if array_is_element(dims, self.extrapolation_dimension): target[coord] = dims, values return target
def _merge_voxel_meta(data, meta, bold_shift_seconds): data_missing = set(meta['story'].values) - set(data['story'].values) if data_missing: warnings.warn(f"Stories missing from the data: {data_missing}") meta_missing = set(data['story'].values) - set(meta['story'].values) if meta_missing: warnings.warn(f"Stories missing from the meta: {meta_missing}") ignored_words = [None, '', '<s>', '</s>', '<s'] annotated_data = [] for story in tqdm(ordered_set(data['story'].values), desc='merge meta'): if story not in meta['story'].values: continue story_meta = meta.sel(story=story) story_meta = story_meta.sortby('time_end') story_data = data.sel(story=story).stack(timepoint=['timepoint_value']) story_data = story_data.sortby('timepoint_value') timepoints = story_data['timepoint_value'].values.tolist() assert is_sorted(timepoints) timepoints = [ timepoint - bold_shift_seconds for timepoint in timepoints ] sentences = [] last_timepoint = -np.inf for timepoint in timepoints: if last_timepoint >= max(story_meta['time_end'].values): break if timepoint <= 0: sentences.append(None) continue # ignore fixation period timebin_meta = [ last_timepoint < end <= timepoint for end in story_meta['time_end'].values ] timebin_meta = story_meta[{'time_bin': timebin_meta}] sentence = ' '.join(word.strip() for word in timebin_meta.values if word not in ignored_words) sentence = sentence.lower().strip() # quick-fixes if story == 'Boar' and sentence == 'interactions the the': # Boar duplicate sentence = 'interactions the' if story == 'KingOfBirds' and sentence == 'the fact that the larger': # missing word in TextGrid sentence = 'earth ' + sentence if story == 'MrSticky' and sentence == 'worry don\'t worry i went extra slowly since it\'s': sentence = 'don\'t worry i went extra slowly since it\'s' sentences.append(sentence) last_timepoint = timebin_meta['time_end'].values[-1] sentence_index = [ i for i, sentence in enumerate(sentences) if sentence ] sentences = np.array(sentences)[sentence_index] if story not in ['Boar', 'KingOfBirds', 'MrSticky']: # ignore quick-fixes annotated_sentence = ' '.join(sentences) meta_sentence = ' '.join(word.strip() for word in story_meta.values if word not in ignored_words) \ .lower().strip() assert annotated_sentence == meta_sentence # re-interpret timepoints as stimuli coords = {} for coord_name, dims, coord_value in walk_coords(story_data): dims = [ dim if not dim.startswith('timepoint') else 'presentation' for dim in dims ] # discard the timepoints for which the stimulus did not change (empty word) coord_value = coord_value if not array_is_element( dims, 'presentation') else coord_value[sentence_index] coords[coord_name] = dims, coord_value coords = { **coords, **{ 'stimulus_sentence': ('presentation', sentences) } } story_data = story_data[{ dim: slice(None) if dim != 'timepoint' else sentence_index for dim in story_data.dims }] dims = [ dim if not dim.startswith('timepoint') else 'presentation' for dim in story_data.dims ] story_data = xr.DataArray(story_data.values, coords=coords, dims=dims) story_data['story'] = 'presentation', [story] * len( story_data['presentation']) gather_indexes(story_data) annotated_data.append(story_data) annotated_data = merge_data_arrays(annotated_data) return annotated_data
def package( features_path='/braintree/data2/active/users/qbilius/computed/hvm/ait' ): assert os.path.isdir(features_path) features_paths = [ os.path.join(features_path, 'basenets_hvm_feats_V4'), os.path.join(features_path, 'basenets_hvm_feats_pIT'), os.path.join(features_path, 'basenets_hvm_feats') ] # alignment meta = pd.read_pickle( os.path.join(os.path.dirname(__file__), 'basenets-meta.pkl')) meta = meta[meta['var'] == 6] meta_ids = meta['id'].values.tolist() hvm = brainscore.get_assembly('dicarlo.Majaj2015') \ .sel(variation=6) \ .multi_groupby(['category_name', 'object_name', 'image_id']) \ .mean(dim="presentation") \ .squeeze("time_bin") hvm_ids = hvm['image_id'].values.tolist() assert len(hvm_ids) == len(meta_ids) indexes = [meta_ids.index(id) for id in hvm_ids] basenets = [] for activations_path_v4 in glob.glob( os.path.join(features_paths[0], '*.npy')): activations_path_pit = os.path.abspath( os.path.join(features_paths[1], os.path.basename(activations_path_v4))) activations_path_ait = os.path.abspath( os.path.join(features_paths[2], os.path.basename(activations_path_v4))) assert os.path.isfile(activations_path_pit) assert os.path.isfile(activations_path_ait) print(activations_path_v4, activations_path_pit, activations_path_ait, end='') activations_v4 = np.load(activations_path_v4) activations_pit = np.load(activations_path_pit) activations_ait = np.load(activations_path_ait) assert activations_v4.shape[0] == activations_pit.shape[ 0] == activations_ait.shape[0] == len(indexes) activations_v4 = activations_v4[indexes, :] activations_pit = activations_ait[indexes, :] activations_ait = activations_ait[indexes, :] coords = { coord: (dims, values) for coord, dims, values in walk_coords(hvm) if array_is_element(dims, 'presentation') } coords['neuroid_id'] = 'neuroid', list(range(3000)) coords['layer'] = 'neuroid', np.concatenate([ np.repeat('basenet-layer_v4', 1000), np.repeat('basenet-layer_pit', 1000), np.repeat('basenet-layer_ait', 1000) ]) activations = np.concatenate( [activations_v4, activations_pit, activations_ait], axis=1) print(activations.shape, end='') assert activations.shape[0] == len(indexes) assembly = NeuroidAssembly(activations, coords=coords, dims=['presentation', 'neuroid']) model_name = os.path.splitext( os.path.basename(activations_path_pit))[0] basenets.append(model_name) target_path = os.path.abspath( os.path.join( os.path.dirname(__file__), '..', '..', '..', 'output/candidate_models.models.model_activations', 'model={},stimulus_set=dicarlo.hvm,weights=imagenet,image_size=224,pca_components=1000.pkl' .format(model_name))) print("-->", target_path) with open(target_path, 'wb') as target_file: pickle.dump({'data': assembly}, target_file) print(" ".join(basenets))