def read_words(candidate, stimulus_set, reset_column='sentence_id', copy_columns=(), average_sentence=False): """ Pass a `stimulus_set` through a model `candidate`. In contrast to the `listen_to` function, this function operates on a word-based `stimulus_set`. """ # Input: stimulus_set = pandas df, col 1 with sentence ID and 2nd col as word. activations = [] for i, reset_id in enumerate(ordered_set(stimulus_set[reset_column].values)): part_stimuli = stimulus_set[stimulus_set[reset_column] == reset_id] # stimulus_ids = part_stimuli['stimulus_id'] sentence_stimuli = StimulusSet({'sentence': ' '.join(part_stimuli['word']), reset_column: list(set(part_stimuli[reset_column]))}) sentence_stimuli.name = f"{stimulus_set.name}-{reset_id}" sentence_activations = candidate(stimuli=sentence_stimuli, average_sentence=average_sentence) for column in copy_columns: sentence_activations[column] = ('presentation', part_stimuli[column]) activations.append(sentence_activations) model_activations = merge_data_arrays(activations) # merging does not maintain stimulus order. the following orders again idx = [model_activations['stimulus_id'].values.tolist().index(stimulus_id) for stimulus_id in itertools.chain.from_iterable(s['stimulus_id'].values for s in activations)] assert len(set(idx)) == len(idx), "Found duplicate indices to order activations" model_activations = model_activations[{'presentation': idx}] return model_activations
def _load_rdms(self, roi_filter='from90to100', bold_shift_seconds=4): assemblies = {} for story in ['Boar', 'KingOfBirds', 'Elvis', 'HighSchool', 'MatchstickSeller']: assembly = load_rdm_sentences(story=story, roi_filter=roi_filter, bold_shift_seconds=bold_shift_seconds) assembly = assembly.mean(dim='subject') stimulus_set_identifier = f'naturalistic-neural-reduced.{story}' stimulus_set = load_stimuli(stimulus_set_identifier) stimulus_set = StimulusSet({'sentence': stimulus_set}) stimulus_set.name = stimulus_set_identifier assembly.attrs['stimulus_set'] = stimulus_set assemblies[story] = assembly return assemblies
def read_words(candidate, stimulus_set): # This is a new version of the listen_to_stories function # Input: stimulus_set = pandas df, col 1 with sentence ID and 2nd col as word. activations = [] for i, sentence_id in enumerate(ordered_set(stimulus_set['sentence_id'].values)): sentence_stimuli = stimulus_set[stimulus_set['sentence_id'] == sentence_id] sentence_stimuli = StimulusSet({'sentence': ' '.join(sentence_stimuli['word']), 'sentence_id': list(set(sentence_stimuli['sentence_id']))}) sentence_stimuli.name = f"{stimulus_set.name}-{sentence_id}" sentence_activations = candidate(stimuli=sentence_stimuli) sentence_activations['stimulus_id'] = ('presentation', 8 * i + np.arange(0, 8)) sentence_activations['sentence_id'] = ('presentation', [sentence_id] * 8) activations.append(sentence_activations) model_activations = merge_data_arrays(activations) # merging does not maintain stimulus order. the following orders again idx = [model_activations['stimulus_id'].values.tolist().index(stimulus_id) for stimulus_id in itertools.chain.from_iterable(s['stimulus_id'].values for s in activations)] assert len(set(idx)) == len(idx), "Found duplicate indices to order activations" model_activations = model_activations[{'presentation': idx}] return model_activations
def load_naturalStories(): ressources_dir = Path(__file__).parent.parent.parent / 'ressources' data_path = ressources_dir / 'neural_data' / 'naturalstories_RTS' data_file = data_path / 'processed_RTs.csv' _logger.info(f'Data file: {data_file}') # get data data = pd.read_csv(data_file) # get unique word identifier tuples and order in order of stories item_ID = np.array(data['item']) zone_ID = np.array(data['zone']) zpd_lst = list(zip(item_ID, zone_ID)) unique_zpd_lst = list(set(zpd_lst)) unique_zpd_lst = sorted(unique_zpd_lst, key=lambda tup: (tup[0], tup[1])) # get unique WorkerIds subjects = data.WorkerId.unique() # ====== create matrix ====== r_dim = len(unique_zpd_lst) c_dim = len(subjects) # default value for a subject's not having an RT for a story/word is NaN matrix = np.empty((r_dim, c_dim)) matrix[:] = np.nan # set row and column indices for matrix r_indices = {unique_zpd_lst[i]: i for i in range(r_dim)} c_indices = {subjects[i]: i for i in range(c_dim)} # populate meta information dictionary for subjects xarray dimension metaInfo_subjects = {} for index, d in tqdm(data.iterrows(), total=len(data), desc='indices'): r = r_indices[(d['item'], d['zone'])] c = c_indices[d['WorkerId']] matrix[r][c] = d['RT'] key = d['WorkerId'] if key not in metaInfo_subjects: metaInfo_subjects[key] = (d['correct'], d['WorkTimeInSeconds']) matrix = np.array(matrix) # get subjects' metadata correct_meta = [v[0] for v in metaInfo_subjects.values()] WorkTimeInSeconds_meta = [v[1] for v in metaInfo_subjects.values()] # get metadata for presentation dimension word_df = pd.read_csv(f'{data_path}/all_stories.tok', sep='\t') voc_item_ID = np.array(word_df['item']) voc_zone_ID = np.array(word_df['zone']) voc_word = np.array(word_df['word']) # get sentence_IDs (finds 481 sentences) sentence_ID = [] idx = 1 for i, elm in enumerate(voc_word): sentence_ID.append(idx) if elm.endswith((".", "?", "!", ".'", "?'", "!'", ";'")): if i + 1 < len(voc_word): if not (voc_word[i + 1].islower() or voc_word[i] == "Mr."): idx += 1 # get IDs of words within a sentence word_within_a_sentence_ID = [] idx = 0 for i, elm in enumerate(voc_word): idx += 1 word_within_a_sentence_ID.append(idx) if elm.endswith((".", "?", "!", ".'", "?'", "!'", ";'")): if i + 1 < len(voc_word): if not (voc_word[i + 1].islower() or voc_word[i] == "Mr."): idx = 0 else: idx = 0 # stimulus_ID stimulus_ID = list(range(1, len(voc_word) + 1)) # set df_stimulus_set for attributes df_stimulus_set = word_df[['word', 'item', 'zone']] df_stimulus_set = StimulusSet(df_stimulus_set) df_stimulus_set['story_id'] = df_stimulus_set['item'] df_stimulus_set['stimulus_id'] = stimulus_ID df_stimulus_set['sentence_id'] = sentence_ID df_stimulus_set['word_id'] = voc_zone_ID df_stimulus_set['word_within_sentence_id'] = word_within_a_sentence_ID df_stimulus_set.name = 'naturalStories' # build xarray # voc_word = word # voc_item_ID = index of story (1-10) # voc_zone_ID = index of words within a story # sentence_ID = index of words within each sentence # stimulus_ID = unique index of word across all stories # subjects = WorkerIDs # correct_meta = number of correct answers in comprehension questions assembly = xr.DataArray(matrix, dims=('presentation', 'subjects'), coords={ 'word': ('presentation', voc_word), 'story_id': ('presentation', voc_item_ID), 'word_id': ('presentation', voc_zone_ID), 'word_within_sentence_id': ('presentation', word_within_a_sentence_ID), 'sentence_id': ('presentation', sentence_ID), 'stimulus_id': ('presentation', stimulus_ID), 'subject_id': ('subjects', subjects), 'correct': ('subjects', correct_meta), 'WorkTimeInSeconds': ('subjects', WorkTimeInSeconds_meta) }) assembly.attrs[ 'stimulus_set'] = df_stimulus_set # Add the stimulus_set dataframe return assembly
def _align_stimuli_recordings(stimulus_set, assembly): aligned_stimulus_set = [] partial_sentences = assembly['stimulus_sentence'].values partial_sentences = [ compare_ignore(sentence) for sentence in partial_sentences ] assembly_stimset = {} stimulus_set_index = 0 stories = ordered_set(assembly['story'].values.tolist()) for story in tqdm(sorted(stories), desc='align stimuli', total=len(stories)): story_partial_sentences = [ (sentence, i) for i, (sentence, sentence_story) in enumerate( zip(partial_sentences, assembly['story'].values)) if sentence_story == story ] story_stimuli = stimulus_set[stimulus_set['story'] == story] stimuli_story = ' '.join(story_stimuli['sentence']) stimuli_story_sentence_starts = [0] + [ len(sentence) + 1 for sentence in story_stimuli['sentence'] ] stimuli_story_sentence_starts = np.cumsum( stimuli_story_sentence_starts) assert ' '.join(s for s, i in story_partial_sentences) == compare_ignore( stimuli_story) stimulus_index = 0 Stimulus = namedtuple( 'Stimulus', ['story', 'sentence', 'sentence_num', 'sentence_part']) sentence_parts = defaultdict(lambda: 0) for partial_sentence, assembly_index in story_partial_sentences: full_partial_sentence = '' partial_sentence_index = 0 while partial_sentence_index < len(partial_sentence) \ or stimulus_index < len(stimuli_story) \ and stimuli_story[stimulus_index] in compare_characters + [' ']: if partial_sentence_index < len(partial_sentence) \ and partial_sentence[partial_sentence_index].lower() \ == stimuli_story[stimulus_index].lower(): full_partial_sentence += stimuli_story[stimulus_index] stimulus_index += 1 partial_sentence_index += 1 elif stimuli_story[stimulus_index] in compare_characters + [ ' ' ]: # this case leads to a potential issue: Beginning quotations ' are always appended to # the current instead of the next sentence. For now, I'm hoping this won't lead to issues. full_partial_sentence += stimuli_story[stimulus_index] stimulus_index += 1 elif stimuli_story[stimulus_index] == '-': full_partial_sentence += '-' stimulus_index += 1 if partial_sentence[partial_sentence_index] == ' ': partial_sentence_index += 1 else: raise NotImplementedError() sentence_num = next( index for index, start in enumerate(stimuli_story_sentence_starts) if start >= stimulus_index) - 1 sentence_part = sentence_parts[sentence_num] sentence_parts[sentence_num] += 1 row = Stimulus(sentence=full_partial_sentence, story=story, sentence_num=sentence_num, sentence_part=sentence_part) aligned_stimulus_set.append(row) assembly_stimset[assembly_index] = stimulus_set_index stimulus_set_index += 1 # check aligned_story = "".join(row.sentence for row in aligned_stimulus_set if row.story == story) assert aligned_story == stimuli_story # build StimulusSet aligned_stimulus_set = StimulusSet(aligned_stimulus_set) aligned_stimulus_set['stimulus_id'] = [ ".".join([str(value) for value in values]) for values in zip(*[ aligned_stimulus_set[coord].values for coord in ['story', 'sentence_num', 'sentence_part'] ]) ] aligned_stimulus_set.name = f"{stimulus_set.name}-aligned" # align assembly alignment = [ stimset_idx for assembly_idx, stimset_idx in sorted(assembly_stimset.items(), key=operator.itemgetter(0)) ] assembly_coords = { **{ coord: (dims, values) for coord, dims, values in walk_coords(assembly) }, **{ 'stimulus_id': ('presentation', aligned_stimulus_set['stimulus_id'].values[alignment]), 'meta_sentence': ('presentation', assembly['stimulus_sentence'].values), 'stimulus_sentence': ('presentation', aligned_stimulus_set['sentence'].values[alignment]) } } assembly = type(assembly)(assembly.values, coords=assembly_coords, dims=assembly.dims) return aligned_stimulus_set, assembly
def load_Pereira2018(): data_dir = neural_data_dir / "Pereira2018" experiment2, experiment3 = "243sentences.mat", "384sentences.mat" stimuli = {} # experiment -> stimuli assemblies = [] subject_directories = [d for d in data_dir.iterdir() if d.is_dir()] for subject_directory in tqdm(subject_directories, desc="subjects"): for experiment_filename in [experiment2, experiment3]: data_file = subject_directory / f"examples_{experiment_filename}" if not data_file.is_file(): _logger.debug( f"{subject_directory} does not contain {experiment_filename}" ) continue data = scipy.io.loadmat(str(data_file)) # assembly assembly = data['examples'] meta = data['meta'] assembly = NeuroidAssembly( assembly, coords={ 'experiment': ('presentation', [os.path.splitext(experiment_filename)[0]] * assembly.shape[0]), 'stimulus_num': ('presentation', list(range(assembly.shape[0]))), 'passage_index': ('presentation', data['labelsPassageForEachSentence'][:, 0]), 'passage_label': ('presentation', [ data['keyPassages'][index - 1, 0][0] for index in data['labelsPassageForEachSentence'][:, 0] ]), 'passage_category': ('presentation', [ data['keyPassageCategory'] [0, data['labelsPassageCategory'][index - 1, 0] - 1][0][0] for index in data['labelsPassageForEachSentence'] ]), 'subject': ('neuroid', [subject_directory.name] * assembly.shape[1]), 'voxel_num': ('neuroid', list(range(assembly.shape[1]))), 'AAL_roi_index': ('neuroid', meta[0][0]['roiMultimaskAAL'][:, 0]), }, dims=['presentation', 'neuroid']) stimulus_id = _build_id(assembly, ['experiment', 'stimulus_num']) assembly['stimulus_id'] = 'presentation', stimulus_id # set story for compatibility assembly['story'] = 'presentation', _build_id( assembly, ['experiment', 'passage_category']) assembly['neuroid_id'] = 'neuroid', _build_id( assembly, ['subject', 'voxel_num']) assemblies.append(assembly) # stimuli if experiment_filename not in stimuli: sentences = data['keySentences'] sentences = [sentence[0] for sentence in sentences[:, 0]] stimuli[experiment_filename] = { 'sentence': sentences, 'sentence_num': list(range(len(sentences))), 'stimulus_id': stimulus_id, 'experiment': assembly['experiment'].values, 'story': assembly['story'].values, } for copy_coord in [ 'experiment', 'story', 'passage_index', 'passage_label', 'passage_category' ]: stimuli[experiment_filename][copy_coord] = assembly[ copy_coord].values _logger.debug(f"Merging {len(assemblies)} assemblies") assembly = merge_data_arrays(assemblies) _logger.debug("Creating StimulusSet") combined_stimuli = {} for key in stimuli[experiment2]: combined_stimuli[key] = np.concatenate( (stimuli[experiment2][key], stimuli[experiment3][key])) stimuli = StimulusSet(combined_stimuli) stimuli.name = "Pereira2018" assembly.attrs['stimulus_set'] = stimuli return assembly
def load_Fedorenko2016(electrodes, version): ressources_dir = Path(__file__).parent.parent.parent / 'ressources' neural_data_dir = ressources_dir / 'neural_data' / 'ecog-Fedorenko2016/' stim_data_dir = ressources_dir / 'stimuli' / 'sentences_8' _logger.info(f'Neural data directory: {neural_data_dir}') filepaths_stim = glob(os.path.join(stim_data_dir, '*.txt')) # ECoG data = None # For language responsive electrodes: if electrodes == 'language': # Create a subject ID list corresponding to language electrodes subject1 = np.repeat(1, 47) subject2 = np.repeat(2, 9) subject3 = np.repeat(3, 9) subject4 = np.repeat(4, 15) subject5 = np.repeat(5, 18) if version == 1: filepath_neural = glob(os.path.join(neural_data_dir, '*ecog.mat')) if version == 2: filepath_neural = glob(os.path.join(neural_data_dir, '*metadata_lang.mat')) if version == 3: subject1 = np.repeat(1, 47) subject2 = np.repeat(2, 8) subject3 = np.repeat(3, 9) subject4 = np.repeat(4, 15) subject5 = np.repeat(5, 18) filepath_neural = glob(os.path.join(neural_data_dir, '*g_lang_v3.mat')) if version == 4: subject1 = np.repeat(1, 49) subject2 = np.repeat(2, 8) subject3 = np.repeat(3, 10) subject4 = np.repeat(4, 16) subject5 = np.repeat(5, 19) subject6 = np.repeat(6, 3) filepath_neural = glob(os.path.join(neural_data_dir, '*g_lang_v4.mat')) _logger.debug(f'Running Fedorenko2016 benchmark with language responsive electrodes, data version: {version}') # For non-noisy electrodes if electrodes == 'all': # Create a subject ID list corresponding to all electrodes subject1 = np.repeat(1, 70) subject2 = np.repeat(2, 35) subject3 = np.repeat(3, 20) subject4 = np.repeat(4, 29) subject5 = np.repeat(5, 26) if version == 1: filepath_neural = glob(os.path.join(neural_data_dir, '*ecog_all.mat')) if version == 2: filepath_neural = glob(os.path.join(neural_data_dir, '*metadata_all.mat')) if version == 3: subject1 = np.repeat(1, 67) subject2 = np.repeat(2, 35) subject3 = np.repeat(3, 20) subject4 = np.repeat(4, 29) subject5 = np.repeat(5, 26) filepath_neural = glob(os.path.join(neural_data_dir, '*all_v3.mat')) if version == 4: subject1 = np.repeat(1, 63) subject2 = np.repeat(2, 35) subject3 = np.repeat(3, 21) subject4 = np.repeat(4, 29) subject5 = np.repeat(5, 27) subject6 = np.repeat(6, 9) filepath_neural = glob(os.path.join(neural_data_dir, '*all_v4.mat')) _logger.debug('Running Fedorenko2016 benchmark with non-noisy electrodes, data version: ', version) # For non-noisy electrodes if electrodes == 'non-language': if version == 1 or version == 2: filepath_neural = glob(os.path.join(neural_data_dir, '*nonlang.mat')) # Create a subject ID list corresponding to non-language electrodes subject1 = np.repeat(1, 28) subject2 = np.repeat(2, 31) subject3 = np.repeat(3, 14) subject4 = np.repeat(4, 19) subject5 = np.repeat(5, 16) if version == 3: filepath_neural = glob(os.path.join(neural_data_dir, '*nonlang_v3.mat')) # Create a subject ID list corresponding to non-language electrodes subject1 = np.repeat(1, 25) # 47 lang selective, subject2 = np.repeat(2, 31) subject3 = np.repeat(3, 14) subject4 = np.repeat(4, 19) subject5 = np.repeat(5, 16) # 10 lang electrodes in the non-noisy if version == 4: filepath_neural = glob(os.path.join(neural_data_dir, '*nonlang_v4.mat')) # Create a subject ID list corresponding to non-language electrodes subject1 = np.repeat(1, 22) subject2 = np.repeat(2, 31) subject3 = np.repeat(3, 15) subject4 = np.repeat(4, 19) subject5 = np.repeat(5, 18) subject6 = np.repeat(6, 6) _logger.debug(f'Running Fedorenko2016 benchmark with non-language electrodes, data version: {version}') ecog_mat = sio.loadmat(filepath_neural[0]) ecog_mtrix = ecog_mat['ecog'] if version == 1: # Manually z-score the version 1 data ecog_z = stats.zscore(ecog_mtrix, 1) if version == 2 or version == 3 or version == 4: ecog_z = ecog_mtrix ecog_mtrix_T = np.transpose(ecog_z) num_words = list(range(np.shape(ecog_mtrix_T)[0])) new_sent_idx = num_words[::8] # Average across word representations sent_avg_ecog = [] for i in new_sent_idx: eight_words = ecog_mtrix_T[i:i + 8, :] sent_avg = np.mean(eight_words, 0) sent_avg_ecog.append(sent_avg) # Stimuli for filepath in filepaths_stim: with open(filepath, 'r') as file1: f1 = file1.readlines() _logger.debug(f1) sentences = [] sentence_words, word_nums = [], [] for sentence in f1: sentence = sentence.split(' ') sentences.append(sentence) word_counter = 0 for word in sentence: if word == '\n': continue word = word.rstrip('\n') sentence_words.append(word) word_nums.append(word_counter) word_counter += 1 _logger.debug(sentence_words) # Create sentenceID list sentence_lst = list(range(0, 52)) sentenceID = np.repeat(sentence_lst, 8) if version == 1 or version == 2 or version == 3: subjectID = np.concatenate([subject1, subject2, subject3, subject4, subject5], axis=0) if version == 4: subjectID = np.concatenate([subject1, subject2, subject3, subject4, subject5, subject6], axis=0) # Create a list for each word number word_number = list(range(np.shape(ecog_mtrix_T)[0])) # Add a pd df as the stimulus_set zipped_lst = list(zip(sentenceID, word_number, sentence_words)) df_stimulus_set = StimulusSet(zipped_lst, columns=['sentence_id', 'stimulus_id', 'word']) df_stimulus_set.name = 'Fedorenko2016.ecog' # xarray electrode_numbers = list(range(np.shape(ecog_mtrix_T)[1])) assembly = xr.DataArray(ecog_mtrix_T, dims=('presentation', 'neuroid'), coords={'stimulus_id': ('presentation', word_number), 'word': ('presentation', sentence_words), 'word_num': ('presentation', word_nums), 'sentence_id': ('presentation', sentenceID), 'electrode': ('neuroid', electrode_numbers), 'neuroid_id': ('neuroid', electrode_numbers), 'subject_UID': ('neuroid', subjectID), # Name is subject_UID for consistency }) assembly.attrs['stimulus_set'] = df_stimulus_set # Add the stimulus_set dataframe data = assembly if data is None else xr.concat(data, assembly) return NeuroidAssembly(data)