def main(mask: str = 'hcp'): masked_dir = join(get_data_dir(), 'masked_%s' % mask) reduced_dir = join(get_data_dir(), 'reduced_512_%s' % mask) mask_all(output_dir=masked_dir, n_jobs=30, mask=mask) reduce_all(output_dir=reduced_dir, masked_dir=masked_dir, n_jobs=30, mask=mask)
def fetch_reduced_loadings(data_dir=None, url=None, verbose=False, resume=True): if url is None: url = 'http://cogspaces.github.io/assets/data/loadings/' data_dir = get_data_dir(data_dir) dataset_name = 'loadings' data_dir = _get_dataset_dir(dataset_name, data_dir=data_dir, verbose=verbose) keys = STUDY_LIST paths = ['data_%s.pt' % key for key in keys] urls = [url + path for path in paths] files = [(path, url, {}) for path, url in zip(paths, urls)] files = _fetch_files(data_dir, files, resume=resume, verbose=verbose) params = {key: file for key, file in zip(keys, files)} fdescr = ( "Z-statistic loadings over a dictionary of 453 components covering " "grey-matter `modl_atlas['components_512_gm']` " "for 35 different task fMRI studies.") params['description'] = fdescr params['data_dir'] = data_dir return params
def fetch_hcp(data_dir=None, n_subjects=None, subjects=None, from_file=True): data_dir = get_data_dir(data_dir) BASE_CONTRASTS = [ 'FACES', 'SHAPES', 'PUNISH', 'REWARD', 'MATH', 'STORY', 'MATCH', 'REL', 'RANDOM', 'TOM', 'LF', 'RF', 'LH', 'RH', 'CUE', '0BK_BODY', '0BK_FACE', '0BK_PLACE', '0BK_TOOL', '2BK_BODY', '2BK_FACE', '2BK_PLACE', '2BK_TOOL', ] source_dir = join(data_dir, 'HCP900') if not os.path.exists(source_dir): raise ValueError('Please ensure that %s contains all required data.' % source_dir) res = hcp_builder_fetch_hcp(data_dir=source_dir, n_subjects=n_subjects, from_file=from_file, subjects=subjects, on_disk=True) rest = res.rest.assign(confounds=[None] * res.rest.shape[0]) task = res.task.assign(confounds=[None] * res.task.shape[0]) task.sort_index(inplace=True) rest.sort_index(inplace=True) # Make it compatible with the other studies contrasts = res.contrasts.loc[idx[:, :, BASE_CONTRASTS, :], :] contrasts = contrasts[['z_map']] # Replace symlink contrasts['z_map'] = contrasts['z_map'].map( lambda x: x.replace('/storage/store/data/HCP900', source_dir)) contrasts.reset_index(inplace=True) contrasts['study'] = 'hcp' contrasts.set_index(['study', 'subject', 'task', 'contrast', 'direction'], inplace=True) contrasts.sort_index(inplace=True) return Bunch(rest=rest, contrasts=contrasts, task=task, behavioral=res.behavioral, mask=res.mask, root=res.root)
def fetch_la5c(data_dir=None): data_dir = get_data_dir(data_dir) source_dir = join(data_dir, 'la5c', 'ds000030', 'glm') if not os.path.exists(source_dir): raise ValueError('Please ensure that %s contains all required data.' % source_dir) z_maps = glob.glob(join(source_dir, '*/*/*', 'z_*.nii.gz')) subjects = [] contrasts = [] tasks = [] filtered_z_maps = [] for z_map in z_maps: dirname, contrast = os.path.split(z_map) contrast = contrast[2:-7] dirname, _ = os.path.split(dirname) dirname, task = os.path.split(dirname) dirname, subject = os.path.split(dirname) subject = int(subject[-3:]) subjects.append(subject) contrasts.append(contrast) tasks.append(task) filtered_z_maps.append(z_map) df = pd.DataFrame( data={ 'study': 'la5c', 'subject': subjects, 'task': tasks, 'contrast': contrasts, 'direction': 'level1', 'z_map': filtered_z_maps, }) df.set_index(['study', 'subject', 'task', 'contrast', 'direction'], inplace=True) df.sort_index(inplace=True) return df
def fetch_mask(data_dir=None, url=None, resume=True, verbose=1): if url is None: url = 'http://cogspaces.github.io/assets/data/hcp_mask.nii.gz' files = [('hcp_mask.nii.gz', url, {})] dataset_name = 'mask' data_dir = get_data_dir(data_dir) dataset_dir = _get_dataset_dir(dataset_name, data_dir=data_dir, verbose=verbose) files = _fetch_files(dataset_dir, files, resume=resume, verbose=verbose) return files[0]
def load_from_directory(dataset, data_dir=None): data_dir = get_data_dir(data_dir) dataset_dir = join(data_dir, dataset) Xs, ys = {}, {} regex = re.compile(r'data_(.*).pt') for file in os.listdir(dataset_dir): m = regex.match(file) if m is not None: study = m.group(1) Xs[study], ys[study] = load(join(dataset_dir, file)) ys = add_study_contrast(ys) return Xs, ys
def baseline(): system = dict( device=-1, seed=0, verbose=100, ) data = dict(source_dir=join(get_data_dir(), 'reduced_512_icbm_gm'), studies='archi') model = dict( normalize=True, estimator='logistic', max_iter=10000, ) logistic = dict(l2_penalty=1e-6, )
def get_study_info(): input_data, targets = load_reduced_loadings(data_dir=get_data_dir()) targets = pd.concat(targets.values(), axis=0) targets['#subjects'] = targets.groupby( by=['study', 'task', 'contrast'])['subject'].transform('nunique') targets['#contrasts_per_task'] = targets.groupby( by=['study', 'task'])['contrast'].transform('nunique') targets['#contrasts_per_study'] = targets.groupby( by='study')['contrast'].transform('nunique') targets['chance_study'] = 1 / targets['#contrasts_per_study'] targets['chance_task'] = 1 / targets['#contrasts_per_task'] citations = _get_citations() targets = pd.merge(targets, citations, on='study', how='left') targets = targets.groupby( by=['study', 'task', 'contrast']).first().sort_index().drop( columns='index').reset_index() targets['study__task'] = targets.apply( lambda x: f'{x["study"]}__{x["task"]}', axis='columns') targets['name_task'] = targets.apply( lambda x: f'[{x["bibkey"]}] {x["task"]}', axis='columns') def apply(x): comment = x['comment'].iloc[0] if comment != '': tasks = comment tasks_lim = comment else: tasks_list = x['task'].unique() tasks = ' & '.join(tasks_list) if len(tasks) > 50: tasks_lim = tasks_list[0] + ' & ...' else: tasks_lim = tasks name = f'[{x["bibkey"].iloc[0]}] {tasks_lim}' latex_name = f'\cite{{{x["citekey"].iloc[0]}}} {tasks}'.replace( '&', '\&') name = pd.DataFrame(data={ 'name': name, 'latex_name': latex_name }, index=x.index) return name name = targets.groupby(by='study').apply(apply) targets = pd.concat([targets, name], axis=1) return targets
def fetch_archi(data_dir=None): INTERESTING_CONTRASTS = [ "expression_control", "expression_intention", "expression_sex", "face_control", "face_sex", "face_trusty", "audio", "calculaudio", "calculvideo", "clicDaudio", "clicDvideo", "clicGaudio", "clicGvideo", "computation", "damier_H", "damier_V", "object_grasp", "object_orientation", "rotation_hand", "rotation_side", "saccade", "motor-cognitive", "false_belief_audio", "false_belief_video", "mecanistic_audio", "mecanistic_video", "non_speech", "speech", "triangle_intention", "triangle_random" ] data_dir = get_data_dir(data_dir) source_dir = join(data_dir, 'archi', 'glm') if not os.path.exists(source_dir): raise ValueError('Please ensure that %s contains all required data.' % source_dir) z_maps = glob.glob(join(source_dir, '*/*/*', 'z_*.nii.gz')) subjects = [] contrasts = [] tasks = [] filtered_z_maps = [] for z_map in z_maps: dirname, contrast = os.path.split(z_map) contrast = contrast[2:-7] if contrast in INTERESTING_CONTRASTS: dirname, _ = os.path.split(dirname) dirname, task = os.path.split(dirname) dirname, subject = os.path.split(dirname) subject = int(subject[-3:]) subjects.append(subject) contrasts.append(contrast) tasks.append(task) filtered_z_maps.append(z_map) df = pd.DataFrame( data={ 'subject': subjects, 'task': tasks, 'contrast': contrasts, 'direction': 'level1', 'study': 'archi', 'z_map': filtered_z_maps, }) df.set_index(['study', 'subject', 'task', 'contrast', 'direction'], inplace=True) df.sort_index(inplace=True) return df
def fetch_atlas_modl(data_dir=None, url=None, resume=True, verbose=1): """Download and load a multi-scale atlas computed using MODL over HCP900. Parameters ---------- data_dir: string, optional Path of the data directory. Used to force data storage in a non- standard location. Default: None (meaning: default) url: string, optional Download URL of the dataset. Overwrite the default URL. """ if url is None: url = 'http://cogspaces.github.io/assets/data/modl/' data_dir = get_data_dir(data_dir) dataset_name = 'modl' data_dir = _get_dataset_dir(dataset_name, data_dir=data_dir, verbose=verbose) keys = [ 'components_64', 'components_128', 'components_453_gm', 'loadings_128_gm' ] paths = [ 'components_64.nii.gz', 'components_128.nii.gz', 'components_453_gm.nii.gz', 'loadings_128_gm.npy', ] urls = [url + path for path in paths] files = [(path, url, {}) for path, url in zip(paths, urls)] files = _fetch_files(data_dir, files, resume=resume, verbose=verbose) params = {key: file for key, file in zip(keys, files)} fdescr = 'Components computed using the MODL package, at various scale,' \ 'from HCP900 data' params['description'] = fdescr params['data_dir'] = data_dir return Bunch(**params)
def default(): seed = 0 system = dict( device=-1, verbose=100, ) data = dict(source_dir=join(get_data_dir(), 'reduced_512_lstsq'), studies=['archi', 'hcp']) model = dict( normalize=True, estimator='factored', max_iter=100, ) factored = dict( shared_embedding_size=100, batch_size=32, dropout=0.75, lr=1e-3, input_dropout=0., ) logistic = dict(l2_penalty=1e-3, )
def factored(): system = dict( device=-1, seed=0, verbose=50, ) data = dict(source_dir=join(get_data_dir(), 'reduced_512'), studies='all') model = dict( normalize=True, estimator='factored', study_weight='study', max_iter=500, ) factored = dict( shared_embedding_size=100, batch_size=32, dropout=0.75, lr=1e-3, input_dropout=0.25, )
def fetch_camcan(data_dir=None): data_dir = get_data_dir(data_dir) source_dir = join(data_dir, 'camcan', 'camcan_smt_maps') if not os.path.exists(source_dir): raise ValueError('Please ensure that %s contains all required data.' % source_dir) z_maps = glob.glob(join(source_dir, '*', '*_z_score.nii.gz')) subjects = [] contrasts = [] tasks = [] filtered_z_maps = [] for z_map in z_maps: dirname, contrast = os.path.split(z_map) _, dirname = os.path.split(dirname) contrast = contrast[13:-15] subject = int(dirname[6:]) if contrast in [ 'AudOnly', 'VidOnly', 'AudVid1200', 'AudVid300', 'AudVid600' ]: subjects.append(subject) contrasts.append(contrast) if contrast in ['AudOnly', 'VidOnly']: tasks.append('audio-video') else: tasks.append('AV-freq') filtered_z_maps.append(z_map) df = pd.DataFrame( data={ 'subject': subjects, 'task': tasks, 'contrast': contrasts, 'direction': 'level1', 'study': 'camcan', 'z_map': filtered_z_maps, }) df.set_index(['study', 'subject', 'task', 'contrast', 'direction'], inplace=True) df.sort_index(inplace=True) return df
def fetch_brainomics(data_dir=None): data_dir = get_data_dir(data_dir) source_dir = join(data_dir, 'brainomics') if not os.path.exists(source_dir): raise ValueError('Please ensure that %s contains all required data.' % source_dir) z_maps = glob.glob(join(source_dir, '*', 'c_*.nii.gz')) subjects = [] contrasts = [] tasks = [] filtered_z_maps = [] regex = re.compile('.*vs.*') for z_map in z_maps: match = re.match(regex, z_map) if match is None: dirname, contrast = os.path.split(z_map) if contrast != 'effects_of_interest': contrast = contrast[6:-7] subject = int(dirname[-2:]) subjects.append(subject) contrasts.append(contrast) tasks.append('localizer') filtered_z_maps.append(z_map) df = pd.DataFrame( data={ 'subject': subjects, 'task': tasks, 'contrast': contrasts, 'direction': 'level1', 'study': 'brainomics', 'z_map': filtered_z_maps, }) df.set_index(['study', 'subject', 'task', 'contrast', 'direction'], inplace=True) df.sort_index(inplace=True) return df
def init_fetch_mask() -> str: """For mask bootstrapping""" return join(get_data_dir(), 'mask', 'hcp_mask.nii.gz')
def run_exp(output_dir, config_updates, _id): """Boiler plate function that has to be put in every multiple experiment script, as exp does not pickle.""" exp.run_command( 'print_config', config_updates=config_updates, ) run = exp._create_run(config_updates=config_updates, ) run._id = _id observer = OurFileStorageObserver.create(basedir=output_dir) run.observers.append(observer) run() if __name__ == '__main__': source_dir = join(get_data_dir(), 'reduced_512') data, target = load_data_from_dir(data_dir=source_dir) studies = list(data.keys()) l2_penalties = np.logspace(-4, -1, 20) config_updates = ParameterGrid({ 'logistic.l2_penalty': l2_penalties, 'data.studies': studies }) output_dir = join(get_output_dir(), 'baseline_logistic_icbm_gm') _id = get_id(output_dir) Parallel(n_jobs=40, verbose=100)( delayed(run_exp)(output_dir, config_update, _id=_id + i) for i, config_update in enumerate(config_updates))
masker = NiftiMasker(mask_img=mask).fit() components = masker.transform(dictionary) for study in studies: this_data, targets = load(join(masked_dir, 'data_%s.pt' % study)) n_samples = this_data.shape[0] batches = list(gen_batches(n_samples, batch_size)) this_data = Parallel(n_jobs=n_jobs, verbose=10, backend='multiprocessing', mmap_mode='r')(delayed(single_reduce)( components, this_data[batch], lstsq=lstsq) for batch in batches) this_data = np.concatenate(this_data, axis=0) dump((this_data, targets), join(output_dir, 'data_%s.pt' % study)) n_jobs = 65 mask_contrasts(studies='all', use_raw=True, output_dir=join(get_data_dir(), 'loadings'), n_jobs=n_jobs) reduce_contrasts(studies='all', masked_dir=join(get_data_dir(), 'masked'), output_dir=join(get_data_dir(), 'loadings'), components='components_453_gm', n_jobs=n_jobs, lstsq=False)