def fetch_openfmri(data_dir, dataset_id, force_download=False, verbose=1): files = { 'ds001': ['ds001_raw'], 'ds002': ['ds002_raw'], 'ds003': ['ds003_raw'], 'ds005': ['ds005_raw'], 'ds006A': ['ds006A_raw'], 'ds007': ['ds007_raw'], 'ds008': ['ds008_raw'], 'ds011': ['ds011_raw'], 'ds017A': ['ds017A_raw'], 'ds017B': ['ds017B_raw'], 'ds051': ['ds051_raw'], 'ds052': ['ds052_raw'], 'ds101': ['ds101_raw'], 'ds102': ['ds102_raw'], 'ds105': ['ds105_raw'], 'ds107': ['ds107_raw'], 'ds108': ['ds108_raw_part1', 'ds108_raw_part2', 'ds108_raw_part3'], 'ds109': ['ds109_raw'], 'ds110': ['ds110_raw_part1', 'ds110_raw_part2', 'ds110_raw_part3', 'ds110_raw_part4', 'ds110_raw_part5', 'ds110_raw_part6'] } if dataset_id not in files: raise Exception('Unknown dataset %s' % dataset_id) base_url = 'http://openfmri.s3.amazonaws.com/tarballs/%s.tgz' urls = [(dataset_id, base_url % f, {'uncompress':True}) for f in files[dataset_id]] temp_dir = os.path.join(data_dir, '_%s' % dataset_id, dataset_id) output_dir = os.path.join(data_dir, dataset_id) if not os.path.exists(output_dir) and not force_download: _fetch_files(data_dir, urls, verbose=verbose) return output_dir
def fetch_hcp_standards(data_dir=None, url=None, resume=True, verbose=1): """ Fetches HCP standard mesh atlases for converting between FreeSurfer and HCP Parameters ---------- data_dir : str, optional Path to use as data directory. If not specified, will check for environmental variable 'NNT_DATA'; if that is not set, will use `~/nnt-data` instead. Default: None url : str, optional URL from which to download data. Default: None resume : bool, optional Whether to attempt to resume partial download, if possible. Default: True verbose : int, optional Modifies verbosity of download, where higher numbers mean more updates. Default: 1 Returns ------- standards : str Filepath to standard_mesh_atlases directory """ if url is None: url = 'http://brainvis.wustl.edu/workbench/standard_mesh_atlases.zip' dataset_name = 'standard_mesh_atlases' data_dir = _get_data_dir(data_dir=data_dir) opts = {'uncompress': True, 'move': '{}.zip'.format(dataset_name)} filenames = ['L.sphere.32k_fs_LR.surf.gii', 'R.sphere.32k_fs_LR.surf.gii'] files = [(op.join(dataset_name, f), url, opts) for f in filenames] _fetch_files(data_dir, files=files, resume=resume, verbose=verbose) return op.join(data_dir, dataset_name)
def fetch_fsl_feeds(data_dir=None, data_name="fsl_feeds", verbose=1): """Function to fetch FSL FEEDS dataset (single-subject) Parameters ---------- data_dir: string path of the data directory. Used to force data storage in a specified location. If the data is already present there, then will simply glob it. Returns ------- data: sklearn.utils.Bunch Dictionary-like object, the interest attributes are: - 'func': string list. Paths to functional images - 'anat': string list. Path to anat image """ data_dir = _get_dataset_dir(data_name, data_dir=data_dir, verbose=verbose) def _glob_fsl_feeds_data(subject_dir): """glob data from subject_dir. """ if not os.path.exists(subject_dir): return None for file_name in FSL_FEEDS_DATA_FILES: file_path = os.path.join(subject_dir, file_name) if os.path.exists(file_path) or os.path.exists( file_path.rstrip(".gz")): file_name = re.sub("(?:\.nii\.gz|\.txt)", "", file_name) else: if not os.path.basename(subject_dir) == 'data': return _glob_fsl_feeds_data( os.path.join(subject_dir, 'feeds/data')) else: print("%s missing from filelist!" % file_name) return None return Bunch(data_dir=data_dir, func=os.path.join(subject_dir, "fmri.nii.gz"), anat=os.path.join(subject_dir, "structural_brain.nii.gz")) # maybe data_dir already contents the data ? data = _glob_fsl_feeds_data(data_dir) if not data is None: return data # download the data print("Data absent, downloading...") url = ("http://fsl.fmrib.ox.ac.uk/fsldownloads/oldversions/" "fsl-4.1.0-feeds.tar.gz") archive_path = os.path.join(data_dir, os.path.basename(url)) for i in range(2): _fetch_files(data_dir, [("feeds", url, { "uncompress": True, "move": "fsl.tar" })]) return _glob_fsl_feeds_data(data_dir)
def fetch_openfmri(data_dir, dataset_id, force_download=False, verbose=1): files = { 'ds001': ['ds001_raw'], 'ds002': ['ds002_raw'], 'ds003': ['ds003_raw'], 'ds005': ['ds005_raw'], 'ds006A': ['ds006A_raw'], 'ds007': ['ds007_raw'], 'ds008': ['ds008_raw'], 'ds011': ['ds011_raw'], 'ds017A': ['ds017A_raw'], 'ds017B': ['ds017B_raw'], 'ds051': ['ds051_raw'], 'ds052': ['ds052_raw'], 'ds101': ['ds101_raw'], 'ds102': ['ds102_raw'], 'ds105': ['ds105_raw'], 'ds107': ['ds107_raw'], 'ds108': ['ds108_raw_part1', 'ds108_raw_part2', 'ds108_raw_part3'], 'ds109': ['ds109_raw'], 'ds110': ['ds110_raw_part1', 'ds110_raw_part2', 'ds110_raw_part3', 'ds110_raw_part4', 'ds110_raw_part5', 'ds110_raw_part6'] } if dataset_id not in files: raise Exception('Unknown dataset %s' % dataset_id) base_url = 'http://openfmri.s3.amazonaws.com/tarballs/%s.tgz' urls = [(dataset_id, base_url % f, {'uncompress':True}) for f in files[dataset_id]] temp_dir = os.path.join(data_dir, '_%s' % dataset_id, dataset_id) output_dir = os.path.join(data_dir, dataset_id) if not os.path.exists(output_dir) and not force_download: _fetch_files(data_dir, urls, verbose=verbose) return output_dir
def fetch_rat_waxholm(template_dir="~/.samri_files/templates/rat/waxholm/", verbose=1): """Download and load waxholm atlas for Sprague Dawley rat Returns ------- dict Dictionary containing template, atlas, labels template - mri anatomy file; nifti file atlas - pixelvalues of regions are grouped together, with corresponding labels in labels.ccv; nifti file labels - labels annotating brain regions for pixelgroups in atlas; csv file References ---------- .. [1] 'Papp, Eszter A., et al. "Waxholm Space atlas of the Sprague Dawley rat brain." NeuroImage 97 (2014): 374-386.' .. [2] https://www.nitrc.org/projects/whs-sd-atlas """ from nilearn.datasets.utils import _fetch_files # Get template url_template = 'https://www.nitrc.org/frs/download.php/9423/WHS_SD_rat_T2star_v1.01.nii.gz' template = _fetch_files( path.abspath(path.expanduser(template_dir)), [('WHS_SD_rat_T2star_v1.01.nii.gz', url_template, {})], verbose=verbose)[0] # Get atlas url_atlas = 'https://www.nitrc.org/frs/download.php/9438/WHS_SD_rat_atlas_v2.nii.gz' atlas = _fetch_files(path.abspath(path.expanduser(template_dir)), [('WHS_SD_rat_atlas_v2.nii.gz', url_atlas, {})], verbose=verbose)[0] # Get labels url_labels = 'https://www.nitrc.org/frs/download.php/9439/WHS_SD_rat_atlas_v2.label' labels = _fetch_files(path.abspath(path.expanduser(template_dir)), [('WHS_SD_rat_atlas_v2.label', url_labels, {})], verbose=verbose)[0] # resample template commands = [ "ResampleImage 3 WHS_SD_rat_T2star_v1.01.nii.gz _200micron_WHS_SD_rat_T2star_v1.01.nii.gz 0.2x0.2x0.2 size=1 spacing=0 4", "SmoothImage 3 _200micron_WHS_SD_rat_T2star_v1.01.nii.gz 0.4 200micron_WHS_SD_rat_T2star_v1.01.nii.gz", "rm _200micron_WHS_SD_rat_T2star_v1.01.nii.gz", "ResampleImage 3 WHS_SD_rat_atlas_v2.nii.gz _200micron_WHS_SD_rat_atlas_v2.nii.gz 0.2x0.2x0.2 size=1 spacing=0 4", "SmoothImage 3 _200micron_WHS_SD_rat_atlas_v2.nii.gz 0.4 200micron_WHS_SD_rat_atlas_v2.nii.gz", "rm _200micron_WHS_SD_rat_atlas_v2.nii.gz", ] for command in commands: p = subprocess.Popen(command.split(), cwd=path.abspath(path.expanduser(template_dir)), stdout=subprocess.PIPE) p.wait() return dict([("template", path.abspath(path.expanduser(template_dir)) + "200micron_WHS_SD_rat_T2star_v1.01.nii.gz"), ("atlas", path.abspath(path.expanduser(template_dir)) + "200micron_WHS_SD_rat_atlas_v2.nii.gz"), ("labels", labels)])
def fetch_rat_waxholm(template_dir="~/.samri_files/templates/rat/waxholm/", verbose=1): """ Download and load waxholm atlas for Sprague Dawley rat Returns ------- dict Dictionary containing template, atlas, labels template - mri anatomy file; nifti file atlas - pixelvalues of regions are grouped together, with corresponding labels in labels.ccv; nifti file labels - labels annotating brain regions for pixelgroups in atlas; csv file Notes ----- Please deprecate this function. We should distribute dependencies such as atlases in the form of unambiguously managed packages, and not data fetched from the web at runtime. References ---------- .. [1] 'Papp, Eszter A., et al. "Waxholm Space atlas of the Sprague Dawley rat brain." NeuroImage 97 (2014): 374-386.' .. [2] https://www.nitrc.org/projects/whs-sd-atlas """ from nilearn.datasets.utils import _fetch_files # Get template url_template = 'https://www.nitrc.org/frs/download.php/9423/WHS_SD_rat_T2star_v1.01.nii.gz' template = _fetch_files(path.abspath(path.expanduser(template_dir)), [('WHS_SD_rat_T2star_v1.01.nii.gz', url_template, {})], verbose=verbose)[0] # Get atlas url_atlas = 'https://www.nitrc.org/frs/download.php/9438/WHS_SD_rat_atlas_v2.nii.gz' atlas = _fetch_files(path.abspath(path.expanduser(template_dir)), [('WHS_SD_rat_atlas_v2.nii.gz', url_atlas, {})], verbose=verbose)[0] # Get labels url_labels = 'https://www.nitrc.org/frs/download.php/9439/WHS_SD_rat_atlas_v2.label' labels = _fetch_files(path.abspath(path.expanduser(template_dir)), [('WHS_SD_rat_atlas_v2.label', url_labels, {})], verbose=verbose)[0] # resample template commands = ["ResampleImage 3 WHS_SD_rat_T2star_v1.01.nii.gz _200micron_WHS_SD_rat_T2star_v1.01.nii.gz 0.2x0.2x0.2 size=1 spacing=0 4", "SmoothImage 3 _200micron_WHS_SD_rat_T2star_v1.01.nii.gz 0.4 200micron_WHS_SD_rat_T2star_v1.01.nii.gz", "rm _200micron_WHS_SD_rat_T2star_v1.01.nii.gz", "ResampleImage 3 WHS_SD_rat_atlas_v2.nii.gz _200micron_WHS_SD_rat_atlas_v2.nii.gz 0.2x0.2x0.2 size=1 spacing=0 4", "SmoothImage 3 _200micron_WHS_SD_rat_atlas_v2.nii.gz 0.4 200micron_WHS_SD_rat_atlas_v2.nii.gz", "rm _200micron_WHS_SD_rat_atlas_v2.nii.gz",] for command in commands: p = subprocess.Popen(command.split(), cwd=path.abspath(path.expanduser(template_dir)), stdout=subprocess.PIPE) p.wait() return dict([ ("template", path.abspath(path.expanduser(template_dir)) + "200micron_WHS_SD_rat_T2star_v1.01.nii.gz"), ("atlas", path.abspath(path.expanduser(template_dir)) + "200micron_WHS_SD_rat_atlas_v2.nii.gz"), ("labels", labels)])
def _fetch_hbnssi_participants(data_dir, url, verbose): """ Helper function to fetch_hbnssi. This function helps in downloading and loading participants data from .tsv uploaded on Open Science Framework(OSF). Parameters ---------- data_dir: str Path of the data directory. Used to force data storage in a specified location. If None is given, data are stored in home directory. url: str, optional Override download URL. Used for test only(or if you setup a mirror of the data). Default: None verbose: int Defines the level of verbosity of the output. Returns ------- participants: numpy.ndarray Contains data of each subject age, gender, handedness. """ if url is None: url = 'https://osf.io/wtvh3/download' files = [('participants.csv', url, {'move': 'participants.csv'})] path_to_participants = _fetch_files(data_dir, files, verbose=verbose)[0] # Load path to participants dtype = [('sid', 'U12'), ('age', '<f8'), ('Gender', 'U4'), ('Handedness', 'U4')] names = ['sid', 'age', 'gender', 'handedness'] participants = csv_to_array(path_to_participants, skip_header=True, dtype=dtype, names=names) return participants
def _fetch_hbnssi_brain_mask(data_dir, url, verbose): """ Helper function to fetch_hbnssi. This function helps in downloading and loading the brain mask from Open Science Framework(OSF). Parameters ---------- data_dir: str Path of the data directory. Used to force data storage in a specified location. If None is given, data are stored in home directory. url: str, optional Override download URL. Used for test only(or if you setup a mirror of the data). Default: None verbose: int Defines the level of verbosity of the output. Returns ------- path_to_mask: str File path for the appropriate brain mask """ if url is None: url = 'https://osf.io/kp6m9/download' target_fname = 'tpl-MNI152NLin2009cAsym_res-3mm_label-GM_desc-thr02_probseg.nii.gz' files = [(target_fname, url, {'move': target_fname})] path_to_mask = _fetch_files(data_dir, files, verbose=verbose)[0] return path_to_mask
def _fetch_ibc_surf_masks(data_dir, url, resume, verbose): """Helper function to fetch_ibc. This function helps in downloading brain and ROI masks for use with IBC functional alignment and inter-subject decoding. The files are downloaded from Open Science Framework (OSF). Parameters ---------- data_dir: str Path of the data directory. Used to force data storage in a specified location. If None is given, data are stored in home directory. url: str, optional Override download URL. Used for test only (or if you setup a mirror of the data). Default: None resume: bool, optional (default True) Whether to resume download of a partly-downloaded file. verbose: int Defines the level of verbosity of the output. Returns ------- derivatives_dir: str Path on disk to the IBC masks data directory. """ if url is None: # Download from the relevant OSF project, using hashes generated # from the OSF API. Note the trailing slash. For more info, see: # https://gist.github.com/emdupre/3cb4d564511d495ea6bf89c6a577da74 url = 'https://osf.io/download/{}/' # The gzip contains unique download keys per Nifti file and CSV # pre-extracted from OSF. Required for downloading files. package_directory = os.path.dirname(os.path.abspath(__file__)) dtype = [('filename', 'U52'), ('uid', 'U24')] names = ['filename', 'uid'] # csv file contains download information osf_data = csv_to_array(os.path.join(package_directory, "ibc_surf_masks.csv"), skip_header=True, dtype=dtype, names=names) derivatives_dir = Path(data_dir, 'masks') masks = [] for this_osf_id in osf_data: # Download mask mask_url = url.format(this_osf_id['uid']) mask_target = Path(derivatives_dir, this_osf_id['filename']) mask_file = [(mask_target, mask_url, {'move': mask_target})] path_to_mask = _fetch_files(data_dir, mask_file, verbose=verbose)[0] masks.append(path_to_mask) return derivatives_dir
def fetch_bids_langloc_dataset(data_dir=None, verbose=1): """Download language localizer example bids dataset. Parameters ---------- data_dir: string, optional Path to store the downloaded dataset. if None employ nilearn datasets default download directory. verbose: int, optional verbosity level (0 means no message). Returns ------- data_dir: string Path to downloaded dataset downloaded_files: list of string Absolute paths of downloaded files on disk """ url = 'https://files.osf.io/v1/resources/9q7dv/providers/osfstorage/5888d9a76c613b01fc6acc4e' dataset_name = 'bids_langloc_example' main_folder = 'bids_langloc_dataset' data_dir = _get_dataset_dir(dataset_name, data_dir=data_dir, verbose=verbose) # The files_spec needed for _fetch_files files_spec = [(main_folder + '.zip', url, {'move': main_folder + '.zip'})] if not os.path.exists(os.path.join(data_dir, main_folder)): downloaded_files = _fetch_files(data_dir, files_spec, resume=True, verbose=verbose) _uncompress_file(downloaded_files[0]) main_path = os.path.join(data_dir, main_folder) file_list = [os.path.join(path, f) for path, dirs, files in os.walk(main_path) for f in files] return os.path.join(data_dir, main_folder), sorted(file_list)
def fetch_craddock_adhd_200_parcellations(data_dir=None, verbose=1): """These are the parcellations from the Athena Pipeline of the ADHD 200 preprocessing initiative. 200 and 400 ROI atlases were generated using 2-level parcellation of 650 individuals from the ADHD 200 Sample. Parameters ---------- data_dir : str Directory where the data should be downloaded. Returns ------- data : sklearn.datasets.base.Bunch dictionary-like object, keys are: parcellations_200, parcellations_400 """ url = 'http://www.nitrc.org/frs/download.php/5906/ADHD200_parcellations.tar.gz' opts = {'uncompress': True} dataset_name = 'craddock_ADHD200_parcellations' filenames = [("ADHD200_parcellate_200.nii.gz", url, opts), ("ADHD200_parcellate_400.nii.gz", url, opts)] data_dir = _get_dataset_dir(dataset_name, data_dir=data_dir, verbose=verbose) files = _fetch_files(data_dir, filenames, verbose=verbose) keys = ("parcellations_200", "parcellations_400") params = dict(list(zip(keys, files))) return Bunch(**params)
def fetch_mist(): """Download MIST parcellation n=122 https://mniopenresearch.org/articles/1-3 Returns ------- maps : str Path to MIST parcellation labels : list of str Anatomical labels assigned to each label """ url = 'https://ndownloader.figshare.com/files/9811081' opts = {'uncompress': True} data_dir = _get_dataset_dir('mist', data_dir=None, verbose=1) files = [(join('Release', 'Parcel_Information', 'MIST_122.csv'), url, opts), (join('Release', 'Parcellations', 'MIST_122.nii.gz'), url, opts)] files = _fetch_files(data_dir, files, resume=True, verbose=1) parcel_info = pd.read_csv(files[0], sep=';') names = parcel_info['name'] df = pd.DataFrame(['Background'], columns=['name']) for i in range(names.shape[0]): df2 = pd.DataFrame([names[i]], columns=['name']) df = df.append(df2, ignore_index=True) return Bunch(maps=files[1], labels=df)
def fetch_localizer_first_level(data_dir=None, verbose=1): """ Download a first-level localizer fMRI dataset Parameters ---------- data_dir: string directory where data should be downloaded and unpacked. Returns ------- data: sklearn.datasets.base.Bunch dictionary-like object, keys are: epi_img: the input 4D image paradigm: a csv file describing the paardigm """ url = 'ftp://ftp.cea.fr/pub/dsv/madic/download/nipy' dataset_name = "localizer_first_level" files = dict(epi_img="s12069_swaloc1_corr.nii.gz", paradigm="localizer_paradigm.csv") # The options needed for _fetch_files options = [(filename, os.path.join(url, filename), {}) for _, filename in sorted(files.items())] data_dir = _get_dataset_dir(dataset_name, data_dir=data_dir, verbose=verbose) sub_files = _fetch_files(data_dir, options, resume=True, verbose=verbose) params = dict(zip(sorted(files.keys()), sub_files)) return Bunch(**params)
def fetch_localizer_first_level(data_dir=None, verbose=1): """ Download a first-level localizer fMRI dataset Parameters ---------- data_dir: string directory where data should be downloaded and unpacked. Returns ------- data: sklearn.datasets.base.Bunch dictionary-like object, with the keys: epi_img: the input 4D image events: a csv file describing the paardigm """ url = 'https://osf.io/2bqxn/download' epi_img = 'sub-12069_task-localizer_space-MNI305.nii.gz' events = 'sub-12069_task-localizer_events.tsv' opts = {'uncompress': True} options = ('epi_img', 'events') dir_ = 'localizer_first_level' filenames = [(os.path.join(dir_, name), url, opts) for name in [epi_img, events]] dataset_name = 'localizer_first_level' data_dir = _get_dataset_dir(dataset_name, data_dir=data_dir, verbose=verbose) files = _fetch_files(data_dir, filenames, verbose=verbose) params = dict(list(zip(options, files))) return Bunch(**params)
def fetch_reduced_loadings(data_dir=None, url=None, verbose=False, resume=True): if url is None: url = 'http://cogspaces.github.io/assets/data/loadings/' data_dir = get_data_dir(data_dir) dataset_name = 'loadings' data_dir = _get_dataset_dir(dataset_name, data_dir=data_dir, verbose=verbose) keys = STUDY_LIST paths = ['data_%s.pt' % key for key in keys] urls = [url + path for path in paths] files = [(path, url, {}) for path, url in zip(paths, urls)] files = _fetch_files(data_dir, files, resume=resume, verbose=verbose) params = {key: file for key, file in zip(keys, files)} fdescr = ( "Z-statistic loadings over a dictionary of 453 components covering " "grey-matter `modl_atlas['components_512_gm']` " "for 35 different task fMRI studies.") params['description'] = fdescr params['data_dir'] = data_dir return params
def fetch_localizer_first_level(data_dir=None, verbose=1): """ Download a first-level localizer fMRI dataset Parameters ---------- data_dir: string directory where data should be downloaded and unpacked. Returns ------- data: sklearn.datasets.base.Bunch dictionary-like object, keys are: epi_img: the input 4D image paradigm: a csv file describing the paardigm """ url = 'ftp://ftp.cea.fr/pub/dsv/madic/download/nipy' dataset_name = "localizer_first_level" files = dict(epi_img="s12069_swaloc1_corr.nii.gz", paradigm="localizer_paradigm.csv") # The options needed for _fetch_files options = [(filename, os.path.join(url, filename), {}) for _, filename in sorted(files.items())] data_dir = _get_dataset_dir(dataset_name, data_dir=data_dir, verbose=verbose) sub_files = _fetch_files(data_dir, options, resume=True, verbose=verbose) params = dict(zip(sorted(files.keys()), sub_files)) return Bunch(**params)
def fetch_craddock_parcellation(data_dir=None, url=None, resume=True, verbose=1): """Download and load the craddock parcellation. Parameters ---------- data_dir: string, optional Path of the data directory. Used to force data storage in a non- standard location. Default: None (meaning: default) mirror: string, optional By default, the dataset is downloaded from the original website of the atlas. Specifying "nitrc" will force download from a mirror, with potentially higher bandwith. url: string, optional Download URL of the dataset. Overwrite the default URL. Returns ------- data: sklearn.datasets.base.Bunch dictionary-like object, contains: - 200-components parcelellation (parcellate200) - 400-components parcelellation (parcellate400) References ---------- ? """ if url is None: url = 'http://www.amensch.fr/data/craddock_parcellation/' files = [ 'ADHD200_parcellate_200.nii.gz', 'ADHD200_parcellate_400.nii.gz', ] if isinstance(url, str): url = [url] * len(files) files = [(f, u + f, {}) for f, u in zip(files, url)] dataset_name = 'craddock_parcellation' data_dir = _get_dataset_dir(dataset_name, data_dir=data_dir, verbose=verbose) files_ = _fetch_files(data_dir, files, resume=resume, verbose=verbose) fdescr = 'Components from Craddock clustering atlas' keys = ['parcellate200', 'parcellate400'] params = dict(zip(keys, files_)) params['description'] = fdescr return Bunch(**params)
def fetch_voneconomo(data_dir=None, url=None, resume=True, verbose=1): """ Fetches von-Economo Koskinas probabilistic FreeSurfer atlas Parameters ---------- data_dir : str, optional Path to use as data directory. If not specified, will check for environmental variable 'NNT_DATA'; if that is not set, will use `~/nnt-data` instead. Default: None url : str, optional URL from which to download data. Default: None resume : bool, optional Whether to attempt to resume partial download, if possible. Default: True verbose : int, optional Modifies verbosity of download, where higher numbers mean more updates. Default: 1 Returns ------- filenames : :class:`sklearn.utils.Bunch` Dictionary-like object with keys of format '{}Parcels{}Networks' where corresponding values are the left/right hemisphere annotation files References ---------- Scholtens, L. H., de Reus, M. A., de Lange, S. C., Schmidt, R., & van den Heuvel, M. P. (2018). An MRI von Economo–Koskinas atlas. NeuroImage, 170, 249-256. Notes ----- License: CC-BY-NC-SA 4.0 """ dataset_name = 'atl-voneconomo_koskinas' keys = ['gcs', 'ctab', 'info'] data_dir = _get_data_dir(data_dir=data_dir) info = _get_dataset_info(dataset_name) if url is None: url = info['url'] opts = { 'uncompress': True, 'md5sum': info['md5'], 'move': '{}.tar.gz'.format(dataset_name) } filenames = [ 'atl-vonEconomoKoskinas_hemi-{}_probabilistic.{}'.format(hemi, suff) for hemi in ['L', 'R'] for suff in ['gcs', 'ctab'] ] + ['atl-vonEconomoKoskinas_info.csv'] files = [(op.join(dataset_name, f), url, opts) for f in filenames] data = _fetch_files(data_dir, files=files, resume=resume, verbose=verbose) data = [ANNOT(*data[:-1:2])] + [ANNOT(*data[1:-1:2])] + [data[-1]] return Bunch(**dict(zip(keys, data)))
def fetch_fsaverage(data_dir=None, url=None, resume=True, verbose=1): """ Downloads files for fsaverage FreeSurfer template Parameters ---------- data_dir : str, optional Path to use as data directory. If not specified, will check for environmental variable 'NNT_DATA'; if that is not set, will use `~/nnt-data` instead. Default: None url : str, optional URL from which to download data. Default: None resume : bool, optional Whether to attempt to resume partial download, if possible. Default: True verbose : int, optional Modifies verbosity of download, where higher numbers mean more updates. Default: 1 Returns ------- filenames : :class:`sklearn.utils.Bunch` Dictionary-like object with keys ['surf'] where corresponding values are length-2 lists downloaded template files (each list composed of files for the left and right hemisphere). References ---------- """ dataset_name = 'tpl-fsaverage' keys = ['orig', 'white', 'smoothwm', 'pial', 'inflated', 'sphere'] data_dir = _get_data_dir(data_dir=data_dir) info = _get_dataset_info(dataset_name) if url is None: url = info['url'] opts = { 'uncompress': True, 'md5sum': info['md5'], 'move': '{}.tar.gz'.format(dataset_name) } filenames = [ 'fsaverage/surf/{}.{}'.format(hemi, surf) for surf in keys for hemi in ['lh', 'rh'] ] data = _fetch_files(data_dir, files=[(f, url, opts) for f in filenames], resume=resume, verbose=verbose) data = [data[i:i + 2] for i in range(0, len(keys) * 2, 2)] return Bunch(**dict(zip(keys, data)))
def fetch_vazquez_rodriguez2019(data_dir=None, url=None, resume=True, verbose=1): """ Downloads files from Vazquez-Rodriguez et al., 2019, PNAS Parameters ---------- data_dir : str, optional Path to use as data directory. If not specified, will check for environmental variable 'NNT_DATA'; if that is not set, will use `~/nnt-data` instead. Default: None url : str, optional URL from which to download data. Default: None resume : bool, optional Whether to attempt to resume partial download, if possible. Default: True verbose : int, optional Modifies verbosity of download, where higher numbers mean more updates. Default: 1 Returns ------- data : :class:`sklearn.utils.Bunch` Dictionary-like object with keys ['rsquared', 'gradient'] containing 1000 values from References ---------- See `ref` key of returned dictionary object for relevant dataset reference """ dataset_name = 'ds-vazquez_rodriguez2019' data_dir = _get_data_dir(data_dir=data_dir) info = _get_dataset_info(dataset_name) if url is None: url = info['url'] opts = { 'uncompress': True, 'md5sum': info['md5'], 'move': '{}.tar.gz'.format(dataset_name) } filenames = [op.join(dataset_name, 'rsquared_gradient.csv')] data = _fetch_files(data_dir, files=[(f, url, opts) for f in filenames], resume=resume, verbose=verbose) # load data rsq, grad = np.loadtxt(data[0], delimiter=',', skiprows=1).T return Bunch(rsquared=rsq, gradient=grad)
def fetch_mask(data_dir=None, url=None, resume=True, verbose=1): if url is None: url = 'http://cogspaces.github.io/assets/data/hcp_mask.nii.gz' files = [('hcp_mask.nii.gz', url, {})] dataset_name = 'mask' data_dir = get_data_dir(data_dir) dataset_dir = _get_dataset_dir(dataset_name, data_dir=data_dir, verbose=verbose) files = _fetch_files(dataset_dir, files, resume=resume, verbose=verbose) return files[0]
def fetch_rat_waxholm(template_dir="~/.samri_files/templates/rat/waxholm/", verbose=1): """Download and load waxholm atlas for Sprague Dawley rat Returns ------- dict Dictionary containing template, atlas, labels template - mri anatomy file; nifti file atlas - pixelvalues of regions are grouped together, with corresponding labels in labels.ccv; nifti file labels - labels annotating brain regions for pixelgroups in atlas; csv file References ---------- .. [1] 'Papp, Eszter A., et al. "Waxholm Space atlas of the Sprague Dawley rat brain." NeuroImage 97 (2014): 374-386.' .. [2] https://www.nitrc.org/projects/whs-sd-atlas """ # Get template url_template = 'https://www.nitrc.org/frs/download.php/9423/WHS_SD_rat_T2star_v1.01.nii.gz' template = _fetch_files( path.abspath(path.expanduser(template_dir)), [('WHS_SD_rat_T2star_v1.01.nii.gz', url_template, {})], verbose=verbose)[0] # Get atlas url_atlas = 'https://www.nitrc.org/frs/download.php/9438/WHS_SD_rat_atlas_v2.nii.gz' atlas = _fetch_files(path.abspath(path.expanduser(template_dir)), [('WHS_SD_rat_atlas_v2.nii.gz', url_atlas, {})], verbose=verbose)[0] # Get labels url_labels = 'https://www.nitrc.org/frs/download.php/9439/WHS_SD_rat_atlas_v2.label' labels = _fetch_files(path.abspath(path.expanduser(template_dir)), [('WHS_SD_rat_atlas_v2.label', url_labels, {})], verbose=verbose)[0] return dict([("template", template), ("atlas", atlas), ("labels", labels)])
def fetch_pauli2018(data_dir=None, url=None, resume=True, verbose=1): """ Downloads files for Pauli et al., 2018 subcortical parcellation Parameters ---------- data_dir : str, optional Path to use as data directory. If not specified, will check for environmental variable 'NNT_DATA'; if that is not set, will use `~/nnt-data` instead. Default: None url : str, optional URL from which to download data. Default: None resume : bool, optional Whether to attempt to resume partial download, if possible. Default: True verbose : int, optional Modifies verbosity of download, where higher numbers mean more updates. Default: 1 Returns ------- filenames : :class:`sklearn.utils.Bunch` Dictionary-like object with keys ['probabilistic', 'deterministic'], where corresponding values are filepaths to downloaded atlas files. References ---------- Pauli, W. M., Nili, A. N., & Tyszka, J. M. (2018). A high-resolution probabilistic in vivo atlas of human subcortical brain nuclei. Scientific Data, 5, 180063. Notes ----- License: CC-BY Attribution 4.0 International """ dataset_name = 'atl-pauli2018' keys = ['probabilistic', 'deterministic', 'info'] data_dir = _get_data_dir(data_dir=data_dir) info = _get_dataset_info(dataset_name) # format the query how _fetch_files() wants things and then download data files = [ (i['name'], i['url'], dict(md5sum=i['md5'], move=i['name'])) for i in info ] data = _fetch_files(data_dir, files=files, resume=resume, verbose=verbose) return Bunch(**dict(zip(keys, data)))
def fetch_openfmri(data_dir, dataset_id, force_download=False, verbose=1): files = { 'ds001': ['ds001_raw_6'], 'ds002': ['ds002_raw'], 'ds003': ['ds003_raw_1'], 'ds005': ['ds005_raw_0'], 'ds006A': ['ds006A_raw'], 'ds007': ['ds007_raw'], 'ds008': ['ds008_raw_4'], 'ds011': ['ds011_raw_0'], 'ds017A': ['ds017A_raw_0'], 'ds017B': ['ds017B_raw_0'], 'ds051': ['ds051_raw_0'], 'ds052': ['ds052_raw_0'], 'ds101': ['ds101_raw_0'], 'ds102': ['ds102_raw_0'], 'ds105': ['ds105_raw_6'], 'ds107': ['ds107_raw_0'], 'ds108': ['ds108_raw_part1', 'ds108_raw_part2', 'ds108_raw_part3'], 'ds109': ['ds109_raw_4'], 'ds110': ['ds110_raw_part1', 'ds110_raw_part2', 'ds110_raw_part3', 'ds110_raw_part4', 'ds110_raw_part5', 'ds110_raw_part6'], } if dataset_id not in files: raise Exception('Unknown dataset %s' % dataset_id) base_url = 'https://openfmri.org/system/files/%s.tgz' urls = [base_url % f for f in files[dataset_id]] temp_dir = os.path.join(data_dir, '_%s' % dataset_id, dataset_id) output_dir = os.path.join(data_dir, dataset_id) if not os.path.exists(output_dir) and not force_download: _fetch_files('_%s' % dataset_id, urls, data_dir, verbose=verbose) shutil.move(temp_dir, output_dir) shutil.rmtree(os.path.split(temp_dir)[0]) return output_dir
def fetch_openneuro_dataset_index(data_dir=None, dataset_version='ds000030_R1.0.4', verbose=1): """ Download a file with OpenNeuro BIDS dataset index. Downloading the index allows to explore the dataset directories to select specific files to download. The index is a sorted list of urls. Parameters ---------- data_dir: string, optional Path to store the downloaded dataset. if None employ nilearn datasets default download directory. dataset_version: string, optional dataset version name. Assumes it is of the form [name]_[version]. verbose: int, optional verbosity level (0 means no message). Returns ------- urls_path: string Path to downloaded dataset index urls: list of string Sorted list of dataset directories """ data_prefix = '{}/{}/uncompressed'.format( dataset_version.split('_')[0], dataset_version, ) data_dir = _get_dataset_dir(data_prefix, data_dir=data_dir, verbose=verbose) file_url = 'https://osf.io/86xj7/download' final_download_path = os.path.join(data_dir, 'urls.json') downloaded_file_path = _fetch_files(data_dir=data_dir, files=[(final_download_path, file_url, { 'move': final_download_path })], resume=True) urls_path = downloaded_file_path[0] with open(urls_path, 'r') as json_file: urls = json.load(json_file) return urls_path, urls
def fetch_atlas_modl(data_dir=None, url=None, resume=True, verbose=1): """Download and load a multi-scale atlas computed using MODL over HCP900. Parameters ---------- data_dir: string, optional Path of the data directory. Used to force data storage in a non- standard location. Default: None (meaning: default) url: string, optional Download URL of the dataset. Overwrite the default URL. """ if url is None: url = 'http://cogspaces.github.io/assets/data/modl/' data_dir = get_data_dir(data_dir) dataset_name = 'modl' data_dir = _get_dataset_dir(dataset_name, data_dir=data_dir, verbose=verbose) keys = [ 'components_64', 'components_128', 'components_453_gm', 'loadings_128_gm' ] paths = [ 'components_64.nii.gz', 'components_128.nii.gz', 'components_453_gm.nii.gz', 'loadings_128_gm.npy', ] urls = [url + path for path in paths] files = [(path, url, {}) for path, url in zip(paths, urls)] files = _fetch_files(data_dir, files, resume=resume, verbose=verbose) params = {key: file for key, file in zip(keys, files)} fdescr = 'Components computed using the MODL package, at various scale,' \ 'from HCP900 data' params['description'] = fdescr params['data_dir'] = data_dir return Bunch(**params)
def fetch_atlas_modl(data_dir=None, url=None, resume=True, verbose=1): """Download and load a multi-scale atlas computed using MODL over HCP900. Parameters ---------- data_dir: string, optional Path of the data directory. Used to force data storage in a non- standard location. Default: None (meaning: default) url: string, optional Download URL of the dataset. Overwrite the default URL. """ if url is None: url = 'http://www.amensch.fr/data/cogspaces/modl/' files = [ 'components_16.nii.gz', 'components_64.nii.gz', 'components_128.nii.gz', 'components_512.nii.gz', ] if isinstance(url, str): url = [url] * len(files) files = [(f, u + f, {}) for f, u in zip(files, url)] dataset_name = 'modl' data_dir = _get_dataset_dir(dataset_name, data_dir=data_dir, verbose=verbose) files_ = _fetch_files(data_dir, files, resume=resume, verbose=verbose) fdescr = 'Components computed using the MODL package, at various scale,' \ 'from HCP900 data' keys = [ 'components16', 'components64', 'components128', 'components512', ] params = dict(zip(keys, files_)) params['description'] = fdescr return Bunch(**params)
def fetch_mask(data_dir=None, url=None, resume=True, verbose=1): if url is None: url = 'http://www.amensch.fr/data/mask/' files = ['mask_img.nii.gz'] if isinstance(url, str): url = [url] * len(files) files = [(f, u + f, {}) for f, u in zip(files, url)] dataset_name = 'mask' data_dir = _get_dataset_dir(dataset_name, data_dir=data_dir, verbose=verbose) files_ = _fetch_files(data_dir, files, resume=resume, verbose=verbose) return files_[0]
def fetch_mask(data_dir=None, url=None, resume=True, verbose=1): if url is None: url = 'http://www.amensch.fr/data/cogspaces/mask/' files = ['hcp_mask.nii.gz', 'icbm_gm_mask.nii.gz', 'contrast_mask.nii.gz'] if isinstance(url, str): url = [url] * len(files) files = [(f, u + f, {}) for f, u in zip(files, url)] dataset_name = 'mask' data_dir = get_data_dir(data_dir) dataset_dir = _get_dataset_dir(dataset_name, data_dir=data_dir, verbose=verbose) files = _fetch_files(dataset_dir, files, resume=resume, verbose=verbose) return {'hcp': files[0], 'icbm_gm': files[1], 'contrast': files[2]}
def fetch_language_localizer_demo_dataset(data_dir=None, verbose=1): """Download language localizer demo dataset. Parameters ---------- data_dir: string, optional Path to store the downloaded dataset. if None employ nilearn datasets default download directory. verbose: int, optional verbosity level (0 means no message). Returns ------- data_dir: string Path to downloaded dataset downloaded_files: list of string Absolute paths of downloaded files on disk """ url = 'https://osf.io/nh987/download' main_folder = 'fMRI-language-localizer-demo-dataset' data_dir = _get_dataset_dir(main_folder, data_dir=data_dir, verbose=verbose) # The files_spec needed for _fetch_files files_spec = [(main_folder + '.zip', url, {'move': main_folder + '.zip'})] # Only download if directory is empty # Directory will have been created by the call to _get_dataset_dir above if not os.listdir(data_dir): downloaded_files = _fetch_files(data_dir, files_spec, resume=True, verbose=verbose) _uncompress_file(downloaded_files[0]) file_list = [ os.path.join(path, f) for path, dirs, files in os.walk(data_dir) for f in files ] return data_dir, sorted(file_list)
def fetch_atlas_gordon_2014(coordinate_system='MNI', resolution=2, data_dir=None, url=None, resume=True, verbose=1): """Download and returns Gordon et al. 2014 atlas References ---------- Gordon, E. M., Laumann, T. O., Adeyemo, B., Huckins, J. F., Kelley, W. M., & Petersen, S. E., "Generation and evaluation of a cortical area parcellation from resting-state correlations", 2014, Cerebral cortex, bhu239. See http://www.nil.wustl.edu/labs/petersen/Resources.html for more information on this parcellation. """ if url is None: url = ("https://sites.wustl.edu/petersenschlaggarlab/files/" "2018/06/Parcels-19cwpgu.zip") dataset_name = "gordon_2014" data_dir = _get_dataset_dir(dataset_name, data_dir=data_dir, verbose=verbose) valid_coordinate_systems = ['MNI', '711-2b'] if coordinate_system not in valid_coordinate_systems: raise ValueError('Unknown coordinate system {0}. ' 'Valid options are {1}'.format( coordinate_system, valid_coordinate_systems)) if resolution not in [1, 2, 3]: raise ValueError('Invalid resolution {0}. ' 'Valid options are 1, 2 or 3.'.format(resolution)) target_file = os.path.join('Parcels', 'Parcels_{0}_{1}.nii'.format( coordinate_system, str(resolution) * 3)) atlas = _fetch_files(data_dir, [(target_file, url, {"uncompress": True})], resume=resume, verbose=verbose) return atlas
def fetch_kirby(subjects=range(2), sessions=[1], data_dir=None, url=None, resume=True, verbose=1): """Download and load the KIRBY multi-modal dataset. Parameters ---------- subjects : sequence of int or None, optional ids of subjects to load, default to loading 2 subjects. sessions: iterable of int, optional The sessions to load. Load only the first session by default. data_dir: string, optional Path of the data directory. Used to force data storage in a specified location. Default: None url: string, optional Override download URL. Used for test only (or if you setup a mirror of the data). Default: None Returns ------- data: sklearn.datasets.base.Bunch Dictionary-like object, the interest attributes are : - 'anat': Paths to structural MPRAGE images - 'asl': Paths to ASL images - 'm0': Paths to ASL M0 images Notes ------ This dataset is composed of 2 sessions of 21 participants (11 males) at 3T. Imaging modalities include MPRAGE, FLAIR, DTI, resting state fMRI, B0 and B1 field maps, ASL, VASO, quantitative T1 mapping, quantitative T2 mapping, and magnetization transfer imaging. For each session, we only download MPRAGE and ASL data. More details about this dataset can be found here : https://www.ncbi.nlm.nih.gov/pmc/articles/PMC3020263 http://mri.kennedykrieger.org/databases.html Paper to cite ------------- `Multi-Parametric Neuroimaging Reproducibility: A 3T Resource Study <https://www.ncbi.nlm.nih.gov/pmc/articles/PMC3020263>`_ Bennett. A. Landman, Alan J. Huang, Aliya Gifford,Deepti S. Vikram, Issel Anne L. Lim, Jonathan A.D. Farrell, John A. Bogovic, Jun Hua, Min Chen, Samson Jarso, Seth A. Smith, Suresh Joel, Susumu Mori, James J. Pekar, Peter B. Barker, Jerry L. Prince, and Peter C.M. van Zijl. NeuroImage. (2010) NIHMS/PMC:252138 doi:10.1016/j.neuroimage.2010.11.047 Licence ------- `BIRN Data License <http://www.nbirn.net/bdr/Data_Use_Agreement_09_19_07-1.pdf>`_ """ if url is None: url = 'https://www.nitrc.org/frs/downloadlink.php/' # Preliminary checks and declarations dataset_name = 'kirby' data_dir = _get_dataset_dir(dataset_name, data_dir=data_dir, verbose=verbose) subject_ids = np.array([ '849', '934', '679', '906', '913', '142', '127', '742', '422', '815', '906', '239', '916', '959', '814', '505', '959', '492', '239', '142', '815', '679', '800', '916', '849', '814', '800', '656', '742', '113', '913', '502', '113', '127', '505', '502', '934', '492', '346', '656', '346', '422']) nitrc_ids = np.arange(2201, 2243) ids = np.arange(1, 43) # Group indices by session _, indices1 = np.unique(subject_ids, return_index=True) subject_ids1 = subject_ids[sorted(indices1)] nitrc_ids1 = nitrc_ids[sorted(indices1)] ids1 = ids[sorted(indices1)] tuple_indices = [np.where(subject_ids == s)[0] for s in subject_ids1] indices2 = [idx1 if idx1 not in indices1 else idx2 for (idx1, idx2) in tuple_indices] subject_ids2 = subject_ids[indices2] nitrc_ids2 = nitrc_ids[indices2] ids2 = ids[indices2] # Check arguments max_subjects = len(subject_ids) if max(subjects) > max_subjects: warnings.warn('Warning: there are only {0} subjects'.format( max_subjects)) subjects = range(max_subjects) unique_subjects, indices = np.unique(subjects, return_index=True) if len(unique_subjects) < len(subjects): warnings.warn('Warning: Duplicate subjects, removing them.') subjects = unique_subjects[np.argsort(indices)] n_subjects = len(subjects) archives = [ [url + '{0}/KKI2009-{1:02}.tar.bz2'.format(nitrc_id, id) for (nitrc_id, id) in zip(nitrc_ids1, ids1)], [url + '{0}/KKI2009-{1:02}.tar.bz2'.format(nitrc_id, id) for (nitrc_id, id) in zip(nitrc_ids2, ids2)] ] anat1 = [os.path.join('session1', subject, 'KKI2009-{0:02}-MPRAGE.nii'.format(i)) for subject, i in zip(subject_ids1, ids1)] anat2 = [os.path.join('session2', subject, 'KKI2009-{0:02}-MPRAGE.nii'.format(i)) for subject, i in zip(subject_ids2, ids2)] asl1 = [os.path.join('session1', subject, 'KKI2009-{0:02}-ASL.nii'.format(i)) for subject, i in zip(subject_ids1, ids1)] asl2 = [os.path.join('session2', subject, 'KKI2009-{0:02}-ASL.nii'.format(i)) for subject, i in zip(subject_ids2, ids2)] m01 = [os.path.join('session1', subject, 'KKI2009-{0:02}-ASLM0.nii'.format(i)) for subject, i in zip(subject_ids1, ids1)] m02 = [os.path.join('session2', subject, 'KKI2009-{0:02}-ASLM0.nii'.format(i)) for subject, i in zip(subject_ids2, ids2)] target = [ [os.path.join('session1', subject, 'KKI2009-{0:02}.tar.bz2'.format(id)) for (subject, id) in zip(subject_ids1, ids1)], [os.path.join('session2', subject, 'KKI2009-{0:02}.tar.bz2'.format(id)) for (subject, id) in zip(subject_ids2, ids2)] ] anat = [anat1, anat2] asl = [asl1, asl2] m0 = [m01, m02] source_anat = [] source_asl = [] source_m0 = [] source_archives = [] session = [] target_archives = [] for i in sessions: if not (i in [1, 2]): raise ValueError('KIRBY dataset session id must be in [1, 2]') source_anat += [anat[i - 1][subject] for subject in subjects] source_asl += [asl[i - 1][subject] for subject in subjects] source_m0 += [m0[i - 1][subject] for subject in subjects] source_archives += [archives[i - 1][subject] for subject in subjects] target_archives += [target[i - 1][subject] for subject in subjects] session += [i] * n_subjects # Dataset description fdescr = _get_dataset_descr(dataset_name) # Call fetch_files once per subject. asl = [] m0 = [] anat = [] for anat_u, asl_u, m0_u, archive, target in zip(source_anat, source_asl, source_m0, source_archives, target_archives): n, a, m = _fetch_files( data_dir, [(anat_u, archive, {'uncompress': True, 'move': target}), (asl_u, archive, {'uncompress': True, 'move': target}), (m0_u, archive, {'uncompress': True, 'move': target})], verbose=verbose) anat.append(n) asl.append(a) m0.append(m) return Bunch(anat=anat, asl=asl, m0=m0, session=session, description=fdescr)
def fetch_openneuro_dataset( urls=None, data_dir=None, dataset_version='ds000030_R1.0.4', verbose=1): """Download openneuro bids dataset. Note: This function requires boto3 to be installed. Parameters ---------- urls: list of string, optional Openneuro url list of dataset files to download. If not specified all files of the specified dataset will be downloaded. data_dir: string, optional Path to store the downloaded dataset. if None employ nilearn datasets default download directory. dataset_version: string, optional dataset version name. Assumes it is of the form [name]_[version]. verbose: int, optional verbosity level (0 means no message). Returns ------- data_dir: string Path to downloaded dataset downloaded_files: list of string Absolute paths of downloaded files on disk """ boto3 = _check_import_boto3("boto3") data_prefix = '{}/{}/uncompressed'.format( dataset_version.split('_')[0], dataset_version) data_dir = _get_dataset_dir(data_prefix, data_dir=data_dir, verbose=verbose) # if urls are not specified we download the complete dataset index if urls is None: _, urls = fetch_openneuro_dataset_index( data_dir=data_dir, dataset_version=dataset_version, verbose=verbose) # The files_spec needed for _fetch_files files_spec = [] files_dir = [] for url in urls: url_path = url.split(data_prefix + '/')[1] file_dir = os.path.join(data_dir, url_path) files_spec.append((os.path.basename(file_dir), url, {})) files_dir.append(os.path.dirname(file_dir)) # download the files downloaded = [] for file_spec, file_dir in zip(files_spec, files_dir): # Timeout errors are common in the s3 connection so we try to avoid # failure of the dataset download for a transient instability success = False download_attempts = 4 while download_attempts > 0 and not success: try: downloaded_files = _fetch_files( file_dir, [file_spec], resume=True, verbose=verbose) downloaded += downloaded_files success = True except Exception: download_attempts -= 1 if not success: raise Exception('multiple failures downloading %s' % file_spec[1]) return data_dir, sorted(downloaded)
def fetch_microarray(data_dir=None, donors=['9861'], resume=True, verbose=1, convert=True): """ Downloads the Allen Human Brain Atlas microarray expression dataset Parameters ---------- data_dir : str, optional Directory where data should be downloaded and unpacked. Default: current directory donors : list, optional List of donors to download; can be either donor number or UID. Can also specify 'all' to download all available donors. Default: 9861 resume : bool, optional Whether to resume download of a partly-downloaded file. Default: True verbose : int, optional Verbosity level (0 means no message). Default: 1 convert : bool, optional Whether to convert downloaded CSV files into parquet format for faster loading in the future; only available if ``fastparquet`` and ``python- snappy`` are installed. Default: True Returns ------- data : :class:`sklearn.utils.Bunch` Dictionary-like object with keys ['microarray', 'ontology', 'pacall', 'probes', 'annotation'], where corresponding values are lists of filepaths to downloaded CSV files. References ---------- Hawrylycz, M. J., Lein, E. S., Guillozet-Bongaarts, A. L., Shen, E. H., Ng, L., Miller, J. A., ... & Abajian, C. (2012). An anatomically comprehensive atlas of the adult human brain transcriptome. Nature, 489(7416), 391. """ url = "https://human.brain-map.org/api/v2/well_known_file_download/{}" dataset_name = 'allenbrain' data_dir = _get_dataset_dir(dataset_name, data_dir=data_dir, verbose=verbose) sub_files = ('MicroarrayExpression.csv', 'Ontology.csv', 'PACall.csv', 'Probes.csv', 'SampleAnnot.csv') n_files = len(sub_files) if donors is not None and (isinstance(donors, (list, tuple))): for n, sub_id in enumerate(donors): if sub_id not in VALID_DONORS: raise ValueError( 'You provided invalid subject id {0} in a' 'list. Subjects must be selected in {1}.'.format( sub_id, VALID_DONORS)) donors[n] = WELL_KNOWN_IDS[sub_id] # convert to ID system elif donors == 'all': donors = WELL_KNOWN_IDS.value_set('subj') else: donors = [] donors = sorted(set(donors), key=lambda x: int(x)) # avoid duplicates files = [ (os.path.join('normalized_microarray_donor{}'.format(sub), fname), url.format(WELL_KNOWN_IDS.url[sub]), dict(uncompress=True, move=os.path.join('normalized_microarray_donor{}'.format(sub), 'donor{}.zip'.format(sub)))) for sub in donors for fname in sub_files ] files = _fetch_files(data_dir, files, resume=resume, verbose=verbose) # if we want to convert files to parquet format it's good to do that now # this step is _already_ super long, so an extra 1-2 minutes is negligible if convert and io.use_parq: for fn in files[0::n_files] + files[2::n_files]: io._make_parquet(fn, convert_only=True) return Bunch(microarray=files[0::n_files], ontology=files[1::n_files], pacall=files[2::n_files], probes=files[3::n_files], annotation=files[4::n_files])
def fetch_tutorial_data(n_subjects=20, data_dir=None, resume=True, verbose=1): """Download and load the surfstat tutorial dataset. Parameters ---------- n_subjects: int, optional The number of subjects to load from maximum of 100 subjects. By default, 20 subjects will be loaded. If None is given, all 100 subjects will be loaded. data_dir: string, optional Path of the data directory. Used to force data storage in a specified location. If None, data will be download to ~ (home directory). Default: None resume: bool, optional If true, try resuming download if possible Returns ------- data: sklearn.datasets.base.Bunch Dictionary-like object, the interest attributes are : - 'image_files': Paths to image files in mgh format - 'demographics': Path to CSV file containing demographic information References ---------- :Download: https://box.bic.mni.mcgill.ca/s/wMPF2vj7EoYWELV """ # set dataset url url = "https://box.bic.mni.mcgill.ca/s/wMPF2vj7EoYWELV" # set data_dir, if not directly set use ~ as default if data_dir is None: data_dir = str(Path.home()) # set dataset name and get its corresponding directory dataset_name = "brainstat_tutorial" data_dir = _get_dataset_dir(dataset_name, data_dir=data_dir, verbose=verbose) # set download information for demographic file files = [( "brainstat_tutorial_df.csv", url + "/download?path=%2FSurfStat_tutorial_data&files=myStudy.csv", { "move": "brainstat_tutorial_df.csv" }, )] # download demographic file path_to_demographics = _fetch_files(data_dir, files, verbose=verbose)[0] # set ids based on complete dataset from demographic file ids = pd.read_csv(path_to_demographics)["ID2"].tolist() # set and check subjects, in total and subset max_subjects = len(ids) if n_subjects is None: n_subjects = max_subjects if n_subjects > max_subjects: warnings.warn("Warning: there are only %d subjects" % max_subjects) n_subjects = max_subjects ids = ids[:n_subjects] # restrict demographic information to subset of subjects df_tmp = pd.read_csv(path_to_demographics) df_tmp = df_tmp[df_tmp["ID2"].isin(ids)] # set download information for image files and download them # for hemi in ['lh', 'rh']: image_files = _fetch_files( data_dir, [( "thickness/{}_{}2fsaverage5_20.mgh".format(subj, hemi), url + "/download?path=%2F&files=brainstat_tutorial.zip", { "uncompress": True, "move": "brainstat_tutorial.zip" }, ) for subj in ids for hemi in ["lh", "rh"]], ) # pack everything in a scikit-learn bunch and return it return Bunch(demographics=df_tmp, image_files=image_files)