def main(): # cammoun2012 is used to generate cortical thickness measures cammoun2012 = datasets.fetch_cammoun2012(data_dir=directories.rois) del cammoun2012['info'] for scale, mask in cammoun2012.items(): outfile = op.basename(mask).replace('.nii.gz', '_corticalthickness') outpath = op.join(directories.parcels, outfile) extract_data(mask, 'sscorticalthickness', outpath) # pauli2018 is used to generate subcortical volume measures pauli2018 = datasets.fetch_pauli2018(data_dir=directories.rois) del pauli2018['info'], pauli2018['probabilistic'] for scale, mask in pauli2018.items(): outfile = op.basename(mask).replace('.nii.gz', '_subcorticalvolume') outpath = op.join(directories.parcels, outfile) extract_data(mask, 'brainsegmentation', outpath)
def test_fetch_pauli2018(tmpdir): pauli = datasets.fetch_pauli2018(data_dir=tmpdir, verbose=0) assert all( hasattr(pauli, k) and os.path.isfile(pauli[k]) for k in ['probabilistic', 'deterministic', 'info'])
def main(): # N.B. this will NOT work unless you set the environmental variables # $PPMI_USER and $PPMI_PASSWORD prior to running this script. # these variables must be the username and password you received when # registering for the PPMI. for more information on data access see: # https://www.ppmi-info.org/access-data-specimens/download-data/ pypmi.fetch_studydata('all', path=directories.ppmi, overwrite=False) # load demographic data and keep only individuals with PD and healthy # individuals. we'll use the information in this data frame to residualize # our data against different variables (e.g., age, gender) print('Loading demographics information...') demographics = pypmi.load_demographics(directories.ppmi) \ .query('diagnosis in ["pd", "hc"]') \ .set_index('participant') demographics['family_history'] = demographics['family_history'].astype(bool) # load all non-MRI data print('Loading all non-MRI data (this step may take some time)...') datscan = pypmi.load_datscan(directories.ppmi, measures='all') biospec = pypmi.load_biospecimen(directories.ppmi, measures='all') behavior = pypmi.load_behavior(directories.ppmi, measures='all') # sometimes, because of how PPMI data were collected, there are slight # variations in the recorded date for the same visit, resulting in scores # for a single visit being split across two or more rows in the dataframe # (i.e., one row might have MoCA scores for visit "V01" and the other has # UPDRS scores for visit "V01") # to remedy this we use pandas `DataFrame.combine_first()` method, merging # scores from both rows and retaining the earliest date as the "true" date # (dates were generally only ~1 month different and if that difference # makes a significant impact on our results then I quit) print('Wrangling non-MRI data into a usable format...') first = behavior.drop_duplicates(['participant', 'visit'], 'first') \ .reset_index(drop=True) last = behavior.drop_duplicates(['participant', 'visit'], 'last') \ .reset_index(drop=True) behavior = first.combine_first(last) # get first visit scores for non-MRI data datscan, dat_date = get_visit(datscan, list(demographics.index), visit='SC') biospec, bio_date = get_visit(biospec, list(demographics.index), visit='BL') # behavioral data acquisition was split across screening + baseline visits # so we need to take the earliest visit for each measure # that is, not all measures were collected at screening so we need to use # the baseline visit scores for those measures # unfortunately which visit various measures were initially collected at # DIFFERED for PD and HC individuals, so we need to do this separately for # the two groups and then merge them back together... ¯\_(ツ)_/¯ beh, beh_dates = [], [] for diagnosis in ['pd', 'hc']: participants = demographics.query(f'diagnosis == "{diagnosis}"').index beh_sc, beh_date = get_visit(behavior, list(participants), visit='SC') beh_bl, _ = get_visit(behavior, list(participants), visit='BL') drop = np.intersect1d(beh_sc.columns, beh_bl.columns) beh += [pd.merge(beh_sc, beh_bl.drop(drop, axis=1), on='participant')] beh_dates += [beh_date] behavior = pd.concat(beh, join='inner') beh_date = pd.concat(beh_dates, join='inner') # iterate through all combinations of cortical + subcortical parcellations # note: there's only one subcortical parcellation (we had considered doing # more but the number of good subcortical parcellations is...limited) cth_data = sorted(glob.glob(op.join(directories.parcels, '*thickness.npy'))) vol_data = sorted(glob.glob(op.join(directories.parcels, '*volume.npy'))) for cth, vol in itertools.product(cth_data, vol_data): # determine what cortical / subcortical parcellation combo we're using # this will determine the name of the output file # the specific details include the resolution of cortical parcellation # and the datatype of the subcortical parcellation (scale, ) = re.search(r'res-(\d+)', cth).groups() (dtype, ) = re.search(r'_hemi-both_(\S+)_', vol).groups() hdf = structures.Frog(op.join(directories.snf, f'scale{scale}_{dtype}.h5')) print(f'Loading MRI data for {op.basename(hdf.filename)}...') # load parcellated cortical thickness data ct_parc = nndata.fetch_cammoun2012(data_dir=directories.rois, verbose=0)['info'] ct_parc = pd.read_csv(ct_parc).query(f'scale == "scale{scale}" ' '& structure == "cortex"') ct_parc['label'] = (ct_parc['label'] + '_' + ct_parc['hemisphere'].apply(str.lower)) cortthick, cth_date = get_parcels(cth, session=1, return_date=True, parcellation=ct_parc) # load parcellated subcortical volume data sv_parc = nndata.fetch_pauli2018(data_dir=directories.rois, verbose=0)['info'] sv_parc = pd.read_csv(sv_parc) subvolume, vol_date = get_parcels(vol, session=1, return_date=True, parcellation=sv_parc) # perform batch correction on MRI data # first, grab the demographics of subjects for whom we have neuro data. # then, remove all sites where we only have data from one subject since # we cannot generate batch correction parameters in these instances. # finally, perform the actual batch correction using `neurocombat` cortthick, subvolume, demo = \ preprocess.intersect_subjects(cortthick, subvolume, demographics) sites, counts = np.unique(demo['site'], return_counts=True) demo = demo[demo['site'].isin(sites[counts > 1])] cortthick, subvolume, demo = \ preprocess.intersect_subjects(cortthick, subvolume, demo) cortthick.iloc[:, :] = batch_correct(cortthick, demo) subvolume.iloc[:, :] = batch_correct(subvolume, demo) # only keep subjects for whom we have all datatypes # we preprocess HC and PD data separately because part of the process # involves imputation and we want to impute missing data using values # from each diagnostic group, separately data = [cortthick, subvolume, datscan, biospec, behavior] *data, demo = preprocess.intersect_subjects(*data, demo) hc_data, hc_demo = snfprep(data, demo.query('diagnosis == "hc"')) pd_data, pd_demo = snfprep(data, demo.query('diagnosis == "pd"')) # only keep features for which we have both PD and HC data for n, (hc_dtype, pd_dtype) in enumerate(zip(hc_data, pd_data)): cols = np.intersect1d(hc_dtype.columns, pd_dtype.columns) hc_data[n], pd_data[n] = hc_data[n][cols], pd_data[n][cols] # "regress out" age, gender, age x gender interactions (and total # estimated intracranial volume, if MRI data) from all data. # we also want to save all this data to disk so we can load it easily # in the future! do that for all the raw data, regressor matrices, and # processed (i.e., residualized) data # we do this because we don't want these sorts of things to bias our # initial analyses when creating the fused networks keys = [ 'cortical_thickness', 'subcortical_volume', 'dat_scans', 'csf_assays', 'behavioral_measures' ] dates = [cth_date, vol_date, dat_date, bio_date, beh_date] for grp, dataset, demo in zip(['pd', 'hc'], [pd_data, hc_data], [pd_demo, hc_demo]): hdf.save(demo, f'/raw/{grp}_demographics', overwrite=False) for n, (df, key, date) in enumerate(zip(dataset, keys, dates)): reg = gen_regressors(date, demo) # get comparative regressors / data (this is always healthy # inviduals -- we use them to estimate the betas for the # residualization process) comp_reg, comp_df = gen_regressors(date, hc_demo), hc_data[n] resid = nnstats.residualize(reg, df, comp_reg, comp_df, normalize=False) resid = pd.DataFrame(resid, index=df.index, columns=df.columns) hdf.save(df, f'/raw/{grp}_{key}', overwrite=False) hdf.save(reg, f'/regressors/{grp}_{key}', overwrite=False) hdf.save(resid, f'/processed/{grp}_{key}', overwrite=False)