Example #1
0
def main():
    # cammoun2012 is used to generate cortical thickness measures
    cammoun2012 = datasets.fetch_cammoun2012(data_dir=directories.rois)
    del cammoun2012['info']
    for scale, mask in cammoun2012.items():
        outfile = op.basename(mask).replace('.nii.gz', '_corticalthickness')
        outpath = op.join(directories.parcels, outfile)
        extract_data(mask, 'sscorticalthickness', outpath)

    # pauli2018 is used to generate subcortical volume measures
    pauli2018 = datasets.fetch_pauli2018(data_dir=directories.rois)
    del pauli2018['info'], pauli2018['probabilistic']
    for scale, mask in pauli2018.items():
        outfile = op.basename(mask).replace('.nii.gz', '_subcorticalvolume')
        outpath = op.join(directories.parcels, outfile)
        extract_data(mask, 'brainsegmentation', outpath)
def test_fetch_pauli2018(tmpdir):
    pauli = datasets.fetch_pauli2018(data_dir=tmpdir, verbose=0)
    assert all(
        hasattr(pauli, k) and os.path.isfile(pauli[k])
        for k in ['probabilistic', 'deterministic', 'info'])
def main():
    # N.B. this will NOT work unless you set the environmental variables
    #      $PPMI_USER and $PPMI_PASSWORD prior to running this script.
    #      these variables must be the username and password you received when
    #      registering for the PPMI. for more information on data access see:
    #      https://www.ppmi-info.org/access-data-specimens/download-data/
    pypmi.fetch_studydata('all', path=directories.ppmi, overwrite=False)

    # load demographic data and keep only individuals with PD and healthy
    # individuals. we'll use the information in this data frame to residualize
    # our data against different variables (e.g., age, gender)
    print('Loading demographics information...')
    demographics = pypmi.load_demographics(directories.ppmi) \
                        .query('diagnosis in ["pd", "hc"]') \
                        .set_index('participant')
    demographics['family_history'] = demographics['family_history'].astype(bool)

    # load all non-MRI data
    print('Loading all non-MRI data (this step may take some time)...')
    datscan = pypmi.load_datscan(directories.ppmi, measures='all')
    biospec = pypmi.load_biospecimen(directories.ppmi, measures='all')
    behavior = pypmi.load_behavior(directories.ppmi, measures='all')

    # sometimes, because of how PPMI data were collected, there are slight
    # variations in the recorded date for the same visit, resulting in scores
    # for a single visit being split across two or more rows in the dataframe
    # (i.e., one row might have MoCA scores for visit "V01" and the other has
    # UPDRS scores for visit "V01")
    # to remedy this we use pandas `DataFrame.combine_first()` method, merging
    # scores from both rows and retaining the earliest date as the "true" date
    # (dates were generally only ~1 month different and if that difference
    # makes a significant impact on our results then I quit)
    print('Wrangling non-MRI data into a usable format...')
    first = behavior.drop_duplicates(['participant', 'visit'], 'first') \
                    .reset_index(drop=True)
    last = behavior.drop_duplicates(['participant', 'visit'], 'last') \
                   .reset_index(drop=True)
    behavior = first.combine_first(last)

    # get first visit scores for non-MRI data
    datscan, dat_date = get_visit(datscan, list(demographics.index), visit='SC')
    biospec, bio_date = get_visit(biospec, list(demographics.index), visit='BL')

    # behavioral data acquisition was split across screening + baseline visits
    # so we need to take the earliest visit for each measure
    # that is, not all measures were collected at screening so we need to use
    # the baseline visit scores for those measures
    # unfortunately which visit various measures were initially collected at
    # DIFFERED for PD and HC individuals, so we need to do this separately for
    # the two groups and then merge them back together... ¯\_(ツ)_/¯
    beh, beh_dates = [], []
    for diagnosis in ['pd', 'hc']:
        participants = demographics.query(f'diagnosis == "{diagnosis}"').index
        beh_sc, beh_date = get_visit(behavior, list(participants), visit='SC')
        beh_bl, _ = get_visit(behavior, list(participants), visit='BL')
        drop = np.intersect1d(beh_sc.columns, beh_bl.columns)
        beh += [pd.merge(beh_sc, beh_bl.drop(drop, axis=1), on='participant')]
        beh_dates += [beh_date]
    behavior = pd.concat(beh, join='inner')
    beh_date = pd.concat(beh_dates, join='inner')

    # iterate through all combinations of cortical + subcortical parcellations
    # note: there's only one subcortical parcellation (we had considered doing
    # more but the number of good subcortical parcellations is...limited)
    cth_data = sorted(glob.glob(op.join(directories.parcels, '*thickness.npy')))
    vol_data = sorted(glob.glob(op.join(directories.parcels, '*volume.npy')))
    for cth, vol in itertools.product(cth_data, vol_data):

        # determine what cortical / subcortical parcellation combo we're using
        # this will determine the name of the output file
        # the specific details include the resolution of cortical parcellation
        # and the datatype of the subcortical parcellation
        (scale, ) = re.search(r'res-(\d+)', cth).groups()
        (dtype, ) = re.search(r'_hemi-both_(\S+)_', vol).groups()
        hdf = structures.Frog(op.join(directories.snf,
                                      f'scale{scale}_{dtype}.h5'))
        print(f'Loading MRI data for {op.basename(hdf.filename)}...')

        # load parcellated cortical thickness data
        ct_parc = nndata.fetch_cammoun2012(data_dir=directories.rois,
                                           verbose=0)['info']
        ct_parc = pd.read_csv(ct_parc).query(f'scale == "scale{scale}" '
                                             '& structure == "cortex"')
        ct_parc['label'] = (ct_parc['label'] + '_'
                            + ct_parc['hemisphere'].apply(str.lower))
        cortthick, cth_date = get_parcels(cth, session=1, return_date=True,
                                          parcellation=ct_parc)

        # load parcellated subcortical volume data
        sv_parc = nndata.fetch_pauli2018(data_dir=directories.rois,
                                         verbose=0)['info']
        sv_parc = pd.read_csv(sv_parc)
        subvolume, vol_date = get_parcels(vol, session=1, return_date=True,
                                          parcellation=sv_parc)

        # perform batch correction on MRI data
        # first, grab the demographics of subjects for whom we have neuro data.
        # then, remove all sites where we only have data from one subject since
        # we cannot generate batch correction parameters in these instances.
        # finally, perform the actual batch correction using `neurocombat`
        cortthick, subvolume, demo = \
            preprocess.intersect_subjects(cortthick, subvolume, demographics)
        sites, counts = np.unique(demo['site'], return_counts=True)
        demo = demo[demo['site'].isin(sites[counts > 1])]
        cortthick, subvolume, demo = \
            preprocess.intersect_subjects(cortthick, subvolume, demo)
        cortthick.iloc[:, :] = batch_correct(cortthick, demo)
        subvolume.iloc[:, :] = batch_correct(subvolume, demo)

        # only keep subjects for whom we have all datatypes
        # we preprocess HC and PD data separately because part of the process
        # involves imputation and we want to impute missing data using values
        # from each diagnostic group, separately
        data = [cortthick, subvolume, datscan, biospec, behavior]
        *data, demo = preprocess.intersect_subjects(*data, demo)
        hc_data, hc_demo = snfprep(data, demo.query('diagnosis == "hc"'))
        pd_data, pd_demo = snfprep(data, demo.query('diagnosis == "pd"'))

        # only keep features for which we have both PD and HC data
        for n, (hc_dtype, pd_dtype) in enumerate(zip(hc_data, pd_data)):
            cols = np.intersect1d(hc_dtype.columns, pd_dtype.columns)
            hc_data[n], pd_data[n] = hc_data[n][cols], pd_data[n][cols]

        # "regress out" age, gender, age x gender interactions (and total
        # estimated intracranial volume, if MRI data) from all data.
        # we also want to save all this data to disk so we can load it easily
        # in the future! do that for all the raw data, regressor matrices, and
        # processed (i.e., residualized) data
        # we do this because we don't want these sorts of things to bias our
        # initial analyses when creating the fused networks
        keys = [
            'cortical_thickness',
            'subcortical_volume',
            'dat_scans',
            'csf_assays',
            'behavioral_measures'
        ]
        dates = [cth_date, vol_date, dat_date, bio_date, beh_date]
        for grp, dataset, demo in zip(['pd', 'hc'],
                                      [pd_data, hc_data],
                                      [pd_demo, hc_demo]):
            hdf.save(demo, f'/raw/{grp}_demographics', overwrite=False)
            for n, (df, key, date) in enumerate(zip(dataset, keys, dates)):
                reg = gen_regressors(date, demo)

                # get comparative regressors / data (this is always healthy
                # inviduals -- we use them to estimate the betas for the
                # residualization process)
                comp_reg, comp_df = gen_regressors(date, hc_demo), hc_data[n]

                resid = nnstats.residualize(reg, df, comp_reg, comp_df,
                                            normalize=False)
                resid = pd.DataFrame(resid, index=df.index, columns=df.columns)

                hdf.save(df, f'/raw/{grp}_{key}', overwrite=False)
                hdf.save(reg, f'/regressors/{grp}_{key}', overwrite=False)
                hdf.save(resid, f'/processed/{grp}_{key}', overwrite=False)