Ejemplo n.º 1
0
def generate_df(metaroi_vals, outdir):
    """Generates a dataframe with the metaroi values for each subject
    
    Parameters    
    ----------
    metaroi_vals : dict
        Dictionary where keys are the filename and values are the mean
        FDG value
    outdir : string
        Full path where FDG output file should be saved
        
    Returns
    -------
    metaroi_df : pandas DataFrame
        DataFrame where each row is an FDG scan
    """
    metaroi_df = pd.DataFrame.from_dict(metaroi_vals, orient="index")
    metaroi_df.reset_index(level=0, inplace=True)
    metaroi_df.rename(columns={0: "roi_vals", "index": "path"}, inplace=True)
    metaroi_df["roi_vals"] = [float(x) for x in metaroi_df["roi_vals"]]
    metaroi_df["codea"] = [cf.get_id(x) for x in metaroi_df["path"]]
    metaroi_df = metaroi_df.rename(columns={"roi_vals": "FDG_val"})
    metaroi_df = metaroi_df.drop("path", axis=1)

    cf.save_xls_and_pkl(metaroi_df, "fdg_metaroi", outdir)

    return metaroi_df
def codetranslator_run(codetblpath, outdir):
    """Takes an excel file as input and generates a pandas dataframe containing only
    matched pairs of codea and codeb.
    
    Parameters
    ----------
    codetblpath : string
        Hard path to the excel file holding code data. Expects two columns named 
        'codeaGRAB' and 'codeb'.
    outdir : string
        Hard path to the directory to save the output file
        
    Renames columns to 'codea' and 'codeb' and saves xls and pkl files named 'codetranslator' 
    to outdir.
    """
    
    codetblin = pd.read_excel(codetblpath)
    codetbl = codetblin[['codeaGRAB','codeb']]
    codetbl = codetbl.rename(columns={'codeaGRAB' : 'codeb'})
    
    codetbl = codetbl.dropna()

    cf.save_xls_and_pkl(codetbl, 'codetranslator', outdir)
    
    return codetbl
Ejemplo n.º 3
0
def datamerge_run(filenames, outdir, roc_cols):
    """Main function to merge all data
    
    Parameters
    ----------
    filenames : list
        List of strings corresponding to filename prefixes of pickle format in outdir
    outdir : string
        Full path of directory containing files in filenames
    roc_cols : list
        List of strings corresponding to column names for which rate of change should be 
        calculated
    
    Returns
    -------
    tbldict : dict
        Dictionary of DataFrames each containing a different type of BACS data
    NPtbl : pandas DataFrame
        DataFrame where each row is a single subject's single cognitive testing session
    subjtbl : pandas DataFrame
        DataFrame where each row is a single subject
    """
    
    tbldict = collect2dict(filenames, outdir)
    tbldict = cogtest_manipulation(tbldict, roc_cols)
    
    #count number of tps
    tbldict['cogtests'] = count_instances(tbldict['cogtests'], 'codeb', 'NP_NoTps')
    tbldict['aseg_change'] = count_instances(tbldict['aseg_change'], 'codea', 'MRI_NoTps')
    tbldict['pibparams'] = count_instances(tbldict['pibparams'], 'codea', 'PIB_NoTps')
    
    new_tbldict = {}
    for key, tbl in tbldict.iteritems():
        tpcol = [s for s in tbl.columns if ('_Tp' in s)]
        if tpcol:
            tpcol = tpcol[0]
            tblflat, tblflatnm = flatten(tbl, tpcol, key, [1, '1'])
            new_tbldict[tblflatnm] = tblflat
    tbldict.update(new_tbldict)
    
    #make sure each table contains SubjID and BAC# fields
    for key, tbl in tbldict.iteritems():
        tbl = addcodes(tbl, tbldict['codetranslator'])
        tbldict[key] = tbl
    
    #merge tables
    tblstojoin = ['cogtests_flat','pibparams_flat','aseg_change_flat','fdg_metaroi_flat','subjinfo']
    joincol = ['codea','codeb']
    subjtbl = mergelots(tbldict, tblstojoin, joincol)
    
    #merge tables
    tblstojoin = ['cogtests','subjinfo','pibparams_flat','aseg_change_flat','fdg_metaroi_flat']
    joincol = ['codea','codeb']
    NPtbl = mergelots(tbldict, tblstojoin, joincol)
    
    cf.save_xls_and_pkl(subjtbl, 'subjtbl', outdir)
    cf.save_xls_and_pkl(NPtbl, 'NPtbl', outdir)
    
    return tbldict, NPtbl, subjtbl
Ejemplo n.º 4
0
def mri_run(datadir, outdir, rois):
    """Main function to collect MRI volume data
    
    Parameters
    ----------
    datadir : string
        Full path to root directory of freesurfer processed data. Expect file tree
        to be datadir/subcode/stats/aseg.stats
    outdir : string
        Full path to directory where data will be saved
    rois : list of strings
        List of freesurfer rois of interest. These volumes of these rois will be 
        inserted in aseg_change along with their rates of change
        
    Returns
    -------
    aseg_stats : pandas DataFrame
        DataFrame where each row is a scan, and columns are volumes of all
        freesurfer processed regions
    aseg_change : pandas DataFrame
        DataFrame where each row is a scan, and columns are the volumes of interest,
        their rates of change, and icv correction
    """

    #get aseg_stats data from freesurfer processed data
    outfile = '%sFS_aseg_stats.txt' %outdir
    subs, asegout, output = extractFSasegstats(datadir, outfile)
    aseg_stats = pd.read_csv(outfile, header=0, delim_whitespace=True)

    #add columns for SubjID and MRI_TP
    aseg_stats['codea'] = [cf.get_id(sub) for sub in subs]
    aseg_stats['MRI_Tp'] = [cf.get_tp(sub) for sub in subs]
    aseg_stats.drop('Measure:volume', axis=1, inplace=True)

    #get dates of MRI scans that were processed with freesurfer
    mridates = bacs_pet_mri_date_batch(datadir)

    aseg_change = pd.merge(aseg_change, mridates, on=['codea','MRI_Tp'])

    rois_icvcorr = dict([(roi, '%s_icvcorr' %roi) for roi in rois])
    
    aseg_change = icvcorr(aseg_change, rois_icvcorr, 'IntraCranialVol')

    #calculate rate of change in years
    for roi in rois:
        aseg_change = cf.rate_of_change(aseg_change, 'codea', 'MRI_Tp', 
                                    'MRI_Scandate', roi, '%s_sl' %roi)
    
    cf.save_xls_and_pkl(aseg_stats, 'aseg_stats', outdir)
    cf.save_xls_and_pkl(aseg_change, 'aseg_change', outdir)
    
    return aseg_stats, aseg_change
Ejemplo n.º 5
0
def pibparams_run(path_pib, pibrename, outdir, pibcutoff):
    """Reads data from the spreadsheet, does some calculations, and 
    returns a Pandas dataframe with PIB data.
    
    Parameters
    ----------
    path_pib : string
        String of full path to *.xls
    pibrename : dict
        Dictionary of name:rename pairs, where the keys are columns in the 
        PIB spreadsheet and values are what to rename the keys to
    outdir : string
        Full path where final dataframe will be saved
    pibcutoff : float
        PIB cutoff value
    
    Returns
    -------
    pib_df : pandas dataframe
        Dataframe containing all PIB data
    """

    #read in pib data from old sheet
    pib_old = pd.read_excel(path_pib, sheetname='i')
    #read in PIB data from longitudinal timepoints
    pib_long = pd.read_excel(path_pib, sheetname='j')
    #concatenate PIB tables
    pib_df = pd.concat([pib_long, pib_old])
    pib_df = pib_df[pibrename.keys()]
    pib_df.rename(columns=pibrename, inplace=True)

    #make binary PIB value
    pib_df['PIB_Pos'] = pib_df['PIB_Index'].apply(lambda x: 1 if x >= pibcutoff else 0)

    #calculate rate of change of PIB_Index in years
    pib_df = cf.rate_of_change(pib_df, 'codea', 'PIB_Tp', 'PIB_Scandate', 
                               'PIB_Index', 'PIB_sl')

    #make column for the age at which PIB positivity appears
    pib_df.sort(columns=['codea','PIB_Tp'], inplace=True)

    #calculate age of PIB positivity
    pib_df['PIB_agepos'] = float('nan')
    pib_df = pib_df.groupby(by='codea')
    pib_df = pib_df.apply(f)
    
    cf.save_xls_and_pkl(pib_df, 'pibparams', outdir)
    
    return pib_df
def factoranalysis_run(cogpth, blpth, wpth, outdir, rowind, cogtests_master, **kwargs):
    """Takes cognitive data output from filemaker pro database and applies weights from
    a factor analysis. Outputs data for each subject for each cognitive session that has
    been z scored, and the factor weights for each of those datapoints.
    
    Parameters
    ----------
    cogpth : string
        Path to folder holding excel sheets of cognitive data
    blpth : string
        Full path to excel sheet holding cognitive data for a reference population
    wpth : string
        Full path to excel file holding factor weights. Each column in this file is a 
        cognitive test, and each row is a factor.
    outdir : string
        Full path where output files should be saved
    rowind : string
        Name of column representing the index value. Default is 'codeb'
    cogtests_master : list
        List of strings that are all cognitive tests that should be included in the 
        factor analysis
    
    Returns
    -------
    subjdata : dict
        Dictionary, keys are 'sessX' where X is the session number and values are
        DataFrames containing that session's cognitive data
    subjdata_z : pandas DataFrame
        DataFrame containing all subject data where each row is a single subject's 
        single cognitive testing session. This data has been z-scored.
    cogdata : pandas DataFrame
        DataFrame where each row is a a single subject's single cognitive testing session.
        Columns are scores on each of the cognitive tests, and scores for each factor
    """
    cogglob = sorted(glob(cogpth))
    
    subjdata = cogprep(cogglob, cogtests_master)
    subjdata_z = zscore(blpth, subjdata, cogtests_master)
    cogdata = factorscores(subjdata_z, wpth, cogtests_master)

    cf.save_xls_and_pkl(cogdata, 'cogdata', outdir)
    
    return subjdata, subjdata_z, cogdata
def cogtestdates_run(path_cogdates, staticrename, outdir):
    """Reads cognitive testing dates into a dataframe
    
    Parameters
    ----------
    path_cogdates : string
        Path to the excel sheets holding the dates of cognitive testing
    staticrename : dict
        Dictionary where keys are existing names of columns in the cogdates
        spreadsheet, and values are what to rename the keys
    outdir : string
        Full path where output files should be saved
    
    Returns
    -------
    testing_out : pandas DataFrame
        DataFrame holding the dates of the cognitive tests for each subject
        at each timepoint
    subjinfo : pandas DataFrame
        DataFrame holding basic subject information
    """
    
    #import data with neuropsych test dates
    cogdates = pd.read_excel(path_cogdates)
    cogdates.rename(columns=staticrename, inplace=True)
    staticcols = staticrename.values()

    #split table into basic subject variables, and ones that change with testing session
    subjinfo = cogdates[['codea'] + staticcols]
    testing = cogdates.drop(staticcols, axis=1)

    #make columns for APOE presence and dose
    subjinfo['APOE_presence'] = subjinfo.apply(APOE_presence, axis=1)
    subjinfo['APOE_dose'] = subjinfo.apply(APOE_dose, axis=1)

    #reconfigure testing table to put tp as row values
    testing_melted = pd.melt(testing, id_vars='codea', var_name='NP_Exam')

    #initiate regex statements to draw tp and test type from column names
    tp_regex = re.compile('(\d)::')
    type_regex = re.compile('::(.*)')
    refcol = testing_melted['NP_Exam'].tolist()

    #rename NP_Tp and NP_Type columns based on regex statements
    testing_melted['NP_Tp'] = [sm.group(1) for s in refcol for sm in [tp_regex.search(s)] if sm]
    testing_melted['NP_Type'] = [sm.group(1) for s in refcol for sm in [type_regex.search(s)] if sm]
    testing_melted['subtp'] = testing_melted['codea'] + testing_melted['NP_Tp'].map(str)

    pattern = re.compile('[\d\s_]+')
    testing_melted['NP_Type'] = [pattern.sub('', s) for s in testing_melted['NP_Type']]

    #reconfigure table to put tests in columns
    testing_piv = testing_melted.pivot(index='subtp', columns='NP_Type', values='value')
    testing_out = pd.merge(testing_piv.reset_index(), testing_melted, on='subtp')
    testing_out.drop(['NP_Type','subtp','value','NP_Exam'], axis=1, inplace=True)
    testing_out.drop_duplicates(inplace=True)
    testing_out.rename(columns={'AgeatSession':'NP_Age','NeuropsychExamTestDate':'NP_Date'},
                      inplace=True)
    testing_out.dropna(axis=0, subset=['NP_Date'], inplace=True)

    #add column for years relative to baseline
    timecalc = testing_out[testing_out['NP_Tp']=='1']
    timecalc.rename(columns={'NP_Age':'NP_AgeBL','NP_Date':'NP_DateBL'}, inplace=True)
    timecalc.drop(['NP_Tp'], axis=1, inplace=True)
    testing_out = pd.merge(testing_out, timecalc, on='codea')
    testing_out['NP_YrsRelBL'] = pd.to_datetime(testing_out['NP_Date'])- pd.to_datetime(testing_out['NP_DateBL'])
    testing_out['NP_YrsRelBL'] = (testing_out['NP_YrsRelBL'].astype('timedelta64[D]'))/365.25
    
    cf.save_xls_and_pkl(testing_out, 'cogtestdates', outdir)
    cf.save_xls_and_pkl(subjinfo, 'subjinfo', outdir)
    
    return testing_out, subjinfo