Example #1
0
def onsets_to_dm(F,
                 sampling_freq,
                 run_length,
                 header='infer',
                 sort=False,
                 keep_separate=True,
                 add_poly=None,
                 unique_cols=[],
                 fill_na=None,
                 **kwargs):
    """
    This function can assist in reading in one or several in a 2-3 column onsets files, specified in seconds and converting it to a Design Matrix organized as samples X Stimulus Classes. Onsets files **must** be organized with columns in one of the following 4 formats:

    1) 'Stim, Onset'
    2) 'Onset, Stim'
    3) 'Stim, Onset, Duration'
    4) 'Onset, Duration, Stim'

    No other file organizations are currently supported

    Args:
        F (filepath/DataFrame/list): path to file, pandas dataframe, or list of files or pandas dataframes
        sampling_freq (float): sampling frequency in hertz; for TRs use (1 / TR)         run_length (int): number of TRs in the run these onsets came from
        sort (bool, optional): whether to sort the columns of the resulting
                                design matrix alphabetically; defaults to
                                False
        addpoly (int, optional: what order polynomial terms to add as new columns (e.g. 0 for intercept, 1 for linear trend and intercept, etc); defaults to None
        header (str,optional): None if missing header, otherwise pandas
                                header keyword; defaults to 'infer'
        keep_separate (bool): whether to seperate polynomial columns if reading a list of files and using the addpoly option
        unique_cols (list): additional columns to keep seperate across files (e.g. spikes)
        fill_nam (str/int/float): what value fill NaNs in with if reading in a list of files
        kwargs: additional inputs to pandas.read_csv

        Returns:
            Design_Matrix class

    """
    if not isinstance(F, list):
        F = [F]

    out = []
    TR = 1. / sampling_freq
    for f in F:
        if isinstance(f, six.string_types):
            df = pd.read_csv(f, header=header, **kwargs)
        elif isinstance(f, pd.core.frame.DataFrame):
            df = f.copy()
        else:
            raise TypeError("Input needs to be file path or pandas dataframe!")
        if df.shape[1] == 2:
            warnings.warn(
                "Only 2 columns in file, assuming all stimuli are the same duration"
            )
        elif df.shape[1] == 1 or df.shape[1] > 3:
            raise ValueError("Can only handle files with 2 or 3 columns!")

        # Try to infer the header
        if header is None:
            possibleHeaders = ['Stim', 'Onset', 'Duration']
            if isinstance(df.iloc[0, 0], six.string_types):
                df.columns = possibleHeaders[:df.shape[1]]
            elif isinstance(df.iloc[0, df.shape[1] - 1], six.string_types):
                df.columns = possibleHeaders[1:] + [possibleHeaders[0]]
            else:
                raise ValueError(
                    "Can't figure out onset file organization. Make sure file has no more than 3 columns specified as 'Stim,Onset,Duration' or 'Onset,Duration,Stim'"
                )
        df['Onset'] = df['Onset'].apply(lambda x: int(np.floor(x / TR)))

        # Build dummy codes
        X = Design_Matrix(np.zeros([run_length,
                                    len(df['Stim'].unique())]),
                          columns=df['Stim'].unique(),
                          sampling_freq=sampling_freq)
        for i, row in df.iterrows():
            if df.shape[1] == 3:
                dur = np.ceil(row['Duration'] / TR)
                X.ix[row['Onset'] - 1:row['Onset'] + dur - 1, row['Stim']] = 1
            elif df.shape[1] == 2:
                X.ix[row['Onset'], row['Stim']] = 1
        if sort:
            X = X.reindex(sorted(X.columns), axis=1)

        out.append(X)
    if len(out) > 1:
        out_dm = out[0].append(out[1:],
                               keep_separate=keep_separate,
                               add_poly=add_poly,
                               unique_cols=unique_cols,
                               fill_na=fill_na)
    else:
        if add_poly is not None:
            out_dm = out[0].add_poly(add_poly)
        else:
            out_dm = out[0]

    return out_dm
Example #2
0
def onsets_to_dm(
    F,
    sampling_freq,
    run_length,
    header="infer",
    sort=False,
    keep_separate=True,
    add_poly=None,
    unique_cols=None,
    fill_na=None,
    **kwargs,
):
    """
    This function can assist in reading in one or several in a 2-3 column onsets files, specified in seconds and converting it to a Design Matrix organized as samples X Stimulus Classes. sampling_freq should be specified in hertz; for TRs use hertz = 1/TR. Onsets files **must** be organized with columns in one of the following 4 formats:

    1) 'Stim, Onset'
    2) 'Onset, Stim'
    3) 'Stim, Onset, Duration'
    4) 'Onset, Duration, Stim'

    No other file organizations are currently supported. *Note:* Stimulus offsets (onset + duration) that fall into an adjacent TR include that full TR. E.g. offset of 10.16s with TR = 2 has an offset of TR 5, which spans 10-12s, rather than an offset of TR 4, which spans 8-10s.

    Args:
        F (filepath/DataFrame/list): path to file, pandas dataframe, or list of files or pandas dataframes
        sampling_freq (float): sampling frequency in hertz; for TRs use (1 / TR)         run_length (int): number of TRs in the run these onsets came from
        sort (bool, optional): whether to sort the columns of the resulting
                                design matrix alphabetically; defaults to
                                False
        addpoly (int, optional: what order polynomial terms to add as new columns (e.g. 0 for intercept, 1 for linear trend and intercept, etc); defaults to None
        header (str,optional): None if missing header, otherwise pandas
                                header keyword; defaults to 'infer'
        keep_separate (bool): whether to seperate polynomial columns if reading a list of files and using the addpoly option; defaults to True
        unique_cols (list, optional): additional columns to keep seperate across files (e.g. spikes); defaults to []
        fill_na (str/int/float, optional): what value fill NaNs in with if reading in a list of files; defaults to None
        kwargs: additional inputs to pandas.read_csv

        Returns:
            Design_Matrix class

    """

    if not isinstance(F, list):
        F = [F]

    if not isinstance(sampling_freq, (float, np.floating)):
        raise TypeError("sampling_freq must be a float")

    out = []
    TR = 1.0 / sampling_freq
    for f in F:
        if isinstance(f, str) or isinstance(f, Path):
            df = pd.read_csv(f, header=header, **kwargs)
        elif isinstance(f, pd.core.frame.DataFrame):
            df = f.copy()
        else:
            raise TypeError("Input needs to be file path or pandas dataframe!")
        # Keep an unaltered copy of the original dataframe for checking purposes below
        data = df.copy()

        if df.shape[1] == 2:
            warnings.warn(
                "Only 2 columns in file, assuming all stimuli are the same duration"
            )
        elif df.shape[1] == 1 or df.shape[1] > 3:
            raise ValueError("Can only handle files with 2 or 3 columns!")

        # Try to infer the header
        if header is None:
            possibleHeaders = ["Stim", "Onset", "Duration"]
            if isinstance(df.iloc[0, 0], str):
                df.columns = possibleHeaders[: df.shape[1]]
            elif isinstance(df.iloc[0, df.shape[1] - 1], str):
                df.columns = possibleHeaders[1:] + [possibleHeaders[0]]
            else:
                raise ValueError(
                    "Can't figure out onset file organization. Make sure file has no more than 3 columns specified as 'Stim,Onset,Duration' or 'Onset,Duration,Stim'"
                )

        # Compute an offset in seconds if a Duration is provided
        if df.shape[1] == 3:
            df["Offset"] = df["Onset"] + df["Duration"]
        # Onset always starts at the closest TR rounded down, e.g.
        # with TR = 2, and onset = 10.1 or 11.7 will both have onset of TR 5 as it spans the window 10-12s
        df["Onset"] = df["Onset"].apply(lambda x: int(np.floor(x / TR)))

        # Offset includes the subsequent if Offset falls within window covered by that TR
        # but not if it falls exactly on the subsequent TR, e.g. if TR = 2, and offset = 10.16, then TR 5 will be included but if offset = 10.00, TR 5 will not be included, as it covers the window 10-12s
        if "Offset" in df.columns:

            def conditional_round(x, TR):
                """Conditional rounding to the next TR if offset falls within window, otherwise not"""
                dur_in_TRs = x / TR
                dur_in_TRs_rounded_down = np.floor(dur_in_TRs)
                # If in the future we wanted to enable the ability to include a TR based on a % of that TR we can change the next line to compare to some value, e.g. at least 0.5s into that TR: dur_in_TRs - dur_in_TRs_rounded_down > 0.5
                if dur_in_TRs > dur_in_TRs_rounded_down:
                    return dur_in_TRs_rounded_down
                else:
                    return dur_in_TRs_rounded_down - 1

            # Apply function
            df["Offset"] = df["Offset"].apply(conditional_round, args=(TR,))

        # Build dummy codes
        X = Design_Matrix(
            np.zeros([run_length, df["Stim"].nunique()]),
            columns=df["Stim"].unique(),
            sampling_freq=sampling_freq,
        )
        for i, row in df.iterrows():
            if "Offset" in df.columns:
                X.loc[row["Onset"] : row["Offset"], row["Stim"]] = 1
            else:
                X.loc[row["Onset"], row["Stim"]] = 1
        # Run a check
        if "Offset" in df.columns:
            onsets = X.sum().values
            stim_counts = data.Stim.value_counts(sort=False)[X.columns]
            durations = data.groupby("Stim").Duration.mean().values
            for i, (o, c, d) in enumerate(zip(onsets, stim_counts, durations)):
                if c * (d / TR) <= o <= c * ((d / TR) + 1):
                    pass
                else:
                    warnings.warn(
                        f"Computed onsets for {data.Stim.unique()[i]} are inconsistent with expected values. Please manually verify the outputted Design_Matrix!"
                    )

        if sort:
            X = X.reindex(sorted(X.columns), axis=1)

        out.append(X)
    if len(out) > 1:
        if add_poly is not None:
            out = [e.add_poly(add_poly) for e in out]

        out_dm = out[0].append(
            out[1:],
            keep_separate=keep_separate,
            unique_cols=unique_cols,
            fill_na=fill_na,
        )
    else:
        out_dm = out[0]
        if add_poly is not None:
            out_dm = out_dm.add_poly(add_poly)
        if fill_na is not None:
            out_dm = out_dm.fill_na(fill_na)

    return out_dm
Example #3
0
def BIDS_to_dm(F,
               sampling_freq,
               run_length,
               trial_col='trial_type',
               parametric_cols=None,
               sort=False,
               keep_separate=True,
               add_poly=None,
               unique_cols=[],
               fill_na=None,
               **kwargs):
    """
        **
        Modified from nltools.file_reader.onsets_to_dm to accomodate BIDS files,
        customize naming of the trial_type column, and allow parametric modulators.
        **
    This function can assist in reading in one or several BIDS-formated events files, specified in seconds and converting it to a Design Matrix organized as samples X Stimulus Classes.
    Onsets files **must** be organized with columns in the following format:
        1) 'onset, duration, trial_type'

    This can handle multiple runs being given at once (if F is a list), and by default uses separate contrasts for each run.

    Args:
        F (filepath/DataFrame/list): path to file, pandas dataframe, or list of files or pandas dataframes
        TR (float): TR of run.
        run_length (int): number of TRs in the run these onsets came from
        trial_col (string): which column should be used to specify stimuli/trials?
        parametric_cols (list of lists of strings):
        e.g. [['condition1', 'parametric1', 'no_cent', 'no_norm'],
             ['condition2', 'paramatric2', 'cent', 'norm']]
             in each entry:
                 entry 1 is a condition within the trial_col
                 entry 2 is a column in the events folder referenced by F.
                 entry 3 is either 'no_cent', or 'cent', indicating whether to center the parametric variable.
                 entry 4 is either 'no_norm', or 'norm', indicating whether to normalize the parametric variable.
             The condition column specified by entry 1 will be multiplied by the
             parametric weighting specified by entry 2, scaled/centered as specified, then
            appended to the design matrix.
        sort (bool, optional): whether to sort the columns of the resulting
                                design matrix alphabetically; defaults to
                                False
        keep_separate (bool): whether to seperate polynomial columns if reading a list of files and using the addpoly option
        addpoly (int, optional: what order polynomial terms to add as new columns (e.g. 0 for intercept, 1 for linear trend and intercept, etc); defaults to None
        unique_cols (list): additional columns to keep seperate across files (e.g. spikes)
        fill_nam (str/int/float): what value fill NaNs in with if reading in a list of files
        kwargs: additional inputs to pandas.read_csv
    Returns:
        Design_Matrix class
    """
    import pandas as pd
    import numpy as np
    import six
    from nltools.data import Design_Matrix
    from sklearn.preprocessing import scale
    import warnings

    if not isinstance(F, list):
        F = [F]
    out = []
    sampling_freq = 1 / TR

    for f in F:  ## Loading event files.
        if isinstance(f, six.string_types):  # load if file.
            if f.split('.')[-1] == 'tsv':
                df = pd.read_csv(
                    f, **kwargs,
                    sep='\t')  # if .tsv, load with tab separation.
            else:
                df = pd.read_csv(f, **kwargs)  # TODO, replace in final code.
        elif isinstance(f, pd.core.frame.DataFrame):  #copy if dataframe.
            df = f.copy()
        else:
            raise TypeError("Input needs to be file path or pandas dataframe!")
        # Set onset to closest prior TR.
        df['onset'] = df['onset'].apply(lambda x: int(np.floor(x / TR)))
        ### Build dummy codes for trial column
        X = Design_Matrix(np.zeros([run_length,
                                    len(df[trial_col].unique())]),
                          columns=df[trial_col].unique(),
                          sampling_freq=sampling_freq)
        for i, row in df.iterrows(
        ):  # for each entry in the .tsv file, mark a contrast for the duration in the design matrix.
            dur = np.ceil(row['duration'] / TR)  # round duration to ceiling.
            X.loc[row['onset'] - 1:row['onset'] + dur - 1, row[trial_col]] = 1
        if sort:
            X = X.reindex(sorted(X.columns), axis=1)  # sort columns.
        ## Parametric modulation, if necessary.
        if parametric_cols:
            par_names = [
                var[0] + '_' + var[1] for var in parametric_cols
            ]  # combine parametric_col indicators to generate new column names.
            XP = Design_Matrix(np.zeros([run_length,
                                         len(par_names)]),
                               columns=par_names,
                               sampling_freq=sampling_freq)
            for idx, cond_par in enumerate(parametric_cols):
                cond = cond_par[0]  # get condition to parametrically modulate
                par = cond_par[1]  # get name of parametric modulator
                print('modulating conditon', cond, 'by parametric modulator',
                      par)
                if cond_par[2] == 'cent':
                    with_mean = True
                elif cond_par[2] == 'no_cent':
                    with_mean = False
                if cond_par[3] == 'norm':
                    with_std = True
                elif cond_par[3] == 'no_norm':
                    with_std = False
                df[par_names[idx]] = scale(
                    df[par], with_mean=with_mean, with_std=with_std
                )  # scale/center the parametric modulatory
                for i, row in df.iterrows():
                    if row[trial_col] == cond:
                        dur = np.ceil(row['duration'] /
                                      TR)  # round duration to ceiling.
                        if np.isnan(row[par]):  # check for missing data.
                            print('NaN found in parameter', par, 'at onset:',
                                  row['onset'])
                            XP.loc[
                                row['onset'] - 1:row['onset'] + dur -
                                1] = 0  # remove all data within missing area
                        else:
                            XP.loc[
                                row['onset'] - 1:row['onset'] + dur - 1,
                                par_names[idx]] = 1 * row[par_names[
                                    idx]]  # multiple dummy code by parametric modulator.
            X = Design_Matrix(pd.concat([X, XP], axis=1),
                              sampling_freq=sampling_freq
                              )  # join parametrc variables to the design.
            out.append(X)  # append to other runs, if multiple runs.
    if len(out) > 1:
        out_dm = out[0].append(out[1:],
                               keep_separate=keep_separate,
                               add_poly=add_poly,
                               unique_cols=unique_cols,
                               fill_na=fill_na)
    else:
        if add_poly is not None:
            out_dm = out[0].add_poly(add_poly)
        else:
            out_dm = out[0]
    return out_dm