def onsets_to_dm(F, sampling_freq, run_length, header='infer', sort=False, keep_separate=True, add_poly=None, unique_cols=[], fill_na=None, **kwargs): """ This function can assist in reading in one or several in a 2-3 column onsets files, specified in seconds and converting it to a Design Matrix organized as samples X Stimulus Classes. Onsets files **must** be organized with columns in one of the following 4 formats: 1) 'Stim, Onset' 2) 'Onset, Stim' 3) 'Stim, Onset, Duration' 4) 'Onset, Duration, Stim' No other file organizations are currently supported Args: F (filepath/DataFrame/list): path to file, pandas dataframe, or list of files or pandas dataframes sampling_freq (float): sampling frequency in hertz; for TRs use (1 / TR) run_length (int): number of TRs in the run these onsets came from sort (bool, optional): whether to sort the columns of the resulting design matrix alphabetically; defaults to False addpoly (int, optional: what order polynomial terms to add as new columns (e.g. 0 for intercept, 1 for linear trend and intercept, etc); defaults to None header (str,optional): None if missing header, otherwise pandas header keyword; defaults to 'infer' keep_separate (bool): whether to seperate polynomial columns if reading a list of files and using the addpoly option unique_cols (list): additional columns to keep seperate across files (e.g. spikes) fill_nam (str/int/float): what value fill NaNs in with if reading in a list of files kwargs: additional inputs to pandas.read_csv Returns: Design_Matrix class """ if not isinstance(F, list): F = [F] out = [] TR = 1. / sampling_freq for f in F: if isinstance(f, six.string_types): df = pd.read_csv(f, header=header, **kwargs) elif isinstance(f, pd.core.frame.DataFrame): df = f.copy() else: raise TypeError("Input needs to be file path or pandas dataframe!") if df.shape[1] == 2: warnings.warn( "Only 2 columns in file, assuming all stimuli are the same duration" ) elif df.shape[1] == 1 or df.shape[1] > 3: raise ValueError("Can only handle files with 2 or 3 columns!") # Try to infer the header if header is None: possibleHeaders = ['Stim', 'Onset', 'Duration'] if isinstance(df.iloc[0, 0], six.string_types): df.columns = possibleHeaders[:df.shape[1]] elif isinstance(df.iloc[0, df.shape[1] - 1], six.string_types): df.columns = possibleHeaders[1:] + [possibleHeaders[0]] else: raise ValueError( "Can't figure out onset file organization. Make sure file has no more than 3 columns specified as 'Stim,Onset,Duration' or 'Onset,Duration,Stim'" ) df['Onset'] = df['Onset'].apply(lambda x: int(np.floor(x / TR))) # Build dummy codes X = Design_Matrix(np.zeros([run_length, len(df['Stim'].unique())]), columns=df['Stim'].unique(), sampling_freq=sampling_freq) for i, row in df.iterrows(): if df.shape[1] == 3: dur = np.ceil(row['Duration'] / TR) X.ix[row['Onset'] - 1:row['Onset'] + dur - 1, row['Stim']] = 1 elif df.shape[1] == 2: X.ix[row['Onset'], row['Stim']] = 1 if sort: X = X.reindex(sorted(X.columns), axis=1) out.append(X) if len(out) > 1: out_dm = out[0].append(out[1:], keep_separate=keep_separate, add_poly=add_poly, unique_cols=unique_cols, fill_na=fill_na) else: if add_poly is not None: out_dm = out[0].add_poly(add_poly) else: out_dm = out[0] return out_dm
def onsets_to_dm( F, sampling_freq, run_length, header="infer", sort=False, keep_separate=True, add_poly=None, unique_cols=None, fill_na=None, **kwargs, ): """ This function can assist in reading in one or several in a 2-3 column onsets files, specified in seconds and converting it to a Design Matrix organized as samples X Stimulus Classes. sampling_freq should be specified in hertz; for TRs use hertz = 1/TR. Onsets files **must** be organized with columns in one of the following 4 formats: 1) 'Stim, Onset' 2) 'Onset, Stim' 3) 'Stim, Onset, Duration' 4) 'Onset, Duration, Stim' No other file organizations are currently supported. *Note:* Stimulus offsets (onset + duration) that fall into an adjacent TR include that full TR. E.g. offset of 10.16s with TR = 2 has an offset of TR 5, which spans 10-12s, rather than an offset of TR 4, which spans 8-10s. Args: F (filepath/DataFrame/list): path to file, pandas dataframe, or list of files or pandas dataframes sampling_freq (float): sampling frequency in hertz; for TRs use (1 / TR) run_length (int): number of TRs in the run these onsets came from sort (bool, optional): whether to sort the columns of the resulting design matrix alphabetically; defaults to False addpoly (int, optional: what order polynomial terms to add as new columns (e.g. 0 for intercept, 1 for linear trend and intercept, etc); defaults to None header (str,optional): None if missing header, otherwise pandas header keyword; defaults to 'infer' keep_separate (bool): whether to seperate polynomial columns if reading a list of files and using the addpoly option; defaults to True unique_cols (list, optional): additional columns to keep seperate across files (e.g. spikes); defaults to [] fill_na (str/int/float, optional): what value fill NaNs in with if reading in a list of files; defaults to None kwargs: additional inputs to pandas.read_csv Returns: Design_Matrix class """ if not isinstance(F, list): F = [F] if not isinstance(sampling_freq, (float, np.floating)): raise TypeError("sampling_freq must be a float") out = [] TR = 1.0 / sampling_freq for f in F: if isinstance(f, str) or isinstance(f, Path): df = pd.read_csv(f, header=header, **kwargs) elif isinstance(f, pd.core.frame.DataFrame): df = f.copy() else: raise TypeError("Input needs to be file path or pandas dataframe!") # Keep an unaltered copy of the original dataframe for checking purposes below data = df.copy() if df.shape[1] == 2: warnings.warn( "Only 2 columns in file, assuming all stimuli are the same duration" ) elif df.shape[1] == 1 or df.shape[1] > 3: raise ValueError("Can only handle files with 2 or 3 columns!") # Try to infer the header if header is None: possibleHeaders = ["Stim", "Onset", "Duration"] if isinstance(df.iloc[0, 0], str): df.columns = possibleHeaders[: df.shape[1]] elif isinstance(df.iloc[0, df.shape[1] - 1], str): df.columns = possibleHeaders[1:] + [possibleHeaders[0]] else: raise ValueError( "Can't figure out onset file organization. Make sure file has no more than 3 columns specified as 'Stim,Onset,Duration' or 'Onset,Duration,Stim'" ) # Compute an offset in seconds if a Duration is provided if df.shape[1] == 3: df["Offset"] = df["Onset"] + df["Duration"] # Onset always starts at the closest TR rounded down, e.g. # with TR = 2, and onset = 10.1 or 11.7 will both have onset of TR 5 as it spans the window 10-12s df["Onset"] = df["Onset"].apply(lambda x: int(np.floor(x / TR))) # Offset includes the subsequent if Offset falls within window covered by that TR # but not if it falls exactly on the subsequent TR, e.g. if TR = 2, and offset = 10.16, then TR 5 will be included but if offset = 10.00, TR 5 will not be included, as it covers the window 10-12s if "Offset" in df.columns: def conditional_round(x, TR): """Conditional rounding to the next TR if offset falls within window, otherwise not""" dur_in_TRs = x / TR dur_in_TRs_rounded_down = np.floor(dur_in_TRs) # If in the future we wanted to enable the ability to include a TR based on a % of that TR we can change the next line to compare to some value, e.g. at least 0.5s into that TR: dur_in_TRs - dur_in_TRs_rounded_down > 0.5 if dur_in_TRs > dur_in_TRs_rounded_down: return dur_in_TRs_rounded_down else: return dur_in_TRs_rounded_down - 1 # Apply function df["Offset"] = df["Offset"].apply(conditional_round, args=(TR,)) # Build dummy codes X = Design_Matrix( np.zeros([run_length, df["Stim"].nunique()]), columns=df["Stim"].unique(), sampling_freq=sampling_freq, ) for i, row in df.iterrows(): if "Offset" in df.columns: X.loc[row["Onset"] : row["Offset"], row["Stim"]] = 1 else: X.loc[row["Onset"], row["Stim"]] = 1 # Run a check if "Offset" in df.columns: onsets = X.sum().values stim_counts = data.Stim.value_counts(sort=False)[X.columns] durations = data.groupby("Stim").Duration.mean().values for i, (o, c, d) in enumerate(zip(onsets, stim_counts, durations)): if c * (d / TR) <= o <= c * ((d / TR) + 1): pass else: warnings.warn( f"Computed onsets for {data.Stim.unique()[i]} are inconsistent with expected values. Please manually verify the outputted Design_Matrix!" ) if sort: X = X.reindex(sorted(X.columns), axis=1) out.append(X) if len(out) > 1: if add_poly is not None: out = [e.add_poly(add_poly) for e in out] out_dm = out[0].append( out[1:], keep_separate=keep_separate, unique_cols=unique_cols, fill_na=fill_na, ) else: out_dm = out[0] if add_poly is not None: out_dm = out_dm.add_poly(add_poly) if fill_na is not None: out_dm = out_dm.fill_na(fill_na) return out_dm
def BIDS_to_dm(F, sampling_freq, run_length, trial_col='trial_type', parametric_cols=None, sort=False, keep_separate=True, add_poly=None, unique_cols=[], fill_na=None, **kwargs): """ ** Modified from nltools.file_reader.onsets_to_dm to accomodate BIDS files, customize naming of the trial_type column, and allow parametric modulators. ** This function can assist in reading in one or several BIDS-formated events files, specified in seconds and converting it to a Design Matrix organized as samples X Stimulus Classes. Onsets files **must** be organized with columns in the following format: 1) 'onset, duration, trial_type' This can handle multiple runs being given at once (if F is a list), and by default uses separate contrasts for each run. Args: F (filepath/DataFrame/list): path to file, pandas dataframe, or list of files or pandas dataframes TR (float): TR of run. run_length (int): number of TRs in the run these onsets came from trial_col (string): which column should be used to specify stimuli/trials? parametric_cols (list of lists of strings): e.g. [['condition1', 'parametric1', 'no_cent', 'no_norm'], ['condition2', 'paramatric2', 'cent', 'norm']] in each entry: entry 1 is a condition within the trial_col entry 2 is a column in the events folder referenced by F. entry 3 is either 'no_cent', or 'cent', indicating whether to center the parametric variable. entry 4 is either 'no_norm', or 'norm', indicating whether to normalize the parametric variable. The condition column specified by entry 1 will be multiplied by the parametric weighting specified by entry 2, scaled/centered as specified, then appended to the design matrix. sort (bool, optional): whether to sort the columns of the resulting design matrix alphabetically; defaults to False keep_separate (bool): whether to seperate polynomial columns if reading a list of files and using the addpoly option addpoly (int, optional: what order polynomial terms to add as new columns (e.g. 0 for intercept, 1 for linear trend and intercept, etc); defaults to None unique_cols (list): additional columns to keep seperate across files (e.g. spikes) fill_nam (str/int/float): what value fill NaNs in with if reading in a list of files kwargs: additional inputs to pandas.read_csv Returns: Design_Matrix class """ import pandas as pd import numpy as np import six from nltools.data import Design_Matrix from sklearn.preprocessing import scale import warnings if not isinstance(F, list): F = [F] out = [] sampling_freq = 1 / TR for f in F: ## Loading event files. if isinstance(f, six.string_types): # load if file. if f.split('.')[-1] == 'tsv': df = pd.read_csv( f, **kwargs, sep='\t') # if .tsv, load with tab separation. else: df = pd.read_csv(f, **kwargs) # TODO, replace in final code. elif isinstance(f, pd.core.frame.DataFrame): #copy if dataframe. df = f.copy() else: raise TypeError("Input needs to be file path or pandas dataframe!") # Set onset to closest prior TR. df['onset'] = df['onset'].apply(lambda x: int(np.floor(x / TR))) ### Build dummy codes for trial column X = Design_Matrix(np.zeros([run_length, len(df[trial_col].unique())]), columns=df[trial_col].unique(), sampling_freq=sampling_freq) for i, row in df.iterrows( ): # for each entry in the .tsv file, mark a contrast for the duration in the design matrix. dur = np.ceil(row['duration'] / TR) # round duration to ceiling. X.loc[row['onset'] - 1:row['onset'] + dur - 1, row[trial_col]] = 1 if sort: X = X.reindex(sorted(X.columns), axis=1) # sort columns. ## Parametric modulation, if necessary. if parametric_cols: par_names = [ var[0] + '_' + var[1] for var in parametric_cols ] # combine parametric_col indicators to generate new column names. XP = Design_Matrix(np.zeros([run_length, len(par_names)]), columns=par_names, sampling_freq=sampling_freq) for idx, cond_par in enumerate(parametric_cols): cond = cond_par[0] # get condition to parametrically modulate par = cond_par[1] # get name of parametric modulator print('modulating conditon', cond, 'by parametric modulator', par) if cond_par[2] == 'cent': with_mean = True elif cond_par[2] == 'no_cent': with_mean = False if cond_par[3] == 'norm': with_std = True elif cond_par[3] == 'no_norm': with_std = False df[par_names[idx]] = scale( df[par], with_mean=with_mean, with_std=with_std ) # scale/center the parametric modulatory for i, row in df.iterrows(): if row[trial_col] == cond: dur = np.ceil(row['duration'] / TR) # round duration to ceiling. if np.isnan(row[par]): # check for missing data. print('NaN found in parameter', par, 'at onset:', row['onset']) XP.loc[ row['onset'] - 1:row['onset'] + dur - 1] = 0 # remove all data within missing area else: XP.loc[ row['onset'] - 1:row['onset'] + dur - 1, par_names[idx]] = 1 * row[par_names[ idx]] # multiple dummy code by parametric modulator. X = Design_Matrix(pd.concat([X, XP], axis=1), sampling_freq=sampling_freq ) # join parametrc variables to the design. out.append(X) # append to other runs, if multiple runs. if len(out) > 1: out_dm = out[0].append(out[1:], keep_separate=keep_separate, add_poly=add_poly, unique_cols=unique_cols, fill_na=fill_na) else: if add_poly is not None: out_dm = out[0].add_poly(add_poly) else: out_dm = out[0] return out_dm