Example #1
0
def test_regression():
    # Test Adjacency Regression
    m1 = block_diag(np.ones((4, 4)), np.zeros((4, 4)), np.zeros((4, 4)))
    m2 = block_diag(np.zeros((4, 4)), np.ones((4, 4)), np.zeros((4, 4)))
    m3 = block_diag(np.zeros((4, 4)), np.zeros((4, 4)), np.ones((4, 4)))
    Y = Adjacency(m1 * 1 + m2 * 2 + m3 * 3, matrix_type='similarity')
    X = Adjacency([m1, m2, m3], matrix_type='similarity')

    stats = Y.regress(X)
    assert np.allclose(stats['beta'], np.array([1, 2, 3]))

    # Test Design_Matrix Regression
    n = 10
    d = Adjacency([
        block_diag(
            np.ones((4, 4)) + np.random.randn(4, 4) * .1, np.zeros((8, 8)))
        for x in range(n)
    ],
                  matrix_type='similarity')
    X = Design_Matrix(np.ones(n))
    stats = d.regress(X)
    out = stats['beta'].within_cluster_mean(clusters=['Group1'] * 4 +
                                            ['Group2'] * 8)
    assert np.allclose(
        np.array([out['Group1'], out['Group2']]), np.array([1, 0]),
        rtol=1e-01)  # np.allclose(np.sum(stats['beta']-np.array([1,2,3])),0)
def test_regression():
    # Test Adjacency Regression
    m1 = block_diag(np.ones((4, 4)), np.zeros((4, 4)), np.zeros((4, 4)))
    m2 = block_diag(np.zeros((4, 4)), np.ones((4, 4)), np.zeros((4, 4)))
    m3 = block_diag(np.zeros((4, 4)), np.zeros((4, 4)), np.ones((4, 4)))
    Y = Adjacency(m1 * 1 + m2 * 2 + m3 * 3, matrix_type="similarity")
    X = Adjacency([m1, m2, m3], matrix_type="similarity")

    stats = Y.regress(X)
    assert np.allclose(stats["beta"], np.array([1, 2, 3]))

    # Test Design_Matrix Regression
    n = 10
    d = Adjacency(
        [
            block_diag(
                np.ones((4, 4)) + np.random.randn(4, 4) * 0.1, np.zeros(
                    (8, 8))) for _ in range(n)
        ],
        matrix_type="similarity",
    )
    X = Design_Matrix(np.ones(n))
    stats = d.regress(X)
    out = stats["beta"].cluster_summary(clusters=["Group1"] * 4 +
                                        ["Group2"] * 8,
                                        summary="within")
    assert np.allclose(
        np.array([out["Group1"], out["Group2"]]), np.array([1, 0]),
        rtol=1e-01)  # np.allclose(np.sum(stats['beta']-np.array([1,2,3])),0)
Example #3
0
def sim_design_matrix():
    # Design matrices are specified in terms of sampling frequency
    TR = 2.0
    sampling_freq = 1. / TR
    return Design_Matrix(np.random.randint(2,size=(500,4)),
                         columns=['face_A','face_B','house_A','house_B'],
                         sampling_freq=sampling_freq)
def test_append(sim_design_matrix):
    mats = sim_design_matrix.append(sim_design_matrix)
    assert mats.shape[0] == sim_design_matrix.shape[0] * 2
    # Keep polys separate by default

    assert (mats.shape[1] - 4) == (sim_design_matrix.shape[1] - 4) * 2
    # Otherwise stack them
    mats = sim_design_matrix.append(sim_design_matrix, keep_separate=False)
    assert mats.shape[1] == sim_design_matrix.shape[1]
    assert mats.shape[0] == sim_design_matrix.shape[0] * 2

    # Keep a single stimulus column separate
    assert (sim_design_matrix.append(sim_design_matrix,
                                     unique_cols=["face_A"]).shape[1] == 5)

    # Keep a common stimulus class separate
    assert (sim_design_matrix.append(sim_design_matrix,
                                     unique_cols=["face*"]).shape[1] == 6)
    # Keep a common stimulus class and a different single stim separate
    assert (sim_design_matrix.append(sim_design_matrix,
                                     unique_cols=["face*",
                                                  "house_A"]).shape[1] == 7)
    # Keep multiple stimulus class separate
    assert (sim_design_matrix.append(sim_design_matrix,
                                     unique_cols=["face*",
                                                  "house*"]).shape[1] == 8)

    # Growing a multi-run design matrix; keeping things separate
    num_runs = 4
    all_runs = Design_Matrix(sampling_freq=0.5)
    for i in range(num_runs):
        run = Design_Matrix(
            np.array([
                [1, 0, 0, 0],
                [1, 0, 0, 0],
                [0, 0, 0, 0],
                [0, 1, 0, 0],
                [0, 1, 0, 0],
                [0, 0, 0, 0],
                [0, 0, 1, 0],
                [0, 0, 1, 0],
                [0, 0, 0, 0],
                [0, 0, 0, 1],
                [0, 0, 0, 1],
            ]),
            sampling_freq=0.5,
            columns=["stim_A", "stim_B", "cond_C", "cond_D"],
        )
        run = run.add_poly(2)
        all_runs = all_runs.append(run, unique_cols=["stim*", "cond*"])
    assert all_runs.shape == (44, 28)
Example #5
0
def test_append(sim_design_matrix):
    mats = sim_design_matrix.append(sim_design_matrix)
    assert mats.shape[0] == sim_design_matrix.shape[0] * 2
    # Keep polys separate by default

    assert (mats.shape[1] - 4) == (sim_design_matrix.shape[1] - 4) * 2
    # Otherwise stack them
    assert sim_design_matrix.append(sim_design_matrix,
                                    keep_separate=False).shape[1] == sim_design_matrix.shape[1]
    # Keep a single stimulus column separate
    assert sim_design_matrix.append(sim_design_matrix,
                                    unique_cols=['face_A']).shape[1] == 5

    # Keep a common stimulus class separate
    assert sim_design_matrix.append(sim_design_matrix,
                                    unique_cols=['face*']).shape[1] == 6
    # Keep a common stimulus class and a different single stim separate
    assert sim_design_matrix.append(sim_design_matrix,
                                    unique_cols=['face*', 'house_A']).shape[1] == 7
    # Keep multiple stimulus class separate
    assert sim_design_matrix.append(sim_design_matrix,
                                    unique_cols=['face*', 'house*']).shape[1] == 8

    # Growing a multi-run design matrix; keeping things separate
    num_runs = 4
    all_runs = Design_Matrix(sampling_freq=.5)
    for i in range(num_runs):
        run = Design_Matrix(np.array([
                                [1, 0, 0, 0],
                                [1, 0, 0, 0],
                                [0, 0, 0, 0],
                                [0, 1, 0, 0],
                                [0, 1, 0, 0],
                                [0, 0, 0, 0],
                                [0, 0, 1, 0],
                                [0, 0, 1, 0],
                                [0, 0, 0, 0],
                                [0, 0, 0, 1],
                                [0, 0, 0, 1]
                                ]),
                            sampling_freq=.5,
                            columns=['stim_A', 'stim_B', 'cond_C', 'cond_D']
                            )
        run = run.add_poly(2)
        all_runs = all_runs.append(run, unique_cols=['stim*', 'cond*'])
    assert all_runs.shape == (44, 28)
Example #6
0
def onsets_to_dm(
    F,
    sampling_freq,
    run_length,
    header="infer",
    sort=False,
    keep_separate=True,
    add_poly=None,
    unique_cols=None,
    fill_na=None,
    **kwargs,
):
    """
    This function can assist in reading in one or several in a 2-3 column onsets files, specified in seconds and converting it to a Design Matrix organized as samples X Stimulus Classes. sampling_freq should be specified in hertz; for TRs use hertz = 1/TR. Onsets files **must** be organized with columns in one of the following 4 formats:

    1) 'Stim, Onset'
    2) 'Onset, Stim'
    3) 'Stim, Onset, Duration'
    4) 'Onset, Duration, Stim'

    No other file organizations are currently supported. *Note:* Stimulus offsets (onset + duration) that fall into an adjacent TR include that full TR. E.g. offset of 10.16s with TR = 2 has an offset of TR 5, which spans 10-12s, rather than an offset of TR 4, which spans 8-10s.

    Args:
        F (filepath/DataFrame/list): path to file, pandas dataframe, or list of files or pandas dataframes
        sampling_freq (float): sampling frequency in hertz; for TRs use (1 / TR)         run_length (int): number of TRs in the run these onsets came from
        sort (bool, optional): whether to sort the columns of the resulting
                                design matrix alphabetically; defaults to
                                False
        addpoly (int, optional: what order polynomial terms to add as new columns (e.g. 0 for intercept, 1 for linear trend and intercept, etc); defaults to None
        header (str,optional): None if missing header, otherwise pandas
                                header keyword; defaults to 'infer'
        keep_separate (bool): whether to seperate polynomial columns if reading a list of files and using the addpoly option; defaults to True
        unique_cols (list, optional): additional columns to keep seperate across files (e.g. spikes); defaults to []
        fill_na (str/int/float, optional): what value fill NaNs in with if reading in a list of files; defaults to None
        kwargs: additional inputs to pandas.read_csv

        Returns:
            Design_Matrix class

    """

    if not isinstance(F, list):
        F = [F]

    if not isinstance(sampling_freq, (float, np.floating)):
        raise TypeError("sampling_freq must be a float")

    out = []
    TR = 1.0 / sampling_freq
    for f in F:
        if isinstance(f, str) or isinstance(f, Path):
            df = pd.read_csv(f, header=header, **kwargs)
        elif isinstance(f, pd.core.frame.DataFrame):
            df = f.copy()
        else:
            raise TypeError("Input needs to be file path or pandas dataframe!")
        # Keep an unaltered copy of the original dataframe for checking purposes below
        data = df.copy()

        if df.shape[1] == 2:
            warnings.warn(
                "Only 2 columns in file, assuming all stimuli are the same duration"
            )
        elif df.shape[1] == 1 or df.shape[1] > 3:
            raise ValueError("Can only handle files with 2 or 3 columns!")

        # Try to infer the header
        if header is None:
            possibleHeaders = ["Stim", "Onset", "Duration"]
            if isinstance(df.iloc[0, 0], str):
                df.columns = possibleHeaders[: df.shape[1]]
            elif isinstance(df.iloc[0, df.shape[1] - 1], str):
                df.columns = possibleHeaders[1:] + [possibleHeaders[0]]
            else:
                raise ValueError(
                    "Can't figure out onset file organization. Make sure file has no more than 3 columns specified as 'Stim,Onset,Duration' or 'Onset,Duration,Stim'"
                )

        # Compute an offset in seconds if a Duration is provided
        if df.shape[1] == 3:
            df["Offset"] = df["Onset"] + df["Duration"]
        # Onset always starts at the closest TR rounded down, e.g.
        # with TR = 2, and onset = 10.1 or 11.7 will both have onset of TR 5 as it spans the window 10-12s
        df["Onset"] = df["Onset"].apply(lambda x: int(np.floor(x / TR)))

        # Offset includes the subsequent if Offset falls within window covered by that TR
        # but not if it falls exactly on the subsequent TR, e.g. if TR = 2, and offset = 10.16, then TR 5 will be included but if offset = 10.00, TR 5 will not be included, as it covers the window 10-12s
        if "Offset" in df.columns:

            def conditional_round(x, TR):
                """Conditional rounding to the next TR if offset falls within window, otherwise not"""
                dur_in_TRs = x / TR
                dur_in_TRs_rounded_down = np.floor(dur_in_TRs)
                # If in the future we wanted to enable the ability to include a TR based on a % of that TR we can change the next line to compare to some value, e.g. at least 0.5s into that TR: dur_in_TRs - dur_in_TRs_rounded_down > 0.5
                if dur_in_TRs > dur_in_TRs_rounded_down:
                    return dur_in_TRs_rounded_down
                else:
                    return dur_in_TRs_rounded_down - 1

            # Apply function
            df["Offset"] = df["Offset"].apply(conditional_round, args=(TR,))

        # Build dummy codes
        X = Design_Matrix(
            np.zeros([run_length, df["Stim"].nunique()]),
            columns=df["Stim"].unique(),
            sampling_freq=sampling_freq,
        )
        for i, row in df.iterrows():
            if "Offset" in df.columns:
                X.loc[row["Onset"] : row["Offset"], row["Stim"]] = 1
            else:
                X.loc[row["Onset"], row["Stim"]] = 1
        # Run a check
        if "Offset" in df.columns:
            onsets = X.sum().values
            stim_counts = data.Stim.value_counts(sort=False)[X.columns]
            durations = data.groupby("Stim").Duration.mean().values
            for i, (o, c, d) in enumerate(zip(onsets, stim_counts, durations)):
                if c * (d / TR) <= o <= c * ((d / TR) + 1):
                    pass
                else:
                    warnings.warn(
                        f"Computed onsets for {data.Stim.unique()[i]} are inconsistent with expected values. Please manually verify the outputted Design_Matrix!"
                    )

        if sort:
            X = X.reindex(sorted(X.columns), axis=1)

        out.append(X)
    if len(out) > 1:
        if add_poly is not None:
            out = [e.add_poly(add_poly) for e in out]

        out_dm = out[0].append(
            out[1:],
            keep_separate=keep_separate,
            unique_cols=unique_cols,
            fill_na=fill_na,
        )
    else:
        out_dm = out[0]
        if add_poly is not None:
            out_dm = out_dm.add_poly(add_poly)
        if fill_na is not None:
            out_dm = out_dm.fill_na(fill_na)

    return out_dm
Example #7
0

# ## Prepare brain data

# In[13]:


# Get data and covariates file and create nuisance design matrix
print(preproc_dir, subject_id, episode)
sub_cov, sub_epi = fileGetter(preproc_dir,subject_id,episode)

#Load run data
print("Loading brain data: {}".format(smooth))
dat = Brain_Data(sub_epi)

cov_mat = Design_Matrix(pd.read_csv(sub_cov[0]).fillna(0), sampling_rate=TR)
# Add Intercept
cov_mat['Intercept'] = 1
# Add Linear Trend
cov_mat['LinearTrend'] = range(cov_mat.shape[0])-np.mean(range(cov_mat.shape[0]))
cov_mat['QuadraticTrend'] = cov_mat['LinearTrend']**2
cov_mat['CSF'] = dat.extract_roi(csf.threshold(.85,binarize=True))

assert cov_mat.shape[0] == dat.shape()[0]
spikeless_idx =  np.logical_not( startswith(cov_mat.columns.values.astype(str), "spike") | startswith(cov_mat.columns.values.astype(str), "FD") )
#dat.X = cov_mat
dat.X = cov_mat.loc[:,spikeless_idx]
datcln = dat.regress()['residual']


# ## Loop through voxels to produce STPs
Example #8
0
X = Adjacency([m1, m2, m3], matrix_type='similarity')
stats = dat.regress(X)
print(stats['beta'])

#########################################################################
# In addition to decomposing a single adjacency matrix, we can also estimate a model that predicts the variance over each voxel.  This is equivalent to a univariate regression in imaging analyses. Remember that just like in imaging these tests are non-independent and may require correcting for multiple comparisons.  Here we create some data that varies over matrices and identify pixels that follow a particular on-off-on pattern.  We plot the t-values that exceed 2.

from nltools.data import Design_Matrix
import matplotlib.pyplot as plt

data = Adjacency(
    [m1 + np.random.randn(12, 12) * .5 for x in range(5)] +
    [np.zeros((12, 12)) + np.random.randn(12, 12) * .5 for x in range(5)] +
    [m1 + np.random.randn(12, 12) * .5 for x in range(5)])

X = Design_Matrix([1] * 5 + [0] * 5 + [1] * 5)
f = X.plot()
f.set_title('Model', fontsize=18)

stats = data.regress(X)
t = stats['t'].plot(vmin=2)
plt.title('Significant Pixels', fontsize=18)

#########################################################################
# Similarity/Distance
# -------------------
#
# We can calculate similarity between two Adjacency matrices using `.similiarity()`.

stats = dat.similarity(m1)
print(stats)
Example #9
0
def onsets_to_dm(F, TR, runLength, header='infer', sort=False,
                addIntercept=False, **kwargs):
    """Function to read in a 2 or 3 column onsets file, specified in seconds,
        organized as: 'Stimulus,Onset','Onset,Stimulus','Stimulus,Onset,
        Duration', or 'Onset,Duration,Stimulus'.

        Args:
            df (str or dataframe): path to file or pandas dataframe
            TR (float): length of TR in seconds the run was collected at
            runLength (int): number of TRs in the run these onsets came from
            sort (bool, optional): whether to sort the columns of the resulting
                                    design matrix alphabetically; defaults to
                                    False
            addIntercept (bool, optional: whether to add an intercept to the
                                    resulting dataframe; defaults to False
            header (str,optional): None if missing header, otherwise pandas
                                    header keyword; defaults to 'infer'
            kwargs: additional inputs to pandas.read_csv

        Returns:
            Design_Matrix class

    """
    if isinstance(F,six.string_types):
        df = pd.read_csv(F,header=header,**kwargs)
    elif isinstance(F,pd.core.frame.DataFrame):
        df = F.copy()
    else:
        raise TypeError("Input needs to be file path or pandas dataframe!")
    if df.shape[1] == 2:
        warnings.warn("Only 2 columns in file, assuming all stimuli are the same duration")
    elif df.shape[1] == 1 or df.shape[1] > 3:
        raise ValueError("Can only handle files with 2 or 3 columns!")

    #Try to infer the header
    if header is None:
        possibleHeaders = ['Stim','Duration','Onset']
        if isinstance(df.iloc[0,0],six.string_types):
            df.columns = possibleHeaders[:df.shape[1]]
        elif isinstance(df.iloc[0,df.shape[1]-1],six.string_types):
            df.columns = possibleHeaders[1:] + [possibleHeaders[0]]
        else:
            raise ValueError("Can't figure out data organization. Make sure file has no more than 3 columns specified as 'Stim,Onset,Duration' or 'Onset,Duration,Stim'")
    df['Onset'] = df['Onset'].apply(lambda x: int(np.floor(x/TR)))

    #Build dummy codes
    X = Design_Matrix(columns=df['Stim'].unique(),
                    data=np.zeros([runLength,
                    len(df['Stim'].unique())]))
    for i, row in df.iterrows():
        if df.shape[1] == 3:
            dur = np.ceil(row['Duration']/TR)
            X.ix[row['Onset']-1:row['Onset']+dur-1, row['Stim']] = 1
        elif df.shape[1] == 2:
            X.ix[row['Onset'], row['Stim']] = 1
    X.TR = TR
    if sort:
        X = X.reindex_axis(sorted(X.columns), axis=1)

    if addIntercept:
        X['intercept'] = 1
        X.hasIntercept = True

    return X
Example #10
0
def onsets_to_dm(F,
                 sampling_freq,
                 run_length,
                 header='infer',
                 sort=False,
                 keep_separate=True,
                 add_poly=None,
                 unique_cols=[],
                 fill_na=None,
                 **kwargs):
    """
    This function can assist in reading in one or several in a 2-3 column onsets files, specified in seconds and converting it to a Design Matrix organized as samples X Stimulus Classes. Onsets files **must** be organized with columns in one of the following 4 formats:

    1) 'Stim, Onset'
    2) 'Onset, Stim'
    3) 'Stim, Onset, Duration'
    4) 'Onset, Duration, Stim'

    No other file organizations are currently supported

    Args:
        F (filepath/DataFrame/list): path to file, pandas dataframe, or list of files or pandas dataframes
        sampling_freq (float): sampling frequency in hertz; for TRs use (1 / TR)         run_length (int): number of TRs in the run these onsets came from
        sort (bool, optional): whether to sort the columns of the resulting
                                design matrix alphabetically; defaults to
                                False
        addpoly (int, optional: what order polynomial terms to add as new columns (e.g. 0 for intercept, 1 for linear trend and intercept, etc); defaults to None
        header (str,optional): None if missing header, otherwise pandas
                                header keyword; defaults to 'infer'
        keep_separate (bool): whether to seperate polynomial columns if reading a list of files and using the addpoly option
        unique_cols (list): additional columns to keep seperate across files (e.g. spikes)
        fill_nam (str/int/float): what value fill NaNs in with if reading in a list of files
        kwargs: additional inputs to pandas.read_csv

        Returns:
            Design_Matrix class

    """
    if not isinstance(F, list):
        F = [F]

    out = []
    TR = 1. / sampling_freq
    for f in F:
        if isinstance(f, six.string_types):
            df = pd.read_csv(f, header=header, **kwargs)
        elif isinstance(f, pd.core.frame.DataFrame):
            df = f.copy()
        else:
            raise TypeError("Input needs to be file path or pandas dataframe!")
        if df.shape[1] == 2:
            warnings.warn(
                "Only 2 columns in file, assuming all stimuli are the same duration"
            )
        elif df.shape[1] == 1 or df.shape[1] > 3:
            raise ValueError("Can only handle files with 2 or 3 columns!")

        # Try to infer the header
        if header is None:
            possibleHeaders = ['Stim', 'Onset', 'Duration']
            if isinstance(df.iloc[0, 0], six.string_types):
                df.columns = possibleHeaders[:df.shape[1]]
            elif isinstance(df.iloc[0, df.shape[1] - 1], six.string_types):
                df.columns = possibleHeaders[1:] + [possibleHeaders[0]]
            else:
                raise ValueError(
                    "Can't figure out onset file organization. Make sure file has no more than 3 columns specified as 'Stim,Onset,Duration' or 'Onset,Duration,Stim'"
                )
        df['Onset'] = df['Onset'].apply(lambda x: int(np.floor(x / TR)))

        # Build dummy codes
        X = Design_Matrix(np.zeros([run_length,
                                    len(df['Stim'].unique())]),
                          columns=df['Stim'].unique(),
                          sampling_freq=sampling_freq)
        for i, row in df.iterrows():
            if df.shape[1] == 3:
                dur = np.ceil(row['Duration'] / TR)
                X.ix[row['Onset'] - 1:row['Onset'] + dur - 1, row['Stim']] = 1
            elif df.shape[1] == 2:
                X.ix[row['Onset'], row['Stim']] = 1
        if sort:
            X = X.reindex(sorted(X.columns), axis=1)

        out.append(X)
    if len(out) > 1:
        out_dm = out[0].append(out[1:],
                               keep_separate=keep_separate,
                               add_poly=add_poly,
                               unique_cols=unique_cols,
                               fill_na=fill_na)
    else:
        if add_poly is not None:
            out_dm = out[0].add_poly(add_poly)
        else:
            out_dm = out[0]

    return out_dm
Example #11
0
def test_designmat(tmpdir):

    mat1 = Design_Matrix({
    'X':[1, 4, 2, 7, 5, 9, 2, 1, 3, 2],
    'Y':[3, 0, 0, 6, 9, 9, 10, 10, 1, 10],
    'Z':[2, 2, 2, 2, 7, 0, 1, 3, 3, 2],
    'intercept':[1, 1, 1, 1, 1, 1, 1, 1, 1, 1]
    },
    sampling_rate=2.0,polys=['intercept'])

    mat2 = Design_Matrix({
    'X':[9, 9, 2, 7, 5, 0, 1, 1, 1, 2],
    'Y':[3, 3, 3, 6, 9, 0, 1, 10, 1, 10],
    'Z':[2, 6, 3, 2, 7, 0, 1, 7, 8, 8],
    'intercept':[1, 1, 1, 1, 1, 1, 1, 1, 1, 1]
    },
    sampling_rate=2.0, polys=['intercept'])

    # Appending
    # Basic horz cat
    new_mat = mat1.append(mat1,axis=1)
    assert new_mat.shape == (mat1.shape[0], mat1.shape[1] + mat2.shape[1])
    both_cols = list(mat1.columns) + list(mat1.columns)
    assert all(new_mat.columns == both_cols)
    # Basic vert cat
    new_mat = mat1.append(mat1,axis=0)
    assert new_mat.shape == (mat1.shape[0]*2, mat1.shape[1]+1)
    # Advanced vert cat
    new_mat = mat1.append(mat1,axis=0,keep_separate=False)
    assert new_mat.shape == (mat1.shape[0]*2,mat1.shape[1])
    # More advanced vert cat
    new_mat = mat1.append(mat1,axis=0,add_poly=2)
    assert new_mat.shape == (mat1.shape[0]*2, 9)

    #convolution doesn't affect intercept
    assert all(mat1.convolve().iloc[:, -1] == mat1.iloc[:, -1])
    #but it still works
    assert (mat1.convolve().iloc[:, :3].values != mat1.iloc[:, :3].values).any()

    #Test vifs
    expectedVifs = np.array([ 1.03984251, 1.02889877, 1.02261945])
    assert np.allclose(expectedVifs,mat1.vif())

    #poly
    mat1.add_poly(order=4).shape[1] == mat1.shape[1]+4
    mat1.add_poly(order=4, include_lower=False).shape[1] == mat1.shape[1]+1

    #zscore
    z = mat1.zscore(columns=['X', 'Z'])
    assert (z['Y'] == mat1['Y']).all()
    assert z.shape == mat1.shape

    # clean
    mat = Design_Matrix({
    'X':[1, 4, 2, 7, 5, 9, 2, 1, 3, 2],
    'A':[1, 4, 2, 7, 5, 9, 2, 1, 3, 2],
    'Y':[3, 0, 0, 6, 9, 9, 10, 10, 1, 10],
    'Z':[2, 2, 2, 2, 7, 0, 1, 3, 3, 2],
    'C':[1, 4, 2, 7, 5, 9, 2, 1, 3, 2],
    'intercept':[1, 1, 1, 1, 1, 1, 1, 1, 1, 1]
    },
    sampling_rate=2.0,polys=['intercept'])
    mat = mat[['X','A','Y','Z','C','intercept']]
    assert all(mat.clean().columns == ['X','Y','Z','intercept'])

    # replace data
    mat = Design_Matrix({
    'X':[1, 4, 2, 7, 5, 9, 2, 1, 3, 2],
    'A':[1, 4, 2, 7, 5, 9, 2, 1, 3, 2],
    'Y':[3, 0, 0, 6, 9, 9, 10, 10, 1, 10],
    'Z':[2, 2, 2, 2, 7, 0, 1, 3, 3, 2],
    'C':[1, 4, 2, 7, 5, 9, 2, 1, 3, 2]
    },
    sampling_rate=2.0)

    mat = mat.replace_data(np.ones((mat.shape[0],mat.shape[1]-1)),column_names=['a','b','c','d'])

    assert(np.allclose(mat.values,1))
    assert(all(mat.columns == ['a','b','c','d']))

    #DCT basis_mat
    mat = Design_Matrix(np.random.randint(2,size=(500,3)),sampling_rate=2.0)
    mat = mat.add_dct_basis()
    assert len(mat.polys) == 11
    assert mat.shape[1] == 14

    #Up and down sampling
    mat = Design_Matrix(np.random.randint(2,size=(500,4)),sampling_rate=2.0,columns=['a','b','c','d'])
    target = 1
    assert mat.upsample(target).shape[0] == mat.shape[0]*2 - target*2
    target = 4
    assert mat.downsample(target).shape[0] == mat.shape[0]/2
Example #12
0
We are going to remove the mean from our vmPFC signal. We are also going to include the average activity in CSF as an additional nuisance regressor to remove physiological artifacts. Finally, we will be including our 24 motion covariates as well as linear and quadratic trends. We need to be a little careful about filtering as the normal high pass filter for an event related design might be too short and will remove potential signals of interest.

Resting state researchers also often remove the global signal, which can reduce physiological and motion related artifacts and also increase the likelihood of observing negative relationships with your seed regressor (i.e., anticorrelated). This procedure has remained quite controversial in practice (see [here](https://www.physiology.org/doi/full/10.1152/jn.90777.2008) [here](https://www.sciencedirect.com/science/article/pii/S1053811908010264), [here](https://www.pnas.org/content/107/22/10238.short), and [here](https://www.sciencedirect.com/science/article/pii/S1053811916306711) for a more in depth discussion). We think that in general including covariates like CSF should be sufficient. It is also common to additionally include covariates from white matter masks, and also multiple principal components of this signal rather than just the mean (see more details about [compcorr](https://www.ncbi.nlm.nih.gov/pmc/articles/PMC2214855/).

Overall, this code should seem very familiar as it is pretty much the same procedure we used in the single subject GLM tutorial. However, instead of modeling the task design, we are interested in calculating the functional connectivity with the vmPFC.

tr = layout.get_tr()
fwhm = 6
n_tr = len(data)

def make_motion_covariates(mc, tr):
    z_mc = zscore(mc)
    all_mc = pd.concat([z_mc, z_mc**2, z_mc.diff(), z_mc.diff()**2], axis=1)
    all_mc.fillna(value=0, inplace=True)
    return Design_Matrix(all_mc, sampling_freq=1/tr)


vmpfc = zscore(pd.DataFrame(vmpfc, columns=['vmpfc']))

csf_mask = Brain_Data(os.path.join(base_dir, 'masks', 'csf.nii.gz'))
csf = zscore(pd.DataFrame(smoothed.extract_roi(mask=csf_mask).T, columns=['csf']))

spikes = smoothed.find_spikes(global_spike_cutoff=3, diff_spike_cutoff=3)
covariates = pd.read_csv(layout.get(subject=sub, scope='derivatives', extension='.tsv')[0].path, sep='\t')
mc = covariates[['trans_x','trans_y','trans_z','rot_x', 'rot_y', 'rot_z']]
mc_cov = make_motion_covariates(mc, tr)
dm = Design_Matrix(pd.concat([vmpfc, csf, mc_cov, spikes.drop(labels='TR', axis=1)], axis=1), sampling_freq=1/tr)
dm = dm.add_poly(order=2, include_lower=True)

smoothed.X = dm
Example #13
0
def make_motion_covariates(mc, tr):
    z_mc = zscore(mc)
    all_mc = pd.concat([z_mc, z_mc**2, z_mc.diff(), z_mc.diff()**2], axis=1)
    all_mc.fillna(value=0, inplace=True)
    return Design_Matrix(all_mc, sampling_freq=1 / tr)
Example #14
0
def make_motion_covariates(mc):
    z_mc = zscore(mc)
    all_mc = pd.concat([z_mc, z_mc**2, z_mc.diff(), z_mc.diff()**2], axis=1)
    all_mc.fillna(value=0, inplace=True)
    return Design_Matrix(all_mc, sampling_freq=1/tr)

for sub in layout.get_subjects(scope='derivatives'):
    data = Brain_Data([x for x in layout.get(subject=sub, scope='derivatives', suffix='bold', extension='nii.gz', return_type='file') if 'denoised' not in x][0])
    smoothed = data.smooth(fwhm=fwhm)

    dm = load_bids_events(layout, sub)
    covariates = pd.read_csv(layout.get(subject=sub, scope='derivatives', extension='.tsv')[0].path, sep='\t')
    mc_cov = make_motion_covariates(covariates[['trans_x','trans_y','trans_z','rot_x', 'rot_y', 'rot_z']])
    spikes = data.find_spikes(global_spike_cutoff=spike_cutoff, diff_spike_cutoff=spike_cutoff)
    dm_cov = dm.convolve().add_dct_basis(duration=128).add_poly(order=1, include_lower=True)
    dm_cov = dm_cov.append(mc_cov, axis=1).append(Design_Matrix(spikes.iloc[:, 1:], sampling_freq=1/tr), axis=1)
    smoothed.X = dm_cov
    stats = smoothed.regress()
    file_name = layout.get(subject=sub, scope='derivatives', suffix='bold', extension='nii.gz', return_type='file')[0]
    stats['beta'].write(os.path.join(os.path.dirname(file_name), f"sub-{sub}_betas_denoised_{file_name.split('_')[1]}_{file_name.split('_')[2]}_smoothed{fwhm}_{file_name.split('_')[-1]}"))

    for i, name in enumerate([x[:-3] for x in dm_cov.columns[:10]]):
        stats['beta'][i].write(os.path.join(os.path.dirname(file_name), f"sub-{sub}_{name}_denoised_{file_name.split('_')[2]}_smoothed{fwhm}_{file_name.split('_')[-1]}"))
```

Now, we are ready to run our first group analyses! 

Let's load our design matrix to remind ourselves of the various conditions

### One Sample t-test
print(dm.info())
dm.heatmap()

#########################################################################
# Load and Z-score a Covariates File
# ----------------------------------
#
# Now we're going to handle a covariates file that's been generated by a preprocessing routine.
# First we'll read in the text file using pandas and convert it to a design matrix. 
# To be explicit with the meta-data we're going to change some default attributes during conversion.

import pandas as pd

covariatesFile = os.path.join(get_resource_path(),'covariates_example.csv')
cov = pd.read_csv(covariatesFile)
cov = Design_Matrix(cov,hasIntercept=False,hrf=[])
cov.heatmap()

#########################################################################
# The class has several methods features for basic data scaling and manipulation. Others can likely be found in pandas core functionality.
# Here we fill NaN values with 0 and zscore all columns except the last. Because the class has all of pandas functionality, method-chaining is built-in.

cov = cov.fillna(0).zscore(cov.columns[:-1])
cov.heatmap()

#########################################################################
# Concatenate Multiple Design Matrices
# ----------------------------------
#
# A really nice feature of this class is simplified, but intelligent matrix concatentation. Here it's trivially to horizontally concatenate our convolved onsets and covariates, while keeping our column names and order.
Example #16
0
# Design Matrix Basics
# --------------------
#
# Lets just create a basic toy design matrix by hand corresponding to a single participant's data from an experiment with 12 TRs, collected at a temporal resolution of 1.5s. For this example we'll have 4 unique "stimulus conditions" that each occur for 2 TRs (3s) with 1 TR (1.5s) of rest between events.

from nltools.data import Design_Matrix
import numpy as np

dm = Design_Matrix(np.array([
                            [1,0,0,0],
                            [1,0,0,0],
                            [0,0,0,0],
                            [0,1,0,0],
                            [0,1,0,0],
                            [0,0,0,0],
                            [0,0,1,0],
                            [0,0,1,0],
                            [0,0,0,0],
                            [0,0,0,1],
                            [0,0,0,1]
                            ]),
                            sampling_rate = 1.5,
                            columns=['stim_A','stim_B','stim_C','stim_D']
                            )
#########################################################################
# Notice how this look exactly like a pandas dataframe. That's because design matrices are *subclasses* of dataframes with some extra attributes and methods.

print(dm)

#########################################################################
# Let's take a look at some of that meta-data. We can see that no columns have been convolved as of yet and this design matrix has no polynomial terms (e.g. such as an intercept or linear trend).
There are a total of 94 subjects available, but we will primarily only be working with a smaller subset of 10-20 participants. See our tutorial on how to download the data if you are not taking the Psych60 version of the class.

## Building a Design Matrix

First, we will learn the basics of how to build a design matrix for our GLM.

Let's load all of the python modules we will need to complete this tutorial.

%matplotlib inline

import os
import glob
import numpy as np
import pandas as pd
import matplotlib.pyplot as plt
import seaborn as sns
import nibabel as nib
from nltools.file_reader import onsets_to_dm
from nltools.stats import regress, zscore
from nltools.data import Brain_Data, Design_Matrix
from nltools.stats import find_spikes 
from nilearn.plotting import view_img, glass_brain, plot_stat_map
from bids import BIDSLayout, BIDSValidator

data_dir = '../data/localizer'
layout = BIDSLayout(data_dir, derivatives=True)

To build the design matrix, we will be using the Design_Matrix class from the nltools toolbox.  First, we use pandas to load the text file that contains the onset and duration for each condition of the task. Rows reflect measurements in time sampled at 1/tr cycles per second. Columns reflect distinct conditions. Conditions are either on or off. We then cast this Pandas DataFrame as a Design_Matrix object. Be sure to specify the sampling frequency, which is $\frac{1}{tr}$. 

def load_bids_events(layout, subject):
    '''Create a design_matrix instance from BIDS event file'''
    
    tr = layout.get_tr()
    n_tr = nib.load(layout.get(subject=subject, scope='raw', suffix='bold')[0].path).shape[-1]

    onsets = pd.read_csv(layout.get(subject=subject, suffix='events')[0].path, sep='\t')
    onsets.columns = ['Onset', 'Duration', 'Stim']
    return onsets_to_dm(onsets, sampling_freq=1/tr, run_length=n_tr)

dm = load_bids_events(layout, 'S01')

The Design_Matrix class is built on top of Pandas DataFrames and retains most of that functionality. There are additional methods to help with building design matrices. Be sure to check out this [tutorial](https://neurolearn.readthedocs.io/en/latest/auto_examples/01_DataOperations/plot_design_matrix.html#sphx-glr-auto-examples-01-dataoperations-plot-design-matrix-py) for more information about how to use this tool. 

We can check out details about the data using the `.info()` method.

dm.info()

We can also view the raw design matrix as a dataframe just like pd.Dataframe.  We use the `.head()` method to just post the first few rows.

dm.head()

We can plot each regressor's time course using the `.plot()` method.

f,a = plt.subplots(figsize=(20,3))
dm.plot(ax=a)

This plot can be useful sometimes, but here there are too many regressors, which makes it difficult to see what is going on.

Often,  `.heatmap()` method provides a more useful visual representation of the design matrix.

dm.heatmap()

### HRF Convolution
Recall what we learned about convolution in our signal processing tutorial. We can now convolve all of the onset regressors with an HRF function using the `.convolve()` method. By default it will convolve all regressors with the standard double gamma HRF function, though you can specify custom ones and also specific regressors to convolve. Check out the docstrings for more information by adding a `?` after the function name. If you are interested in learning more about different ways to model the HRF using temporal basis functions, watch this [video](https://www.youtube.com/watch?v=YfeMIcDWwko&t=9s).

dm_conv = dm.convolve()
dm_conv.heatmap()

You can see that each of the regressors is now  bit blurrier and now has the shape of an HRF function. We can plot a single regoressor to see this more clearly using the `.plot()` method.

f,a = plt.subplots(figsize=(15,3))
dm_conv['horizontal_checkerboard_c0'].plot(ax=a)

Maybe we want to plot both of the checkerboard regressors.

f,a = plt.subplots(figsize=(15,3))
dm_conv[['horizontal_checkerboard_c0','vertical_checkerboard_c0']].plot(ax=a)

### Multicollinearity
In statistics, collinearity or multicollinearity is when one regressor can be strongly linearly predicted from the others. While this does not actually impact the model's ability to predict data as a whole, it will impact our ability to accurately attribute variance to a single regressor. Recall that in multiple regression, we are estimating the independent variance from each regressor from `X` on `Y`. If there is substantial overlap between the regressors, then the estimator can not attribute the correct amount of variance each regressor accounts for `Y` and the coefficients can become unstable. A more intuitive depiction of this problem can be seen in the venn diagram. The dark orange area in the center at the confluence of all 3 circles reflects the shared variance between `X1` and `X2` on `Y`. If this area becomes bigger, the unique variances become smaller and individually reflect less of the total variance on `Y`.

![MultipleRegression.png](../images/single_subject/MultipleRegression.png)

One way to evaluate multicollinearity is to examine the pairwise correlations between each regressor. We plot the correlation matrix as a heatmap.

sns.heatmap(dm_conv.corr(), vmin=-1, vmax=1, cmap='RdBu_r')

#### Variance Inflation Factor
Pairwise correlations will let you know if any regressor is correlated with another regressor. However, we are even more concerned about being able to explain any regressor as a linear combination of the other regressors. For example, *can one regressor be explained by three or more of the remaining regressors?* The variance inflation factor (VIF) is a metric that can help us detect multicollinearity. Specifically, it is simply the ratio of variance in a model with multiple terms, divided by the variance of a model with only a single term. This ratio reduces to the following formula:

$$VIF_j=\frac{1}{1-R_i^2}$$

Where $R_j^2$ is the $R^2$ value obtained by regressing the $jth$ predictor on the remaining predictors. This means that each regressor $j$ will have it's own variance inflation factor.

How should we interpret the VIF values?  

A VIF of 1 indicates that there is no correlation among the $jth$ predictor and the remaining variables. Values greater than 4 should be investigated further, while VIFs exceeding 10 indicate significant multicollinearity and will likely require intervention.

Here we will use the `.vif()` method to calculate the variance inflation factor for our design matrix.

See this [overview](https://newonlinecourses.science.psu.edu/stat501/node/347/) for more details on VIFs.

plt.plot(dm_conv.columns, dm_conv.vif(), linewidth=3)
plt.xticks(rotation=90)
plt.ylabel('Variance Inflation Factor')

#### Orthogonalization
There are many ways to deal with collinearity. In practice, don't worry about collinearity between your covariates. The more pernicious issues are collinearity in your experimental design. 

It is commonly thought that using a procedure called orthogonalization should be used to address issues of multicollinearity. In linear algebra, orthogonalization is the process of prioritizing shared variance between regressors to a single regressor. Recall that the standard GLM already accounts for shared variance by removing it from individual regressors. Orthogonalization allows a user to assign that variance to a specific regressor. However, the process of performing this procedure can introduce artifact into the model and often changes the interpretation of the beta weights in unanticipated ways.

![Orthogonalization.png](../images/single_subject/Orthogonalization.png)

In general, we do not recommend using orthogonalization in most use cases, with the exception of centering regressor variables. We encourage the interested reader to review this very useful [overview](https://journals.plos.org/plosone/article?id=10.1371/journal.pone.0126255) of collinearity and orthogonalization by Jeanette Mumford and colleagues.

## Nuisance Variables

from IPython.display import YouTubeVideo

YouTubeVideo('DEtwsFdFwYc')

### Filtering
Recall from our signal processing tutorial, that there are often other types of artifacts in our signal that might take the form of slow or fast oscillations. It is common to apply a high pass filter to the data to remove low frequency artifacts. Often this can also be addressed by simply using a few polynomials to model these types of trends. If we were to directly filter the brain data using something like a butterworth filter as we did in our signal processing tutorial, we would also need to apply it to our design matrix to make sure that we don't have any low frequency drift in experimental design. One easy way to simultaneously perform both of these procedures is to simply build a filter into the design matrix. We will be using a discrete cosine transform (DCT), which is a basis set of cosine regressors of varying frequencies up to a filter cutoff of a specified number of seconds. Many software use 100s or 128s as a default cutoff, but we encourage caution that the filter cutoff isn't too short for your specific experimental design. Longer trials will require longer filter cutoffs. See this [paper](https://www.sciencedirect.com/science/article/pii/S1053811900906098) for a more technical treatment of using the DCT as a high pass filter in fMRI data analysis. In addition, here is a more detailed discussion about [filtering](http://mindhive.mit.edu/node/116).

dm_conv_filt = dm_conv.add_dct_basis(duration=128)

dm_conv_filt.iloc[:,10:].plot()

dm_conv_filt = dm_conv.add_dct_basis(duration=128)
dm_conv_filt.heatmap()

### Intercepts
We almost always want to include an intercept in our model. This will usually reflect the baseline, or the average voxel response during the times that are not being modeled as a regressor. It is important to note that you must have some sparsity to your model, meaning that you can't model every point in time, as this will make your model rank deficient and unestimable.

If you are concatenating runs and modeling them all together, it is recommended to include a separate intercept for each run, but not for the entire model. This means that the average response within a voxel might differ across runs. You can add an intercept by simply creating a new column of ones (e.g., `dm['Intercept] = 1`). Here we provide an example using the `.add_poly()` method, which adds an intercept by default.

dm_conv_filt_poly = dm_conv_filt.add_poly()
dm_conv_filt_poly.heatmap()

### Linear Trends
We also often want to remove any slow drifts in our data.  This might include a linear trend and a quadratic trend. We can also do this with the `.add_poly()` method and adding all trends up to an order of 2 (e.g., quadratic). We typically use this approach rather than applying a high pass filter when working with naturalistic viewing data.

Notice that these do not appear to be very different from the high pass filter basis set. It's actually okay if there is collinearity in our covariate regressors. Collinearity is only a problem when it correlates with the task regressors as it means that we will not be able to uniquely model the variance. The DCT can occasionally run into edge artifacts, which can be addressed by the linear trend.

dm_conv_filt_poly = dm_conv_filt.add_poly(order=3, include_lower=True)
dm_conv_filt_poly.heatmap()

### Noise Covariates
Another important thing to consider is removing variance associated with head motion. Remember the preprocessed data has already realigned each TR in space, but head motion itself can nonlinearly distort the magnetic field. There are several common strategies for trying to remove artifacts associated with head motion. One is using a data driven denoising algorithm like ICA and combining it with a classifer such as FSL's [FIX](https://fsl.fmrib.ox.ac.uk/fsl/fslwiki/FIX) module. Another approach is to include the amount of correction that needed to be applied to align each TR. For example, if someone moved a lot in a single TR, there will be a strong change in their realignment parameters. It is common to include the 6 parameters as covariates in your regression model. However, as we already noted, often motion can have a nonlinear relationship with signal intensity, so it is often good to include other transformations of these signals to capture nonlinear signal changes resulting from head motion. We typically center the six realigment parameters (or zscore) and then additionally add a quadratic version, a derivative, and the square of the derivatives, which becomes 24 additional regressors. 

In addition, it is common to model out big changes using a regressor with a single value indicating the timepoint of the movement. This will be zeros along time, with a single value of one at the time point of interest. This effectively removes any variance associated with this single time point. It is important to model each "spike" as a separate regressor as there might be distinct spatial patterns associated with different types of head motions. We strongly recommond against using a single continuous frame displacement metric as is often recommended by the fMRIprep team. This assumes (1) that there is a *linear* relationship between displacement and voxel activity, and (2) that there is a *single* spatial generator or pattern associated with frame displacement. As we saw in the ICA noise lab, there might be many different types of head motion artifacts. This procedure of including spikes as nuisance regressors is mathematically equivalent to censoring your data and removing the bad TRs. We think it is important to do this in the context of the GLM as it will also reduce the impact if it happens to covary with your task.

First, let's load preprocessed data from one participant.

sub = 'S01'
data = Brain_Data(layout.get(subject=sub, task='localizer', scope='derivatives', suffix='bold', extension='nii.gz', return_type='file')[1])

Now let's inspect the realignment parameters for this participant. These pertain to how much each volume had to be moved in the (X,Y,Z) planes and rotations around each axis. We are standardizing the data so that rotations and translations are on the same scale.

covariates = pd.read_csv(layout.get(subject='S01', scope='derivatives', extension='.tsv')[0].path, sep='\t')

mc = covariates[['trans_x','trans_y','trans_z','rot_x', 'rot_y', 'rot_z']]

plt.figure(figsize=(15,5))
plt.plot(zscore(mc))

Now, let's build the 24 covariates related to head motion. We include the 6 realignment parameters that have been standardized. In addition, we add their quadratic, their derivative, and the square of their derivative.

We can create a quick visualization to see what the overall pattern is across the different regressors.

def make_motion_covariates(mc, tr):
    z_mc = zscore(mc)
    all_mc = pd.concat([z_mc, z_mc**2, z_mc.diff(), z_mc.diff()**2], axis=1)
    all_mc.fillna(value=0, inplace=True)
    return Design_Matrix(all_mc, sampling_freq=1/tr)

tr = layout.get_tr()
mc_cov = make_motion_covariates(mc, tr)

sns.heatmap(mc_cov)

Now let's try to find some spikes in the data. This is performed by finding TRs that exceed a global mean threshold and also that exceed an overall average intensity change by a threshold.  We are using an arbitrary cutoff of 3 standard deviations as a threshold.

First, let's plot the average signal intensity across all voxels over time.

plt.figure(figsize=(15,3))
plt.plot(np.mean(data.data, axis=1), linewidth=3)
plt.xlabel('Time', fontsize=18)
plt.ylabel('Intensity', fontsize=18)

Notice there is a clear slow drift in the signal that we will need to remove with our high pass filter.

Now, let's see if there are any spikes in the data that exceed our threshold. What happens if we use a different threshold?

spikes = data.find_spikes(global_spike_cutoff=2.5, diff_spike_cutoff=2.5)

f, a = plt.subplots(figsize=(15,3))
spikes = Design_Matrix(spikes.iloc[:,1:], sampling_freq=1/tr)
spikes.plot(ax = a, linewidth=2)

For this subject, our spike identification procedure only found a single spike. Let's add all of these covariate to our design matrix.

In this example, we will append each of these additional matrices to our main design matrix. 

**Note**: `.append()` requires that all matrices are a design_matrix with the same sampling frequency.

dm_conv_filt_poly_cov = pd.concat([dm_conv_filt_poly, mc_cov, spikes], axis=1)
dm_conv_filt_poly_cov.heatmap(cmap='RdBu_r', vmin=-1,vmax=1)

## Smoothing

To increase the signal to noise ratio and clean up the data, it is common to apply spatial smoothing to the image.

Here we will convolve the image with a 3-D gaussian kernel, with a 6mm full width half maximum (FWHM) using the `.smooth()` method. 

fwhm=6
smoothed = data.smooth(fwhm=fwhm)

Let's take a look and see how this changes the image.

data.mean().plot()

smoothed.mean().plot()

## Estimate GLM for all voxels
Now we are ready to estimate the regression model for all voxels.

We will assign the design_matrix object to the `.X` attribute of our `Brain_Data` instance.

Then we simply need to run the `.regress()` method.

smoothed.X = dm_conv_filt_poly_cov
stats = smoothed.regress()

print(stats.keys())

Ok, it's done! Let's take a look at the results.

The stats variable is a dictionary with the main results from the regression: a brain image with all of the betas for each voxel, a correspondign image of t-values, p-values, standard error of the estimate, and residuals.

Remember we have run the same regression model separately on each voxel of the brain.

Let's take a look at one of the regressors. The names of each of them are in the column names of the design matrix, which is in the `data.X` field.  We can print them to see the names. Let's plot the first one, which is a horizontal checkerboard.

print(smoothed.X.columns)

Brain_Data instances have their own plotting methods. We will be using `.iplot()` here, which can allow us to interactively look at all of the values.

If you would like to see the top values, we can quickly apply a threshold. Try using `95`% threshold, and be sure to click the `percentile_threshold` option.

stats['beta'][0].iplot()

### Save Image
We will frequently want to save different brain images we are working with to a nifti file. This is useful for saving intermediate work, or sharing our results with others. This is easy with the `.write()` method. Be sure to specify a path and file name for the file.

**Note**: You can only write to folders where you have permission. Try changing the path to your own directory.

smoothed.write(f'{sub}_betas_denoised_smoothed{fwhm}_preprocessed_fMRI_bold.nii.gz')

## Contrasts

Now that we have estimated our model, we will likely want to create contrasts to examine brain activation to different conditions.

This procedure is identical to those introduced in our GLM tutorial.

Let's watch another video by Tor Wager to better understand contrasts at the first-level model stage.

YouTubeVideo('7MibM1ATai4')

Now, let's try making a simple contrast where we average only the regressors pertaining to motor. This is essentially summing all of the motor regressors. To take the mean we need to divide by the number of regressors. 

print(smoothed.X.columns)

c1 = np.zeros(len(stats['beta']))
c1[[2,4,5,6]] = 1/4
print(c1)

motor = stats['beta'] * c1

motor.iplot()

Ok, now we can clearly see regions specifically involved in motor processing.

Now let's see which regions are more active when making motor movements with our right hand compared to our left hand.

c_rvl = np.zeros(len(stats['beta']))
c_rvl[[2,4,5,6]] = [.5, .5, -.5, -.5]

motor_rvl = stats['beta'] * c_rvl

motor_rvl.iplot()

What do you see?

## Exercises

For homework, let's get a better handle on how to play with our data and test different hypotheses.

### 1. Which regions are more involved with visual compared to auditory sensory processing?
 - Create a contrast to test this hypothesis
 - plot the results
 - write the file to your output folder.



### 2. Which regions are more involved in processing numbers compared to words?
 - Create a contrast to test this hypothesis
 - plot the results
 - write the file to your output folder.



### 3. Which regions are more involved with motor compared to cognitive processes (e.g., language and math)?
 - Create a contrast to test this hypothesis
 - plot the results
 - write the file to your output folder.



### 4. How are your results impacted by different smoothing kernels?
 - Pick two different sized smoothing kernels and create two new brain images with each smoothing kernel
 - Pick any contrast of interest to you and evaluate the impact of smoothing on the contrast.
 - plot the results
 - write the file to your output folder.
Example #18
0
def test_designmat(tmpdir):
    mat1 = Design_Matrix({'X':[1,4,2,7,5,9,2,1,3,2],'Y':[3,0,0,6,9,9,10,10,1,10],'Z':[2,2,2,2,7,0,1,3,3,2],'intercept':[1,1,1,1,1,1,1,1,1,1]},TR=2.0,hasIntercept=True)
    mat2 = Design_Matrix({'X':[9,9,2,7,5,0,1,1,1,2],'Y':[3,3,3,6,9,0,1,10,1,10],'Z':[2,6,3,2,7,0,1,7,8,8],'intercept':[1,1,1,1,1,1,1,1,1,1]},TR=2.0,hasIntercept=True)

    #appending
    assert mat1.append(mat1,axis=1).shape == (mat1.shape[0],mat1.shape[1]+mat2.shape[1])
    assert mat1.append(mat2,axis=0).shape == (mat1.shape[0]+mat2.shape[0],mat1.shape[1]+1)

    #convolution doesn't affect intercept
    assert all(mat1.convolve().iloc[:,-1] == mat1.iloc[:,-1])
    #but it still works
    assert (mat1.convolve().iloc[:,:3].values != mat1.iloc[:,:3].values).any()

    #Test vifs
    expectedVifs =  np.array([ 1.03984251,  1.02889877,  1.02261945])
    assert np.allclose(expectedVifs,mat1.vif())

    #poly
    mat1.addpoly(order=4).shape[1] == mat1.shape[1]+4
    mat1.addpoly(order=4,include_lower=False).shape[1] == mat1.shape[1]+1

    #zscore
    z = mat1.zscore(colNames=['X','Z'])
    assert (z['Y'] == mat1['Y']).all()
    assert z.shape == mat1.shape
    def get_terms(confound_file, noise_transforms, noise_regressors, TR, options):
        '''
        Gathers confounds (and transformations) into a pandas dataframe.
        Input [Mandatory]:
            confound_file [string]: path to confound.tsv file, given by fmriprep.
            noise_transforms [list of strings]:
                noise transforms to be applied to select noise_regressors above. Possible values are 'quad', 'tderiv', and 'quadtderiv', standing for quadratic function of value, temporal derivative of value, and quadratic function of temporal derivative.
                e.g. model_wf.inputs.inputspec.noise_transforms = ['quad', 'tderiv', 'quadtderiv']
            noise_regressors [list of strings]:
                column names in confounds.tsv, specifying desired noise regressors for model.
                IF noise_transforms are to be applied to a regressor, add '*' to the name.
                e.g. model_wf.inputs.inputspec.noise_regressors = ['CSF', 'WhiteMatter', 'GlobalSignal', 'X*', 'Y*', 'Z*', 'RotX*', 'RotY*', 'RotZ*']
            TR [float]:
                Scanner TR value in seconds.
            options: dictionary with the following entries
                remove_steadystateoutlier [boolean]:
                    Should always be True. Remove steady state outliers from bold timecourse, specified in fmriprep confounds file.
                ICA_AROMA [boolean]:
                    Use AROMA error components, from fmriprep confounds file.
                poly_trend [integer. Use None to skip]:
                    If given, polynomial trends will be added to run confounds, up to the order of the integer
                    e.g. "0", gives an intercept, "1" gives intercept + linear trend,
                    "2" gives intercept + linear trend + quadratic.
                dct_basis [integer. Use None to skip]:
                    If given, adds a discrete cosine transform, with a length (in seconds) of the interger specified.
                        Adds unit scaled cosine basis functions to Design_Matrix columns,
                        based on spm-style discrete cosine transform for use in
                        high-pass filtering. Does not add intercept/constant.
        '''
        import numpy as np
        import pandas as pd
        from nltools.data import Design_Matrix

        df_cf = pd.DataFrame(pd.read_csv(confound_file, sep='\t', parse_dates=False))
        transfrm_list = []
        for idx, entry in enumerate(noise_regressors): # get entries marked with *, indicating they should be transformed.
            if '*' in entry:
                transfrm_list.append(entry.replace('*', '')) # add entry to transformation list if it has *.
                noise_regressors[idx] = entry.replace('*', '')

        confounds = df_cf[noise_regressors]
        transfrmd_cnfds = df_cf[transfrm_list] # for transforms
        TR_time = pd.Series(np.arange(0.0, TR*transfrmd_cnfds.shape[0], TR)) # time series for derivatives.
        if 'quad' in noise_transforms:
            quad = np.square(transfrmd_cnfds)
            confounds = confounds.join(quad, rsuffix='_quad')
        if 'tderiv' in noise_transforms:
            tderiv = pd.DataFrame(pd.Series(np.gradient(transfrmd_cnfds[col]), TR_time)
                                  for col in transfrmd_cnfds).T
            tderiv.columns = transfrmd_cnfds.columns
            tderiv.index = confounds.index
            confounds = confounds.join(tderiv, rsuffix='_tderiv')
        if 'quadtderiv' in noise_transforms:
            quadtderiv = np.square(tderiv)
            confounds = confounds.join(quadtderiv, rsuffix='_quadtderiv')
        if options['remove_steadystateoutlier']:
            if not df_cf[df_cf.columns[df_cf.columns.to_series().str.contains('^non_steady_state_outlier')]].empty:
                confounds = confounds.join(df_cf[df_cf.columns[df_cf.columns.to_series().str.contains('^non_steady_state_outlier')]])
            elif not df_cf[df_cf.columns[df_cf.columns.to_series().str.contains('^NonSteadyStateOutlier')]].empty:
                confounds = confounds.join(df_cf[df_cf.columns[df_cf.columns.to_series().str.contains('^NonSteadyStateOutlier')]]) # old syntax
        if options['ICA_AROMA']:
            if not df_cf[df_cf.columns[df_cf.columns.to_series().str.contains('^aroma_motion')]].empty:
                confounds = confounds.join(df_cf[df_cf.columns[df_cf.columns.to_series().str.contains('^aroma_motion')]])
            elif not df_cf[df_cf.columns[df_cf.columns.to_series().str.contains('^AROMAAggrComp')]].empty:
                confounds = confounds.join(df_cf[df_cf.columns[df_cf.columns.to_series().str.contains('^AROMAAggrComp')]]) # old syntax
        confounds = Design_Matrix(confounds, sampling_freq=1/TR)
        if isinstance(options['poly_trend'], int):
            confounds = confounds.add_poly(order = options['poly_trend']) # these do not play nice with high pass filters.
        if isinstance(options['dct_basis'], int):
            confounds = confounds.add_dct_basis(duration=options['dct_basis']) # these do not play nice with high pass filters.
        return confounds
Example #20
0
def BIDS_to_dm(F,
               sampling_freq,
               run_length,
               trial_col='trial_type',
               parametric_cols=None,
               sort=False,
               keep_separate=True,
               add_poly=None,
               unique_cols=[],
               fill_na=None,
               **kwargs):
    """
        **
        Modified from nltools.file_reader.onsets_to_dm to accomodate BIDS files,
        customize naming of the trial_type column, and allow parametric modulators.
        **
    This function can assist in reading in one or several BIDS-formated events files, specified in seconds and converting it to a Design Matrix organized as samples X Stimulus Classes.
    Onsets files **must** be organized with columns in the following format:
        1) 'onset, duration, trial_type'

    This can handle multiple runs being given at once (if F is a list), and by default uses separate contrasts for each run.

    Args:
        F (filepath/DataFrame/list): path to file, pandas dataframe, or list of files or pandas dataframes
        TR (float): TR of run.
        run_length (int): number of TRs in the run these onsets came from
        trial_col (string): which column should be used to specify stimuli/trials?
        parametric_cols (list of lists of strings):
        e.g. [['condition1', 'parametric1', 'no_cent', 'no_norm'],
             ['condition2', 'paramatric2', 'cent', 'norm']]
             in each entry:
                 entry 1 is a condition within the trial_col
                 entry 2 is a column in the events folder referenced by F.
                 entry 3 is either 'no_cent', or 'cent', indicating whether to center the parametric variable.
                 entry 4 is either 'no_norm', or 'norm', indicating whether to normalize the parametric variable.
             The condition column specified by entry 1 will be multiplied by the
             parametric weighting specified by entry 2, scaled/centered as specified, then
            appended to the design matrix.
        sort (bool, optional): whether to sort the columns of the resulting
                                design matrix alphabetically; defaults to
                                False
        keep_separate (bool): whether to seperate polynomial columns if reading a list of files and using the addpoly option
        addpoly (int, optional: what order polynomial terms to add as new columns (e.g. 0 for intercept, 1 for linear trend and intercept, etc); defaults to None
        unique_cols (list): additional columns to keep seperate across files (e.g. spikes)
        fill_nam (str/int/float): what value fill NaNs in with if reading in a list of files
        kwargs: additional inputs to pandas.read_csv
    Returns:
        Design_Matrix class
    """
    import pandas as pd
    import numpy as np
    import six
    from nltools.data import Design_Matrix
    from sklearn.preprocessing import scale
    import warnings

    if not isinstance(F, list):
        F = [F]
    out = []
    sampling_freq = 1 / TR

    for f in F:  ## Loading event files.
        if isinstance(f, six.string_types):  # load if file.
            if f.split('.')[-1] == 'tsv':
                df = pd.read_csv(
                    f, **kwargs,
                    sep='\t')  # if .tsv, load with tab separation.
            else:
                df = pd.read_csv(f, **kwargs)  # TODO, replace in final code.
        elif isinstance(f, pd.core.frame.DataFrame):  #copy if dataframe.
            df = f.copy()
        else:
            raise TypeError("Input needs to be file path or pandas dataframe!")
        # Set onset to closest prior TR.
        df['onset'] = df['onset'].apply(lambda x: int(np.floor(x / TR)))
        ### Build dummy codes for trial column
        X = Design_Matrix(np.zeros([run_length,
                                    len(df[trial_col].unique())]),
                          columns=df[trial_col].unique(),
                          sampling_freq=sampling_freq)
        for i, row in df.iterrows(
        ):  # for each entry in the .tsv file, mark a contrast for the duration in the design matrix.
            dur = np.ceil(row['duration'] / TR)  # round duration to ceiling.
            X.loc[row['onset'] - 1:row['onset'] + dur - 1, row[trial_col]] = 1
        if sort:
            X = X.reindex(sorted(X.columns), axis=1)  # sort columns.
        ## Parametric modulation, if necessary.
        if parametric_cols:
            par_names = [
                var[0] + '_' + var[1] for var in parametric_cols
            ]  # combine parametric_col indicators to generate new column names.
            XP = Design_Matrix(np.zeros([run_length,
                                         len(par_names)]),
                               columns=par_names,
                               sampling_freq=sampling_freq)
            for idx, cond_par in enumerate(parametric_cols):
                cond = cond_par[0]  # get condition to parametrically modulate
                par = cond_par[1]  # get name of parametric modulator
                print('modulating conditon', cond, 'by parametric modulator',
                      par)
                if cond_par[2] == 'cent':
                    with_mean = True
                elif cond_par[2] == 'no_cent':
                    with_mean = False
                if cond_par[3] == 'norm':
                    with_std = True
                elif cond_par[3] == 'no_norm':
                    with_std = False
                df[par_names[idx]] = scale(
                    df[par], with_mean=with_mean, with_std=with_std
                )  # scale/center the parametric modulatory
                for i, row in df.iterrows():
                    if row[trial_col] == cond:
                        dur = np.ceil(row['duration'] /
                                      TR)  # round duration to ceiling.
                        if np.isnan(row[par]):  # check for missing data.
                            print('NaN found in parameter', par, 'at onset:',
                                  row['onset'])
                            XP.loc[
                                row['onset'] - 1:row['onset'] + dur -
                                1] = 0  # remove all data within missing area
                        else:
                            XP.loc[
                                row['onset'] - 1:row['onset'] + dur - 1,
                                par_names[idx]] = 1 * row[par_names[
                                    idx]]  # multiple dummy code by parametric modulator.
            X = Design_Matrix(pd.concat([X, XP], axis=1),
                              sampling_freq=sampling_freq
                              )  # join parametrc variables to the design.
            out.append(X)  # append to other runs, if multiple runs.
    if len(out) > 1:
        out_dm = out[0].append(out[1:],
                               keep_separate=keep_separate,
                               add_poly=add_poly,
                               unique_cols=unique_cols,
                               fill_na=fill_na)
    else:
        if add_poly is not None:
            out_dm = out[0].add_poly(add_poly)
        else:
            out_dm = out[0]
    return out_dm
Example #21
0
plt.plot(np.mean(data.data, axis=1), linewidth=3)
plt.xlabel('Time', fontsize=18)
plt.ylabel('Intensity', fontsize=18)


# Notice there is a clear slow drift in the signal that we will need to remove with our high pass filter.
# 
# Now, let's see if there are any spikes in the data that exceed our threshold. What happens if we use a different threshold?

# In[37]:


spikes = data.find_spikes(global_spike_cutoff=2.5, diff_spike_cutoff=2.5)

f, a = plt.subplots(figsize=(15,3))
spikes = Design_Matrix(spikes.iloc[:,1:], sampling_freq=1/tr)
spikes.plot(ax = a, linewidth=2)


# For this subject, our spike identification procedure only found a single spike. Let's add all of these covariate to our design matrix.
# 
# In this example, we will append each of these additional matrices to our main design matrix. 
# 
# **Note**: `.append()` requires that all matrices are a design_matrix with the same sampling frequency.

# In[38]:


dm_conv_filt_poly_cov = pd.concat([dm_conv_filt_poly, mc_cov, spikes], axis=1)
dm_conv_filt_poly_cov.heatmap(cmap='RdBu_r', vmin=-1,vmax=1)
Example #22
0
#########################################################################
# Design Matrix Basics
# --------------------
#
# Lets just create a basic toy design matrix by hand corresponding to a single participant's data from an experiment with 12 TRs, collected at a temporal resolution of 1.5s. For this example we'll have 4 unique "stimulus conditions" that each occur for 2 TRs (3s) with 1 TR (1.5s) of rest between events.

from nltools.data import Design_Matrix
import numpy as np

TR = 1.5  # Design Matrices take a sampling_freq argument specified in hertz which can be converted as 1./TR

dm = Design_Matrix(np.array([[0, 0, 0, 0], [0, 0, 0, 0], [1, 0, 0, 0],
                             [1, 0, 0, 0], [0, 0, 0, 0], [0, 1, 0, 0],
                             [0, 1, 0, 0], [0, 0, 0, 0], [0, 0, 1, 0],
                             [0, 0, 1, 0], [0, 0, 0, 0], [0, 0, 0, 1],
                             [0, 0, 0, 1], [0, 0, 0, 0], [0, 0, 0, 0],
                             [0, 0, 0, 0], [0, 0, 0, 0], [0, 0, 0, 0],
                             [0, 0, 0, 0], [0, 0, 0, 0], [0, 0, 0, 0],
                             [0, 0, 0, 0]]),
                   sampling_freq=1. / TR,
                   columns=['face_A', 'face_B', 'house_A', 'house_B'])
#########################################################################
# Notice how this look exactly like a pandas dataframe. That's because design matrices are *subclasses* of dataframes with some extra attributes and methods.

print(dm)

#########################################################################
# Let's take a look at some of that meta-data. We can see that no columns have been convolved as of yet and this design matrix has no polynomial terms (e.g. such as an intercept or linear trend).

print(dm.details())

#########################################################################
tr = 1.5
outlier_cutoff = 3

file_list = [x for x in glob.glob(os.path.join(base_dir, '*/func/*preproc*gz')) if 'denoised' not in x] 
for f in file_list:
    sub = os.path.basename(f).split('_')[0]

    data = Brain_Data(f)
    smoothed = data.smooth(fwhm=fwhm)

    spikes = smoothed.find_spikes(global_spike_cutoff=outlier_cutoff, diff_spike_cutoff=outlier_cutoff)
    covariates = pd.read_csv(glob.glob(os.path.join(base_dir, sub, 'func', '*tsv'))[0], sep='\t')
    mc = covariates[['trans_x','trans_y','trans_z','rot_x', 'rot_y', 'rot_z']]
    mc_cov = make_motion_covariates(mc, tr)
    csf = covariates['csf'] # Use CSF from fmriprep output
    dm = Design_Matrix(pd.concat([csf, mc_cov, spikes.drop(labels='TR', axis=1)], axis=1), sampling_freq=1/tr)
    dm = dm.add_poly(order=2, include_lower=True) # Add Intercept, Linear and Quadratic Trends

    smoothed.X = dm
    stats = smoothed.regress()
    stats['residual'].data = np.float32(stats['residual'].data) # cast as float32 to reduce storage space
    stats['residual'].write(os.path.join(base_dir, sub, 'func', f'{sub}_denoise_smooth{fwhm}mm_task-sherlockPart1_space-MNI152NLin2009cAsym_desc-preproc_bold.nii.gz'))


We also saved the cropped denoised viewing data as an hdf5 file to speed up loading times when using nltools.

data_dir = '/Volumes/Engram/Data/Sherlock/fmriprep'

for scan in ['Part1', 'Part2']:
    file_list = glob.glob(os.path.join(data_dir, '*', 'func', f'*crop*{scan}*nii.gz'))
    for f in file_list: