Beispiel #1
0
def pca_decoder(data, nfolds=2, dims=10, cfun=isfc, weights_fun=laplace_weights,
                                        weights_params=laplace_params, combine=mean_combine, rfun=None):
    """
    :param data: a list of number-of-observations by number-of-features matrices
    :param nfolds: number of cross-validation folds (train using out-of-fold data;
                   test using in-fold data)
    :param cfun: function for transforming the group data (default: isfc)
    :param weights_fun: used to compute per-timepoint weights for cfun; default: laplace_weights
    :param  weights_params: parameters passed to weights_fun; default: laplace_params
    :params combine: function for combining data within each group, or a list of such functions (default: mean_combine)
    :param rfun: function for reducing output (default: None)
    :return: results dictionary with the following keys:
       'rank': mean percentile rank (across all timepoints and folds) in the
               decoding distribution of the true timepoint
       'accuracy': mean percent accuracy (across all timepoints and folds)
       'error': mean estimation error (across all timepoints and folds) between
                the decoded and actual window numbers, expressed as a percentage
                of the total number of windows
    """

    assert len(np.unique(
        list(map(lambda x: x.shape[0], data)))) == 1, 'all data matrices must have the same number of timepoints'
    assert len(np.unique(
        list(map(lambda x: x.shape[1], data)))) == 1, 'all data matrices must have the same number of features'


    pca_data = np.asarray(hyp.reduce(list(data), ndims=dims))

    group_assignments = get_xval_assignments(len(pca_data), nfolds)
    results_pd = pd.DataFrame()

    for i in range(0, nfolds):
        for d in range(1, dims + 1):

            in_data = np.asarray([x for x in pca_data[group_assignments == i]])[:, :, :d]
            out_data = np.asarray([x for x in pca_data[group_assignments != i]])[:, :, :d]

            in_smooth, out_smooth, in_raw, out_raw = folding_levels(in_data, out_data, level=0, cfun=isfc, rfun=[None],
                                                                combine=[mean_combine], weights_fun=weights_fun,
                                                                weights_params=weights_params)

            if d < 3:
                in_smooth = np.hstack((in_smooth, np.zeros((in_smooth.shape[0], 3 - in_smooth.shape[1]))))
                out_smooth = np.hstack((out_smooth, np.zeros((out_smooth.shape[0], 3 - out_smooth.shape[1]))))
            corrs = (1 - sd.cdist(in_smooth, out_smooth, 'correlation'))

            corrs = np.array(corrs)
            next_results_pd = decoder(corrs)
            next_results_pd['dims'] = d
            next_results_pd['folds'] = i

            results_pd = pd.concat([results_pd, next_results_pd])

    return results_pd
Beispiel #2
0
def reduce(corrs, rfun=None):
    '''
    :param corrs: a matrix of vectorized correlation matrices (output of mat2vec), or a list
                  of such matrices

    :param rfun: function to use for dimensionality reduction.  All hypertools and
        scikit-learn functions are supported: PCA, IncrementalPCA, SparsePCA,
        MiniBatchSparsePCA, KernelPCA, FastICA, FactorAnalysis, TruncatedSVD,
        DictionaryLearning, MiniBatchDictionaryLearning, TSNE, Isomap,
        SpectralEmbedding, LocallyLinearEmbedding, MDS, and UMAP.

        Can be passed as a string, but for finer control of the model
        parameters, pass as a dictionary, e.g.
        reduction={‘model’ : ‘PCA’, ‘params’ : {‘whiten’ : True}}.

        See scikit-learn specific model docs for details on parameters supported
        for each model.

        Another option is to use graph theoretic measures computed for each node.
        The following measures are supported (via the brainconn toolbox):
        eigenvector_centrality, pagerank_centrality, and strength.  (Each
        of these must be specified as a string; dictionaries not supported.)

        Default: None (no dimensionality reduction)

    :return: dimensionality-reduced (or original) correlation matrices
    '''

    if rfun is None:
        return corrs

    get_V = lambda x: int(np.divide(np.sqrt(8 * x + 1) - 1, 2))

    if type(corrs) is list:
        V = get_V(corrs[0].shape[1])
    else:
        V = get_V(corrs.shape[1])

    if rfun in graph_measures.keys():
        return apply_by_row(corrs, graph_measures[rfun])
    else:
        red_corrs = hyp.reduce(corrs, reduce=rfun, ndims=V)

        D = np.shape(red_corrs)[-1]

        if D < V:
            red_corrs = np.hstack((red_corrs, np.zeros((D, V - D))))

        return red_corrs
def test_reduce():
    n_components = 10
    for m in models:
        if m == 'SparseCoder':
            dictionary = hyp.reduce(dw.stack(normalized_weights).T.values,
                                    'IncrementalPCA',
                                    n_components=n_components).values.T
            next_model = {
                'model': m,
                'args': [],
                'kwargs': {
                    'dictionary': dictionary
                }
            }
        else:
            next_model = {
                'model': m,
                'args': [],
                'kwargs': {
                    'n_components': n_components
                }
            }

        reduced_weights = hyp.reduce(normalized_weights, model=next_model)
        assert type(reduced_weights) is list
        assert len(reduced_weights) == len(normalized_weights)
        assert all([
            r.shape[0] == w.shape[0]
            for r, w in zip(reduced_weights, normalized_weights)
        ])
        assert all([r.shape[1] == n_components for r in reduced_weights])

        x = hyp.reduce(normalized_weights[0], model=next_model)
        assert type(x) is pd.DataFrame
        assert x.shape[0] == normalized_weights[0].shape[0]
        assert x.shape[1] == n_components
    X, y = make_classification(n_samples=n_samples,
                               n_features=n_features,
                               n_informative=3,
                               n_redundant=17,
                               n_repeated=0,
                               n_classes=2,
                               n_clusters_per_class=2,
                               flip_y=0.01,
                               class_sep=3.0,
                               hypercube=True,
                               shift=0.0,
                               scale=1.0,
                               shuffle=True,
                               random_state=42)
    X_plot = hyp.reduce(X, ndims=2, reduce=reduction_method)
    data1 = (X, y, X_plot)

    datasets = [data1]

    # ============

    # hyp.plot(X, group=y,  align='hyper')
    # hyp.describe(X, reduce='UMAP', max_dims=5)
    # hyp.describe(X, reduce='UMAP')
    # hyp.plot(X, '.', reduce='UMAP', hue=y, ndims=2)
    # X = hyp.reduce(X_raw, ndims=n_features, reduce=reduction_method)
    # X_reduce = hyp.reduce(X, ndims=n_reduced_features, reduce=reduction_method)
    # X_plot = hyp.reduce(X, ndims=2, reduce=reduction_method)

    # data1 = make_box_data_random(n_samples=num_samples, min_val=-0.3, max_val=1.3, stratify=True, shuffle=True)
## Getting some intuitions for the Sherlock data using HyperTools

### Examining brain responses in visual, auditory, and motor cortex during movie watching

Participants in the Sherlock experiment all watched the same audiovisual movie.  Therefore, to the extent that participants' brain responses were driven by the movie, we might expect that their brain responses in primary auditory and visual cortex should follow similar or related patterns.  In contrast, non-sensory regions like primary motor cortex should not show this sort of agreement.

We can test this intuition qualitatively by projecting the ROI data from visual, auditory, and motor cortex into a shared low-dimensional space.  Each participant's trajectory will be plotted in a different color.  Regions that show greater agreement across participants will have more similarly shaped (overlapping) trajectories when plotted using the HyperTools pipeline.

def plot_aligned_ROI_trajectories(data, reduce='UMAP', align='hyper', n_iter=5, ndims=500, internal_reduce='IncrementalPCA', **kwargs):
    if type(data) == dict:
        for r in data.keys(): #roi
            plot_aligned_ROI_trajectories(data[r], reduce=reduce, align=align, ndims=ndims, internal_reduce=internal_reduce, title=r, **kwargs)
    else:
        #step 1: reduce dataset before aligning (runs much faster)
        reduced_data = hyp.reduce([x.data for x in data], reduce=internal_reduce, ndims=ndims)

        #step 2: smooth trajectories so they look prettier
        smoothed_data = tc.smooth(reduced_data, kernel_fun=tc.helpers.gaussian_weights, kernel_params={'var': 500})
        
        #step 3: align trajectories
        aligned_data = smoothed_data
        for i in range(n_iter):
            aligned_data = hyp.align(aligned_data, align=align)

        #now generate a plot
        hyp.plot(aligned_data, reduce=reduce, **kwargs)

plot_aligned_ROI_trajectories(data['Part1'])

We can see strong agreement across people in V1 and A1, whereas precentral gyrus responses are much more variable.  Now let's see if these patterns also hold for the second half of the dataset:
for n_neighbors in list(range(100, 220, 10)) + [161]:
    for min_dist in np.arange(.1, 1, .2):
        for spread in range(1, 11, 2):
            params = {
                'metric': 'correlation',
                'random_state': seed,
                'n_neighbors': n_neighbors,
                'min_dist': min_dist,
                'spread': spread
            }

            np.random.seed(seed)
            embeddings = hyp.reduce(to_reduce,
                                    reduce={
                                        'model': 'UMAP',
                                        'params': params
                                    },
                                    ndims=2)

            if order == 1:
                video_embedding = embeddings[-2]
                recall_embeddings = embeddings[:-2]
                avg_recall_embedding = embeddings[-1]

            elif order == 2:
                video_embedding = embeddings[-1]
                recall_embeddings = embeddings[:-2]
                avg_recall_embedding = embeddings[-2]

            elif order == 3:
                video_embedding = embeddings[0]
Beispiel #7
0
    losses = np.array([[line[0]] for line in lines])
    positions = np.array([line[1:] for line in lines])
    return {'name': fname, 'losses': losses, 'positions': positions}


logs = [parseLogFile(fname) for fname in logFileNames]

########################################################################################################################
# 2. Reduce dimensions

# We want to apply the same transformation to all points. So:

# 2a. Combine points into single list, and reduce

combinedPositions = np.concatenate([log['positions'] for log in logs])
combinedPositionsReduced = hyp.reduce(combinedPositions, ndims=2, reduce='PCA')

# 2b. Separate combined points

separatedPositions = []
for log in logs:
    losses = log['losses']
    positionsReduced = combinedPositionsReduced[:len(log['positions'])]
    combinedPositionsReduced = combinedPositionsReduced[len(log['positions']):]

    spa = np.concatenate((positionsReduced, losses), axis=1)
    logging.info(spa.shape)
    separatedPositions.append({'name': log['name'], 'prl': spa})

# 2c. Only keep large log files, and only after they stabilize
# import
from scipy.linalg import toeplitz
import numpy as np
from copy import copy
import hypertools as hyp

# simulate data
K = 10 - toeplitz(np.arange(10))
data1 = np.cumsum(np.random.multivariate_normal(np.zeros(10), K, 250), axis=0)
data2 = copy(data1)

# randomly remove 5% of the data
missing = .01
inds = [(i, j) for i in range(data1.shape[0]) for j in range(data1.shape[1])]
missing_data = [
    inds[i] for i in np.random.choice(int(len(inds)), int(len(inds) * missing))
]
for i, j in missing_data:
    data2[i, j] = np.nan

# reduce the data
data1_r, data2_r = hyp.reduce([data1, data2], ndims=3)

# pull out missing inds
missing_inds = hyp.tools.missing_inds(data2)
missing_data = data2_r[missing_inds, :]

# plot
hyp.plot([data1_r, data2_r, missing_data], ['-', '--', '*'],
         legend=['Full', 'Missing', 'Missing Points'])