def pca_decoder(data, nfolds=2, dims=10, cfun=isfc, weights_fun=laplace_weights, weights_params=laplace_params, combine=mean_combine, rfun=None): """ :param data: a list of number-of-observations by number-of-features matrices :param nfolds: number of cross-validation folds (train using out-of-fold data; test using in-fold data) :param cfun: function for transforming the group data (default: isfc) :param weights_fun: used to compute per-timepoint weights for cfun; default: laplace_weights :param weights_params: parameters passed to weights_fun; default: laplace_params :params combine: function for combining data within each group, or a list of such functions (default: mean_combine) :param rfun: function for reducing output (default: None) :return: results dictionary with the following keys: 'rank': mean percentile rank (across all timepoints and folds) in the decoding distribution of the true timepoint 'accuracy': mean percent accuracy (across all timepoints and folds) 'error': mean estimation error (across all timepoints and folds) between the decoded and actual window numbers, expressed as a percentage of the total number of windows """ assert len(np.unique( list(map(lambda x: x.shape[0], data)))) == 1, 'all data matrices must have the same number of timepoints' assert len(np.unique( list(map(lambda x: x.shape[1], data)))) == 1, 'all data matrices must have the same number of features' pca_data = np.asarray(hyp.reduce(list(data), ndims=dims)) group_assignments = get_xval_assignments(len(pca_data), nfolds) results_pd = pd.DataFrame() for i in range(0, nfolds): for d in range(1, dims + 1): in_data = np.asarray([x for x in pca_data[group_assignments == i]])[:, :, :d] out_data = np.asarray([x for x in pca_data[group_assignments != i]])[:, :, :d] in_smooth, out_smooth, in_raw, out_raw = folding_levels(in_data, out_data, level=0, cfun=isfc, rfun=[None], combine=[mean_combine], weights_fun=weights_fun, weights_params=weights_params) if d < 3: in_smooth = np.hstack((in_smooth, np.zeros((in_smooth.shape[0], 3 - in_smooth.shape[1])))) out_smooth = np.hstack((out_smooth, np.zeros((out_smooth.shape[0], 3 - out_smooth.shape[1])))) corrs = (1 - sd.cdist(in_smooth, out_smooth, 'correlation')) corrs = np.array(corrs) next_results_pd = decoder(corrs) next_results_pd['dims'] = d next_results_pd['folds'] = i results_pd = pd.concat([results_pd, next_results_pd]) return results_pd
def reduce(corrs, rfun=None): ''' :param corrs: a matrix of vectorized correlation matrices (output of mat2vec), or a list of such matrices :param rfun: function to use for dimensionality reduction. All hypertools and scikit-learn functions are supported: PCA, IncrementalPCA, SparsePCA, MiniBatchSparsePCA, KernelPCA, FastICA, FactorAnalysis, TruncatedSVD, DictionaryLearning, MiniBatchDictionaryLearning, TSNE, Isomap, SpectralEmbedding, LocallyLinearEmbedding, MDS, and UMAP. Can be passed as a string, but for finer control of the model parameters, pass as a dictionary, e.g. reduction={‘model’ : ‘PCA’, ‘params’ : {‘whiten’ : True}}. See scikit-learn specific model docs for details on parameters supported for each model. Another option is to use graph theoretic measures computed for each node. The following measures are supported (via the brainconn toolbox): eigenvector_centrality, pagerank_centrality, and strength. (Each of these must be specified as a string; dictionaries not supported.) Default: None (no dimensionality reduction) :return: dimensionality-reduced (or original) correlation matrices ''' if rfun is None: return corrs get_V = lambda x: int(np.divide(np.sqrt(8 * x + 1) - 1, 2)) if type(corrs) is list: V = get_V(corrs[0].shape[1]) else: V = get_V(corrs.shape[1]) if rfun in graph_measures.keys(): return apply_by_row(corrs, graph_measures[rfun]) else: red_corrs = hyp.reduce(corrs, reduce=rfun, ndims=V) D = np.shape(red_corrs)[-1] if D < V: red_corrs = np.hstack((red_corrs, np.zeros((D, V - D)))) return red_corrs
def test_reduce(): n_components = 10 for m in models: if m == 'SparseCoder': dictionary = hyp.reduce(dw.stack(normalized_weights).T.values, 'IncrementalPCA', n_components=n_components).values.T next_model = { 'model': m, 'args': [], 'kwargs': { 'dictionary': dictionary } } else: next_model = { 'model': m, 'args': [], 'kwargs': { 'n_components': n_components } } reduced_weights = hyp.reduce(normalized_weights, model=next_model) assert type(reduced_weights) is list assert len(reduced_weights) == len(normalized_weights) assert all([ r.shape[0] == w.shape[0] for r, w in zip(reduced_weights, normalized_weights) ]) assert all([r.shape[1] == n_components for r in reduced_weights]) x = hyp.reduce(normalized_weights[0], model=next_model) assert type(x) is pd.DataFrame assert x.shape[0] == normalized_weights[0].shape[0] assert x.shape[1] == n_components
X, y = make_classification(n_samples=n_samples, n_features=n_features, n_informative=3, n_redundant=17, n_repeated=0, n_classes=2, n_clusters_per_class=2, flip_y=0.01, class_sep=3.0, hypercube=True, shift=0.0, scale=1.0, shuffle=True, random_state=42) X_plot = hyp.reduce(X, ndims=2, reduce=reduction_method) data1 = (X, y, X_plot) datasets = [data1] # ============ # hyp.plot(X, group=y, align='hyper') # hyp.describe(X, reduce='UMAP', max_dims=5) # hyp.describe(X, reduce='UMAP') # hyp.plot(X, '.', reduce='UMAP', hue=y, ndims=2) # X = hyp.reduce(X_raw, ndims=n_features, reduce=reduction_method) # X_reduce = hyp.reduce(X, ndims=n_reduced_features, reduce=reduction_method) # X_plot = hyp.reduce(X, ndims=2, reduce=reduction_method) # data1 = make_box_data_random(n_samples=num_samples, min_val=-0.3, max_val=1.3, stratify=True, shuffle=True)
## Getting some intuitions for the Sherlock data using HyperTools ### Examining brain responses in visual, auditory, and motor cortex during movie watching Participants in the Sherlock experiment all watched the same audiovisual movie. Therefore, to the extent that participants' brain responses were driven by the movie, we might expect that their brain responses in primary auditory and visual cortex should follow similar or related patterns. In contrast, non-sensory regions like primary motor cortex should not show this sort of agreement. We can test this intuition qualitatively by projecting the ROI data from visual, auditory, and motor cortex into a shared low-dimensional space. Each participant's trajectory will be plotted in a different color. Regions that show greater agreement across participants will have more similarly shaped (overlapping) trajectories when plotted using the HyperTools pipeline. def plot_aligned_ROI_trajectories(data, reduce='UMAP', align='hyper', n_iter=5, ndims=500, internal_reduce='IncrementalPCA', **kwargs): if type(data) == dict: for r in data.keys(): #roi plot_aligned_ROI_trajectories(data[r], reduce=reduce, align=align, ndims=ndims, internal_reduce=internal_reduce, title=r, **kwargs) else: #step 1: reduce dataset before aligning (runs much faster) reduced_data = hyp.reduce([x.data for x in data], reduce=internal_reduce, ndims=ndims) #step 2: smooth trajectories so they look prettier smoothed_data = tc.smooth(reduced_data, kernel_fun=tc.helpers.gaussian_weights, kernel_params={'var': 500}) #step 3: align trajectories aligned_data = smoothed_data for i in range(n_iter): aligned_data = hyp.align(aligned_data, align=align) #now generate a plot hyp.plot(aligned_data, reduce=reduce, **kwargs) plot_aligned_ROI_trajectories(data['Part1']) We can see strong agreement across people in V1 and A1, whereas precentral gyrus responses are much more variable. Now let's see if these patterns also hold for the second half of the dataset:
for n_neighbors in list(range(100, 220, 10)) + [161]: for min_dist in np.arange(.1, 1, .2): for spread in range(1, 11, 2): params = { 'metric': 'correlation', 'random_state': seed, 'n_neighbors': n_neighbors, 'min_dist': min_dist, 'spread': spread } np.random.seed(seed) embeddings = hyp.reduce(to_reduce, reduce={ 'model': 'UMAP', 'params': params }, ndims=2) if order == 1: video_embedding = embeddings[-2] recall_embeddings = embeddings[:-2] avg_recall_embedding = embeddings[-1] elif order == 2: video_embedding = embeddings[-1] recall_embeddings = embeddings[:-2] avg_recall_embedding = embeddings[-2] elif order == 3: video_embedding = embeddings[0]
losses = np.array([[line[0]] for line in lines]) positions = np.array([line[1:] for line in lines]) return {'name': fname, 'losses': losses, 'positions': positions} logs = [parseLogFile(fname) for fname in logFileNames] ######################################################################################################################## # 2. Reduce dimensions # We want to apply the same transformation to all points. So: # 2a. Combine points into single list, and reduce combinedPositions = np.concatenate([log['positions'] for log in logs]) combinedPositionsReduced = hyp.reduce(combinedPositions, ndims=2, reduce='PCA') # 2b. Separate combined points separatedPositions = [] for log in logs: losses = log['losses'] positionsReduced = combinedPositionsReduced[:len(log['positions'])] combinedPositionsReduced = combinedPositionsReduced[len(log['positions']):] spa = np.concatenate((positionsReduced, losses), axis=1) logging.info(spa.shape) separatedPositions.append({'name': log['name'], 'prl': spa}) # 2c. Only keep large log files, and only after they stabilize
# import from scipy.linalg import toeplitz import numpy as np from copy import copy import hypertools as hyp # simulate data K = 10 - toeplitz(np.arange(10)) data1 = np.cumsum(np.random.multivariate_normal(np.zeros(10), K, 250), axis=0) data2 = copy(data1) # randomly remove 5% of the data missing = .01 inds = [(i, j) for i in range(data1.shape[0]) for j in range(data1.shape[1])] missing_data = [ inds[i] for i in np.random.choice(int(len(inds)), int(len(inds) * missing)) ] for i, j in missing_data: data2[i, j] = np.nan # reduce the data data1_r, data2_r = hyp.reduce([data1, data2], ndims=3) # pull out missing inds missing_inds = hyp.tools.missing_inds(data2) missing_data = data2_r[missing_inds, :] # plot hyp.plot([data1_r, data2_r, missing_data], ['-', '--', '*'], legend=['Full', 'Missing', 'Missing Points'])