def canonical_correlation_analysis(occurences_a, occurences_b): occurences_a = pd.Series(occurences_a, dtype="category") occurences_a = pd.get_dummies(occurences_a) occurences_b = pd.DataFrame.from_items(occurences_b) occurences_b = pd.get_dummies(occurences_b) cca = CCA(n_components=1) cca.fit(occurences_a, occurences_b) return cca.score(occurences_a, occurences_b)
def cca_fit(X, Y): cca = CCA(n_components=1) cca.fit(X, Y) X = list(itertools.islice(X, 10)) Y = list(itertools.islice(Y, 10)) return cca.score(X, Y)
def get_cca(chip_cors, rna_vec): Y_vec = np.array([[each_val / max(chip_cors) for each_val in chip_cors]]) X_vec = np.array([[each_val / max(rna_vec) for each_val in rna_vec]]) Y_vec = Y_vec.transpose() X_vec = X_vec.transpose() cca_obj = CCA(n_components=1) cca_obj.fit(X_vec, Y_vec) r_squared_canonical = cca_obj.score(X_vec, Y_vec) return r_squared_canonical
def cca_for_ssvep(input_data, sampling_rate, compared_frequencies): # TODO: Strick input checks, exceptions and avoid crashing and processing errors # Pre-allocate SSVEP signals matrix to be compared with original EEG recordings using CCA number_time_points = input_data.shape[1] number_harmonics = 2 cca_base_signal_matrix = [[] for loop_var in compared_frequencies] # Pre-allocate output: one correlation coefficient (Rho) for each target SSVEP frequency # Note: Row 1 is for default Rho scores, Row 2 is for the Rho scores After cca transformation cca_rho_values = numpy.zeros([1, len(compared_frequencies)], dtype='float') # For each target frequency, fill Y matrix with sine and cosine signals for every harmonic for loop_frequencies in range(len(compared_frequencies)): # For this current SSVEP frequency, pre-allocate the harmonics matrix cca_base_signal_matrix[loop_frequencies] = numpy.zeros( [number_harmonics * 2, number_time_points]) time_points_count = numpy.arange(number_time_points, dtype='float') time_points_count = time_points_count / sampling_rate # Generate sine and cosine reference signals, for every harmonic for loop_harmonics in range(number_harmonics): # Compute the reference signals for current harmonic base_constant = 2 * numpy.pi * ( loop_harmonics + 1) * compared_frequencies[loop_frequencies] base_sine_signal = numpy.sin((base_constant * time_points_count)) base_cosine_signal = numpy.cos((base_constant * time_points_count)) # Copy signals back to reference matrix base_position = loop_harmonics + 1 sine_position = (2 * (base_position - 1) + 1) cosine_position = 2 * base_position cca_base_signal_matrix[loop_frequencies][sine_position - 1, :] = base_sine_signal cca_base_signal_matrix[loop_frequencies][cosine_position - 1, :] = base_cosine_signal # After the loop, extract the y_matrix from reference matrix for current SSVEP frequency y_matrix = cca_base_signal_matrix[loop_frequencies] # Create a CCA object and compute the correlation score cca_object = CCA(n_components=number_harmonics) cca_object.fit(numpy.transpose(input_data), numpy.transpose(y_matrix)) values_x, values_y = cca_object.transform(input_data, y_matrix) cca_rho_values[0, loop_frequencies] = cca_object.score( input_data, y_matrix, values_y) # Score = Rho value? # After loop return and exit return cca_rho_values
def doCCA(metrics, color): inp = np.array([metrics[m] for m in metricsInput2]).T.astype(float) out = np.array([metrics[m] for m in metricsOutput2]).T.astype(float) inp0 = np.zeros(len(metricsInput2)) out0 = np.zeros(len(metricsOutput2)) inp = np.vstack((inp, inp0)) out = np.vstack((out, out0)) all = np.concatenate((inp, out), axis=1) # fixed cache fixed = all[all[:, 0] == 90] inp_fixed = fixed[:, 1:2] out_fixed = fixed[:, 2:6] #singleScatter2(1, 2, fixed) #singleScatter2(1, 3, fixed) # singleScatter2(1, 4, fixed) # singleScatter2(1, 5, fixed) inp = inp_fixed #inpnSat #inp_fixed out = out_fixed #outnSat #out_fixed poly = PolynomialFeatures(1, include_bias=False, interaction_only=False) inp = poly.fit_transform(inp) # inp = inp_poly[:, 2:] cca = CCA(n_components=1, scale=False) cca.fit(inp, out) print(cca.score(inp, out)) inp_cca = inp.dot(cca.x_rotations_) out_cca = out.dot(cca.y_rotations_) # Create linear regression object regr = linear_model.LinearRegression() # Train the model using the training sets regr.fit(inp_cca, out_cca) cca_regr = regr.predict(inp_cca) # The coefficients print('Coefficients: \n', regr.coef_) plt.scatter(inp_cca, out_cca, c=color) plt.plot(inp_cca, cca_regr, color=color, linewidth=0.5) logging.info('cca') logging.info(cca.x_loadings_) logging.info(cca.y_loadings_) logging.info(cca.coef_) return cca.coef_
def cca_for_ssvep(input_data, sampling_rate, compared_frequencies): # TODO: Strick input checks, exceptions and avoid crashing and processing errors # Pre-allocate SSVEP signals matrix to be compared with original EEG recordings using CCA number_time_points = input_data.shape[1] number_harmonics = 2 cca_base_signal_matrix = [[] for loop_var in compared_frequencies] # Pre-allocate output: one correlation coefficient (Rho) for each target SSVEP frequency # Note: Row 1 is for default Rho scores, Row 2 is for the Rho scores After cca transformation cca_rho_values = numpy.zeros([1, len(compared_frequencies)], dtype='float') # For each target frequency, fill Y matrix with sine and cosine signals for every harmonic for loop_frequencies in range(len(compared_frequencies)): # For this current SSVEP frequency, pre-allocate the harmonics matrix cca_base_signal_matrix[loop_frequencies] = numpy.zeros([number_harmonics * 2, number_time_points]) time_points_count = numpy.arange(number_time_points, dtype='float') time_points_count = time_points_count / sampling_rate # Generate sine and cosine reference signals, for every harmonic for loop_harmonics in range(number_harmonics): # Compute the reference signals for current harmonic base_constant = 2 * numpy.pi * (loop_harmonics + 1) * compared_frequencies[loop_frequencies] base_sine_signal = numpy.sin((base_constant * time_points_count)) base_cosine_signal = numpy.cos((base_constant * time_points_count)) # Copy signals back to reference matrix base_position = loop_harmonics + 1 sine_position = (2 * (base_position - 1) + 1) cosine_position = 2 * base_position cca_base_signal_matrix[loop_frequencies][sine_position - 1, :] = base_sine_signal cca_base_signal_matrix[loop_frequencies][cosine_position - 1, :] = base_cosine_signal # After the loop, extract the y_matrix from reference matrix for current SSVEP frequency y_matrix = cca_base_signal_matrix[loop_frequencies] # Create a CCA object and compute the correlation score cca_object = CCA(n_components=number_harmonics) cca_object.fit(numpy.transpose(input_data), numpy.transpose(y_matrix)) values_x, values_y = cca_object.transform(input_data, y_matrix) cca_rho_values[0, loop_frequencies] = cca_object.score(input_data, y_matrix, values_y) # Score = Rho value? # After loop return and exit return cca_rho_values
def SVCCA_distance(checkpoint_1, checkpoint_2, R=32): """Compute the singular-value canonical correlation analysis distance between two different networks.""" A_1 = checkpoint_1['test_data'] A_2 = checkpoint_2['test_data'] #U_1, S_1, V_1 = np.linalg.svd(A_1) #U_2, S_2, V_2 = np.linalg.svd(A_2) cca = CCA(n_components=R, max_iter=1000) #cca.fit(V_1, V_2) #cca.fit(A_1.dot(V_1), A_2.dot(V_2)) cca.fit(A_1, A_2) #return 1 - cca.score(A_1.dot(V_1), A_2.dot(V_2)) #return 1 - cca.score(V_1, V_2) return 1 - cca.score(A_1, A_2)
def grid_cca(activations1, act_labels1, activations2, act_labels2, n_clusters): cca_grid = np.zeros((n_clusters, n_clusters)) for clust_i in range(n_clusters): for clust_j in range(n_clusters): i_mask = act_labels1 == clust_i j_mask = act_labels2 == clust_j if sum(i_mask) == 0 or sum(j_mask) == 0: cca_grid[clust_i, clust_j] = 0 cca_grid[clust_j, clust_i] = 0 else: n_comps = min(sum(i_mask), sum(j_mask)) cca = CCA(n_components=n_comps) cca.fit(activations1[i_mask].T, activations2[j_mask].T) cca_score = cca.score(activations1[i_mask].T, activations2[j_mask].T) cca_grid[clust_i, clust_j] = cca_score return cca_grid
def PLS_CCA(csv_data, point_index, sub_index, var_name, train=None, components=None): X_array = [] temp_array = [] for j in csv_data: temp_array = j[point_index - 1:point_index + 8] X_array.append(temp_array) X_array = np.array(X_array) if components == None: components = np.shape(X_array)[1] for i in range(7): Y_array = np.array(csv_data[:, sub_index - 1 + i]) ccaModel = CCA(n_components=1) ccaModel.fit(X_array, Y_array) print(var_name[sub_index + i]) print("R^2 =", np.around(ccaModel.score(X_array, Y_array), decimals=2))
# Get CCA transformation U_c, V_c = cca.x_scores_, cca.y_scores_ #= cca.transform(sat_data, y_data) # From: https://stackoverflow.com/questions/37398856/ rho_cca = np.corrcoef(U_c.T, V_c.T).diagonal(offset=n_cca_comp) #score = np.diag(np.corrcoef(cca.x_scores_, cca.y_scores_, rowvar=False)[:n_cca_comp, n_cca_comp:]) # Use function definition cod_cca2 = rsquare(U_c, y_data) print(cod_cca2) # Add to output dict cca_plc_r2[dataset_use] = cod_cca2[0] # TODO: set index programatically cca_pdc_r2[dataset_use] = cod_cca2[1] # TODO: set index programatically # Calculate Coefficient of Determination (COD) = R² cod_cca = cca.score(sat_data, y_data) print(cod_cca) # Plot number of CCA U and V if 'CCA'.lower() in plot_list: legend_list = [] fig = plt.figure() for i_comp in range(n_cca_comp): plt.scatter(U_c[:,i_comp], V_c[:,i_comp], c=c_vec[i_comp]) legend_list.append('Comp. nr. '+str(i_comp)+ r' $\rho$ = ' +'{:.3f}'.format(rho_cca[i_comp])) plt.title(dataset_use+' CCA: R^2 = ' +'{:.3f}'.format(cod_cca)) plt.legend(legend_list) plt.show() # display it # Plot number of CCA U and PLC if 'PxCvsU'.lower() in plot_list:
def compute_correlation(directory, blob, num_samples=None, num_components=1, out_file=None, verbose=False): ed = expdir.ExperimentDirectory(directory) info = ed.load_info() ds = loadseg.SegmentationData(info.dataset) L = ds.label_size() N = ds.size() blob_info = ed.load_info(blob=blob) shape = blob_info.shape K = shape[1] categories = np.array(ds.category_names()) label_names = np.array([ds.label[i]['name'] for i in range(L)]) (Hs, Ws) = get_seg_size(info.input_dim) if verbose: start = time.time() print 'Loading data...' upsampled_data = ed.open_mmap(blob=blob, part='upsampled', mode='r', shape=(N,K,Hs,Ws)) concept_data = ed.open_mmap(part='concept_data', mode='r', shape=(N,L,Hs,Ws)) if verbose: print 'Finished loading data in %d secs.' % (time.time() - start) if verbose: start = time.time() print 'Selecting data...' if num_samples is not None: rand_idx = np.random.choice(N, num_samples, replace=False) X = upsampled_data[rand_idx,:,Hs/2,Ws/2] Y = concept_data[rand_idx,:,Hs/2,Ws/2] else: X = upsampled_data[:,:,Hs/2,Ws/2] Y = concept_data[:,:,Hs/2,Ws/2] if verbose: print 'Finished selecting data in %d secs.' % (time.time() - start) cca = CCA(n_components=num_components) if verbose: start = time.time() if num_samples is None: num_samples = N print 'Fitting %d-component CCA with N = %d samples...' % (num_components, num_samples) cca.fit(X,Y) if verbose: print 'Fitted %d-component CCA with N = %d samples in %d secs.' % (num_components, num_samples, time.time() - start) X_c, Y_c = cca.transform(X,Y) score = cca.score(X,Y) results = {} if out_file is not None: if verbose: start = time.time() print 'Saving results...' results['model'] = cca try: results['idx'] = rand_idx except: results['idx'] = None results['directory'] = directory results['blob'] = blob results['num_samples'] = num_samples results['num_components'] = num_components results['score'] = score pkl.dump(results, open(out_file, 'wb')) if verbose: print 'Saved results at %s in %d secs.' % (out_file, time.time() - start) return results
Sat = all[all[:, 2] > 40] inpSat = Sat[:, 0:2] outSat = Sat[:, 2:] inpnSat = nSat[:, 0:2] outnSat = nSat[:, 2:] scale = False ccanSat = CCA(n_components=1, scale=scale) ccanSat.fit(inpnSat, outnSat) inp_ccanSat = inpnSat.dot(ccanSat.x_weights_) out_ccanSat = outnSat.dot(ccanSat.y_weights_) plt.scatter(inp_ccanSat, out_ccanSat, c='orange', s=50) logging.info('ccanSat') logging.info(ccanSat.x_loadings_) logging.info(ccanSat.y_loadings_) print(ccanSat.score(inpnSat, outnSat)) out_pred0 = inpnSat.dot(ccanSat.coef_[:, 0]) plt.scatter(outnSat[:, 0], out_pred0, c='r', marker='+') ccaSat = CCA(n_components=1, scale=scale) ccaSat.fit(inpSat, outSat) inp_ccaSat = inpSat.dot(ccaSat.x_weights_) out_ccaSat = outSat.dot(ccaSat.y_weights_) plt.scatter(inp_ccaSat, out_ccaSat, c='purple') logging.info('ccaSat') #logging.info(ccaSat.x_rotations_) #logging.info(ccaSat.y_rotations_) # compare with second measurement inp2 = np.array([metrics2[m] for m in metricsInput2]).T.astype(float) out2 = np.array([metrics2[m] for m in metricsOutput2]).T.astype(float)
def calculate_sklearn_var(cca: CCA, X: np.ndarray, Y: np.ndarray, X_encoder): shared_var = cca.score(X, Y) # Return R^2 return shared_var
def CCA_across_patients(data_files, alg='cca', freq_clustering='cannonical', bin_size=10, window_size=500, post_shift=0, pre_shift=0, band='alpha', pair=(1, 1)): # Assemble the set of feature vectors # Send the arguments in units of ms samp_factor = 10 window_size = int(window_size / samp_factor) pre_shift = pre_shift / samp_factor post_shift = post_shift / samp_factor pre_stim_feature_vector = np.array([]) post_stim_feature_vector = np.array([]) for data_file in data_files: with h5py.File(data_file, 'r') as f: # ERSP time series references ERSP_refs = f['cfg_PAINT_cond']['ChanERSP'] for i in range(ERSP_refs.size): # Use 32 bit floating precision ERSP = np.zeros((250, 51, 95), dtype=np.float64) f[ERSP_refs[i][0]].read_direct(ERSP) # Need to exclude the maximum nan padding leading_nan_count = np.zeros((51, 95)) trailing_nan_count = np.zeros((51, 95)) for j in range(51): for k in range(95): x1, x2 = count_leading_trailing_true( np.isnan(ERSP[:, j, k])) leading_nan_count[j, k] = x1 trailing_nan_count[j, k] = x2 # Select pre and post stimulation leading_max = int(np.amax(leading_nan_count)) trailing_max = int(np.amax(trailing_nan_count)) pre_window_end = int(1000 / samp_factor - pre_shift) post_window_start = int(1000 / samp_factor + post_shift) # Ensure that we don't encroach on the nan-padding window_size1 = min(window_size, pre_window_end - leading_max) window_size2 = min( window_size, int(2500 / samp_factor - trailing_max - post_window_start)) window_size = int(min(window_size1, window_size2)) pre_stim = ERSP[pre_window_end - window_size:pre_window_end, :, :] post_stim = ERSP[post_window_start:post_window_start + window_size, :, :] # Re-arrange axes so that frequency bins are last pre_stim = np.swapaxes(pre_stim, 1, 2) post_stim = np.swapaxes(post_stim, 1, 2) if freq_clustering == 'cannonical': # Average across cannonical frequency bands pre_stim_theta = np.mean(pre_stim[:, :, 0:4], axis=-1) pre_stim_alpha = np.mean(pre_stim[:, :, 4:8], axis=-1) pre_stim_beta = np.mean(pre_stim[:, :, 8:26], axis=-1) pre_stim_gamma = np.mean(pre_stim[:, :, 26::], axis=-1) pre_stim = np.concatenate([ pre_stim_theta, pre_stim_alpha, pre_stim_beta, pre_stim_gamma ], axis=-1) post_stim_theta = np.mean(post_stim[:, :, 0:4], axis=-1) post_stim_alpha = np.mean(post_stim[:, :, 4:8], axis=-1) post_stim_beta = np.mean(post_stim[:, :, 8:26], axis=-1) post_stim_gamma = np.mean(post_stim[:, :, 26::], axis=-1) post_stim = np.concatenate([ post_stim_theta, post_stim_alpha, post_stim_beta, post_stim_gamma ], axis=-1) elif freq_clustering == 'equal': # Chop off the lowest frequency bin so we have a non-prime number of bins... pre_stim = pre_stim[..., 1::] post_stim = post_stim[..., 1::] # Average across equal number of frequency bands pre_stim = np.mean(pre_stim.reshape( (pre_stim.shape[0], pre_stim.shape[1], -1, bin_size)), axis=-1) post_stim = np.mean(post_stim.reshape( (post_stim.shape[0], post_stim.shape[1], -1, bin_size)), axis=-1) # Collapse pre_stim = pre_stim.reshape( (pre_stim.shape[0], pre_stim.shape[1] * pre_stim.shape[2])) post_stim = post_stim.reshape( (post_stim.shape[0], post_stim.shape[1] * post_stim.shape[2])) elif freq_clustering == 'random': # Chop off the lowest frequency bin so we have a non-prime number of bins... pre_stim = pre_stim[..., 1::] post_stim = post_stim[..., 1::] # Average across random collection of frequency bins idxs = np.arange(pre_stim.shape[-1]) np.random.shuffle(idxs) idxs = np.split(idxs, int(pre_stim.shape[-1] / bin_size)) pre_stim_rand1 = np.mean(pre_stim[:, :, idxs[0]], axis=-1) pre_stim_rand2 = np.mean(pre_stim[:, :, idxs[1]], axis=-1) pre_stim_rand3 = np.mean(pre_stim[:, :, idxs[2]], axis=-1) pre_stim_rand4 = np.mean(pre_stim[:, :, idxs[3]], axis=-1) pre_stim_rand5 = np.mean(pre_stim[:, :, idxs[4]], axis=-1) pre_stim = np.concatenate([ pre_stim_rand1, pre_stim_rand2, pre_stim_rand3, pre_stim_rand4, pre_stim_rand5 ], axis=-1) post_stim_rand1 = np.mean(post_stim[:, :, idxs[0]], axis=-1) post_stim_rand2 = np.mean(post_stim[:, :, idxs[1]], axis=-1) post_stim_rand3 = np.mean(post_stim[:, :, idxs[2]], axis=-1) post_stim_rand4 = np.mean(post_stim[:, :, idxs[3]], axis=-1) post_stim_rand5 = np.mean(post_stim[:, :, idxs[4]], axis=-1) post_stim = np.concatenate([ post_stim_rand1, post_stim_rand2, post_stim_rand3, post_stim_rand4, post_stim_rand5 ], axis=-1) elif freq_clustering == 'single_band': if band == 'theta': pre_stim = pre_stim[:, :, 0:4] post_stim = post_stim[:, :, 0:4] elif band == 'alpha': pre_stim = pre_stim[:, :, 4:8] post_stim = post_stim[:, :, 4:8] elif band == 'beta': pre_stim = pre_stim[:, :, 8:26] post_stim = post_stim[:, :, 8:26] elif band == 'gamma': pre_stim = pre_stim[:, :, 26::] post_stim = post_stim[:, :, 26::] elif band == 'topgamma': pre_stim = pre_stim[:, :, 41:51] post_stim = post_stim[:, :, 41:51] elif band == 'all': pass elif freq_clustering == 'pairwise': pre_stim = pre_stim[:, :, pair[0]] post_stim = post_stim[:, :, pair[1]] # Collpase and append if pre_stim_feature_vector.size == 0: pre_stim_feature_vector = np.append( pre_stim_feature_vector, pre_stim.reshape((1, -1))) post_stim_feature_vector = np.append( post_stim_feature_vector, post_stim.reshape((1, -1))) pre_stim_feature_vector = pre_stim_feature_vector.reshape( (1, -1)) post_stim_feature_vector = post_stim_feature_vector.reshape( (1, -1)) else: pre_stim_feature_vector = np.concatenate( [pre_stim_feature_vector, pre_stim.reshape((1, -1))]) post_stim_feature_vector = np.concatenate( [post_stim_feature_vector, post_stim.reshape((1, -1))]) # Convert to 32 bit floating precision pre_stim_feature_vector = pre_stim_feature_vector.astype(np.float32) post_stim_feature_vector = post_stim_feature_vector.astype(np.float32) # Attempt to do a cross-validated CCA across all the features # Perform a cross-validated cannonical correlation analysis on the basis of this data if alg == 'cca': corrmodel = CCA(n_components=1) crsval = cross_validate(corrmodel, pre_stim_feature_vector, post_stim_feature_vector, cv=5, return_train_score=True) return np.mean(crsval['test_score']), np.mean(crsval['train_score']) elif alg == 'pls': corrmodel = PLSRegression() # Manually cross-validate folds = KFold(n_splits=5) test_scores = [] train_scores = [] for train_index, test_index in folds.split(pre_stim_feature_vector, post_stim_feature_vector): corrmodel.fit(pre_stim_feature_vector[train_index], post_stim_feature_vector[train_index]) test_scores.append( corrmodel.score(pre_stim_feature_vector[test_index], post_stim_feature_vector[test_index])) train_scores.append( corrmodel.score(pre_stim_feature_vector[train_index], post_stim_feature_vector[train_index])) return np.mean(test_scores), np.mean(train_scores)
X = [] X.append(hour) X.append(o3) # X.append(pm10) X.append(so2) X.append(no2) X.append(co) X.append(temperature) X.append(wind) # X.append(weather) X.append(moisture) X.append(pressure) X.append(precipitation) X = np.array(X) X = np.transpose(X) print(X.shape) Y = np.array(pm25) print(Y.shape) regr = RandomForestRegressor().fit(X, Y) print("RandomForestRegressor.feature_importances_:\n", regr.feature_importances_) cca = CCA().fit(X, Y) print("cca.x_weights_:\n", cca.x_weights_) # print("cca.x_loadings_:\n", cca.x_loadings_) # print("cca.x_scores_:\n", cca.x_scores_) print("cca.score:\n", cca.score(X, Y)) # print("cca.predict:\n", cca.predict(X))