def find_correlation_cca_method1(signal, reference_signals, n_components=2): r""" Perform canonical correlation analysis (CCA) Reference: https://github.com/aaravindravi/Brain-computer-interfaces/blob/master/notebook_12_class_cca.ipynb Args: signal : ndarray, shape (channel,time) Input signal in time domain reference_signals : ndarray, shape (len(flick_freq),2*num_harmonics,time) Required sinusoidal reference templates corresponding to the flicker frequency for SSVEP classification n_components : int, default: 2 number of components to keep (for sklearn.cross_decomposition.CCA) Returns: result : array, size: len(flick_freq) Probability for each reference signals Dependencies: CCA : sklearn.cross_decomposition.CCA np : numpy package """ cca = CCA(n_components) corr = np.zeros(n_components) result = np.zeros(reference_signals.shape[0]) for freq_idx in range(0, reference_signals.shape[0]): cca_x = signal.T cca_y = np.squeeze(reference_signals[freq_idx, :, :]).T cca.fit(cca_x, cca_y) a, b = cca.transform(cca_x, cca_y) for ind_val in range(0, n_components): corr[ind_val] = np.corrcoef(a[:, ind_val], b[:, ind_val])[0, 1] result[freq_idx] = np.max(corr) return result
def map_spaces(self, algo, src_mapped_embed=None, trg_mapped_embed=None): # (There may be duplicates in self.shared_vocab_src and/or self.shared_vocab_trg, # swap_vocab can be used to only inspect one-to-one translations) src_embed = self.model_src[self.shared_vocab_src] trg_embed = self.model_trg[self.shared_vocab_trg] os.makedirs(algo, exist_ok=True) if algo == "procrustes": logging.info( "Calculating Rotation Matrix (Procrustes Problem) and applying it to first embedding" ) #ortho, _ = orthogonal_procrustes(src_embed, trg_embed) # does the same as u, _, vt = np.linalg.svd(trg_embed.T.dot(src_embed)) w = vt.T.dot(u.T) self.model_src.vectors.dot(w, out=self.model_src.vectors) elif algo == "noise": logging.info( "Calculating Rotation Matrix with noise aware algorithm and applying it to first embedding" ) transform_matrix, alpha, clean_indices, noisy_indices = noise_aware( src_embed, trg_embed) #write cleaned vocab to file with open("vocab.clean.txt", 'w') as v: for src, trg in np.asarray(self.shared_vocab)[clean_indices]: v.write("{}\t{}\n".format(src, trg)) self.model_src.vectors.dot(transform_matrix, out=self.model_src.vectors) logging.info("Percentage of clean indices: {}".format(alpha)) elif algo == "cca": logging.info( "Calculating Mapping based on CCA and applying it to both embeddings" ) cca = CCA(n_components=100, max_iter=5000) cca.fit(src_embed, trg_embed) self.model_src.vectors, self.model_trg.vectors = cca.transform( self.model_src.vectors, self.model_trg.vectors) elif algo == "gcca": logging.info( "Calculating Mapping based on GCCA and applying it to both embeddings" ) gcca = GCCA() gcca.fit([src_embed, trg_embed]) transform_l = gcca.transform_as_list( (self.model_src.vectors, self.model_trg.vectors)) # gcca computes positive and negative correlations (eigenvalues), sorted in ascending order. # We are only interested in the positive portion self.model_src.vectors = transform_l[0][:, 100:] self.model_trg.vectors = transform_l[1][:, 100:] # save transformed model(s) if src_mapped_embed: self.model_src.save(os.path.join(algo, src_mapped_embed)) if trg_mapped_embed: self.model_trg.save(os.path.join(algo, trg_mapped_embed))
def CCA_project_vectors(args, src_dico, tgt_dico, src_full, tgt_full, src_train, tgt_train, NUM_dim=100): print('Exporting embeddings...') OutputDir = "output/{}-{}/".format(args.src_lang, args.tgt_lang) if not os.path.exists(OutputDir): os.makedirs(OutputDir) cca = CCA(n_components=NUM_dim) print("Fitting...") cca.fit(src_train, tgt_train) print(cca.get_params()) X_c, Y_c = cca.transform(src_full, tgt_full) src_out, tgt_out = utils.norm_embeddings(X_c), utils.norm_embeddings(Y_c) print("Exporting embeddings...") utils.export_embeddings(src_dico[0], src_out, OutputDir + 'projected.{}'.format(args.src_lang)) utils.export_embeddings(tgt_dico[0], tgt_out, OutputDir + 'projected.{}'.format(args.tgt_lang)) print("work over!")
class CCAFusion(TransformerMixin, BaseEstimator): def __init__(self, c1, c2): self.pipes = [c1, c2] self.max_iter = 500 self.cca = None def fit(self, X, y=None, **fit_params): C = [] n_components = None for pipe in self.pipes: c = pipe.fit_transform(X, y) if hasattr(c, 'toarray'): c = c.toarray() if n_components is None: n_components = c.shape[1] else: n_components = min(c.shape[1], n_components) C += [c] self.cca = CCA(n_components=n_components, max_iter=self.max_iter) self.cca.fit(*C) return self def transform(self, X, y=None): C = [] for pipe in self.pipes: c = pipe.transform(X, y) if hasattr(c, 'toarray'): c = c.toarray() C += [c] return self.cca.transform(*C)[0] def fit_transform(self, X, y=None, **fit_params): return self.fit(X, y, **fit_params).transform(X, y)
def train_eval(self, train_index, test_index, ignore_eval=False): normalized_train, normalized_test = normalize_by_train(self.source[train_index], self.source[test_index]) if self.comp is not None: if self.use_scikit is not None: if self.use_scikit == 'cca': dim_reduction = CCA(n_components=self.comp) else: dim_reduction = PCA(n_components=self.comp) # fit cca according to train data only dim_reduction.fit(normalized_train, self.target[train_index]) # convert source into lower dimensional representation normalized_train = dim_reduction.transform(normalized_train) normalized_test = dim_reduction.transform(normalized_test) else: _, wa, _ = tutorial_on_cca(normalized_train, self.target[train_index]) normalized_train = normalized_train @ wa[:, :self.comp] normalized_test = normalized_test @ wa[:, :self.comp] model = self.build_model() model.fit(normalized_train, self.target[train_index]) prediction = model.predict(normalized_test) # res_df.to_csv(f"{self.out_name}/res1.csv") if not ignore_eval: return self.evaluate_regression(prediction, test_index) else: return prediction
def train_eval(self, train_index, test_index): train_source, test_source = self.source[train_index], self.source[ test_index] train_target, test_target = self.target[train_index], self.target[ test_index] train_source, test_source = scale_train_test(train_source, test_source) train_target, _ = scale_train_test(train_target, test_target) # rho, w_t, w_s, _ = evaluate_cca_wa_wb(train_target, train_source) cca = CCA(n_components=min(train_source.shape[1], train_target.shape[1]), max_iter=1000) cca.fit(train_source, train_target) w_s = cca.x_rotations_ w_t = cca.y_rotations_ predicted_target = test_source @ w_s @ np.linalg.pinv(w_t) predicted_target = unscale_prediction(train_target, predicted_target) if self.target_encoder is not None: test_target = self.original_target[test_index] predicted_target = self.target_encoder.decode( torch.as_tensor(predicted_target)).detach().numpy() scores = np.zeros(self.original_target.shape[1]) for i in range(self.original_target.shape[1]): predicted = predicted_target[:, i] actual = test_target[:, i] r, pval = pearsonr(predicted, actual) scores[i] = r return scores
def fit_cca(self, outfile=''): # fits linear CCA constraint and replaces pretrained name embeddings with CCA transformed embeddings self.load_embeddings() self.extract_pretrained_prototype_embeddings() items, vectors = zip( *[(k, v) for k, v in self.pretrained_prototype_embeddings.items() if k in self.exemplar_to_concept]) concept_embs = Reach(vectors, items) train_vectors = [] for x in items: train_vectors.append(self.train_embeddings[x]) train_vectors = Reach.normalize(train_vectors) cca = CCA(n_components=self.train_embeddings.size, max_iter=10000) cca.fit(train_vectors, concept_embs.norm_vectors) # transform all name embeddings using the CCA mapping all_name_embeddings = deepcopy(self.pretrained_name_embeddings) items = [x for _, x in sorted(all_name_embeddings.indices.items())] projected_name_embeddings = cca.transform( all_name_embeddings.norm_vectors) new_name_embeddings = Reach(projected_name_embeddings, items) self.pretrained_name_embeddings = new_name_embeddings self.load_embeddings() if outfile: with open('{}_cca.p', 'wb') as f: pickle.dump(cca, f)
class CCA_method(): def __init__(self, n_latents): self._n_latents = n_latents self._cca = CCA(n_components=n_latents, scale=False, max_iter=10000, tol=1e-8) self._Q = np.eye(self._n_latents) def fit(self, X, Y): # projections U'X, V'Y such that U'X and V'Y are maximally correlated self._cca.fit(X, Y) # get time-course of projected data UX, VY = self._cca.transform(X, Y) # learn linear regression VY = UX * Q # (Q will be optimal in least-squares sense) self._Q = np.linalg.pinv(UX).dot(VY) def predict(self, X): # transform source data into latent space UX = self._cca.transform(X) # predict latent activity in target space QUX = UX.dot(self._Q) # predict observed activity in target space Ypred = QUX.dot(self._cca.y_loadings_.T) return Ypred
def visualize_with_cca(X, y, title): cca = CCA(n_components=2) cca.fit(X, y) X_cca = cca.transform(X) Xax = X_cca[:, 0] Yax = X_cca[:, 1] labels = (y > 0).astype(int) cdict = {0: 'red', 1: 'green'} labl = {0: 'home_loss', 1: 'home_win'} marker = {0: '*', 1: 'o'} alpha = {0: .3, 1: .5} fig, ax = plt.subplots(figsize=(7, 5)) fig.patch.set_facecolor('white') for l in np.unique(labels): ix = np.where(labels == l) ax.scatter(Xax[ix], Yax[ix], c=cdict[l], s=40, label=labl[l], marker=marker[l], alpha=alpha[l]) plt.xlabel("First Principal Component", fontsize=14) plt.ylabel("Second Principal Component", fontsize=14) plt.legend() plt.title(title) plt.show()
def cca_classify(X_eeg_signals, Yi_frequency_signals): cca = CCA(1) corr_results = [] for fr in range(0, Yi_frequency_signals.shape[0]): X = X_eeg_signals Yi = Yi_frequency_signals[fr, :, :] #计算X与Yi之间的相关性 cca.fit(X.T, np.squeeze(Yi).T) X_train_r, Yi_train_r = cca.transform(X.T, np.squeeze(Yi).T) corr = np.corrcoef(X_train_r[:, 0], Yi_train_r[:, 0])[0, 1] #得出X与每个Yi的相关性 corr_results.append(corr) if corr_results[np.argmax(corr_results)] > 0.50: #设置阈值 global index global all_data classify_result = np.argmax(corr_results) + 1 print(corr_results) index += 1 #保存数据 TT = pd.DataFrame(X_eeg_signals) all_data = all_data.append(np.transpose(TT[1:9])) if index == 50: #保存数据 all_data = pd.DataFrame(all_data) all_data.to_csv('./j_8_all_data.csv', index=False) return classify_result else: return -1
def doCCA(metrics, color): inp = np.array([metrics[m] for m in metricsInput2]).T.astype(float) out = np.array([metrics[m] for m in metricsOutput2]).T.astype(float) inp0 = np.zeros(len(metricsInput2)) out0 = np.zeros(len(metricsOutput2)) inp = np.vstack((inp, inp0)) out = np.vstack((out, out0)) cca = CCA(n_components=1, scale=False) cca.fit(inp, out) inp_cca = inp.dot(cca.x_weights_) out_cca = out.dot(cca.y_weights_) # Create linear regression object regr = linear_model.LinearRegression() # Train the model using the training sets regr.fit(inp_cca, out_cca) cca_regr = regr.predict(inp_cca) # The coefficients print('Coefficients: \n', regr.coef_) plt.scatter(inp_cca, out_cca, c=color) plt.plot(inp_cca, cca_regr, color=color, linewidth=0.5) logging.info('cca') logging.info(cca.x_rotations_) logging.info(cca.y_rotations_)
def cca(m1, m2, preprocessing=None): """ Use CCA to decompose two views and plot result. Params: m1, m2: Every column is a example with every row as a feature. preprocessing: If None, we don't do pre-processing; if 'orth', we adjust center to 0 and perform PCA. """ # Adjust means to be 0 and perform PCA. if preprocessing == "orth": # Zero means. m1 -= np.mean(m1, axis=1, keepdims=True) # print("m1=", np.sum(m1, axis=1)) m2 -= np.mean(m2, axis=1, keepdims=True) # PCA. cca = CCA(n_components=3, max_iter=100) cca.fit(m1.T, m2.T) X_c = cca.transform(m1.T) fig, ax = plt.subplots() ax.set_title('Fig.2.(c)') # ax.set_color_cycle(['blue', 'green', 'red']) ax.set_prop_cycle('color', ['blue', 'red', 'green']) ax.plot(X_c) # ax.plot(Y_c) plt.show()
def perform(arrs): blocks_cnt = sum([arr.shape[1] * arr.shape[2] for arr in arrs]) X = np.zeros((blocks_cnt, 16)) Y = np.zeros((blocks_cnt, 64)) for c in range(3): for i, arr in enumerate(arrs): height = arr.shape[1] width = arr.shape[2] for y in range(height): for x in range(width): X[y * width + x] = np.hstack( [arr[c][y][x - 1][:][-1], arr[c][y - 1][x][-1][:]]) Y[y * width + x] = arr[c][y][x].ravel() X_mc = (X - X.mean()) / (X.std()) Y_mc = (Y - Y.mean()) / (Y.std()) ca = CCA(n_components=1) ca.fit(X_mc, Y_mc) print(f'\nColor {c}:') weights = ca.x_weights_.ravel() print(weights.shape) print(', '.join(map(lambda a: str(a), weights))) print(ca.n_iter_)
def project_vectors(origForeignVecFile, origEnVecFile, subsetEnVecFile, subsetForeignVecFile, outputEnFile, outputForeignFile, NUMCC=40): ''' 将词典的向量输入到CCA中,生成投影向量,再生成双语向量 :param origForeignVecFile: 外语向量矩阵 :param origEnVecFile: 英语向量矩阵 :param subsetEnVecFile: 词典中的英语向量矩阵 :param subsetForeignVecFile: 词典中的外语向量矩阵 :param outputEnFile: 重新获得的英语词向量 :param outputForeignFile: 重新获得的外语词向量 :param truncRatio: 模型的训练系数 ''' '''数据读入,处理掉开头的英文单词,只保留词向量''' tmp = np.loadtxt(origEnVecFile, dtype=np.str, delimiter=' ') origEnVecs = tmp[:, 1:].astype(np.float) tmp2 = np.loadtxt(origForeignVecFile, dtype=np.str, delimiter=' ') origForeignVecs = tmp2[:, 1:].astype(np.float) tmp3 = np.loadtxt(subsetEnVecFile, dtype=np.str, delimiter=' ') subsetEnVecs = tmp3[:, 1:].astype(np.float) tmp4 = np.loadtxt(subsetForeignVecFile, dtype=np.str, delimiter=' ') subsetForeignVecs = tmp4[:, 1:].astype(np.float) '''预处理,使每行正则化''' #origEnVecs=preprocessing.normalize(origEnVecs) #origForeignVecs=preprocessing.normalize(origForeignVecs) subsetEnVecs = preprocessing.normalize(subsetEnVecs) subsetForeignVecs = preprocessing.normalize(subsetForeignVecs) '''训练CCA''' ''' num = [NUMCC] regs = [1e-1] cca = rcca.CCACrossValidate(regs=regs,numCCs=num,kernelcca=False,cutoff=0.1) cca.train([subsetEnVecs, subsetForeignVecs]) ''' cca = CCA(n_components=NUMCC) cca.fit(subsetEnVecs, subsetForeignVecs) print cca.get_params() X_c, Y_c = cca.transform(origEnVecs, origForeignVecs) '''生成投影后的向量''' #tmpOutput = rcca._listdot([d.T for d in [origEnVecs, origForeignVecs]], cca.ws) origEnVecsProjected = preprocessing.normalize(X_c) #origEnVecsProjected = preprocessing.scale(tmpOutput[0]) origEnVecsProjected = np.column_stack( (tmp[:, :1], origEnVecsProjected.astype(np.str))) origForeignVecsProjected = preprocessing.normalize(Y_c) #origForeignVecsProjected = preprocessing.scale(tmpOutput[1]) origForeignVecsProjected = np.column_stack( (tmp2[:, :1], origForeignVecsProjected.astype(np.str))) np.savetxt(outputEnFile, origEnVecsProjected, fmt="%s", delimiter=' ') np.savetxt(outputForeignFile, origForeignVecsProjected, fmt="%s", delimiter=' ') print "work over!"
def cca_fit(X, Y): cca = CCA(n_components=1) cca.fit(X, Y) X = list(itertools.islice(X, 10)) Y = list(itertools.islice(Y, 10)) return cca.score(X, Y)
def _CCA(data, graph, n): cca = CCA(n_components=n) adjacencyMatrix = createAffinityMatrix(graph) cca.fit(data, adjacencyMatrix) X_c, Y_c = cca.transform(data, adjacencyMatrix) writeCSV(X_c, 'CCA_X') writeCSV(Y_c, 'CCA_Y')
def cca_d_h(d_var, h_var, components_num): cca=CCA(n_components=components_num, scale=True, max_iter=2000) cca.fit(d_var, h_var) d_c,h_c=cca.transform(d_var, h_var) ah = np.linalg.inv((h_var.T).dot(h_var)).dot(h_var.T).dot(h_c) ad = np.linalg.inv((d_var.T).dot(d_var)).dot(d_var.T).dot(d_c) return d_c, h_c, ad, ah
def mean_canonical_correlations(scaled_features, df): cca = CCA(1) cca.fit(scaled_features, df.iloc[:,-1]) X_c, Y_c = cca.transform(scaled_features, df.iloc[:,-1]) mean_canonical_correlation = np.mean(X_c) return mean_canonical_correlation
def canonical_correlation_analysis(occurences_a, occurences_b): occurences_a = pd.Series(occurences_a, dtype="category") occurences_a = pd.get_dummies(occurences_a) occurences_b = pd.DataFrame.from_items(occurences_b) occurences_b = pd.get_dummies(occurences_b) cca = CCA(n_components=1) cca.fit(occurences_a, occurences_b) return cca.score(occurences_a, occurences_b)
def cca_analysis(X, Y, X_dev, Y_dev): cca = CCA(n_components=1, max_iter=2000) cca.fit(X, Y) X_dev_c, Y_dev_c = cca.transform(X_dev, Y_dev) corrcoef = np.corrcoef(X_dev_c.T, Y_dev_c.T)[0,1] return corrcoef
def cca_score(X, Y): # Calculate the CCA score of the first component pair ca = CCA(n_components=1) ca.fit(X, Y) Xc, Yc = ca.transform(X, Y) score = np.corrcoef(Xc[:, 0], Yc[:, 0]) return score[0][1]
def CCA_analysis(self, recordings_index, trial_index, brain_area): path = self.all_data_path + '/' + self.selected_recordings[ recordings_index] #Prepare rates rates = self.convert_one_population_to_rates(recordings_index, trial_index, brain_area).T #Prepare behavior trials = np.load(path + '/' + 'trials.intervals.npy') #Behavioral data mot_timestamps = np.load(path + '/' + 'face.timestamps.npy') mot_energy = np.load(path + '/' + 'face.motionEnergy.npy') beh_range = np.bitwise_and( mot_timestamps[:, 1] >= trials[trial_index][0], mot_timestamps[:, 1] <= trials[trial_index][1]) #print(np.where(beh_range==True)) #print(mot_timestamps[beh_range]) beh_subset = mot_energy[beh_range] beh_subset_aligned = self.align_rate_and_behavior( beh_subset, rates[:, 0]).reshape(-1, 1) from sklearn.cross_decomposition import CCA cca = CCA(n_components=2) cca.fit(rates, beh_subset_aligned) X_train_r, Y_train_r = cca.transform(rates, beh_subset_aligned) print(X_train_r.shape) print(Y_train_r.shape) plt.scatter(X_train_r[:, 0], Y_train_r[:], label="train", marker="*", c="b", s=50) plt.show() plt.scatter(X_train_r[:, 1], Y_train_r[:], label="train", marker="*", c="b", s=50) plt.show() #rates_test=self.convert_one_population_to_rates(recordings_index,2,brain_area).T #X_test_r, Y_test_r = cca.transform(rates_test, beh_subset_aligned) #plt.scatter(X_test_r[:, 0], Y_test_r[:], label="test", #marker="^", c="b", s=50) #plt.show() print(beh_subset_aligned.shape) print(rates.shape)
def get_cca(chip_cors, rna_vec): Y_vec = np.array([[each_val / max(chip_cors) for each_val in chip_cors]]) X_vec = np.array([[each_val / max(rna_vec) for each_val in rna_vec]]) Y_vec = Y_vec.transpose() X_vec = X_vec.transpose() cca_obj = CCA(n_components=1) cca_obj.fit(X_vec, Y_vec) r_squared_canonical = cca_obj.score(X_vec, Y_vec) return r_squared_canonical
def cca(src_dict, tgt_dict, bi_dict, dim=250): #with open('../data/seed_embedding.dat', 'wb') as f: # pickle.dump(x, f) # pickle.dump(y, f) cca_model = CCA(n_components=dim) src_mat, tgt_mat = make_training_matrices(src_dict, tgt_dict, bi_dict) cca_model.fit(src_mat, tgt_mat) return cca_model.transform(src_dict.embed, tgt_dict.embed)
def do_cca(X, y, X_orig, n_components=10, permutations=10): ''' Performs a CCA using components Projects scores back to edge space ''' cca = CCA(n_components=n_components) cca.fit(X, y) # save the latent component correlation cca.mode_r = [] for component in range(n_components): cca.mode_r.append( np.corrcoef(cca.x_scores_[:, component], cca.y_scores_[:, component])[0, 1]) # correlate behaviour with LC score cca.y_score_correlation = np.zeros((np.shape(y)[1], n_components)) for component in range(n_components): for beh in range(np.shape(y)[1]): cca.y_score_correlation[beh, component] = np.corrcoef( y[:, beh].T, cca.y_scores_[:, component])[0, 1] # correlate edges with LC score cca.x_score_correlation = np.zeros((np.shape(X_orig)[1], n_components)) for component in range(n_components): cca.x_score_correlation[:, component] = np.corrcoef( cca.x_scores_[:, component], X_orig.T)[1::, 0] # non parametric max T tests for component significance max_r = [] for perm in tqdm(range(permutations)): #shuffle the behaviour for each permutation y_shuffle = shuffle(y) #perform a new CCA with shuffled data cca_perm = [] cca_perm = CCA(n_components=n_components) cca_perm.fit(X, y_shuffle) # save the latent component correlation mode_r_perm = [] for component in range(n_components): mode_r_perm.append( np.corrcoef(cca_perm.x_scores_[:, component], cca_perm.y_scores_[:, component])[0, 1]) # take the max r value max_r.append(np.max(mode_r_perm)) # Compute adjusted p-values via percentile p_adj = [] for component in range(n_components): p_adj.append(np.mean(max_r >= cca.mode_r[component])) return cca, p_adj
def cca_feature(self, data, parameter_list): cca = CCA(1) result = [] for i in range(parameter_list[-1]): # reference_signals = self.reference_signals(parameter_list[1][i], parameter_list[2], parameter_list[3]) reference_signals = self.reference_signals(parameter_list[1][i], parameter_list[2], parameter_list[3]) cca.fit(data.T, reference_signals.T) x, y = cca.transform(data.T, np.squeeze(reference_signals).T) corr = np.corrcoef(x[:, 0], y[:, 0])[0, 1] result.append(corr) return result
def fit_CCA(tr_block, data_builder): '''We fit a CCA to some 100 odd points??? ''' # train on number of points num_points = 100 PixelPoints = data_builder.sample_random_pixels() points_array_ipw = [] points_array_refl = [] for yr in [14, 15]: doy_strings = data_builder.club_days(tr_block[tr_block[0][:, 1] == yr]) days_in_sorted = doy_strings.keys() days_in_sorted.sort() ipw_files, refl_files = data_builder.sort_IPW_refl_files_imgs(yr) for set_ in days_in_sorted: print 'Building data set for year: %d and string of days %s' % ( yr, set_) # Get the required files only temp_ipw_files = filter( lambda x: re.findall('\d+', x)[1] in doy_strings[set_], ipw_files) temp_refl_files = filter( lambda x: re.findall('\d+', x)[1] in doy_strings[set_], refl_files) temp_ipw_files = map( lambda x: '../data/dataset/20' + str(yr) + os.sep + x, temp_ipw_files) temp_refl_files = map( lambda x: '../data/dataset/20' + str(yr) + os.sep + x, temp_refl_files) for x_, y_ in zip(PixelPoints[:num_points, 0], PixelPoints[:num_points, 1]): temp_array = data_builder.build_features_and_truth_imgs( temp_ipw_files, temp_refl_files, x_, y_) points_array_ipw.append(temp_array[1]) points_array_refl.append(temp_array[2]) X_ = np.vstack(points_array_ipw) Y_ = np.vstack(points_array_refl) mdl = CCA(n_components=10) print 'Fitting a CCA...' mdl.fit(X_[:, :1089], Y_[:, :1089]) ipw_frames = X_[:, 2178:-1] refl_frames = Y_[:, 2178:] del X_ del Y_ ipw_frames = ipw_frames[~np.any(np.isnan(ipw_frames), axis=1), :] refl_frames = refl_frames[~np.any(np.isnan(refl_frames), axis=1), :] # indices = [(x*1089,(x+1)*1089)for x in range(4) ] # # the number of components times 4 # ipw_refl_fusion = np.zeros((ipw_frames.shape[0],80)) print 'Building the feature fusion..' return mdl
def load_mutation_data(): if os.path.isfile(mutation_pickle_path): pickle_load = pickle.load(open(mutation_pickle_path, 'rb')) return pickle_load[0], pickle_load[1] gene_effect_df = pd.read_csv( r"C:\Users\Nitay\Documents\courses\roded-seminar\Achilles_gene_dependency.csv" ) mutations_df = pd.read_csv( r"C:\Users\Nitay\Documents\courses\roded-seminar\CCLE_mutations.csv") mutations_df = mutations_df[mutations_df["isDeleterious"].fillna(False)] gene_effect_df = gene_effect_df.set_index("Unnamed: 0").T gene_effect_df.columns.names = ["cell_line"] gene_effect_df.index.names = ["gene"] def clean_gene_name(name): return name.split("(")[0].strip() clean_gene_effect_df = gene_effect_df.rename(index=clean_gene_name) common_genes = set(clean_gene_effect_df.index).intersection( set(mutations_df['Hugo_Symbol'])) mutations_cell_line = set(mutations_df['DepMap_ID']) new_mutations_df = pd.DataFrame(np.zeros( (len(common_genes), len(mutations_cell_line))), columns=mutations_cell_line, index=common_genes) for i, row in mutations_df.iterrows(): cell_line = row["DepMap_ID"] gene = row['Hugo_Symbol'] if gene in common_genes and cell_line in mutations_cell_line: new_mutations_df.loc[gene, cell_line] = 1 filtered_gene_effect_df = clean_gene_effect_df.filter(items=common_genes, axis=0) filtered_mutations_df = new_mutations_df.loc[new_mutations_df.sum(1) > 0, new_mutations_df.sum(0) > 0] from sklearn.cross_decomposition import CCA Y = filtered_gene_effect_df.values X = filtered_mutations_df.values cca = CCA(n_components=10) cca.fit(X, Y) X_c = cca.transform(X) filtered_mutations_df = pd.DataFrame(X_c) pickle.dump([filtered_gene_effect_df, filtered_mutations_df], open(mutation_pickle_path, "wb")) return filtered_gene_effect_df, filtered_mutations_df
def cca(vocab1, vocab2, cca_model=None, dim=300, max_iter=1000, thre=0.5): if not cca_model: cca_model = CCA(n_components=dim, max_iter=max_iter) try: cca_model.fit(vocab1, vocab2) [cca_vec1, cca_vec2] = cca_model.transform(vocab1, vocab2) except: print('svd cannot converge, try smaller dim') else: [cca_vec1, cca_vec2] = cca_model.transform(vocab1, vocab2) comb_cca = (thre * cca_vec1 + (1 - thre) * cca_vec2) return comb_cca, cca_vec1, cca_vec2, cca_model
def predict(self): if self.k_CCA is None: if self.verbose: print('Going to compute best components first') self.determine_CCA_components() # self.cca_predictions, _ = self.ccaCV.predict(self.features, self.ccaCV.ws) cca = CCA(n_components=self.k_CCA) cca.fit(self.features[:6000], self.graph[:6000]) self.cca_predictions = cca.transform(self.features) if self.verbose: print('Produced predictions') print('Size of predictions {}'.format(self.cca_predictions.shape))
def canonical_approach(): from sklearn.cross_decomposition import CCA (X, Y), cities = pull_xy_data() cca = CCA(n_components=2) cca.fit(X, Y) ccaX, ccaY = cca.transform(X, Y) plot(ccaX, cities, ["CC01", "CC02", "CC03"], 1) return "OK What Now?"
def __init__(self, dataset, n=None, tol=1e-4): if n is None: n = int(numpy.ceil(numpy.sqrt(len(dataset.attributes)))) self.dataset = dataset self.attributes = random.sample(dataset.attributes, n) cca = CCA(n_components=1, tol=tol) cca.fit( dataset.X.take([a.index for a in self.attributes], 1), dataset.y) self.linear_combination = LinearCombination( self.attributes, cca.x_weights_.transpose()[0])
def cca_for_ssvep(input_data, sampling_rate, compared_frequencies): # TODO: Strick input checks, exceptions and avoid crashing and processing errors # Pre-allocate SSVEP signals matrix to be compared with original EEG recordings using CCA number_time_points = input_data.shape[1] number_harmonics = 2 cca_base_signal_matrix = [[] for loop_var in compared_frequencies] # Pre-allocate output: one correlation coefficient (Rho) for each target SSVEP frequency # Note: Row 1 is for default Rho scores, Row 2 is for the Rho scores After cca transformation cca_rho_values = numpy.zeros([1, len(compared_frequencies)], dtype='float') # For each target frequency, fill Y matrix with sine and cosine signals for every harmonic for loop_frequencies in range(len(compared_frequencies)): # For this current SSVEP frequency, pre-allocate the harmonics matrix cca_base_signal_matrix[loop_frequencies] = numpy.zeros([number_harmonics * 2, number_time_points]) time_points_count = numpy.arange(number_time_points, dtype='float') time_points_count = time_points_count / sampling_rate # Generate sine and cosine reference signals, for every harmonic for loop_harmonics in range(number_harmonics): # Compute the reference signals for current harmonic base_constant = 2 * numpy.pi * (loop_harmonics + 1) * compared_frequencies[loop_frequencies] base_sine_signal = numpy.sin((base_constant * time_points_count)) base_cosine_signal = numpy.cos((base_constant * time_points_count)) # Copy signals back to reference matrix base_position = loop_harmonics + 1 sine_position = (2 * (base_position - 1) + 1) cosine_position = 2 * base_position cca_base_signal_matrix[loop_frequencies][sine_position - 1, :] = base_sine_signal cca_base_signal_matrix[loop_frequencies][cosine_position - 1, :] = base_cosine_signal # After the loop, extract the y_matrix from reference matrix for current SSVEP frequency y_matrix = cca_base_signal_matrix[loop_frequencies] # Create a CCA object and compute the correlation score cca_object = CCA(n_components=number_harmonics) cca_object.fit(numpy.transpose(input_data), numpy.transpose(y_matrix)) values_x, values_y = cca_object.transform(input_data, y_matrix) cca_rho_values[0, loop_frequencies] = cca_object.score(input_data, y_matrix, values_y) # Score = Rho value? # After loop return and exit return cca_rho_values
def fit_CCA(tr_block,data_builder): '''We fit a CCA to some 100 odd points??? ''' # train on number of points num_points = 100 PixelPoints = data_builder.sample_random_pixels() points_array_ipw = [] points_array_refl = [] for yr in [14,15]: doy_strings = data_builder.club_days(tr_block[tr_block[0][:,1] == yr]) days_in_sorted = doy_strings.keys() days_in_sorted.sort() ipw_files,refl_files = data_builder.sort_IPW_refl_files_imgs(yr) for set_ in days_in_sorted: print 'Building data set for year: %d and string of days %s'%(yr,set_) # Get the required files only temp_ipw_files = filter(lambda x: re.findall('\d+',x)[1] in doy_strings[set_],ipw_files) temp_refl_files = filter(lambda x: re.findall('\d+',x)[1] in doy_strings[set_],refl_files) temp_ipw_files = map(lambda x: '../data/dataset/20' + str(yr) + os.sep + x,temp_ipw_files) temp_refl_files = map(lambda x: '../data/dataset/20' + str(yr) + os.sep + x,temp_refl_files) for x_,y_ in zip(PixelPoints[:num_points,0],PixelPoints[:num_points,1]): temp_array = data_builder.build_features_and_truth_imgs(temp_ipw_files,temp_refl_files,x_,y_) points_array_ipw.append(temp_array[1]) points_array_refl.append(temp_array[2]) X_ = np.vstack(points_array_ipw) Y_ = np.vstack(points_array_refl) mdl = CCA(n_components = 10) print 'Fitting a CCA...' mdl.fit(X_[:,:1089],Y_[:,:1089]) ipw_frames = X_[:,2178:-1] refl_frames = Y_[:,2178:] del X_ del Y_ ipw_frames = ipw_frames[~np.any(np.isnan(ipw_frames),axis = 1),:] refl_frames = refl_frames[~np.any(np.isnan(refl_frames),axis = 1),:] # indices = [(x*1089,(x+1)*1089)for x in range(4) ] # # the number of components times 4 # ipw_refl_fusion = np.zeros((ipw_frames.shape[0],80)) print 'Building the feature fusion..' return mdl
def test_cca_implementation(): X = np.random.multivariate_normal(np.random.randint(50,100,(10)).astype('float'),np.identity(10),200) Y = np.random.multivariate_normal(np.random.randint(80,200,(6)).astype('float'),np.identity(6),200) X_test = np.random.multivariate_normal(np.random.randint(50,100,(10)).astype('float'),np.identity(10),20) Y_test = np.random.multivariate_normal(np.random.randint(50,100,(6)).astype('float'),np.identity(6),20) mdl_test = CCA(n_components = 6) mdl_test.fit(X,Y) Y_pred = mdl_test.predict(X) print Y_pred print '-'*50 # print Y_test from sklearn.cross_decomposition import CCA as CCA_sklearn mdl_actual = CCA_sklearn(n_components = 6) mdl_actual.fit(X,Y) print '-'*50 Y_actual = mdl_actual.predict(X) print Y_actual
[ 138, 33, 68, 2, 110, 43] ] print X.shape #X = N.array(Z)[:,0:3].tolist() #Y = N.array(Z)[:,3:6].tolist() print 'X=\n',X print 'Y=\n',Y Rx = N.corrcoef(X.T) Ry = N.corrcoef(Y.T) cca = CCA(n_components=1) cca.fit(X, Y) print "Rx:\n", Rx print "Ry:\n", Ry print "x_weights:\n", cca.x_weights_ print "y_weights:\n", cca.y_weights_ print "x_loadings:\n", cca.x_loadings_ print "y_loadings:\n", cca.y_loadings_ print "x_scores_:\n", cca.x_scores_ print "y_scores_:\n", cca.y_scores_ loadings_man_x = N.dot(Rx, cca.x_weights_) loadings_man_y = N.dot(Ry, cca.y_weights_) print "loadings_man_x:\n",loadings_man_x print "loadings_man_y:\n",loadings_man_y
Y = np.dot(X, B) + np.random.normal(size=n * q).reshape((n, q)) + 5 pls2 = PLSRegression(n_components=3) pls2.fit(X, Y) print("True B (such that: Y = XB + Err)") print(B) # compare pls2.coef_ with B print("Estimated B") print(np.round(pls2.coef_, 1)) pls2.predict(X) # PLS regression, with univariate response, a.k.a. PLS1 n = 1000 p = 10 X = np.random.normal(size=n * p).reshape((n, p)) y = X[:, 0] + 2 * X[:, 1] + np.random.normal(size=n * 1) + 5 pls1 = PLSRegression(n_components=3) pls1.fit(X, y) # note that the number of components exceeds 1 (the dimension of y) print("Estimated betas") print(np.round(pls1.coef_, 1)) # ############################################################################# # CCA (PLS mode B with symmetric deflation) cca = CCA(n_components=2) cca.fit(X_train, Y_train) X_train_r, Y_train_r = cca.transform(X_train, Y_train) X_test_r, Y_test_r = cca.transform(X_test, Y_test)
def mainExec(name_file, features): ''' Based on a list of image names and image features, learn a CCA model based on Stacked Auxiliary Embedding and save this model to disk. :param name_file :param features :return: ''' print "Creating vocabulary" voc = readVocabulary() print "Generating document vectors" occurrenceVectors, idf = createOccurrenceVectors(voc) print "Weighing vectors" weightedVectors = weight_tfidf(occurrenceVectors, idf) sentenceMatrix = [] imagematrix = [] print "Creating matrices" currentSentence = 0 for i in weightedVectors.keys(): if isLargeEnough(i): currentSentence += 1 print "current Sentence: " + str(currentSentence) for j in range(len(weightedVectors[i])): weightedVectors[i][j] = float(weightedVectors[i][j]) if currentSentence == 1: sentenceMatrix = weightedVectors[i] imagematrix = getImage(i,name_file, features) elif currentSentence ==2: sentenceMatrix = np.concatenate(([sentenceMatrix], [weightedVectors[i]]), axis = 0) imagematrix = np.concatenate(([imagematrix], [getImage(i,name_file, features)]), axis = 0) else: sentenceMatrix = np.concatenate((sentenceMatrix, [weightedVectors[i]]), axis = 0) imagematrix = np.concatenate((imagematrix, [getImage(i,name_file, features)]), axis = 0) print "Modelling cca" cca = CCA(n_components=128) cca.fit(sentenceMatrix, imagematrix) pickle.dump(cca, open("ccasnippetmodel.p",'w+')) idf = np.zeros(len(voc)) trainingimages = [] trainingsentences = [] dp = getDataProvider('flickr30k') currentPair = 0 for pair in dp.sampleImageSentencePair(): currentPair += 1 if currentPair % 100 == 0: print "Current pair: " + str(currentPair) img = pair['image']['feat'] trainingimages.append(img) sentence = getFullSentence(pair) for i in range(len(sentence)): if sentence[i] > 0: idf[i] += 1 trainingsentences.append(sentence) for i in range(len(trainingsentences)): trainingsentences[i] = trainingsentences[i]*idf trans_img, trans_sent = cca.transform(trainingimages, trainingsentences) nn_img = nearest_neighbor(trainingimages) nn_sent = nearest_neighbor(trainingsentences) augmented_imgs = [] augmented_sentences = [] for i in range(len(trans_img)): augm_img = trainingimages[i].extend(phi(3000,nn_img, trans_img[i])) augmented_imgs.append(augm_img) for i in range(len(trans_sent)): augm_sent = trainingsentences[i].extend(phi(3000, nn_sent, trans_sent[i])) augmented_sentences.append(augm_sent) augmentedcca = CCA(n_components= 96) augmentedcca.fit(augmented_sentences, augmented_imgs) pickle.dump(cca, open("augmentedcca.p",'w+'))
def main(): sess = tf.InteractiveSession() X1_data, X2_data, Y_data, baseline_data, labels_data = read_inputs() # set up the DCCA network keep_input = tf.placeholder("float") keep_hidden = tf.placeholder("float") X1_in, X1_out = build_network(273, 1500, 1500, 1500, 50, keep_input, keep_hidden) X2_in, X2_out = build_network(112, 1500, 1500, 1500, 50, keep_input, keep_hidden) # define the DCCA cost function U = tf.placeholder("float", [50, 40]) V = tf.placeholder("float", [50, 40]) UtF = tf.matmul(tf.transpose(U), tf.transpose(X1_out)) GtV = tf.matmul(X2_out, V) canon_corr = tf.mul(1./BATCH, tf.reduce_sum(tf.mul(tf.matmul(UtF, GtV), tf.constant(np.eye(40), dtype = tf.float32)))) corr_step = tf.train.AdamOptimizer(1e-6).minimize(- canon_corr) sess.run(tf.initialize_all_variables()) # train the network print "Training DCCA" for i in range(0, EPOCHS): for j in range(0, len(X1_data.train), int(BATCH)): X1_in_batch = X1_data.train[j:(j + BATCH)] X2_in_batch = X2_data.train[j:(j + BATCH)] X1_out_batch = X1_out.eval(feed_dict = { X1_in : X1_in_batch, keep_input : 1.0, keep_hidden : 1.0}) X2_out_batch = X2_out.eval(feed_dict = { X2_in : X2_in_batch, keep_input : 1.0, keep_hidden : 1.0}) # compute CCA on the output layers cca = CCA(n_components = 40) cca.fit(X1_out_batch, X2_out_batch) U_batch = cca.x_weights_ V_batch = cca.y_weights_ # perform gradient step corr_step.run(feed_dict = { X1_in : X1_in_batch, X2_in : X2_in_batch, U : U_batch, V : V_batch, keep_input : 0.9, keep_hidden : 0.8}) # print useful info print "EPOCH", i, "/ COST", canon_corr.eval(feed_dict = { X1_in : X1_in_batch, X2_in : X2_in_batch, U : U_batch, V : V_batch, keep_input : 1.0, keep_hidden : 1.0}) # train the softmax classifier print "Training softmax" W_s = weight_variable([89, 39]) b_s = bias_variable([39]) baseline = tf.placeholder("float", [None, 39]) y_true = tf.placeholder("float", [None, 39]) # define the cost X1_baseline_combo = tf.concat(1, [X1_out, baseline]) y_pred = tf.nn.softmax(tf.matmul(X1_baseline_combo, W_s) + b_s) lr_cost = - tf.reduce_sum(y_true * tf.log(tf.clip_by_value(y_pred, 1e-10, 1.0))) lr_step = tf.train.AdamOptimizer(1e-4).minimize(lr_cost) # set up accuracy checking correct_prediction = tf.equal(tf.argmax(y_pred, 1), tf.argmax(y_true, 1)) accuracy = tf.reduce_mean(tf.cast(correct_prediction, "float")) sess.run(tf.initialize_all_variables()) for i in range(0, EPOCHS): for j in range(0, len(X1_data.train), int(BATCH)): lr_step.run(feed_dict = { X1_in : X1_data.train[j:(j + BATCH)], y_true : Y_data.train[j:(j + BATCH)], baseline : baseline_data.train[j:(j + BATCH)], keep_input : 1.0, keep_hidden : 1.0}) print i, accuracy.eval(feed_dict = { X1_in : X1_data.dev, y_true : Y_data.dev, baseline : baseline_data.dev, keep_input : 1.0, keep_hidden : 1.0}) print "Test accuracy:", accuracy.eval(feed_dict = { X1_in : X1_data.test, y_true : Y_data.test, baseline : baseline_data.test, keep_input : 1.0, keep_hidden : 1.0}) # project the data and print it to file X1_train_proj = X1_baseline_combo.eval(feed_dict = { X1_in : X1_data.train, baseline : baseline_data.train, keep_input : 1.0, keep_hidden : 1.0}) X1_dev_proj = X1_baseline_combo.eval(feed_dict = { X1_in : X1_data.dev, baseline : baseline_data.dev, keep_input : 1.0, keep_hidden : 1.0}) X1_test_proj = X1_baseline_combo.eval(feed_dict = { X1_in : X1_data.test, baseline : baseline_data.test, keep_input : 1.0, keep_hidden : 1.0}) scipy.io.savemat("dcca_projected_data.mat", {'dataTr' : X1_train_proj, "PhonesTr" : labels_data.train, "dataDev" : X1_dev_proj, "PhonesDev" : labels_data.dev, "dataTest" : X1_test_proj, "PhonesTest" : labels_data.test})
session.execute("USE TweetsXiaohu") # session.execute("DROP TABLE IF EXISTS Tweet") rows = session.execute("SELECT text, hashtags FROM Tweet limit 1000") X, Y = [], [] for row in rows: X.append(row.text) Y.append([x.lower() for x in row.hashtags]) vectorizer = StemmedTfidfVectorizer(min_df=1, stop_words='english', decode_error='ignore') # print(vectorizer) X = vectorizer.fit_transform(X).toarray() # print '40', X # print type(X) Y_indicator = LabelBinarizer().fit(Y).transform(Y) cca = CCA(n_components = 100, max_iter=10) cca.fit(X, Y_indicator) X = cca.transform(X) # print '45', X # print type(X) classif = OneVsRestClassifier(SVC(kernel='linear')) classif.fit(X, Y) for row in rows: # row = rows[0] # print vectorizer.transform([row.text]).toarray() # print cca.predict(vectorizer.transform([row.text]).toarray()) transformed = vectorizer.transform([row.text]).toarray() # print '55', transformed ccad = cca.transform(transformed) # print '57', ccad predicts = classif.predict(ccad)
# check type of array #print(np.dtype(data_selection)) # force dtype = float32 data_selection = data_selection.astype(np.float32, copy=False) # complete cases data_selection = data_selection[~np.isnan(data_selection).any(axis=1)] data_selection = data_selection[np.isfinite(data_selection).any(axis=1)] # target variable / covariates y = data_selection[:,0:3] x = data_selection[:,4:] # split test-train x_train, x_test, y_train, y_test = cross_validation.train_test_split(x,y, test_size=0.2, random_state=0) cca = CCA(n_components=1,scale=True) cca.fit(x_train, y_train) #CCA(copy=True, max_iter=500, n_components=1, scale=True, tol=1e-06), X_train_r, Y_train_r = cca.transform(x_train,y_train) X_test_r, Y_test_r = cca.transform(x_test, y_test) print(type(X_train_r)) print(np.shape(X_train_r)) print(np.shape(Y_train_r)) print(np.shape(x)) print(np.corrcoef(X_train_r[:,0],Y_train_r[:,0])) print(np.corrcoef(X_test_r[:,0],Y_test_r[:,0]))
plt.figure() for i in range (5): plt.plot(nComponents,plsRegScores[i,:],lw=3) plt.xlim(1,np.amax(nComponents)) plt.title('PLS Regression accuracy') plt.xlabel('Number of components') plt.ylabel('accuracy') plt.legend (['LR','LDA','GNB','Linear SVM','rbf SVM'],loc='lower right') plt.grid(True) if (0): #%% Canonical Correlation Analysis nComponents = np.arange(1,nClasses +1) cca = CCA(n_components=nClasses) cca.fit(Xtrain,Ytrain) XtrainT = cca.transform(Xtrain) XtestT = cca.transform(Xtest) ccaScores = np.zeros((5,np.alen(nComponents))) for i,n in enumerate(nComponents): ccaScores[:,i] = util.classify(XtrainT[:,0:n],XtestT[:,0:n],labelsTrain,labelsTest) cca = CCA(n_components=3) cca.fit(Xtrain,Ytrain) xt = cca.transform(Xtrain) fig = plt.figure() util.plotData(fig,xt,labelsTrain,classColors) plt.title('First 3 components of projected data') #%% Plot accuracies for CCA
class CCA_Model: def __init__(self,n_components): self.n_components = n_components self.cca = CCA(n_components=n_components) self.ntop = 10 def learn_model(self,X_chanel, Y_chanel,Y_Distinct=None): """ :param X_chanel: array-like for X chanel :param Y_chanel: array-line for Y chanel :return: """ print "Start learning..." self.x_dim = len(X_chanel[0]) self.y_dim = len(Y_chanel[0]) self.cca.fit(X_chanel,Y_chanel) if Y_Distinct == None: self.X_transform ,self.Y_transform = self.cca.transform(X_chanel,Y_chanel) else: self.X_transform ,self.Y_transform = self.cca.transform(X_chanel,Y_Distinct) print "Learning completed" def get_bet_match_index_transform_x2y(self,x_transform): shape = self.Y_transform.shape scores = np.ndarray(shape[0],dtype=float) for i in xrange(shape[0]): scores[i] = np.dot(self.Y_transform[i],x_transform) #scores[i] = entropy(x_transform,self.Y_transform[i]) indices = (-scores).argsort()[:self.ntop] return [indices, scores[indices]] def get_bet_match_index_transform_y2x(self,y_transform): shape = self.X_transform.shape scores = np.ndarray(shape[0], dtype=float) for i in xrange(shape[0]): scores[i] = np.dot(self.X_transform[i], y_transform) #scores[i] = entropy(y_transform,self.X_transform[i]) indices = (-scores).argsort()[:self.ntop] return [indices, scores[indices]] def get_best_match_cross_indices_x2y(self,x_inputs): x_transformes = self.cca.transform(x_inputs) results = [] for x_transform in x_transformes: results.append(self.get_bet_match_index_transform_x2y(x_transform)) return results def get_best_match_cross_indices_y2x(self,y_inputs): _, y_transformes = self.cca.transform([[0 for i in xrange(self.x_dim)]],y_inputs) results = [] for y_transform in y_transformes: results.append(self.get_bet_match_index_transform_y2x(y_transform)) return results
plt.xlabel("Y comp. 1") plt.ylabel("Y comp. 2") plt.title('Y comp. 1 vs Y comp. 2 , (test corr = %.2f)'% numpy.corrcoef(Y_test_r[:, 0], Y_test_r[:, 1])[0, 1]) plt.legend(loc="best") plt.xticks(()) plt.yticks(()) plt.savefig(output_file) plt.close() # PLSCA plsca = PLSCanonical(n_components=2) plsca.fit(Xtrain, Ytrain) # PLSCanonical(algorithm='nipals', copy=True, max_iter=500, n_components=2, # scale=True, tol=1e-06) X_train_r, Y_train_r = plsca.transform(Xtrain, Ytrain) X_test_r, Y_test_r = plsca.transform(Xtest, Ytest) do_plot(X_train_r,Y_train_r,X_test_r,Y_test_r,'%s/PLSCA_2comp_norm.pdf' %output_folder) # CCA # probably not necessary, but just in case the data was modified in some way Ytrain = norm.loc[train,:] Ytest = norm.loc[holdout,:] Xtrain = numpy.array(X.loc[train,:]) Xtest = X.loc[holdout,:] cca = CCA(n_components=2) cca.fit(Xtrain, Ytrain) # CCA(copy=True, max_iter=500, n_components=2, scale=True, tol=1e-06) X_train_r, Y_train_r = cca.transform(Xtrain, Ytrain) X_test_r, Y_test_r = cca.transform(Xtest, Ytest) do_plot(X_train_r,Y_train_r,X_test_r,Y_test_r,'%s/CCA_2comp_norm.pdf' %output_folder)