def test_stats_vs_matlab(): X = np.vstack((np.eye(3, 3), 2 * np.eye(3, 3))) Y1 = np.fliplr(np.eye(3, 3)) Y = np.vstack((Y1, 0.1 * np.eye(3, 3))) matlab_stats = { 'r': np.array([1.000000000000000, 0.533992991387982, 0.355995327591988]), 'Wilks': np.array([0, 0.624256445446525, 0.873267326732673]), 'df1': np.array([9, 4, 1]), 'df2': np.array([0.150605850666856, 2, 2]), 'F': np.array([np.inf, 0.132832080200501, 0.290249433106576]), 'pF': np.array([0, 0.955941574355455, 0.644004672408012]), 'chisq': np.array([np.inf, 0.706791037156489, 0.542995281660087]), 'pChisq': np.array([0, 0.950488814632803, 0.461194028737338]) } cca = CCA(n_components=3) scores = cca.fit_transform([X, Y]) stats = cca.stats(scores) assert np.allclose(stats['r'][0], 1) nondegen = np.argwhere(stats['r'] < 1 - 2 * np.finfo(float).eps).squeeze() assert np.array_equal(nondegen, np.array([1, 2])) for key in stats: assert np.allclose(stats[key], matlab_stats[key], rtol=1e-3, atol=1e-4)
def test_with_mapping(): try: from ..transformer import BPtTransformerMV from mvlearn.embed import CCA except (ImportError, ModuleNotFoundError): return X = get_X() mv = BPtTransformerMV(estimator=CCA(multiview_output=False), inds=[[0, 1, 2], [3, 4]]) # This mapping should ignore feat 1, adding it on after mapping = {0: 0, 1: 0, 2: 2, 3: 3, 4: 4} X_trans = mv.fit_transform(X, mapping=mapping) assert X_trans.shape == (4, 2) assert np.array_equal(X_trans[:, 1], X[:, 1]) assert len(mv.estimator_.loadings_) == 2 assert len(mv.estimator_.loadings_[0]) == 2 assert len(mv.estimator_.loadings_[1]) == 2 assert len(mv.estimator_.means_) == 2 assert len(mv.estimator_.means_[0] == 2) assert len(mv.estimator_.means_[1] == 2)
def test_with_cache(): try: from ..transformer import BPtTransformerMV from mvlearn.embed import CCA except (ImportError, ModuleNotFoundError): return temp_dr = os.path.join(tempfile.gettempdir(), 'temp_dr') if os.path.exists(temp_dr): shutil.rmtree(temp_dr) X = get_X() mv = BPtTransformerMV(estimator=CCA(multiview_output=False), inds=[[0, 1, 2], [3, 4]], cache_loc=temp_dr) # Fit once w/ caching mv.fit_transform(X) assert os.listdir(temp_dr) == ['joblib'] # Fit again, should load from cached X_trans = mv.fit_transform(X) assert X_trans.shape == (4, 1) if os.path.exists(temp_dr): shutil.rmtree(temp_dr) assert not os.path.exists(temp_dr)
def basic_test(): try: from ..transformer import BPtTransformerMV from mvlearn.embed import CCA except (ImportError, ModuleNotFoundError): return X = get_X() mv = BPtTransformerMV(estimator=CCA(multiview_output=False), inds=[[0, 1, 2], [3, 4]]) X_trans = mv.fit_transform(X) # Basic Checks assert X_trans.shape == (4, 1) assert mv.inds_ == [0, 1, 2, 3, 4] assert mv.view_inds_ == [[0, 1, 2], [3, 4]] assert mv.out_mapping_[0] == [0] assert mv.out_mapping_[4] == [0] assert len(mv.out_mapping_) == 5 assert isinstance(mv.estimator_, CCA) assert mv.n_trans_feats_ == 1 assert mv.n_features_in_ == 5 assert len(mv.estimator_.means_) == 2 assert len(mv.estimator_.means_[0] == 3) assert len(mv.estimator_.means_[1] == 2) assert len(mv.estimator_.loadings_) == 2 assert len(mv.estimator_.loadings_[0]) == 3 assert len(mv.estimator_.loadings_[1]) == 2
def test_stats_1_feature_vs_matlab(): X = np.arange(1, 11).reshape(-1, 1) Y = np.arange(2, 21, 2).reshape(-1, 1) matlab_stats = { 'r': np.array([1]), 'Wilks': np.array([0]), 'df1': np.array([1]), 'df2': np.array([8]), 'F': np.array([np.inf]), 'pF': np.array([0]), 'chisq': np.array([np.inf]), 'pChisq': np.array([0]) } cca = CCA(n_components=1) scores = cca.fit_transform([X, Y]) stats = cca.stats(scores) for key in stats: assert np.allclose(stats[key], matlab_stats[key], rtol=1e-3, atol=1e-4)
def test_stats_2_components(): np.random.seed(12) X = np.random.rand(100, 3) Y = np.random.rand(100, 4) past_stats = { 'r': np.array([0.22441608, 0.19056307]), 'Wilks': np.array([0.91515202, 0.96368572]), 'df1': np.array([12, 6]), 'df2': np.array([246.34637455, 188]), 'F': np.array([0.69962605, 0.58490315]), 'pF': np.array([0.75134965, 0.74212361]), 'chisq': np.array([8.42318331, 4.2115406]), 'pChisq': np.array([0.75124771, 0.64807349]) } cca = CCA(n_components=2) scores = cca.fit_transform([X, Y]) stats = cca.stats(scores) nondegen = np.argwhere(stats['r'] < 1 - 2 * np.finfo(float).eps).squeeze() assert np.array_equal(nondegen, np.array([0, 1])) for key in stats: assert np.allclose(stats[key], past_stats[key], rtol=1e-3, atol=1e-4)
def test_stats_1_component(): np.random.seed(12) X = np.random.rand(100, 3) Y = np.random.rand(100, 4) past_stats = { 'r': np.array([0.22441608326082138]), 'Wilks': np.array([0.94963742]), 'df1': np.array([12]), 'df2': np.array([246.34637455]), 'F': np.array([0.40489714]), 'pF': np.array([0.96096493]), 'chisq': np.array([4.90912773]), 'pChisq': np.array([0.9609454]) } cca = CCA(n_components=1) scores = cca.fit_transform([X, Y]) stats = cca.stats(scores) assert not stats['r'] == 1 assert not stats['r'] + 2 * np.finfo(float).eps >= 1 for key in stats: assert np.allclose(stats[key], past_stats[key], rtol=1e-3, atol=1e-4)
random_state=23, return_decomp=True) ############################################################################### # CCA # ^^^ # # CCA, equivalent to 2 view MCCA, learns transformations of the views, # projecting a linear combination of the features to a component such that the # sum of correlations between the ith components of each view is maximized. We # see the top three components of the first two views plotted against each # other, pairwise. The strong linear shape on the diagonals shows that the # found components correlate well. # the default is no regularization meaning this is SUMCORR-AVGVAR MCCA cca = CCA(n_components=joint_rank) # the fit-transform method outputs the scores for each view cca_scores = cca.fit_transform(Xs[:2]) crossviews_plot(cca_scores, title='CCA scores (first two views fitted)', equal_axes=True, scatter_kwargs={ 'alpha': 0.4, 's': 2.0 }) # In the 2 view setting, a variety of interpretable statistics can be # calculated. We assess the canonical correlations achieved and # their significance using the p-values from a Wilk's Lambda test
cmap = matplotlib.colors.ListedColormap( sns.diverging_palette(240, 10, n=len(labels), center='light').as_hex()) cmap = 'coolwarm' method_labels = \ ['Raw Views', 'CCA', 'Polynomial KCCA', 'Gaussian KCCA', 'DCCA'] transform_labels = \ ['Linear Transform', 'Polynomial Transform', 'Sinusoidal Transform'] input_size1 = Xs_train_sets[0][0].shape[1] input_size2 = Xs_train_sets[0][1].shape[1] outdim_size = min(Xs_train_sets[0][0].shape[1], 2) layer_sizes1 = [256, 256, outdim_size] layer_sizes2 = [256, 256, outdim_size] methods = [ CCA(regs=0.1, n_components=2), KMCCA(kernel='poly', regs=0.1, kernel_params={'degree': 2, 'coef0': 0.1}, n_components=2), KMCCA(kernel='rbf', regs=0.1, kernel_params={'gamma': 1/4}, n_components=2), DCCA(input_size1, input_size2, outdim_size, layer_sizes1, layer_sizes2, epoch_num=400) ] fig, axes = plt.subplots(3 * 2, 5 * 2, figsize=(22, 12)) sns.set_context('notebook') for r, transform in enumerate(transforms): axs = axes[2 * r:2 * r + 2, :2] for i, ax in enumerate(axs.flatten()): dim2 = int(i / 2)
# Inference using regularized CCA # ------------------------------- # # Canonical Correlation Analysis (:class:`mvlearn.embed.CCA`) # finds separate linear projections of views which are maximally # correlated. We can so embed the data jointly and observe that the first two # embeddings are highly correlated and capture the differences between # genetic types. One can use this to construct a single view # for subsequent inference, or to examine the loading weights across views. # Because the genetic expression data has more features than samples, we need # to use regularization so as to not to trivially overfit. from mvlearn.plotting import crossviews_plot # noqa: E402 from mvlearn.embed import CCA # noqa: E402 cca = CCA(n_components=2, regs=[0.9, 0.1]) Xs_cca = cca.fit_transform(Xs) y_labels = [diet_names[j] + f' ({genotype_names[i]})' for (i, j) in y] f, axes = crossviews_plot(Xs_cca, labels=np.asarray(['Red', 'Blue'])[y[:, 0]], ax_ticks=False, figsize=(5, 5), equal_axes=True, title='CCA view embeddings', scatter_kwargs=sca_kwargs, show=False) corr1, corr2 = cca.canon_corrs(Xs_cca) axes[0, 0].annotate(f'1st Canonical\nCorrelation = {corr1:.2f}', xy=(0.95, 0.05), xycoords='axes fraction',
def test_errors(): cca = CCA() with pytest.raises(ValueError): cca.fit([train1, train2, train2]) with pytest.raises(ValueError): cca.fit([train1, train2]) cca.stats([train1, train1, train1]) with pytest.raises(AssertionError): cca.fit([train1, train2]) _ = cca.stats([train1, train2]) with pytest.raises(KeyError): cca.fit([train1, train2]) _ = cca.stats([train1, train1], 'FAIL')
data1 = 0.25 * indep1 + 0.75 * np.vstack( (latvar1, latvar2, latvar1, latvar2)).T data2 = 0.25 * indep2 + 0.75 * np.vstack( (latvar1, latvar2, latvar1, latvar2, latvar1)).T # Split each dataset into a training set and test set # (10% of dataset is training data) train1 = data1[:int(nSamples / 10)] train2 = data2[:int(nSamples / 10)] test1 = data1[int(nSamples / 10):] test2 = data2[int(nSamples / 10):] n_components = 4 # Initialize CCA cca = CCA(regs=0.001, n_components=n_components) # Use the methods to find a CCA mapping and transform the views of data cca_ft = cca.fit_transform([train1, train2]) cca_f = cca.fit([train1, train2]) cca_t = cca.transform([train1, train2]) # gaussian related data N = 100 t = np.random.uniform(-np.pi, np.pi, N) e1 = np.random.normal(0, 0.05, (N, 2)) e2 = np.random.normal(0, 0.05, (N, 2)) x = np.zeros((N, 2)) x[:, 0] = t x[:, 1] = np.sin(3 * t)
def test_n_components(n_components): cca = CCA(n_components=n_components) with pytest.raises(ValueError, match="n_components must be an integer"): cca = cca.fit([train1, train2])