Example #1
0
def test_stats_vs_matlab():
    X = np.vstack((np.eye(3, 3), 2 * np.eye(3, 3)))
    Y1 = np.fliplr(np.eye(3, 3))
    Y = np.vstack((Y1, 0.1 * np.eye(3, 3)))
    matlab_stats = {
        'r':
        np.array([1.000000000000000, 0.533992991387982, 0.355995327591988]),
        'Wilks': np.array([0, 0.624256445446525, 0.873267326732673]),
        'df1': np.array([9, 4, 1]),
        'df2': np.array([0.150605850666856, 2, 2]),
        'F': np.array([np.inf, 0.132832080200501, 0.290249433106576]),
        'pF': np.array([0, 0.955941574355455, 0.644004672408012]),
        'chisq': np.array([np.inf, 0.706791037156489, 0.542995281660087]),
        'pChisq': np.array([0, 0.950488814632803, 0.461194028737338])
    }

    cca = CCA(n_components=3)
    scores = cca.fit_transform([X, Y])
    stats = cca.stats(scores)

    assert np.allclose(stats['r'][0], 1)
    nondegen = np.argwhere(stats['r'] < 1 - 2 * np.finfo(float).eps).squeeze()
    assert np.array_equal(nondegen, np.array([1, 2]))

    for key in stats:
        assert np.allclose(stats[key], matlab_stats[key], rtol=1e-3, atol=1e-4)
Example #2
0
def test_with_mapping():

    try:
        from ..transformer import BPtTransformerMV
        from mvlearn.embed import CCA
    except (ImportError, ModuleNotFoundError):
        return

    X = get_X()
    mv = BPtTransformerMV(estimator=CCA(multiview_output=False),
                          inds=[[0, 1, 2], [3, 4]])

    # This mapping should ignore feat 1, adding it on after
    mapping = {0: 0, 1: 0, 2: 2, 3: 3, 4: 4}
    X_trans = mv.fit_transform(X, mapping=mapping)

    assert X_trans.shape == (4, 2)
    assert np.array_equal(X_trans[:, 1], X[:, 1])

    assert len(mv.estimator_.loadings_) == 2
    assert len(mv.estimator_.loadings_[0]) == 2
    assert len(mv.estimator_.loadings_[1]) == 2
    assert len(mv.estimator_.means_) == 2
    assert len(mv.estimator_.means_[0] == 2)
    assert len(mv.estimator_.means_[1] == 2)
Example #3
0
def test_with_cache():

    try:
        from ..transformer import BPtTransformerMV
        from mvlearn.embed import CCA
    except (ImportError, ModuleNotFoundError):
        return

    temp_dr = os.path.join(tempfile.gettempdir(), 'temp_dr')
    if os.path.exists(temp_dr):
        shutil.rmtree(temp_dr)

    X = get_X()
    mv = BPtTransformerMV(estimator=CCA(multiview_output=False),
                          inds=[[0, 1, 2], [3, 4]],
                          cache_loc=temp_dr)

    # Fit once w/ caching
    mv.fit_transform(X)
    assert os.listdir(temp_dr) == ['joblib']

    # Fit again, should load from cached
    X_trans = mv.fit_transform(X)
    assert X_trans.shape == (4, 1)

    if os.path.exists(temp_dr):
        shutil.rmtree(temp_dr)

    assert not os.path.exists(temp_dr)
Example #4
0
def basic_test():

    try:
        from ..transformer import BPtTransformerMV
        from mvlearn.embed import CCA
    except (ImportError, ModuleNotFoundError):
        return

    X = get_X()
    mv = BPtTransformerMV(estimator=CCA(multiview_output=False),
                          inds=[[0, 1, 2], [3, 4]])

    X_trans = mv.fit_transform(X)

    # Basic Checks
    assert X_trans.shape == (4, 1)
    assert mv.inds_ == [0, 1, 2, 3, 4]
    assert mv.view_inds_ == [[0, 1, 2], [3, 4]]
    assert mv.out_mapping_[0] == [0]
    assert mv.out_mapping_[4] == [0]
    assert len(mv.out_mapping_) == 5
    assert isinstance(mv.estimator_, CCA)
    assert mv.n_trans_feats_ == 1
    assert mv.n_features_in_ == 5

    assert len(mv.estimator_.means_) == 2
    assert len(mv.estimator_.means_[0] == 3)
    assert len(mv.estimator_.means_[1] == 2)

    assert len(mv.estimator_.loadings_) == 2
    assert len(mv.estimator_.loadings_[0]) == 3
    assert len(mv.estimator_.loadings_[1]) == 2
Example #5
0
def test_stats_1_feature_vs_matlab():
    X = np.arange(1, 11).reshape(-1, 1)
    Y = np.arange(2, 21, 2).reshape(-1, 1)
    matlab_stats = {
        'r': np.array([1]),
        'Wilks': np.array([0]),
        'df1': np.array([1]),
        'df2': np.array([8]),
        'F': np.array([np.inf]),
        'pF': np.array([0]),
        'chisq': np.array([np.inf]),
        'pChisq': np.array([0])
    }

    cca = CCA(n_components=1)
    scores = cca.fit_transform([X, Y])
    stats = cca.stats(scores)

    for key in stats:
        assert np.allclose(stats[key], matlab_stats[key], rtol=1e-3, atol=1e-4)
Example #6
0
def test_stats_2_components():
    np.random.seed(12)
    X = np.random.rand(100, 3)
    Y = np.random.rand(100, 4)
    past_stats = {
        'r': np.array([0.22441608, 0.19056307]),
        'Wilks': np.array([0.91515202, 0.96368572]),
        'df1': np.array([12, 6]),
        'df2': np.array([246.34637455, 188]),
        'F': np.array([0.69962605, 0.58490315]),
        'pF': np.array([0.75134965, 0.74212361]),
        'chisq': np.array([8.42318331, 4.2115406]),
        'pChisq': np.array([0.75124771, 0.64807349])
    }

    cca = CCA(n_components=2)
    scores = cca.fit_transform([X, Y])
    stats = cca.stats(scores)

    nondegen = np.argwhere(stats['r'] < 1 - 2 * np.finfo(float).eps).squeeze()
    assert np.array_equal(nondegen, np.array([0, 1]))

    for key in stats:
        assert np.allclose(stats[key], past_stats[key], rtol=1e-3, atol=1e-4)
Example #7
0
def test_stats_1_component():
    np.random.seed(12)
    X = np.random.rand(100, 3)
    Y = np.random.rand(100, 4)
    past_stats = {
        'r': np.array([0.22441608326082138]),
        'Wilks': np.array([0.94963742]),
        'df1': np.array([12]),
        'df2': np.array([246.34637455]),
        'F': np.array([0.40489714]),
        'pF': np.array([0.96096493]),
        'chisq': np.array([4.90912773]),
        'pChisq': np.array([0.9609454])
    }

    cca = CCA(n_components=1)
    scores = cca.fit_transform([X, Y])
    stats = cca.stats(scores)

    assert not stats['r'] == 1
    assert not stats['r'] + 2 * np.finfo(float).eps >= 1

    for key in stats:
        assert np.allclose(stats[key], past_stats[key], rtol=1e-3, atol=1e-4)
Example #8
0
                                                random_state=23,
                                                return_decomp=True)

###############################################################################
# CCA
# ^^^
#
# CCA, equivalent to 2 view MCCA, learns transformations of the views,
# projecting a linear combination of the features to a component such that the
# sum of correlations between the ith components of each view is maximized. We
# see the top three components of the first two views plotted against each
# other, pairwise. The strong linear shape on the diagonals shows that the
# found components correlate well.

# the default is no regularization meaning this is SUMCORR-AVGVAR MCCA
cca = CCA(n_components=joint_rank)

# the fit-transform method outputs the scores for each view
cca_scores = cca.fit_transform(Xs[:2])
crossviews_plot(cca_scores,
                title='CCA scores (first two views fitted)',
                equal_axes=True,
                scatter_kwargs={
                    'alpha': 0.4,
                    's': 2.0
                })

# In the 2 view setting, a variety of interpretable statistics can be
# calculated. We assess the canonical correlations achieved and
# their significance using the p-values from a Wilk's Lambda test
Example #9
0
cmap = matplotlib.colors.ListedColormap(
    sns.diverging_palette(240, 10, n=len(labels), center='light').as_hex())
cmap = 'coolwarm'

method_labels = \
    ['Raw Views', 'CCA', 'Polynomial KCCA', 'Gaussian KCCA', 'DCCA']
transform_labels = \
    ['Linear Transform', 'Polynomial Transform', 'Sinusoidal Transform']

input_size1 = Xs_train_sets[0][0].shape[1]
input_size2 = Xs_train_sets[0][1].shape[1]
outdim_size = min(Xs_train_sets[0][0].shape[1], 2)
layer_sizes1 = [256, 256, outdim_size]
layer_sizes2 = [256, 256, outdim_size]
methods = [
    CCA(regs=0.1, n_components=2),
    KMCCA(kernel='poly', regs=0.1, kernel_params={'degree': 2, 'coef0': 0.1},
          n_components=2),
    KMCCA(kernel='rbf', regs=0.1, kernel_params={'gamma': 1/4},
          n_components=2),
    DCCA(input_size1, input_size2, outdim_size, layer_sizes1, layer_sizes2,
         epoch_num=400)
]

fig, axes = plt.subplots(3 * 2, 5 * 2, figsize=(22, 12))
sns.set_context('notebook')

for r, transform in enumerate(transforms):
    axs = axes[2 * r:2 * r + 2, :2]
    for i, ax in enumerate(axs.flatten()):
        dim2 = int(i / 2)
Example #10
0
# Inference using regularized CCA
# -------------------------------
#
# Canonical Correlation Analysis (:class:`mvlearn.embed.CCA`)
# finds separate linear projections of views which are maximally
# correlated. We can so embed the data jointly and observe that the first two
# embeddings are highly correlated and capture the differences between
# genetic types. One can use this to construct a single view
# for subsequent inference, or to examine the loading weights across views.
# Because the genetic expression data has more features than samples, we need
# to use regularization so as to not to trivially overfit.

from mvlearn.plotting import crossviews_plot  # noqa: E402
from mvlearn.embed import CCA  # noqa: E402

cca = CCA(n_components=2, regs=[0.9, 0.1])
Xs_cca = cca.fit_transform(Xs)

y_labels = [diet_names[j] + f' ({genotype_names[i]})' for (i, j) in y]
f, axes = crossviews_plot(Xs_cca,
                          labels=np.asarray(['Red', 'Blue'])[y[:, 0]],
                          ax_ticks=False,
                          figsize=(5, 5),
                          equal_axes=True,
                          title='CCA view embeddings',
                          scatter_kwargs=sca_kwargs,
                          show=False)
corr1, corr2 = cca.canon_corrs(Xs_cca)
axes[0, 0].annotate(f'1st Canonical\nCorrelation = {corr1:.2f}',
                    xy=(0.95, 0.05),
                    xycoords='axes fraction',
Example #11
0
def test_errors():
    cca = CCA()
    with pytest.raises(ValueError):
        cca.fit([train1, train2, train2])

    with pytest.raises(ValueError):
        cca.fit([train1, train2])
        cca.stats([train1, train1, train1])

    with pytest.raises(AssertionError):
        cca.fit([train1, train2])
        _ = cca.stats([train1, train2])

    with pytest.raises(KeyError):
        cca.fit([train1, train2])
        _ = cca.stats([train1, train1], 'FAIL')
Example #12
0
data1 = 0.25 * indep1 + 0.75 * np.vstack(
    (latvar1, latvar2, latvar1, latvar2)).T
data2 = 0.25 * indep2 + 0.75 * np.vstack(
    (latvar1, latvar2, latvar1, latvar2, latvar1)).T

# Split each dataset into a training set and test set
# (10% of dataset is training data)
train1 = data1[:int(nSamples / 10)]
train2 = data2[:int(nSamples / 10)]
test1 = data1[int(nSamples / 10):]
test2 = data2[int(nSamples / 10):]

n_components = 4

# Initialize CCA
cca = CCA(regs=0.001, n_components=n_components)

# Use the methods to find a CCA mapping and transform the views of data
cca_ft = cca.fit_transform([train1, train2])
cca_f = cca.fit([train1, train2])
cca_t = cca.transform([train1, train2])

# gaussian related data
N = 100
t = np.random.uniform(-np.pi, np.pi, N)
e1 = np.random.normal(0, 0.05, (N, 2))
e2 = np.random.normal(0, 0.05, (N, 2))

x = np.zeros((N, 2))
x[:, 0] = t
x[:, 1] = np.sin(3 * t)
Example #13
0
def test_n_components(n_components):
    cca = CCA(n_components=n_components)
    with pytest.raises(ValueError, match="n_components must be an integer"):
        cca = cca.fit([train1, train2])