def test_UCImultifeature_dataloader_select_views():
    # load data
    views = [4, 5, 1]
    o_data, o_labels = load_UCImultifeature()
    data, labels = load_UCImultifeature(views=views)

    assert len(data) == len(views)
    assert labels.shape[0] == 2000

    # Check the shape of the data
    for i in range(len(views)):
        assert data[i].shape == o_data[views[i]].shape
def test_UCImultifeature_dataloader():
    # load data
    data, labels = load_UCImultifeature()

    assert len(data) == 6
    assert labels.shape[0] == 2000

    # check size of data
    for i in range(6):
        assert data[i].shape[0] == 2000

    data1, labels1 = load_UCImultifeature()

    # check data and labels are same
    assert (np.allclose(data[0], data1[0]))
    assert (np.allclose(labels, labels1))
Exemple #3
0
 def __init__(self, train=True):
     full_data, self.labels = load_UCImultifeature()
     self.train = train
     self.view_1, self.view_2, self.view_3, self.view_4, self.view_5, self.view_6 = full_data
     self.view_1 = MinMaxScaler().fit_transform(self.view_1)
     self.view_2 = MinMaxScaler().fit_transform(self.view_2)
     self.view_3 = MinMaxScaler().fit_transform(self.view_3)
     self.view_4 = MinMaxScaler().fit_transform(self.view_4)
     self.view_5 = MinMaxScaler().fit_transform(self.view_5)
     self.view_6 = MinMaxScaler().fit_transform(self.view_6)
def test_UCImultifeature_randomstate_sameordifferent():

    # load data
    data, labels = load_UCImultifeature(shuffle=True, random_state=2)
    data1, labels1 = load_UCImultifeature(shuffle=True, random_state=5)
    data2, labels2 = load_UCImultifeature(shuffle=True, random_state=2)
    data3, labels3 = load_UCImultifeature(shuffle=False)

    assert len(data) == 6
    assert labels.shape[0] == 2000

    # check size of data
    for i in range(6):
        assert data[i].shape[0] == 2000

    # check data is same
    for idx in range(6):
        assert (np.allclose(data[idx], data2[idx]))
        assert (not np.allclose(data[idx], data1[idx]))
        assert (not np.allclose(data[idx], data3[idx]))
        assert (not np.allclose(data1[idx], data3[idx]))
def test_UCImultifeature_dataloader_select_labels():
    # load data
    lab = [0, 1, 2]
    data, labels = load_UCImultifeature(select_labeled=lab)

    assert len(data) == 6

    assert labels.shape[0] == 600
    labels_set = list(set(labels))
    assert len(labels_set) == len(lab)
    for j, lab_in_set in enumerate(labels_set):
        assert lab_in_set == lab[j]

    # check size of data
    for i in range(6):
        assert data[i].shape[0] == 600
# License: MIT

from mvlearn.datasets import load_UCImultifeature
from mvlearn.cluster import MultiviewSphericalKMeans
from sklearn.manifold import TSNE
from sklearn.metrics import normalized_mutual_info_score as nmi_score
import matplotlib.pyplot as plt
import warnings
warnings.simplefilter('ignore')  # Ignore warnings

# Load in UCI digits multiple feature dataset as an example

RANDOM_SEED = 5
# Load dataset along with labels for digits 0 through 4
n_class = 5
Xs, labels = load_UCImultifeature(select_labeled=list(range(n_class)),
                                  views=[0, 1])


# Creating a function to display data and the results of clustering
def display_plots(pre_title, data, labels):
    # plot the views
    fig, ax = plt.subplots(1, 2, figsize=(14, 5))
    dot_size = 10
    ax[0].scatter(data[0][:, 0], data[0][:, 1], c=labels, s=dot_size)
    ax[0].set_title(pre_title + ' View 1')
    ax[0].axes.get_xaxis().set_visible(False)
    ax[0].axes.get_yaxis().set_visible(False)

    ax[1].scatter(data[1][:, 0], data[1][:, 1], c=labels, s=dot_size)
    ax[1].set_title(pre_title + ' View 2')
    ax[1].axes.get_xaxis().set_visible(False)
from sklearn.naive_bayes import GaussianNB
from sklearn.model_selection import train_test_split
from sklearn.metrics import accuracy_score
from sklearn.ensemble import RandomForestClassifier

from mvlearn.semi_supervised import CTClassifier
from mvlearn.datasets import load_UCImultifeature

###############################################################################
# Load the UCI Multiple Digit Features Dataset
# ^^^^^^^^^^^^^^^^^^^^^^^^^^^^^^^^^^^^^^^^^^^^
#
# To simulate a semi-supervised learning scenario, randomly remove 98% of the
# labels.

data, labels = load_UCImultifeature(select_labeled=[0, 1])

# Use only the first 2 views as an example
View0, View1 = data[0], data[1]

# Split both views into testing and training
View0_train, View0_test, labels_train, labels_test = train_test_split(
    View0, labels, test_size=0.33, random_state=42)
View1_train, View1_test, labels_train, labels_test = train_test_split(
    View1, labels, test_size=0.33, random_state=42)

# Randomly remove all but 4 of the labels
np.random.seed(6)
remove_idx = np.random.rand(len(labels_train), ) < 0.98
labels_train[remove_idx] = np.nan
not_removed = np.where(~remove_idx)
from mpl_toolkits.axes_grid1 import make_axes_locatable

###############################################################################
# Load and visualize the multiple handwritten digit views
# -------------------------------------------------------
#
# We load a 6-view, 4-class dataset from the Multiple Features Dataset. Each
# of the six views are as follows:
#     1. 76 Fourier coefficients of the character shapes
#     2. 216 profile correlations
#     3. 64 Karhunen-Love coefficients
#     4. 240 pixel averages of the images from 2x3 windows
#     5. 47 Zernike moments
#     6. 6 morphological features

Xs, y = load_UCImultifeature(select_labeled=[0, 1, 2, 3])
view_names = [
    'Fourier\nCoefficients', 'Profile\nCorrelations', 'Karhunen-\nLoeve',
    'Pixel\nAverages', 'Zernike\nMoments', 'Morphological\nFeatures'
]

order = np.argsort(y)
sub_samp = np.arange(0, Xs[0].shape[0], step=3)

fig, axes = plt.subplots(1, 6, figsize=(8, 4))
for i, view in enumerate(Xs):
    sorted_view = view[order, :].copy()
    sorted_view = sorted_view[sub_samp, :]

    # Scale features in each view to [0, 1]
    minim = np.min(sorted_view, axis=0)
Loading and Viewing the UCI Multiple Features Dataset
=====================================================

"""

from mvlearn.datasets import load_UCImultifeature
from mvlearn.plotting import quick_visualize

###############################################################################
# Load the data and labels
# ^^^^^^^^^^^^^^^^^^^^^^^^
# We can either load the entire dataset (all 10 digits) or select certain
# digits. Then, visualize in 2D.

# Load entire dataset
full_data, full_labels = load_UCImultifeature()

print("Full Dataset\n")
print("Views = " + str(len(full_data)))
print("First view shape = " + str(full_data[0].shape))
print("Labels shape = " + str(full_labels.shape))

quick_visualize(full_data, labels=full_labels, title="10-class data")

###############################################################################
# Load only 2 classes of the data
# ^^^^^^^^^^^^^^^^^^^^^^^^^^^^^^^
#
# Also, shuffle the data and set the seed for reproducibility. Then, visualize
# in 2D.
Exemple #10
0
            alpha=0.15)
plt.xlabel("Component 1", fontsize=20)
plt.ylabel("Component 2", fontsize=20)
plt.tight_layout()
ax.set_title('Latent Positions from Omnibus Embedding', fontsize=20)
plt.show()

###############################################################################
# UCI Digits Dataset
# ------------------
#
# Finally, we run Omnibus on the UCI Multiple Features Digits
# Dataset. We use the Fourier coefficient and profile correlation
# views (View 1 and 2 respectively) as a 2-view dataset.

full_data, full_labels = load_UCImultifeature()
view_1 = full_data[0]
view_2 = full_data[1]

Xs = [view_1, view_2]

# Running omnibus
embedder = omnibus.Omnibus()
embeddings = embedder.fit_transform(Xs)

###############################################################################
# Visualizing the Results
# ^^^^^^^^^^^^^^^^^^^^^^^
#
# This time, the points in the plot are colored by digit (0-9). The marker
# symbols denote which view each sample is from. We randomly plot 500 samples
# License: MIT

from mvlearn.datasets import load_UCImultifeature
from mvlearn.embed import GCCA
from mvlearn.plotting import crossviews_plot

###############################################################################
# Load Data
# ---------
#
# We load three views from the UCI handwritten digits multi-view data set.
# Specificallym the Profile correlations, Karhunen-Love coefficients, and pixel
# averages from 2x3 windows.

# Load the data
Xs, y = load_UCImultifeature(views=[1, 2, 3])

# Inspect the dataset
print(f'There are {len(Xs)} views.')
print(f'There are {Xs[0].shape[0]} observations')
print(f'The feature sizes are: {[X.shape[1] for X in Xs]}')

###############################################################################
# Embed Views
# ^^^^^^^^^^^

# Create GCCA object and embed the
gcca = GCCA()
Xs_latents = gcca.fit_transform(Xs)

print(f'The feature sizes are: {[X.shape[1] for X in Xs_latents]}')
def test_dataloader_badviews1():
    v_list = []
    with pytest.raises(ValueError):
        data, labels = load_UCImultifeature(views=v_list)
def test_UCImultifeature_dataloader_badselect3():
    bad_list = [0, 2, 4, -2]
    with pytest.raises(ValueError):
        data, labels = load_UCImultifeature(select_labeled=bad_list)
def test_UCImultifeature_dataloader_badselect2():
    long_list = [0, 0, 0, 1, 2, 3, 4, 5, 6, 7, 8, 9, 10, 11, 12]
    with pytest.raises(ValueError):
        data, labels = load_UCImultifeature(select_labeled=long_list)
def test_UCImultifeature_dataloader_badselect():
    with pytest.raises(ValueError):
        data, labels = load_UCImultifeature(select_labeled=[])