def test_UCImultifeature_dataloader_select_views(): # load data views = [4, 5, 1] o_data, o_labels = load_UCImultifeature() data, labels = load_UCImultifeature(views=views) assert len(data) == len(views) assert labels.shape[0] == 2000 # Check the shape of the data for i in range(len(views)): assert data[i].shape == o_data[views[i]].shape
def test_UCImultifeature_dataloader(): # load data data, labels = load_UCImultifeature() assert len(data) == 6 assert labels.shape[0] == 2000 # check size of data for i in range(6): assert data[i].shape[0] == 2000 data1, labels1 = load_UCImultifeature() # check data and labels are same assert (np.allclose(data[0], data1[0])) assert (np.allclose(labels, labels1))
def __init__(self, train=True): full_data, self.labels = load_UCImultifeature() self.train = train self.view_1, self.view_2, self.view_3, self.view_4, self.view_5, self.view_6 = full_data self.view_1 = MinMaxScaler().fit_transform(self.view_1) self.view_2 = MinMaxScaler().fit_transform(self.view_2) self.view_3 = MinMaxScaler().fit_transform(self.view_3) self.view_4 = MinMaxScaler().fit_transform(self.view_4) self.view_5 = MinMaxScaler().fit_transform(self.view_5) self.view_6 = MinMaxScaler().fit_transform(self.view_6)
def test_UCImultifeature_randomstate_sameordifferent(): # load data data, labels = load_UCImultifeature(shuffle=True, random_state=2) data1, labels1 = load_UCImultifeature(shuffle=True, random_state=5) data2, labels2 = load_UCImultifeature(shuffle=True, random_state=2) data3, labels3 = load_UCImultifeature(shuffle=False) assert len(data) == 6 assert labels.shape[0] == 2000 # check size of data for i in range(6): assert data[i].shape[0] == 2000 # check data is same for idx in range(6): assert (np.allclose(data[idx], data2[idx])) assert (not np.allclose(data[idx], data1[idx])) assert (not np.allclose(data[idx], data3[idx])) assert (not np.allclose(data1[idx], data3[idx]))
def test_UCImultifeature_dataloader_select_labels(): # load data lab = [0, 1, 2] data, labels = load_UCImultifeature(select_labeled=lab) assert len(data) == 6 assert labels.shape[0] == 600 labels_set = list(set(labels)) assert len(labels_set) == len(lab) for j, lab_in_set in enumerate(labels_set): assert lab_in_set == lab[j] # check size of data for i in range(6): assert data[i].shape[0] == 600
# License: MIT from mvlearn.datasets import load_UCImultifeature from mvlearn.cluster import MultiviewSphericalKMeans from sklearn.manifold import TSNE from sklearn.metrics import normalized_mutual_info_score as nmi_score import matplotlib.pyplot as plt import warnings warnings.simplefilter('ignore') # Ignore warnings # Load in UCI digits multiple feature dataset as an example RANDOM_SEED = 5 # Load dataset along with labels for digits 0 through 4 n_class = 5 Xs, labels = load_UCImultifeature(select_labeled=list(range(n_class)), views=[0, 1]) # Creating a function to display data and the results of clustering def display_plots(pre_title, data, labels): # plot the views fig, ax = plt.subplots(1, 2, figsize=(14, 5)) dot_size = 10 ax[0].scatter(data[0][:, 0], data[0][:, 1], c=labels, s=dot_size) ax[0].set_title(pre_title + ' View 1') ax[0].axes.get_xaxis().set_visible(False) ax[0].axes.get_yaxis().set_visible(False) ax[1].scatter(data[1][:, 0], data[1][:, 1], c=labels, s=dot_size) ax[1].set_title(pre_title + ' View 2') ax[1].axes.get_xaxis().set_visible(False)
from sklearn.naive_bayes import GaussianNB from sklearn.model_selection import train_test_split from sklearn.metrics import accuracy_score from sklearn.ensemble import RandomForestClassifier from mvlearn.semi_supervised import CTClassifier from mvlearn.datasets import load_UCImultifeature ############################################################################### # Load the UCI Multiple Digit Features Dataset # ^^^^^^^^^^^^^^^^^^^^^^^^^^^^^^^^^^^^^^^^^^^^ # # To simulate a semi-supervised learning scenario, randomly remove 98% of the # labels. data, labels = load_UCImultifeature(select_labeled=[0, 1]) # Use only the first 2 views as an example View0, View1 = data[0], data[1] # Split both views into testing and training View0_train, View0_test, labels_train, labels_test = train_test_split( View0, labels, test_size=0.33, random_state=42) View1_train, View1_test, labels_train, labels_test = train_test_split( View1, labels, test_size=0.33, random_state=42) # Randomly remove all but 4 of the labels np.random.seed(6) remove_idx = np.random.rand(len(labels_train), ) < 0.98 labels_train[remove_idx] = np.nan not_removed = np.where(~remove_idx)
from mpl_toolkits.axes_grid1 import make_axes_locatable ############################################################################### # Load and visualize the multiple handwritten digit views # ------------------------------------------------------- # # We load a 6-view, 4-class dataset from the Multiple Features Dataset. Each # of the six views are as follows: # 1. 76 Fourier coefficients of the character shapes # 2. 216 profile correlations # 3. 64 Karhunen-Love coefficients # 4. 240 pixel averages of the images from 2x3 windows # 5. 47 Zernike moments # 6. 6 morphological features Xs, y = load_UCImultifeature(select_labeled=[0, 1, 2, 3]) view_names = [ 'Fourier\nCoefficients', 'Profile\nCorrelations', 'Karhunen-\nLoeve', 'Pixel\nAverages', 'Zernike\nMoments', 'Morphological\nFeatures' ] order = np.argsort(y) sub_samp = np.arange(0, Xs[0].shape[0], step=3) fig, axes = plt.subplots(1, 6, figsize=(8, 4)) for i, view in enumerate(Xs): sorted_view = view[order, :].copy() sorted_view = sorted_view[sub_samp, :] # Scale features in each view to [0, 1] minim = np.min(sorted_view, axis=0)
Loading and Viewing the UCI Multiple Features Dataset ===================================================== """ from mvlearn.datasets import load_UCImultifeature from mvlearn.plotting import quick_visualize ############################################################################### # Load the data and labels # ^^^^^^^^^^^^^^^^^^^^^^^^ # We can either load the entire dataset (all 10 digits) or select certain # digits. Then, visualize in 2D. # Load entire dataset full_data, full_labels = load_UCImultifeature() print("Full Dataset\n") print("Views = " + str(len(full_data))) print("First view shape = " + str(full_data[0].shape)) print("Labels shape = " + str(full_labels.shape)) quick_visualize(full_data, labels=full_labels, title="10-class data") ############################################################################### # Load only 2 classes of the data # ^^^^^^^^^^^^^^^^^^^^^^^^^^^^^^^ # # Also, shuffle the data and set the seed for reproducibility. Then, visualize # in 2D.
alpha=0.15) plt.xlabel("Component 1", fontsize=20) plt.ylabel("Component 2", fontsize=20) plt.tight_layout() ax.set_title('Latent Positions from Omnibus Embedding', fontsize=20) plt.show() ############################################################################### # UCI Digits Dataset # ------------------ # # Finally, we run Omnibus on the UCI Multiple Features Digits # Dataset. We use the Fourier coefficient and profile correlation # views (View 1 and 2 respectively) as a 2-view dataset. full_data, full_labels = load_UCImultifeature() view_1 = full_data[0] view_2 = full_data[1] Xs = [view_1, view_2] # Running omnibus embedder = omnibus.Omnibus() embeddings = embedder.fit_transform(Xs) ############################################################################### # Visualizing the Results # ^^^^^^^^^^^^^^^^^^^^^^^ # # This time, the points in the plot are colored by digit (0-9). The marker # symbols denote which view each sample is from. We randomly plot 500 samples
# License: MIT from mvlearn.datasets import load_UCImultifeature from mvlearn.embed import GCCA from mvlearn.plotting import crossviews_plot ############################################################################### # Load Data # --------- # # We load three views from the UCI handwritten digits multi-view data set. # Specificallym the Profile correlations, Karhunen-Love coefficients, and pixel # averages from 2x3 windows. # Load the data Xs, y = load_UCImultifeature(views=[1, 2, 3]) # Inspect the dataset print(f'There are {len(Xs)} views.') print(f'There are {Xs[0].shape[0]} observations') print(f'The feature sizes are: {[X.shape[1] for X in Xs]}') ############################################################################### # Embed Views # ^^^^^^^^^^^ # Create GCCA object and embed the gcca = GCCA() Xs_latents = gcca.fit_transform(Xs) print(f'The feature sizes are: {[X.shape[1] for X in Xs_latents]}')
def test_dataloader_badviews1(): v_list = [] with pytest.raises(ValueError): data, labels = load_UCImultifeature(views=v_list)
def test_UCImultifeature_dataloader_badselect3(): bad_list = [0, 2, 4, -2] with pytest.raises(ValueError): data, labels = load_UCImultifeature(select_labeled=bad_list)
def test_UCImultifeature_dataloader_badselect2(): long_list = [0, 0, 0, 1, 2, 3, 4, 5, 6, 7, 8, 9, 10, 11, 12] with pytest.raises(ValueError): data, labels = load_UCImultifeature(select_labeled=long_list)
def test_UCImultifeature_dataloader_badselect(): with pytest.raises(ValueError): data, labels = load_UCImultifeature(select_labeled=[])