def tsne(): fruits = pd.read_table('fruit_data_with_colors.txt') feature_names_fruits = ['height', 'width', 'mass', 'color_score'] X_fruits = fruits[feature_names_fruits] y_fruits = fruits['fruit_label'] target_names_fruits = ['apple', 'mandarin', 'orange', 'lemon'] X_fruits_normalized = StandardScaler().fit(X_fruits).transform(X_fruits) tsne = TSNE(random_state = 0) X_tsne = tsne.fit_transform(X_fruits_normalized) plot_labelled_scatter(X_tsne, y_fruits, ['apple', 'mandarin', 'orange', 'lemon']) plt.xlabel('First t-SNE feature') plt.ylabel('Second t-SNE feature') plt.title('Fruits dataset t-SNE'); cancer = load_breast_cancer() (X_cancer, y_cancer) = load_breast_cancer(return_X_y = True) X_normalized = StandardScaler().fit(X_cancer).transform(X_cancer) tsne = TSNE(random_state = 0) X_tsne = tsne.fit_transform(X_normalized) plot_labelled_scatter(X_tsne, y_cancer, ['malignant', 'benign']) plt.xlabel('First t-SNE feature') plt.ylabel('Second t-SNE feature') plt.title('Breast cancer dataset t-SNE');
def dbscan(): X, y = make_blobs(random_state = 9, n_samples = 25) dbscan = DBSCAN(eps = 2, min_samples = 2) cls = dbscan.fit_predict(X) print("Cluster membership values:\n{}".format(cls)) plot_labelled_scatter(X, cls + 1, ['Noise', 'Cluster 0', 'Cluster 1', 'Cluster 2'])
def mds(): fruits = pd.read_table('fruit_data_with_colors.txt') feature_names_fruits = ['height', 'width', 'mass', 'color_score'] X_fruits = fruits[feature_names_fruits] y_fruits = fruits['fruit_label'] target_names_fruits = ['apple', 'mandarin', 'orange', 'lemon'] X_fruits_normalized = StandardScaler().fit(X_fruits).transform(X_fruits) mds = MDS(n_components = 2) X_fruits_mds = mds.fit_transform(X_fruits_normalized) plot_labelled_scatter(X_fruits_mds, y_fruits, ['apple', 'mandarin', 'orange', 'lemon']) plt.xlabel('First MDS feature') plt.ylabel('Second MDS feature') plt.title('Fruit sample dataset MDS'); cancer = load_breast_cancer() (X_cancer, y_cancer) = load_breast_cancer(return_X_y = True) X_normalized = StandardScaler().fit(X_cancer).transform(X_cancer) mds = MDS(n_components = 2) X_mds = mds.fit_transform(X_normalized) plot_labelled_scatter(X_mds, y_cancer, ['malignant', 'benign']) plt.xlabel('First MDS dimension') plt.ylabel('Second MDS dimension') plt.title('Breast Cancer Dataset MDS (n_components = 2)');
def dendogram(): X, y = make_blobs(random_state = 10, n_samples = 10) plot_labelled_scatter(X, y, ['Cluster 1', 'Cluster 2', 'Cluster 3']) print(X) plt.figure() dendrogram(ward(X)) plt.show()
def kmeans_clustering(): X, y = make_blobs(random_state = 10) kmeans = KMeans(n_clusters = 3) kmeans.fit(X) plot_labelled_scatter(X, kmeans.labels_, ['Cluster 1', 'Cluster 2', 'Cluster 3']) fruits = pd.read_table('fruit_data_with_colors.txt') X_fruits = fruits[['mass','width','height', 'color_score']].as_matrix() y_fruits = fruits[['fruit_label']] - 1 X_fruits_normalized = MinMaxScaler().fit(X_fruits).transform(X_fruits) kmeans = KMeans(n_clusters = 4, random_state = 0) kmeans.fit(X_fruits_normalized) plot_labelled_scatter(X_fruits_normalized, kmeans.labels_, ['Cluster 1', 'Cluster 2', 'Cluster 3', 'Cluster 4'])
def pca(): cancer = load_breast_cancer() (X_cancer, y_cancer) = load_breast_cancer(return_X_y = True) X_normalized = StandardScaler().fit(X_cancer).transform(X_cancer) pca = PCA(n_components = 2).fit(X_normalized) X_pca = pca.transform(X_normalized) print(X_cancer.shape, X_pca.shape) plot_labelled_scatter(X_pca, y_cancer, ['malignant', 'benign']) plt.xlabel('First principal component') plt.ylabel('Second principal component') plt.title('Breast Cancer Dataset PCA (n_components = 2)'); fig = plt.figure(figsize=(8, 4)) plt.imshow(pca.components_, interpolation = 'none', cmap = 'plasma') feature_names = list(cancer.feature_names) plt.gca().set_xticks(np.arange(-.5, len(feature_names))) plt.gca().set_yticks(np.arange(0.5, 2)) plt.gca().set_xticklabels(feature_names, rotation=90, ha='left', fontsize=12) plt.gca().set_yticklabels(['First PC', 'Second PC'], va='bottom', fontsize=12) plt.colorbar(orientation='horizontal', ticks=[pca.components_.min(), 0, pca.components_.max()], pad=0.65) fruits = pd.read_table('fruit_data_with_colors.txt') feature_names_fruits = ['height', 'width', 'mass', 'color_score'] X_fruits = fruits[feature_names_fruits] y_fruits = fruits['fruit_label'] target_names_fruits = ['apple', 'mandarin', 'orange', 'lemon'] X_normalized = StandardScaler().fit(X_fruits).transform(X_fruits) pca = PCA(n_components = 2).fit(X_normalized) X_pca = pca.transform(X_normalized) plot_labelled_scatter(X_pca, y_fruits, ['apple','mandarin','orange','lemon']) plt.xlabel('First principal component') plt.ylabel('Second principal component') plt.title('Fruits Dataset PCA (n_components = 2)');
from sklearn.datasets import make_blobs from sklearn.cluster import AgglomerativeClustering from adspy_shared_utilities import plot_labelled_scatter from matplotlib import pyplot as plt X, y = make_blobs(random_state=10) cls = AgglomerativeClustering(n_clusters=3) cls_assignment = cls.fit_predict(X) plot_labelled_scatter(X, cls_assignment, ['Cluster 1', 'Cluster 2', 'Cluster 3']) # Creating a dendrogram X, y = make_blobs(random_state=10, n_samples=10) plot_labelled_scatter(X, y, ['Cluster 1', 'Cluster 2', 'Cluster 3']) print(X) # And here's the dendrogram corresponding to agglomerative clustering of the 10 points above using Ward's method. # The index 0..9 of the points corresponds to the index of the points in the X array above. # For example, point 0 (5.69, -9.47) and point 9 (5.43, -9.76) are the closest two points and are clustered first. from scipy.cluster.hierarchy import ward, dendrogram plt.figure() dendrogram(ward(X)) plt.show()
# PCA from sklearn.preprocessing import StandardScaler from sklearn.decomposition import PCA from sklearn.datasets import load_breast_cancer from adspy_shared_utilities import plot_labelled_scatter cancer = load_breast_cancer() (X_cancer, y_cancer) = load_breast_cancer(return_X_y=True) X_normalized = StandardScaler().fit(X_cancer).transform(X_cancer) pca = PCA(n_components=2).fit(X_normalized) x_pca = pca.transform(X_normalized) print(X_cancer.shape, x_pca.shape) plot_labelled_scatter(X_pca, y_cancer, ['malignant', 'benign']) plt.xlable('First principle component') plt.ylable('Second principle component') plt.title('Breast cancer dataset')
# Before applying PCA, each feature should be centered (zero mean) and with unit variance X_normalized = StandardScaler().fit(X_cancer).transform(X_cancer) pca = PCA(n_components=2).fit(X_normalized) X_pca = pca.transform(X_normalized) print(X_cancer.shape, X_pca.shape) # #### Plotting the PCA-transformed version of the breast cancer dataset # In[ ]: from adspy_shared_utilities import plot_labelled_scatter plot_labelled_scatter(X_pca, y_cancer, ['malignant', 'benign']) plt.xlabel('First principal component') plt.ylabel('Second principal component') plt.title('Breast Cancer Dataset PCA (n_components = 2)') # #### Plotting the magnitude of each feature value for the first two principal components # In[ ]: fig = plt.figure(figsize=(8, 4)) plt.imshow(pca.components_, interpolation='none', cmap='plasma') feature_names = list(cancer.feature_names) plt.gca().set_xticks(np.arange(-.5, len(feature_names))) plt.gca().set_yticks(np.arange(0.5, 2))
X_fruits_mds = mds.fit_transform(X_fruits_normalized) # plot_labelled_scatter(X_fruits_mds, y_fruits, ['apple', 'mandarin', 'orange', 'lemon']) # plt.xlabel('First MDS feature') # plt.ylabel('Second MDS feature') # plt.title('Fruit sample dataset MDS') # Multidimensional scaling (MDS) on the breast cancer dataset (compare it to the results from PCA) # each feature should be centered (zero mean) and with unit variance X_normalized = StandardScaler().fit(X_cancer).transform(X_cancer) mds = MDS(n_components=2, random_state=0) X_mds = mds.fit_transform(X_normalized) plot_labelled_scatter(X_mds, y_cancer, ['malignant', 'benign']) plt.xlabel('First MDS dimension') plt.ylabel('Second MDS dimension') plt.title('Breast Cancer Dataset MDS (n_components = 2)') # t-SNE on the fruit dataset (you can see how some dimensionality reduction methods may be less successful on some datasets. # Here, it doesn't work as well at finding structure in the small fruits dataset, compared to other methods like MDS) from sklearn.manifold import TSNE # tsne = TSNE(random_state = 0) # # X_tsne = tsne.fit_transform(X_fruits_normalized) # plot_labelled_scatter(X_tsne, y_fruits, # ['apple', 'mandarin', 'orange', 'lemon'])
from sklearn.datasets import make_blobs from sklearn.cluster import KMeans from adspy_shared_utilities import plot_labelled_scatter # artificial dataset with make_blobs, then applies k-means to find 3 clusters X, y = make_blobs(random_state=10) kmeans = KMeans(n_clusters=3) kmeans.fit(X) plot_labelled_scatter(X, kmeans.labels_, ['Cluster 1', 'Cluster 2', 'Cluster 3']) # find 4 clusters in the fruits dataset from sklearn.preprocessing import MinMaxScaler import pandas as pd fruits = pd.read_table('fruit_data_with_colors.txt') X_fruits = fruits[['mass', 'width', 'height', 'color_score']].as_matrix() y_fruits = fruits[['fruit_label']] - 1 X_fruits_normalized = MinMaxScaler().fit(X_fruits).transform(X_fruits) kmeans = KMeans(n_clusters=4, random_state=0) kmeans.fit(X_fruits_normalized) plot_labelled_scatter(X_fruits_normalized, kmeans.labels_, ['Cluster 1', 'Cluster 2', 'Cluster 3', 'Cluster 4'])
X, y = make_blobs(random_state = 10) cls = AgglomerativeClustering(n_clusters = 3) cls_assignment = cls.fit_predict(X) plot_labelled_scatter(X, cls_assignment, ['Cluster 1', 'Cluster 2', 'Cluster 3']) # Creating a dendrogram (using scipy) X, y = make_blobs(random_state = 10, n_samples = 10) plot_labelled_scatter(X, y, ['Cluster 1', 'Cluster 2', 'Cluster 3']) print(X) plt.figure() dendrogram(ward(X)) # Uses ward's method plt.show() ''' # DBSCAN X, y = make_blobs(random_state=9, n_samples=25) dbscan = DBSCAN(eps=2, min_samples=2) cls = dbscan.fit_predict(X) print("Cluster membership values:\n{}".format(cls)) plot_labelled_scatter(X, cls + 1, ['Noise', 'Cluster 0', 'Cluster 1', 'Cluster 2']) print('\n')
def agglomerative_clustering(): X, y = make_blobs(random_state = 10) cls = AgglomerativeClustering(n_clusters = 3) cls_assignment = cls.fit_predict(X) plot_labelled_scatter(X, cls_assignment, ['Cluster 1', 'Cluster 2', 'Cluster 3'])