Esempio n. 1
0
def tsne():
    
    fruits = pd.read_table('fruit_data_with_colors.txt')
    feature_names_fruits = ['height', 'width', 'mass', 'color_score']
    X_fruits = fruits[feature_names_fruits]
    y_fruits = fruits['fruit_label']
    target_names_fruits = ['apple', 'mandarin', 'orange', 'lemon']
    X_fruits_normalized = StandardScaler().fit(X_fruits).transform(X_fruits)  
    
    tsne = TSNE(random_state = 0)
    X_tsne = tsne.fit_transform(X_fruits_normalized)
    plot_labelled_scatter(X_tsne, y_fruits, ['apple', 'mandarin', 'orange', 'lemon'])
    plt.xlabel('First t-SNE feature')
    plt.ylabel('Second t-SNE feature')
    plt.title('Fruits dataset t-SNE');
    
    cancer = load_breast_cancer()
    (X_cancer, y_cancer) = load_breast_cancer(return_X_y = True)
    X_normalized = StandardScaler().fit(X_cancer).transform(X_cancer)  
    tsne = TSNE(random_state = 0)
    X_tsne = tsne.fit_transform(X_normalized)

    plot_labelled_scatter(X_tsne, y_cancer, ['malignant', 'benign'])
    plt.xlabel('First t-SNE feature')
    plt.ylabel('Second t-SNE feature')
    plt.title('Breast cancer dataset t-SNE');
Esempio n. 2
0
def dbscan():
    
    X, y = make_blobs(random_state = 9, n_samples = 25)
    dbscan = DBSCAN(eps = 2, min_samples = 2)
    cls = dbscan.fit_predict(X)
    print("Cluster membership values:\n{}".format(cls))
    plot_labelled_scatter(X, cls + 1, ['Noise', 'Cluster 0', 'Cluster 1', 'Cluster 2'])
Esempio n. 3
0
def mds():
    
    fruits = pd.read_table('fruit_data_with_colors.txt')
    feature_names_fruits = ['height', 'width', 'mass', 'color_score']
    X_fruits = fruits[feature_names_fruits]
    y_fruits = fruits['fruit_label']
    target_names_fruits = ['apple', 'mandarin', 'orange', 'lemon']
    
    X_fruits_normalized = StandardScaler().fit(X_fruits).transform(X_fruits)  
    mds = MDS(n_components = 2)
    X_fruits_mds = mds.fit_transform(X_fruits_normalized)

    plot_labelled_scatter(X_fruits_mds, y_fruits, ['apple', 'mandarin', 'orange', 'lemon'])
    plt.xlabel('First MDS feature')
    plt.ylabel('Second MDS feature')
    plt.title('Fruit sample dataset MDS');
    
    cancer = load_breast_cancer()
    (X_cancer, y_cancer) = load_breast_cancer(return_X_y = True)
    X_normalized = StandardScaler().fit(X_cancer).transform(X_cancer)  
    mds = MDS(n_components = 2)
    X_mds = mds.fit_transform(X_normalized)

    plot_labelled_scatter(X_mds, y_cancer, ['malignant', 'benign'])
    plt.xlabel('First MDS dimension')
    plt.ylabel('Second MDS dimension')
    plt.title('Breast Cancer Dataset MDS (n_components = 2)');
Esempio n. 4
0
def dendogram():
    
    X, y = make_blobs(random_state = 10, n_samples = 10)
    plot_labelled_scatter(X, y, ['Cluster 1', 'Cluster 2', 'Cluster 3'])
    print(X)

    plt.figure()
    dendrogram(ward(X))
    plt.show()
Esempio n. 5
0
def kmeans_clustering():
    
    X, y = make_blobs(random_state = 10)
    kmeans = KMeans(n_clusters = 3)
    kmeans.fit(X)
    plot_labelled_scatter(X, kmeans.labels_, ['Cluster 1', 'Cluster 2', 'Cluster 3'])

    fruits = pd.read_table('fruit_data_with_colors.txt')
    X_fruits = fruits[['mass','width','height', 'color_score']].as_matrix()
    y_fruits = fruits[['fruit_label']] - 1    
    X_fruits_normalized = MinMaxScaler().fit(X_fruits).transform(X_fruits)  
    kmeans = KMeans(n_clusters = 4, random_state = 0)
    kmeans.fit(X_fruits_normalized)
    plot_labelled_scatter(X_fruits_normalized, kmeans.labels_, ['Cluster 1', 'Cluster 2', 'Cluster 3', 'Cluster 4'])
Esempio n. 6
0
def pca():
    
    cancer = load_breast_cancer()
    (X_cancer, y_cancer) = load_breast_cancer(return_X_y = True)
    X_normalized = StandardScaler().fit(X_cancer).transform(X_cancer)  
    pca = PCA(n_components = 2).fit(X_normalized)
    X_pca = pca.transform(X_normalized)
    print(X_cancer.shape, X_pca.shape)

    plot_labelled_scatter(X_pca, y_cancer, ['malignant', 'benign'])
    plt.xlabel('First principal component')
    plt.ylabel('Second principal component')
    plt.title('Breast Cancer Dataset PCA (n_components = 2)');

    fig = plt.figure(figsize=(8, 4))
    plt.imshow(pca.components_, interpolation = 'none', cmap = 'plasma')
    feature_names = list(cancer.feature_names)
    plt.gca().set_xticks(np.arange(-.5, len(feature_names)))
    plt.gca().set_yticks(np.arange(0.5, 2))
    plt.gca().set_xticklabels(feature_names, rotation=90, ha='left', fontsize=12)
    plt.gca().set_yticklabels(['First PC', 'Second PC'], va='bottom', fontsize=12)
    plt.colorbar(orientation='horizontal', ticks=[pca.components_.min(), 0, pca.components_.max()], pad=0.65)
    
    fruits = pd.read_table('fruit_data_with_colors.txt')
    feature_names_fruits = ['height', 'width', 'mass', 'color_score']
    X_fruits = fruits[feature_names_fruits]
    y_fruits = fruits['fruit_label']
    target_names_fruits = ['apple', 'mandarin', 'orange', 'lemon']
    
    X_normalized = StandardScaler().fit(X_fruits).transform(X_fruits)  
    pca = PCA(n_components = 2).fit(X_normalized)
    X_pca = pca.transform(X_normalized) 
    plot_labelled_scatter(X_pca, y_fruits, ['apple','mandarin','orange','lemon'])
    plt.xlabel('First principal component')
    plt.ylabel('Second principal component')
    plt.title('Fruits Dataset PCA (n_components = 2)');
from sklearn.datasets import make_blobs
from sklearn.cluster import AgglomerativeClustering
from adspy_shared_utilities import plot_labelled_scatter
from matplotlib import pyplot as plt

X, y = make_blobs(random_state=10)

cls = AgglomerativeClustering(n_clusters=3)
cls_assignment = cls.fit_predict(X)

plot_labelled_scatter(X, cls_assignment,
        ['Cluster 1', 'Cluster 2', 'Cluster 3'])

# Creating a dendrogram
X, y = make_blobs(random_state=10, n_samples=10)
plot_labelled_scatter(X, y,
        ['Cluster 1', 'Cluster 2', 'Cluster 3'])
print(X)
# And here's the dendrogram corresponding to agglomerative clustering of the 10 points above using Ward's method.
# The index 0..9 of the points corresponds to the index of the points in the X array above.
# For example, point 0 (5.69, -9.47) and point 9 (5.43, -9.76) are the closest two points and are clustered first.
from scipy.cluster.hierarchy import ward, dendrogram
plt.figure()
dendrogram(ward(X))
plt.show()
Esempio n. 8
0
# PCA
from sklearn.preprocessing import StandardScaler
from sklearn.decomposition import PCA
from sklearn.datasets import load_breast_cancer
from adspy_shared_utilities import plot_labelled_scatter

cancer = load_breast_cancer()
(X_cancer, y_cancer) = load_breast_cancer(return_X_y=True)
X_normalized = StandardScaler().fit(X_cancer).transform(X_cancer)

pca = PCA(n_components=2).fit(X_normalized)
x_pca = pca.transform(X_normalized)
print(X_cancer.shape, x_pca.shape)
plot_labelled_scatter(X_pca, y_cancer, ['malignant', 'benign'])

plt.xlable('First principle component')
plt.ylable('Second principle component')
plt.title('Breast cancer dataset')
Esempio n. 9
0
# Before applying PCA, each feature should be centered (zero mean) and with unit variance
X_normalized = StandardScaler().fit(X_cancer).transform(X_cancer)

pca = PCA(n_components=2).fit(X_normalized)

X_pca = pca.transform(X_normalized)
print(X_cancer.shape, X_pca.shape)

# #### Plotting the PCA-transformed version of the breast cancer dataset

# In[ ]:

from adspy_shared_utilities import plot_labelled_scatter

plot_labelled_scatter(X_pca, y_cancer, ['malignant', 'benign'])

plt.xlabel('First principal component')
plt.ylabel('Second principal component')
plt.title('Breast Cancer Dataset PCA (n_components = 2)')

# #### Plotting the magnitude of each feature value for the first two principal components

# In[ ]:

fig = plt.figure(figsize=(8, 4))
plt.imshow(pca.components_, interpolation='none', cmap='plasma')
feature_names = list(cancer.feature_names)

plt.gca().set_xticks(np.arange(-.5, len(feature_names)))
plt.gca().set_yticks(np.arange(0.5, 2))
Esempio n. 10
0
X_fruits_mds = mds.fit_transform(X_fruits_normalized)

# plot_labelled_scatter(X_fruits_mds, y_fruits, ['apple', 'mandarin', 'orange', 'lemon'])
# plt.xlabel('First MDS feature')
# plt.ylabel('Second MDS feature')
# plt.title('Fruit sample dataset MDS')

# Multidimensional scaling (MDS) on the breast cancer dataset (compare it to the results from PCA)
# each feature should be centered (zero mean) and with unit variance
X_normalized = StandardScaler().fit(X_cancer).transform(X_cancer)

mds = MDS(n_components=2, random_state=0)

X_mds = mds.fit_transform(X_normalized)

plot_labelled_scatter(X_mds, y_cancer, ['malignant', 'benign'])

plt.xlabel('First MDS dimension')
plt.ylabel('Second MDS dimension')
plt.title('Breast Cancer Dataset MDS (n_components = 2)')

# t-SNE on the fruit dataset (you can see how some dimensionality reduction methods may be less successful on some datasets.
# Here, it doesn't work as well at finding structure in the small fruits dataset, compared to other methods like MDS)
from sklearn.manifold import TSNE

# tsne = TSNE(random_state = 0)
#
# X_tsne = tsne.fit_transform(X_fruits_normalized)

# plot_labelled_scatter(X_tsne, y_fruits,
#     ['apple', 'mandarin', 'orange', 'lemon'])
Esempio n. 11
0
from sklearn.datasets import make_blobs
from sklearn.cluster import KMeans
from adspy_shared_utilities import plot_labelled_scatter

# artificial dataset with make_blobs, then applies k-means to find 3 clusters
X, y = make_blobs(random_state=10)

kmeans = KMeans(n_clusters=3)
kmeans.fit(X)

plot_labelled_scatter(X, kmeans.labels_,
                      ['Cluster 1', 'Cluster 2', 'Cluster 3'])

# find 4 clusters in the fruits dataset
from sklearn.preprocessing import MinMaxScaler
import pandas as pd

fruits = pd.read_table('fruit_data_with_colors.txt')
X_fruits = fruits[['mass', 'width', 'height', 'color_score']].as_matrix()
y_fruits = fruits[['fruit_label']] - 1

X_fruits_normalized = MinMaxScaler().fit(X_fruits).transform(X_fruits)

kmeans = KMeans(n_clusters=4, random_state=0)
kmeans.fit(X_fruits_normalized)

plot_labelled_scatter(X_fruits_normalized, kmeans.labels_,
                      ['Cluster 1', 'Cluster 2', 'Cluster 3', 'Cluster 4'])
Esempio n. 12
0
X, y = make_blobs(random_state = 10)

cls = AgglomerativeClustering(n_clusters = 3)
cls_assignment = cls.fit_predict(X)

plot_labelled_scatter(X, cls_assignment,
        ['Cluster 1', 'Cluster 2', 'Cluster 3'])

# Creating a dendrogram (using scipy)
X, y = make_blobs(random_state = 10, n_samples = 10)
plot_labelled_scatter(X, y,
        ['Cluster 1', 'Cluster 2', 'Cluster 3'])
print(X)

plt.figure()
dendrogram(ward(X)) # Uses ward's method
plt.show()
'''
# DBSCAN
X, y = make_blobs(random_state=9, n_samples=25)

dbscan = DBSCAN(eps=2, min_samples=2)

cls = dbscan.fit_predict(X)
print("Cluster membership values:\n{}".format(cls))

plot_labelled_scatter(X, cls + 1,
                      ['Noise', 'Cluster 0', 'Cluster 1', 'Cluster 2'])

print('\n')
Esempio n. 13
0
def agglomerative_clustering():
    
    X, y = make_blobs(random_state = 10)
    cls = AgglomerativeClustering(n_clusters = 3)
    cls_assignment = cls.fit_predict(X)
    plot_labelled_scatter(X, cls_assignment, ['Cluster 1', 'Cluster 2', 'Cluster 3'])