Beispiel #1
0
def load_data():
    # y = f(X)
    X, y = mglearn.datasets.make_forge()
    print("X.shape: {}".format(X.shape))
    # create a scatter matrix
    mglearn.discrete_scatter(X[:, 0], X[:,1], y)
    plt.xlabel("First feature")
    plt.ylabel("Second feature")
    plt.legend(["Class 0", "Class 1"], loc='lower right')
    plt.show()
from sklearn.linear_model import LogisticRegression
from sklearn.svm import LinearSVC


import matplotlib
matplotlib.use('TkAgg')
import matplotlib.pyplot as plt
import mglearn

X, y = mglearn.datasets.make_forge()
fig, axes = plt.subplots(1, 2, figsize=(10, 3))
for model, ax in zip([LinearSVC(), LogisticRegression()], axes):
    clf = model.fit(X, y)
    mglearn.plots.plot_2d_separator(clf, X, fill=False, eps=0.5,
    ax=ax, alpha=.7)
    mglearn.discrete_scatter(X[:, 0], X[:, 1], y, ax=ax)
    ax.set_title("{}".format(clf.__class__.__name__))
    ax.set_xlabel("Feature 0")
    ax.set_ylabel("Feature 1")
axes[0].legend()

plt.show()
%cd C:\Users\bama6012\Desktop\desk\Python My study\Py Codes-Introduction to Machine Learning Book

data='C:/Users/bama6012/Desktop/desk/Python My study/data/'

#import required libraries
import numpy as np
import pandas as pd
import matplotlib.pyplot as plt
import mglearn

# Some Sample Datasets-----------------------------------------------------------------------------------------
# generate dataset
X,y=mglearn.datasets.make_forge()

#plot dataset
mglearn.discrete_scatter(X[:,0],X[:,1],y)
plt.legend(["Class 0", "Class 1"], loc=4)
plt.xlabel("First feature")
plt.ylabel("Second feature")
print("X.shape: {}".format(X.shape))

X, y = mglearn.datasets.make_wave(n_samples=40)
plt.plot(X, y, 'o')
plt.ylim(-3, 3)
plt.xlabel("Feature")
plt.ylabel("Target")

from sklearn.datasets import load_breast_cancer
cancer=load_breast_cancer()
print('cancer.keys() : \n{}'.format(cancer.keys()))
print('shape of cancer data : {}'.format(cancer['data'].shape))
from sklearn.svm import SVC
import matplotlib.pyplot as plt
import mglearn

X, y = mglearn.tools.make_handcrafted_dataset()
svm = SVC(kernel='rbf', C=10, gamma=0.1).fit(X, y)
mglearn.plots.plot_2d_separator(svm, X, eps=.5)
mglearn.discrete_scatter(X[:, 0], X[:, 1], y)
sv = svm.support_vectors_
sv_labels = svm.dual_coef_.ravel() > 0
mglearn.discrete_scatter(sv[:, 0],
                         sv[:, 1],
                         sv_labels,
                         s=15,
                         markeredgewidth=3)
plt.xlabel('Feature 0')
plt.ylabel('Feature 1')
plt.show()
# keep the first two principal components of the data
pca=PCA(n_components=2)

# fit PCA model to breast cancer data
pca.fit(X_scaled)

# transform data onto the first two principal components
X_pca=pca.transform(X_scaled)

print("Original shape: {}".format(str(X_scaled.shape)))
print("Reduced shape: {}".format(str(X_pca.shape)))

# plot first vs. second principal component, colored by class
plt.figure(figsize=(8, 8))
mglearn.discrete_scatter(X_pca[:, 0], X_pca[:, 1], cancer.target)
plt.legend(cancer.target_names, loc="best")
plt.gca().set_aspect("equal")
plt.xlabel("First principal component")
plt.ylabel("Second principal component")

"""The principal components themselves are stored in the
components_ attribute of the PCA object during fitting"""

print("PCA component shape: {}".format(pca.components_.shape))

"""
Each row in components_ corresponds to one principal component, and they are sorted
by their importance (the first principal component comes first, etc.). The columns
correspond to the original features attribute of the PCA in this example, “mean
radius,” “mean texture,” and so on. Let’s have a look at the content of components_"""
# K-Means for temp anomalies
# -----------------------------------------------------------------------------
from sklearn.cluster import KMeans
#Initialize the algorithm and fit it with the data
kmeans = KMeans(n_clusters=5)
X = Var_frommean.to_numpy().reshape(-1, 1)
kmeans.fit(X)
kmeans.cluster_centers_
print("Cluster memberships:\n{}".format(kmeans.labels_))
#Assign classes to each data point based on the model
classes = kmeans.predict(X)
#Inspect the centroids of the clusters
print(kmeans.cluster_centers_)
kmeans_clusters = kmeans.cluster_centers_
#Shortcut to see/visualize the datapoints and range of each cluster
mglearn.discrete_scatter(X, X, kmeans.labels_, markers='o')
#Volcanic activity is expected to have the maximum impact out of all forcings so look for the time points which are in the cluster associated with the lowest centroid
dip = np.argwhere(classes == np.argmin(kmeans_clusters))
#look for the years which have the biggest dips
dipinyear = list(int(timelist[i][0] / 10000) for i in dip)
len(dipinyear)

# -----------------------------------------------------------------------------
# K-Means for filtered temp anomalies
# -----------------------------------------------------------------------------
#Initialize the algorithm and fit it with the data
kmeans = KMeans(n_clusters=5)
X = Var_frommean.to_numpy().reshape(-1, 1)
# apply the filter
# define window to filter
lowcut = 1 / (365 * 86400 * 10)
# Evaluate
print("Accuracy: {:.2f}".format(classifier.score(X_test, y_test)))

# Decision boundary
fig, axes = plt.subplots(1, 3, figsize=(10, 3))

for n_neighbors, ax in zip([1, 3, 9], axes):
    classifier = KNeighborsClassifier(n_neighbors=n_neighbors).fit(X, y)

    mglearn.plots.plot_2d_separator(classifier,
                                    X,
                                    fill=True,
                                    eps=0.5,
                                    ax=ax,
                                    alpha=.4)
    mglearn.discrete_scatter(X[:, 0], X[:, 1], y, ax=ax)
    ax.set_title("{} neighbor".format(n_neighbors))
    ax.set_xlabel("Feature 0")
    ax.set_ylabel("Feature 1")

axes[0].legend(loc=3)
plt.show()

# Complexity/Generalization of Model with Cancer dataset
from sklearn.datasets import load_breast_cancer

cancer = load_breast_cancer()
X_train, X_test, y_train, y_test = train_test_split(cancer.data,
                                                    cancer.target,
                                                    stratify=cancer.target,
                                                    random_state=66)
Beispiel #8
0
"""
Created on Thu Mar 15 10:18:39 2018

@author: Yuan-Ray Chang
"""

import numpy as np
import matplotlib.pyplot as plt
import pandas as pd
import mglearn

from sklearn.datasets import make_blobs
X, y = make_blobs(centers=4, random_state=8)
y = y % 2

mglearn.discrete_scatter(X[:, 0], X[:, 1], y)
plt.xlabel("Feature 0")
plt.ylabel("Feature 1")

from sklearn.svm import LinearSVC
linear_svm = LinearSVC().fit(X, y)

mglearn.plots.plot_2d_separator(linear_svm, X)
mglearn.discrete_scatter(X[:, 0], X[:, 1], y)
plt.xlabel("Feature 0")
plt.ylabel("Feature 1")

X_new = np.hstack([X, X[:, 1:]**2])
from mpl_toolkits.mplot3d import Axes3D, axes3d
figure = plt.figure()
ax = Axes3D(figure, elev=-152, azim=-26)
Beispiel #9
0
from sklearn.neural_network import MLPClassifier
from sklearn.datasets import make_moons
from sklearn.model_selection import train_test_split
import matplotlib.pyplot as plt

# 导入mglearn模块
import sys
sys.path.append("../")
import mglearn

X, y = make_moons(n_samples=100, noise=0.25, random_state=3)
X_train, X_test, y_train, y_test = train_test_split(X,
                                                    y,
                                                    stratify=y,
                                                    random_state=42)

fig, axes = plt.subplots(2, 4, figsize=(20, 8))

for i, ax in enumerate(axes.ravel()):
    mlp = MLPClassifier(solver='lbfgs',
                        random_state=i,
                        hidden_layer_sizes=[100, 100])
    mlp.fit(X_train, y_train)
    mglearn.plots.plot_2d_separator(mlp, X_train, fill=True, alpha=.3, ax=ax)
    mglearn.discrete_scatter(X_train[:, 0], X_train[:, 1], y_train, ax=ax)

plt.show()
Beispiel #10
0
import matplotlib.pyplot as plt
import mglearn

fig, axes = plt.subplots(2, 3, figsize=(20, 10))
for i, (ax, tree) in enumerate(zip(axes.ravel(), forest.estimators_)):
    ax.set_title("tree {}".format(i))
    mglearn.plots.plot_tree_partition(x, y, tree, ax=ax)

mglearn.plots.plot_2d_separator(forest,
                                x,
                                fill=True,
                                ax=axes[-1, -1],
                                alpha=.4)
axes[-1, -1].set_title("랜덤 포레스트")
mglearn.discrete_scatter(x[:, 0], x[:, 1], y)

# In[42]:

import matplotlib.pyplot as plt
import mglearn
forest.fit(X, y)
importances = forest.feature_importances_
std = np.std([tree.feature_importances_ for tree in forest.estimators_],
             axis=0)
indices = np.argsort(importances)
plt.figure()
plt.title("Feature importances")
plt.barh(range(X.shape[1]),
         importances[indices],
         color="r",
# array([[ 0.37649644, -0.06637905,  0.85134571,  0.35924188],
#        [ 0.6240207 ,  0.75538031, -0.18479376, -0.07648543]])

C1 = 0.37649644 * X1 + -0.06637905 * X2 + 0.85134571 * X3 + 0.35924188 * X4
C2 = 0.6240207 * X1 + 0.75538031 * X2 + -0.18479376 * X3 + -0.07648543 * X4

# PCA + knn (앙상블) for iris data

# 4) 유도된 인공변수 knn 모델에 적용
m_knn = knn_c(5)
m_knn.fit(train_x_pca, train_y)
m_knn.score(test_x_pca, test_y)    # 0.973

# 5) data point들의 분포 확인 (산점도)
import mglearn
mglearn.discrete_scatter(train_x_pca[:, 0], train_x_pca[:, 1], train_y)

# -------------------------------- 연 습 문 제 -------------------------------- #
# 연습문제 7. cancer data의 PCA + SVM 적용
# 1) data loading
df1 = pd.read_csv('cancer.csv')
Y = df1.iloc[:, 1]
X = df1.iloc[:, 2:]

train_x, test_x, train_y, test_y = train_test_split(X, Y, random_state = 0)

# 2) scaling
from sklearn.preprocessing import MinMaxScaler as minmax
m_sc = minmax()
m_sc.fit(train_x)
train_x_sc = m_sc.transform(train_x)
mglearn.tools.plot_2d_separator(gbrt,
                                X,
                                ax=axes[0],
                                alpha=0.4,
                                fill=True,
                                cm=mglearn.cm2)
scores_image = mglearn.tools.plot_2d_scores(gbrt,
                                            X,
                                            ax=axes[1],
                                            alpha=0.5,
                                            cm=mglearn.ReBl,
                                            function='predict_proba')
for ax in axes:
    mglearn.discrete_scatter(X_test[:, 0],
                             X_test[:, 1],
                             y_test,
                             markers='^',
                             ax=ax)
    mglearn.discrete_scatter(X_train[:, 0],
                             X_train[:, 1],
                             y_train,
                             markers='o',
                             ax=ax)
    ax.set_xlabel('Feature 0')
    ax.set_ylabel('Feature 1')
cbar = plt.colorbar(scores_image, ax=axes.tolist())  # 颜色棒,颜色越深代表置信度越高
axes[0].legend(
    ['Test class 0', 'Test class 1', 'Train class 0', 'Train class 1'],
    ncol=4,
    loc=(0.1, 1.1))
plt.show()