def knnWaveEG(): X, y = mglearn.datasets.make_wave(n_samples=40) XTrain, XTest, yTrain, yTest = train_test_split(\ X, y, random_state = 0) reg = KNeighborsRegressor(n_neighbors=3) reg.fit(XTrain, yTrain) print("Test set predictions:\n{}".format(reg.predict(XTest))) print("Test set R^2: {:.2f}".format(reg.score(XTest, yTest))) fig, axes = plt.subplots(1, 3, figsize=(15, 4)) # reshape(-1, 1), -1 -> unknown dimension. line = np.linspace(-3, 3, 1000).reshape(-1, 1) for nNeighbors, ax in zip([1, 3, 8], axes): reg = KNeighborsRegressor(n_neighbors=nNeighbors) reg.fit(XTrain, yTrain) ax.plot(line, reg.predict(line)) ax.plot(XTrain, yTrain, '^', c = mglearn.cm2(0), \ markersize = 8) ax.plot(XTest, yTest, 'v', c = mglearn.cm2(1), \ markersize = 8) ax.set_title(("{} neighbor(s)\n train score: {:.2f}" \ "test score: {:.2f}").format(nNeighbors, \ reg.score(XTrain, yTrain), \ reg.score(XTest, yTest))) ax.set_xlabel("Feature") ax.set_ylabel("Target") axes[0].legend(["Model predictions", \ "Training data/target", \ "Test data/target"], \ loc = "best") plt.show()
def kmeans_failure_3(): # moon # k-means also performs poorly if the clusters have more complex shapes, like the two_moons data X, y = make_moons(n_samples=200, noise=0.05, random_state=0) kmeans = KMeans(n_clusters=2) kmeans.fit(X) y_pred = kmeans.predict(X) # plot the cluster assignments and cluster centers plt.scatter(X[:, 0], X[:, 1], c=y_pred, s=60) plt.scatter( kmeans.cluster_centers_[:, 0], kmeans.cluster_centers_[:, 1], c=[mglearn.cm2(0), mglearn.cm2(1)], marker='^', s=100, linewidth=2 ) plt.xlabel("Feature 0") plt.ylabel("Feature 1") plt.show()
def analyze(X_train, X_test, y_train, y_test): # analyze fig, axes = plt.subplots(1, 3, figsize=(15, 4)) # create 1,000 data points, evenly spaced between -3 and 3 line = np.linspace(-3, 3, 1000).reshape(-1, 1) for n_neighbors, ax in zip([1, 3, 9], axes): # make predictions using 1, 3 or 9 neighbors reg = KNeighborsRegressor(n_neighbors=n_neighbors) reg.fit(X_train, y_train) ax.plot(line, reg.predict(line)) ax.plot(X_train, y_train, '^', c=mglearn.cm2(0), markersize=8) ax.plot(X_test, y_test, 'v', c=mglearn.cm2(1), markersize=8) train_score = reg.score(X_train, y_train) test_score = reg.score(X_test, y_test) ax.set_title("{} neighbor(s)\n train score: {:.2f} test score: {:.2f}".format(n_neighbors, train_score, test_score)) ax.set_xlabel("Feature") ax.set_ylabel("Target") axes[0].legend(["Model Predictions", "Training data/target", "Test data/target"], loc="best") plt.show()
def compare_n_neighbors(): fig, axes = plt.subplots(1, 3, figsize=(15, 4)) # create 1000 data points, evenly spaced between -3 and 3 line = np.linspace(-3, 3, 1000).reshape(-1, 1) for n_neighbors, ax in zip([1, 3, 9], axes): # make predictions using 1, 3, or 9 neighbors reg = KNeighborsRegressor(n_neighbors=n_neighbors) reg.fit(X_train, y_train) ax.plot(line, reg.predict(line)) ax.plot(X_train, y_train, '^', c=mglearn.cm2(0), markersize=8) ax.plot(X_test, y_test, 'v', c=mglearn.cm2(1), markersize=8) ax.set_title( '{} neighbor(s)\ntrain score: {:.2f} test score: {:.2f}'.format( n_neighbors, reg.score(X_train, y_train), reg.score(X_test, y_test))) ax.set_xlabel('Feature') ax.set_ylabel('Target') axes[0].legend( ['Model predictions', 'Training data/target', 'Test data/target'], loc='best') plt.show()
markers='^', markeredgewidth=2) plt.xlabel("특성 0") plt.ylabel("특성 1") # two_moons 데이터를 생성합니다(이번에는 노이즈를 조금만 넣습니다) from sklearn.datasets import make_moons X, y = make_moons(n_samples=200, noise=0.05, random_state=0) # 두 개의 클러스터로 데이터에 KMeans 알고리즘을 적용합니다 kmeans = KMeans(n_clusters=2) kmeans.fit(X) y_pred = kmeans.predict(X) # 클러스터 할당과 클러스터 중심을 표시합니다 plt.scatter(X[:, 0], X[:, 1], c=y_pred, cmap=mglearn.cm2, s=60, edgecolors='k') plt.scatter(kmeans.cluster_centers_[:, 0], kmeans.cluster_centers_[:, 1], marker='^', c=[mglearn.cm2(0), mglearn.cm2(1)], s=100, linewidth=2, edgecolors='k') plt.xlabel("특성 0") plt.ylabel("특성 1") X_train, X_test, y_train, y_test = train_test_split( X_people, y_people, stratify=y_people, random_state=42) nmf = NMF(n_components=100, random_state=0) nmf.fit(X_train) pca = PCA(n_components=100, random_state=0) pca.fit(X_train) kmeans = KMeans(n_clusters=100, random_state=0) kmeans.fit(X_train) X_reconstructed_pca = pca.inverse_transform(pca.transform(X_test)) X_reconstructed_kmeans = kmeans.cluster_centers_[kmeans.predict(X_test)] X_reconstructed_nmf = np.dot(nmf.transform(X_test), nmf.components_)
random_state=1) scaler = MinMaxScaler() scaler.fit(X_train) X_train_scaled = scaler.transform(X_train) # print(X_train.min(), X_train.max()) # print(X_train_scaled.min(), X_train_scaled.max()) X, _ = make_blobs(n_samples=50, centers=5, random_state=4, cluster_std=2) X_train, X_test = train_test_split(X, random_state=5, test_size=.1) #作图对比 fig, axes = plt.subplots(1, 3, figsize=(13, 4)) axes[0].scatter(X_train[:, 0], X_train[:, 1], c=mglearn.cm2(0), label='Training set', s=60) axes[0].scatter(X_test[:, 0], X_test[:, 1], c=mglearn.cm2(1), marker='^', label='Testing set', s=60) axes[0].legend(loc='upper left') scaler = MinMaxScaler() scaler.fit(X_train) X_train_scaled = scaler.transform(X_train) X_test_scaled = scaler.transform(X_test)
print("max значение признака после масштабирования:\n {}".format( X_train_scaled.max(axis=0))) "In[7]:" X_test_scaled = (X_test - min_on_training) / range_on_training print("min значение признака после масштабирования:\n{}".format(X_test_scaled.min(axis=0))) print("max значение признака после масштабирования:\n{}".format(X_test_scaled.max(axis=0))) "In[8]:" from sklearn.datasets import make_blobs X, _ = make_blobs(n_samples=50, centers=5, random_state=4, cluster_std=2) X_train, X_test = train_test_split(X, random_state=5, test_size=.1) fig, axes = plt.subplots(1, 3, figsize=(13, 4)) axes[0].scatter(X_train[:, 0], X_train[:, 1], c=mglearn.cm2(0), label="Обучающий набор", s=60) axes[0].scatter(X_test[:, 0], X_test[:, 1], marker='^', c=mglearn.cm2(1), label="Тестовый набор", s=60) axes[0].legend(loc='upper left') axes[0].set_title("Исходные данные") scaler = MinMaxScaler() scaler.fit(X_train) X_train_scaled = scaler.transform(X_train) X_test_scaled = scaler.transform(X_test) axes[1].scatter(X_train_scaled[:, 0], X_train_scaled[:, 1], c=mglearn.cm2(0), label="Обучающий набор", s=60) axes[1].scatter(X_test_scaled[:, 0], X_test_scaled[:, 1], marker='^', c=mglearn.cm2(1), label="Тестовый набор", s=60) axes[1].set_title("Масштабированные данные")
X, _ = make_blobs(n_samples=50, centers=5, random_state=4, cluster_std=2) make_blobs? X _ X.shape # test_size, float between 0 and 1, the proportion of the dataset generating test set; int, the number of dataset generating test set X_train, X_test = train_test_split(X, random_state=5, test_size=.1) X_test.shape # plot the training and test data fig, axes = plt.subplots(1, 3, figsize=(13, 4)) # subplot 1 axes[0].scatter(X_train[:, 0], X_train[:, 1], c=mglearn.cm2(0), label='training set', s=60) axes[0].scatter(X_test[:, 0], X_test[:, 1], marker='^', c=mglearn.cm2(1), label='test set', s=60) axes[0].legend(loc='upper left') axes[0].set_title('original data') # subplot 2 # scale the data(based on training) using MinMaxScaler scaler = MinMaxScaler() scaler.fit(X_train) X_train_scaled = scaler.transform(X_train) X_test_scaled = scaler.transform(X_test) axes[1].scatter(X_train_scaled[:, 0], X_train_scaled[:, 1], c=mglearn.cm2(0), label='training set', s=60) axes[1].scatter(X_test_scaled[:, 0], X_test_scaled[:, 1], marker='^', c=mglearn.cm2(1), label='test set', s=60) axes[1].set_title('scaled date') # subplot 3
stu_x_sc44['num'] = d1.iloc[:,0] stu_x_sc44.drop('num',axis=1) # 선 말고 점으로 시각화 df3 df4 = np.array(df3) df4[:,0] # 시각화 # 1) figure와 subplot 생성(1X3) fig, ax = plt.subplots(1,2) import mglearn ax[0].scatter(df4[:,2], df4[:,0], c=mglearn.cm2(0)) # 결석 ax[1].scatter(df4[:,2], df4[:,1], c=mglearn.cm2(1)) # 성적 # 시각화 # 1) figure와 subplot 생성(1X3) fig, ax = plt.subplots(1,1) import mglearn ax[0].scatter(df4[:,2], df4[:,0], c=mglearn.cm2(0)) # 결석 ax[0].scatter(df4[:,2], df4[:,1], c=mglearn.cm2(1)) # 성적 # 시각화 # 1) figure와 subplot 생성(1X3) fig, ax = plt.subplots(1,2) import mglearn
# plt.subplots(行数, 列数, figsize=(5,10)) fig, axis = plt.subplots(5, 1, figsize=(8, 16)) # ついでにスコアの推移も見る training_accuracy = [] test_accuracy = [] # np.reshape(newshape) # Gives a new shape to an array without changing its data. # reshape(n,m) => n行 m列にする.-1を指定すると要素数を元によしなにやってくれる line = np.linspace(-3, 3, 1000).reshape(-1, 1) # 1000行1列 for n, ax in zip([1, 3, 9, 15, 30], axis): # axは描画先 reg = KNeighborsRegressor(n_neighbors=n).fit(X_train, y_train) training_accuracy.append(reg.score(X_train, y_train)) test_accuracy.append(reg.score(X_test, y_test)) ax.plot(line, reg.predict(line)) # -3 から 3 までのデータを評価 ax.plot(X_train, y_train, '^', c=mg.cm2(0), markersize=2) ax.plot(X_test, y_test, 'v', c=mg.cm2(1), markersize=2) ax.set_title( "n={}, train_score={:.2f}, test_score={:.2f}".format(n, reg.score(X_train, y_train), reg.score(X_test, y_test))) ax.set_xlabel("Feature") ax.set_ylabel("Target") axis[0].legend(["Model predictions", "Training data/target", "Test data/target"], loc="best") # nを増やすと予測(Model predictions)はスムーズになり, # 訓練データに対する精度が下がる. # 一方で# テストデータに対するスコアも下がっている.(n_samples=40の場合) # 線形バイアスがかかってるデータなので,直感的にはスコアは上がりそうだが・・・なんで? # どうやらサンプル数が少ないことが原因っぽい.
# test_x_sc1 = m_sc2.transform(test_x) # # # - 잘못된 scaling => 기준 다름 (train -> train fit, test -> test fit) # train_x_sc2 = m_sc2.transform(train_x) # test_x_sc2 = m_sc3.transform(test_x) # ============================================================================= # 4) 시각화 # 4-1) figure와 subplot 생성 (1 x 3) fig, ax = plt.subplots(1, 3) # 4-2) 원본의 분포(x1, x2)의 산점도 import mglearn plt.rc('font', family = 'Malgun Gothic') ax[0].scatter(train_x[:, 0], train_x[:, 1], c = mglearn.cm2(0), label = 'train') ax[0].scatter(test_x[:, 0], test_x[:, 1], c = mglearn.cm2(1), label = 'test') ax[0].legend() ax[0].set_title('원본 산점도') ax[0].set_xlabel('sepal ') ax[0].set_ylabel('sepal ') # 4-3) 올바른 scaling 후 데이터(x1, x2)의 산점도 ax[1].scatter(train_x_sc1[:, 0], train_x_sc1[:, 1], c = mglearn.cm2(0)) ax[1].scatter(test_x_sc1[:, 0], test_x_sc1[:, 1], c = mglearn.cm2(1)) # 4-4) 잘못된 scaling 후 데이터(x1, x2)의 산점도 ax[2].scatter(train_x_sc2[:, 0], train_x_sc2[:, 1], c = mglearn.cm2(0)) ax[2].scatter(test_x_sc2[:, 0], test_x_sc2[:, 1], c = mglearn.cm2(1)) # => train data set과 test data set이 분리되어진 상태일 경우 각각 서로 다른 기준으로 scaling
import numpy as np import matplotlib.pyplot as plt import mglearn from sklearn.model_selection import train_test_split from sklearn.datasets import make_blobs from sklearn.preprocessing import MinMaxScaler # make synthetic data X, _ = make_blobs(n_samples=50, centers=5, random_state=4, cluster_std=2) # split it into training and test sets X_train, X_test = train_test_split(X, random_state=5, test_size=.1) # make c argument 2D cm20 = np.array(mglearn.cm2(0)).reshape(1, -1) cm21 = np.array(mglearn.cm2(1)).reshape(1, -1) # plot the training and test sets fig, axes = plt.subplots(1, 3, figsize=(13, 4)) axes[0].scatter(X_train[:, 0], X_train[:, 1], c=cm20, label="Training set", s=60) axes[0].scatter(X_test[:, 0], X_test[:, 1], marker='^', c=cm21, label="Test set", s=60) axes[0].legend(loc='upper left') axes[0].set_title("Original Data") # scale the data using MinMaxScaler scaler = MinMaxScaler() scaler.fit(X_train) X_train_scaled = scaler.transform(X_train) X_test_scaled = scaler.transform(X_test)
from sklearn.datasets import make_moons from sklearn.cluster import KMeans import matplotlib.pyplot as plt # 导入mglearn模块 import sys sys.path.append("../") import mglearn # generate synthetic two_moons data (with less noise this time) X, y = make_moons(n_samples=200, noise=0.05, random_state=0) # cluster the data into two clusters kmeans = KMeans(n_clusters=2) kmeans.fit(X) y_pred = kmeans.predict(X) # plot the cluster assignments and cluster centers plt.scatter(X[:, 0], X[:, 1], c=y_pred, cmap=mglearn.cm2, s=60) plt.scatter(kmeans.cluster_centers_[:, 0], kmeans.cluster_centers_[:, 1], marker='^', c=[mglearn.cm2(0), mglearn.cm2(1)], s=100, linewidth=2) plt.xlabel("Feature 0") plt.ylabel("Feature 1") plt.show()
# ---------------------------------------------------------------------------------------------- # Analyzing KNeighborsRegressor: # ---------------------------------------------------------------------------------------------- fig, axes = plt.subplots(1, 3, figsize=(15, 4)) # create 1,000 data points, evenly spaced between -3 and 3 line = np.linspace(-3, 3, 1000).reshape(-1, 1) for n_neighbors, ax in zip([1, 3, 9], axes): # make predictions using 1, 3, or 9 neighbors reg = KNeighborsRegressor(n_neighbors=n_neighbors) reg.fit(X_train, y_train) ax.plot(line, reg.predict( line)) # produces the line of predicted values for points in "line" ax.plot(X_train, y_train, '^', c=mglearn.cm2(0), markersize=8) # training set points ax.plot(X_test, y_test, 'v', c=mglearn.cm2(1), markersize=8) # test set points ax.set_title( "{} neighbor(s)\n train score: {:.2f} test score: {:.2f}".format( n_neighbors, reg.score(X_train, y_train), reg.score(X_test, y_test))) ax.set_xlabel("Feature") ax.set_ylabel("Target") axes[0].legend( ["Model predictions", "Training data/target", "Test data/target"], loc="best") plt.show()
print("per-feature minimum after scaling:\n{}".format(X_test_scaled.min(axis=0))) print("per-feature maximum after scaling:\n{}".format(X_test_scaled.max(axis=0))) ######################################################################################################################## ######################################################################################################################## import mglearn from sklearn.datasets import make_blobs import pylab as plt # make synthetic data X, _ = make_blobs(n_samples=50, centers=5, random_state=4, cluster_std=2) # split it into training and test sets X_train, X_test = train_test_split(X, random_state=5, test_size=.1) # plot the training and test sets fig, axes = plt.subplots(1, 3, figsize=(13, 4)) axes[0].scatter(X_train[:, 0], X_train[:, 1], c=mglearn.cm2(0), label="Training set", s=60) axes[0].scatter(X_test[:, 0], X_test[:, 1], marker='^', c=mglearn.cm2(1), label="Test set", s=60) axes[0].legend(loc='upper left') axes[0].set_title("Original Data") # scale the data using MinMaxScaler scaler = MinMaxScaler() scaler.fit(X_train) X_train_scaled = scaler.transform(X_train) X_test_scaled = scaler.transform(X_test) # visualize the properly scaled data axes[1].scatter(X_train_scaled[:, 0], X_train_scaled[:, 1], c=mglearn.cm2(0), label="Training set", s=60) axes[1].scatter(X_test_scaled[:, 0], X_test_scaled[:, 1], marker='^', c = mglearn.cm2(1), label="Test set", s=60) axes[1].set_title("Scaled Data") # rescale the test set separately # so test set min is 0 and test set max is 1
# 对数据的训练集进行标准化 scaler = MinMaxScaler() X_train_scaled = scaler.fit_transform(X_train) X_test_scaled = scaler.transform(X_test) #单独对测试集进行缩放的错误做法 test_scaler=MinMaxScaler() X_test_scaled_badly = test_scaler.fit_transform(X_test) #先标准化再切分,与先切分再标准化之间有细微差别 print(X_test_scaled[:,0]) print(X_testx[:,0]) #画图 fig=plt.figure(figsize=(13,4)) ax0=fig.add_subplot(1,4,1) ax0.scatter(X_train[:,0],X_train[:,1],c=mglearn.cm2(0),label='Training set',s=60) ax0.scatter(X_test[:,0],X_test[:,1],marker='^',c=mglearn.cm2(1),label='Test set',s=60) ax0.legend(loc='upper left') ax0.set_title('Original Data') ax0.set_xlabel('Feature 0') ax0.set_ylabel('Feature 1') ax1=fig.add_subplot(1,4,2) ax1.scatter(X_train_scaled[:,0],X_train_scaled[:,1],c=mglearn.cm2(0),label='Training set',s=60) ax1.scatter(X_test_scaled[:,0],X_test_scaled[:,1],marker='^',c=mglearn.cm2(1),label='Test set',s=60) ax1.legend(loc='upper left') ax1.set_title('Scaled Data') ax1.set_xlabel('Feature 0') ax1.set_ylabel('Feature 1') ax2=fig.add_subplot(1,4,3)
def main(): X, _ = make_blobs(n_samples=50, centers=5, random_state=4, cluster_std=2) X_train, X_test = train_test_split(X, random_state=5, test_size=.1) # plot the training and test sets fig, axes = plt.subplots(1, 3, figsize=(13, 4)) axes[0].scatter(X_train[:, 0], X_train[:, 1], c=mglearn.cm2(0), label="Training set", s=60) axes[0].scatter(X_test[:, 0], X_test[:, 1], marker='^', c=mglearn.cm2(1), label="Test set", s=60) axes[0].legend(loc='upper left') axes[0].set_title("Original Data") # scale the data using MinMaxScaler scaler = MinMaxScaler() scaler.fit(X_train) X_train_scaled = scaler.transform(X_train) X_test_scaled = scaler.transform(X_test) # visualize the properly scaled data axes[1].scatter(X_train_scaled[:, 0], X_train_scaled[:, 1], c=mglearn.cm2(0), label="Training set", s=60) axes[1].scatter(X_test_scaled[:, 0], X_test_scaled[:, 1], marker='^', c=mglearn.cm2(1), label="Test set", s=60) axes[1].set_title("Scaled Data") # rescale the test set separately # so test set min is 0 and test set max is 1 # DO NOT DO THIS! For illustration purposes only. test_scaler = MinMaxScaler() test_scaler.fit(X_test) X_test_scaled_badly = test_scaler.transform(X_test) # visualize wrongly scaled data axes[2].scatter(X_train_scaled[:, 0], X_train_scaled[:, 1], c=mglearn.cm2(0), label="training set", s=60) axes[2].scatter(X_test_scaled_badly[:, 0], X_test_scaled_badly[:, 1], marker='^', c=mglearn.cm2(1), label="test set", s=60) axes[2].set_title("Improperly Scaled Data") for ax in axes: ax.set_xlabel("Feature 0") ax.set_ylabel("Feature 1") fig.tight_layout() plt.show()
# データポイントを3つにクラスタリング kmeans.fit(X) y_pred = kmeans.predict(X) # クラスタ割り当てとクラスタセンタをプロット mglearn.discrete_scatter(X[:, 0], X[:, 1], kmeans.labels_, markers='o') mglearn.discrete_scatter( kmeans.cluster_centers_[:, 0], kmeans.cluster_centers_[:, 1], [0, 1, 2], markers='^', markeredgewidth=2) plt.xlabel("Feature 0") plt.ylabel("Feature 1") # 合成データセットtwo_moonsデータ作成(ノイズ少な目) from sklearn.datasets import make_moons X, y = make_moons(n_samples=200, noise=0.05, random_state=0) # 2つのクラスタにクラスタ分離 kmeans = KMeans(n_clusters=2) kmeans.fit(X) y_pred = kmeans.predict(X) # クラスタ割り当てとクラスタセンタをプロット plt.scatter(X[:, 0], X[:, 1], c=y_pred, cmap=mglearn.cm2, s=60, edgecolor='k') plt.scatter(kmeans.cluster_centers_[:, 0], kmeans.cluster_centers_[:, 1], marker='^', c=[mglearn.cm2(0), mglearn.cm2(1)], s=100, linewidth=2, edgecolor='k') plt.xlabel("Feature 0") plt.ylabel("Feature 1")
mglearn.discrete_scatter(X[:,0],X[:,1],kmeans.labels_,markers="o") mglearn.discrete_scatter(kmeans.cluster_centers_[:,0],kmeans.cluster_centers_[:,1], [0,1,2],markers="^",markeredgewidth=2) plt.xlabel("feature 0") plt.ylabel("feature 1") #two moon data from sklearn.datasets import make_moons X, y = make_moons(n_samples=200, noise=0.05, random_state=0) kmeans = KMeans(n_clusters=2).fit(X) y_pred = kmeans.predict(X) plt.scatter(X[:,0],X[:,1],c=y_pred, cmap=mglearn.cm2, s=60, edgecolors="k") plt.scatter(kmeans.cluster_centers_[:,0],kmeans.cluster_centers_[:,1],marker="^", c=[mglearn.cm2(0),mglearn.cm2(1)],s=100, linewidth=2, edgecolors="k") plt.xlabel("feature 0") plt.ylabel("feature 1") ##벡터 양자화(k평균 군집의 중점을 분해로 볼 수 있다는 취지) #face data from sklearn.datasets import fetch_lfw_people people = fetch_lfw_people(min_faces_per_person=20,resize=0.7) image_shape = people.images[0].shape mask = np.zeros(people.target.shape, dtype=np.bool) for target in np.unique(people.target):#타겟별 빈도가 달라 50개씩 일괄 추출 mask[np.where(people.target == target)[0][:50]] = 1 X_people = people.data[mask] y_people = people.target[mask]
# Scaling Training and Test Data the Same Way---------------------------------- """ It is important to apply exactly the same transformation to the training set and the test set for the supervised model to work on the test set. The following example illustrates what would happen if we were to use the minimum and range of the test set instead. """ from sklearn.datasets import make_blobs # make synthetic data X, _ = make_blobs(n_samples=50, centers=5, random_state=4, cluster_std=2) # split it into training and test sets X_train, X_test = train_test_split(X, random_state=5, test_size=.1) # plot the training and test sets fig, axes = plt.subplots(1, 3, figsize=(13, 4)) axes[0].scatter(X_train[:, 0], X_train[:, 1], c=mglearn.cm2(0), label="Training set", s=60) axes[0].scatter(X_test[:, 0], X_test[:, 1], marker='^', c=mglearn.cm2(1), label="Test set", s=60) axes[0].legend(loc='upper left') axes[0].set_title("Original Data") # scale the data using MinMaxScaler scaler = MinMaxScaler() scaler.fit(X_train) X_train_scaled = scaler.transform(X_train) X_test_scaled = scaler.transform(X_test) # visualize the properly scaled data axes[1].scatter(X_train_scaled[:, 0], X_train_scaled[:, 1], c=mglearn.cm2(0), label="Training set", s=60) axes[1].scatter(X_test_scaled[:, 0], X_test_scaled[:, 1], marker='^', c=mglearn.cm2(1), label="Test set", s=60) axes[1].set_title("Scaled Data")
plt.xlabel("Feature 0") plt.ylabel("Feature 1") plt.show() #In[54] from sklearn.datasets import make_moons X, y = make_moons(n_samples=200, noise=0.05, random_state=0) kmeans = KMeans(n_clusters=2) kmeans.fit(X) y_pred = kmeans.predict(X) plt.scatter(X[:, 0], X[:, 1], c=y_pred, cmap=mglearn.cm2, s=60, edgecolor='k') plt.scatter(kmeans.cluster_centers_[:, 0], kmeans.cluster_centers_[:, 1], marker='^', c=[mglearn.cm2(0), mglearn.cm2(1)], s=100, linewidth=2, edgecolor='k') plt.xlabel("Feature 0") plt.ylabel("Feature 1") #In[55] #In[56] #In[59] mglearn.plots.plot_agglomerative_algorithm() plt.show() #In[60] from sklearn.cluster import AgglomerativeClustering
of goodness of a prediction for a regression model, and yields a score between 0 and 1. A value of 1 corresponds to a perfect prediction, and a value of 0 corresponds to a constant model that just predicts the mean of the training set responses, y_train """ print('Test set R^2 : {:.2f}'.format(reg.score(X_test,y_test))) # Analyzing KNeighborsRegressor------------------------------------------------ fig, axes = plt.subplots(1, 3, figsize=(15, 4)) # create 1,000 data points, evenly spaced between -3 and 3 line = np.linspace(-3, 3, 1000).reshape(-1, 1) for n_neighbors, ax in zip([1, 3, 9], axes): # make predictions using 1, 3, or 9 neighbors reg = KNeighborsRegressor(n_neighbors=n_neighbors) reg.fit(X_train, y_train) ax.plot(line, reg.predict(line)) ax.plot(X_train, y_train, '^', c=mglearn.cm2(0), markersize=8) ax.plot(X_test, y_test, 'v', c=mglearn.cm2(1), markersize=8) ax.set_title("{} neighbor(s)\n train score: {:.2f} test score: {:.2f}".format( n_neighbors, reg.score(X_train, y_train), reg.score(X_test, y_test))) ax.set_xlabel("Feature") ax.set_ylabel("Target") axes[0].legend(["Model predictions", "Training data/target", "Test data/target"], loc="best") #------------------------------------------------------------------------------Linear Models # Linear models for regression------------------------------------------------- mglearn.plots.plot_linear_regression_wave() # Linear regression (aka ordinary least squares)------------------------------- from sklearn.linear_model import LinearRegression
# %% reg = KNeighborsRegressor(n_neighbors=3) reg.fit(X_train, y_train) # %% print('Test set predictions: \n{}'.format(reg.predict(X_test))) # %% print('Test set R^2: {:.2f}'.format(reg.score(X_test, y_test))) # %% fig, axes = plt.subplots(1, 3, figsize=(15, 4)) line = np.linspace(-3, 3, 1000).reshape(-1, 1) for n_neighbors, ax in zip([1, 3, 9], axes): reg = KNeighborsRegressor(n_neighbors) reg.fit(X_train, y_train) ax.plot(line, reg.predict(line)) ax.plot(X_train, y_train, '^', c=mglearn.cm2(0), markersize=8) ax.plot(X_test, y_test, 'v', c=mglearn.cm2(1), markersize=8) ax.set_title('{} neighbor(s)\n train score: {:.2f} test score: {:.2f}'.format( n_neighbors, reg.score(X_train, y_train), reg.score(X_test, y_test))) ax.set_xlabel('Feature') ax.set_ylabel('Target') axes[0].legend(['Model predictions', 'Training data/target', 'Test data/target'], loc='best') # %%
@author: Yuan-Ray Chang """ import numpy as np import matplotlib.pyplot as plt import pandas as pd import mglearn from sklearn.datasets import make_blobs from sklearn.model_selection import train_test_split X, _ = make_blobs(n_samples=50, centers=5, random_state=4, cluster_std=2) X_train, X_test = train_test_split(X, random_state=5, test_size=0.1) fig, axes = plt.subplots(1, 3, figsize=(13, 4)) axes[0].scatter(X_train[:,0], X_train[:,1], c=mglearn.cm2(0), \ label="Training set", s=60) axes[0].scatter(X_test[:,0], X_test[:,1], marker='^', \ c=mglearn.cm2(1), label="Test set", s=60) axes[0].legend(loc='upper left') axes[0].set_title("Original data") scaler = MinMaxScaler() scaler.fit(X_train) X_train_scaled = scaler.transform(X_train) X_test_scaled = scaler.transform(X_test) axes[1].scatter(X_train_scaled[:,0], X_train_scaled[:,1], c=mglearn.cm2(0), \ label="Training set", s=60) axes[1].scatter(X_test_scaled[:,0], X_test_scaled[:,1], marker='^', \ c=mglearn.cm2(1), label="Test set", s=60)