import numpy as np import matplotlib.pyplot as plt from sklearn import datasets iris = datasets.load_iris() X = iris.data[:, 2:] y = iris.target #信息熵entropy的计算比基尼系数gini稍慢,所以sklearn默认criterion="gini",大多数时候两者没有区别 from sklearn.tree import DecisionTreeClassifier dt_clf = DecisionTreeClassifier(max_depth=2, criterion="entropy") dt_clf.fit(X, y) from Utils.PlotDecisionBoundary import plot_decision_boundary plot_decision_boundary(dt_clf, axis=[0.5, 7.5, 0, 3]) plt.scatter(X[y == 0, 0], X[y == 0, 1], label='y==0') plt.scatter(X[y == 1, 0], X[y == 1, 1], label='y==1') plt.scatter(X[y == 2, 0], X[y == 2, 1], label='y==2') plt.legend() plt.show() #基于自己写的决策树函数做划分 from DecisionTree.DecisionTreeClassfierFunction import try_split, split, entropy #第1次遍历划分 best_entropy1, best_dim1, best_value1 = try_split(X, y) print("best_entropy1:", best_entropy1) #best_entropy1: 0.6931471805599453 print("best_dim1:", best_dim1) #best_dim1: 0
def x2(x1): return (- logistic_reg.interception_ - logistic_reg.coefficient_[0] * x1)/logistic_reg.coefficient_[1] x1_plot = np.linspace(4, 8, 1000) x2_plot = x2(x1_plot) plt.scatter(X[y==0, 0], X[y==0, 1], color='g', label='y==0') plt.scatter(X[y==1, 0], X[y==1, 1], color='b', label='y==1') plt.plot(x1_plot, x2_plot, color='r', label='Decision Boundary') plt.legend() plt.show() from Utils.PlotDecisionBoundary import plot_decision_boundary plot_decision_boundary(logistic_reg, axis=[4, 7.5, 1.5, 4.5]) plt.scatter(X[y==0, 0], X[y==0, 1], color='g', label='y==0') plt.scatter(X[y==1, 0], X[y==1, 1], color='b', label='y==1') plt.legend() plt.show() # KNN分类的决策边界 from sklearn.neighbors import KNeighborsClassifier knn_clf = KNeighborsClassifier(n_neighbors=30) knn_clf.fit(iris.data[:,:2], iris.target) plot_decision_boundary(knn_clf, axis=[4, 8, 1.5, 4.5]) plt.scatter(X[y==0, 0], X[y==0, 1], color='g', label='y==0') plt.scatter(X[y==1, 0], X[y==1, 1], color='b', label='y==1') plt.scatter(X[y==2, 0], X[y==2, 1], color='r', label='y==2') plt.legend() plt.show()
import numpy as np import matplotlib.pyplot as plt from sklearn import datasets X, y = datasets.make_moons(noise=0.25, random_state=666) from sklearn.tree import DecisionTreeClassifier #超参数1:max_depth,决策树深度,越小越不容易过拟合 dt_clf1 = DecisionTreeClassifier(max_depth=2) dt_clf1.fit(X, y) from Utils.PlotDecisionBoundary import plot_decision_boundary plot_decision_boundary(dt_clf1, axis=[-1.5, 2.5, -1.0, 1.5]) plt.scatter(X[y == 0, 0], X[y == 0, 1]) plt.scatter(X[y == 1, 0], X[y == 1, 1]) plt.title("max_depth=2") plt.show() #超参数2:min_samples_split,划分结束判断条件。如果只剩min_samples_split个样本,则不再继续划分。越大越不容易过拟合 dt_clf2 = DecisionTreeClassifier(min_samples_split=10) dt_clf2.fit(X, y) from Utils.PlotDecisionBoundary import plot_decision_boundary plot_decision_boundary(dt_clf2, axis=[-1.5, 2.5, -1.0, 1.5]) plt.scatter(X[y == 0, 0], X[y == 0, 1]) plt.scatter(X[y == 1, 0], X[y == 1, 1]) plt.title("min_samples_split=10") plt.show() #超参数3:min_samples_leaf,划分结束判断条件。最底层的叶子节点需要至少保留min_samples_leaf个样本,则不再继续划分。越大越不容易过拟合
import numpy as np import matplotlib.pyplot as plt from sklearn import datasets #默认100个样本,2个特征,可配置 X, y = datasets.make_moons(noise=0.15, random_state=666) #使用多项式+线性SVM from SVM.SVCFunction import PolynomialSVC poly_svc = PolynomialSVC(degree=3) poly_svc.fit(X, y) from Utils.PlotDecisionBoundary import plot_decision_boundary plot_decision_boundary(poly_svc, axis=[-1.5, 2.5, -1.0, 1.5]) plt.scatter(X[y == 0, 0], X[y == 0, 1], color='g', label='y==0') plt.scatter(X[y == 1, 0], X[y == 1, 1], color='b', label='y==1') plt.legend() plt.show() #使用多项式核函数的SVM from SVM.SVCFunction import PolynomialKernalSVC poly_kernal_svc = PolynomialKernalSVC(degree=3) poly_kernal_svc.fit(X, y) from Utils.PlotDecisionBoundary import plot_decision_boundary plot_decision_boundary(poly_kernal_svc, axis=[-1.5, 2.5, -1.0, 1.5]) plt.scatter(X[y == 0, 0], X[y == 0, 1], color='g', label='y==0')
from sklearn.model_selection import train_test_split X_train, X_test, y_train, y_test = train_test_split(X, y, random_state=666, test_size=0.2) ####################################################################### #自己写的多项式逻辑回归训练和评分 poly_logistic_reg = PolynomialLogisticRegression(degree=20) poly_logistic_reg.fit(X_train, y_train) print("poly_logistic_reg.score:", poly_logistic_reg.score(X_test, y_test)) #poly_logistic_reg.score: 0.9 #自己写的多项式逻辑回归绘制决策边界 from Utils.PlotDecisionBoundary import plot_decision_boundary plot_decision_boundary(poly_logistic_reg, axis=[-4, 4, -4, 4]) plt.scatter(X[y == 0, 0], X[y == 0, 1]) plt.scatter(X[y == 1, 0], X[y == 1, 1]) plt.show() ##################################################################################### #scikit-learn逻辑回归 from sklearn.linear_model import LogisticRegression skl_logistic_reg = LogisticRegression() skl_logistic_reg.fit(X_train, y_train) print("skl_logistic_reg.score:", skl_logistic_reg.score(X_test, y_test)) #skl_logistic_reg.score: 0.875 #scikit-learn逻辑回归绘制决策边界 from Utils.PlotDecisionBoundary import plot_decision_boundary plot_decision_boundary(skl_logistic_reg, axis=[-4, 4, -4, 4])