import numpy as np
import matplotlib.pyplot as plt

from sklearn import datasets
iris = datasets.load_iris()

X = iris.data[:, 2:]
y = iris.target

#信息熵entropy的计算比基尼系数gini稍慢,所以sklearn默认criterion="gini",大多数时候两者没有区别
from sklearn.tree import DecisionTreeClassifier
dt_clf = DecisionTreeClassifier(max_depth=2, criterion="entropy")
dt_clf.fit(X, y)

from Utils.PlotDecisionBoundary import plot_decision_boundary
plot_decision_boundary(dt_clf, axis=[0.5, 7.5, 0, 3])
plt.scatter(X[y == 0, 0], X[y == 0, 1], label='y==0')
plt.scatter(X[y == 1, 0], X[y == 1, 1], label='y==1')
plt.scatter(X[y == 2, 0], X[y == 2, 1], label='y==2')
plt.legend()
plt.show()

#基于自己写的决策树函数做划分
from DecisionTree.DecisionTreeClassfierFunction import try_split, split, entropy

#第1次遍历划分
best_entropy1, best_dim1, best_value1 = try_split(X, y)
print("best_entropy1:", best_entropy1)
#best_entropy1: 0.6931471805599453
print("best_dim1:", best_dim1)
#best_dim1: 0
def x2(x1):
    return (- logistic_reg.interception_ - logistic_reg.coefficient_[0] * x1)/logistic_reg.coefficient_[1]

x1_plot = np.linspace(4, 8, 1000)
x2_plot = x2(x1_plot)


plt.scatter(X[y==0, 0], X[y==0, 1], color='g', label='y==0')
plt.scatter(X[y==1, 0], X[y==1, 1], color='b', label='y==1')
plt.plot(x1_plot, x2_plot, color='r', label='Decision Boundary')
plt.legend()
plt.show()


from Utils.PlotDecisionBoundary import plot_decision_boundary
plot_decision_boundary(logistic_reg, axis=[4, 7.5, 1.5, 4.5])
plt.scatter(X[y==0, 0], X[y==0, 1], color='g', label='y==0')
plt.scatter(X[y==1, 0], X[y==1, 1], color='b', label='y==1')
plt.legend()
plt.show()

# KNN分类的决策边界
from sklearn.neighbors import KNeighborsClassifier
knn_clf = KNeighborsClassifier(n_neighbors=30)
knn_clf.fit(iris.data[:,:2], iris.target)
plot_decision_boundary(knn_clf, axis=[4, 8, 1.5, 4.5])
plt.scatter(X[y==0, 0], X[y==0, 1], color='g', label='y==0')
plt.scatter(X[y==1, 0], X[y==1, 1], color='b', label='y==1')
plt.scatter(X[y==2, 0], X[y==2, 1], color='r', label='y==2')
plt.legend()
plt.show()
Example #3
0
import numpy as np
import matplotlib.pyplot as plt

from sklearn import datasets

X, y = datasets.make_moons(noise=0.25, random_state=666)

from sklearn.tree import DecisionTreeClassifier
#超参数1:max_depth,决策树深度,越小越不容易过拟合
dt_clf1 = DecisionTreeClassifier(max_depth=2)
dt_clf1.fit(X, y)

from Utils.PlotDecisionBoundary import plot_decision_boundary
plot_decision_boundary(dt_clf1, axis=[-1.5, 2.5, -1.0, 1.5])
plt.scatter(X[y == 0, 0], X[y == 0, 1])
plt.scatter(X[y == 1, 0], X[y == 1, 1])
plt.title("max_depth=2")
plt.show()

#超参数2:min_samples_split,划分结束判断条件。如果只剩min_samples_split个样本,则不再继续划分。越大越不容易过拟合
dt_clf2 = DecisionTreeClassifier(min_samples_split=10)
dt_clf2.fit(X, y)

from Utils.PlotDecisionBoundary import plot_decision_boundary
plot_decision_boundary(dt_clf2, axis=[-1.5, 2.5, -1.0, 1.5])
plt.scatter(X[y == 0, 0], X[y == 0, 1])
plt.scatter(X[y == 1, 0], X[y == 1, 1])
plt.title("min_samples_split=10")
plt.show()

#超参数3:min_samples_leaf,划分结束判断条件。最底层的叶子节点需要至少保留min_samples_leaf个样本,则不再继续划分。越大越不容易过拟合
Example #4
0
import numpy as np
import matplotlib.pyplot as plt

from sklearn import datasets
#默认100个样本,2个特征,可配置
X, y = datasets.make_moons(noise=0.15, random_state=666)

#使用多项式+线性SVM
from SVM.SVCFunction import PolynomialSVC

poly_svc = PolynomialSVC(degree=3)
poly_svc.fit(X, y)

from Utils.PlotDecisionBoundary import plot_decision_boundary

plot_decision_boundary(poly_svc, axis=[-1.5, 2.5, -1.0, 1.5])
plt.scatter(X[y == 0, 0], X[y == 0, 1], color='g', label='y==0')
plt.scatter(X[y == 1, 0], X[y == 1, 1], color='b', label='y==1')
plt.legend()
plt.show()

#使用多项式核函数的SVM
from SVM.SVCFunction import PolynomialKernalSVC

poly_kernal_svc = PolynomialKernalSVC(degree=3)
poly_kernal_svc.fit(X, y)

from Utils.PlotDecisionBoundary import plot_decision_boundary

plot_decision_boundary(poly_kernal_svc, axis=[-1.5, 2.5, -1.0, 1.5])
plt.scatter(X[y == 0, 0], X[y == 0, 1], color='g', label='y==0')
from sklearn.model_selection import train_test_split
X_train, X_test, y_train, y_test = train_test_split(X,
                                                    y,
                                                    random_state=666,
                                                    test_size=0.2)

#######################################################################
#自己写的多项式逻辑回归训练和评分
poly_logistic_reg = PolynomialLogisticRegression(degree=20)
poly_logistic_reg.fit(X_train, y_train)
print("poly_logistic_reg.score:", poly_logistic_reg.score(X_test, y_test))
#poly_logistic_reg.score: 0.9

#自己写的多项式逻辑回归绘制决策边界
from Utils.PlotDecisionBoundary import plot_decision_boundary
plot_decision_boundary(poly_logistic_reg, axis=[-4, 4, -4, 4])
plt.scatter(X[y == 0, 0], X[y == 0, 1])
plt.scatter(X[y == 1, 0], X[y == 1, 1])
plt.show()

#####################################################################################
#scikit-learn逻辑回归
from sklearn.linear_model import LogisticRegression
skl_logistic_reg = LogisticRegression()
skl_logistic_reg.fit(X_train, y_train)
print("skl_logistic_reg.score:", skl_logistic_reg.score(X_test, y_test))
#skl_logistic_reg.score: 0.875

#scikit-learn逻辑回归绘制决策边界
from Utils.PlotDecisionBoundary import plot_decision_boundary
plot_decision_boundary(skl_logistic_reg, axis=[-4, 4, -4, 4])