Esempio n. 1
0
clf = RandomForestClassifier()
clf.fit(X_train, y_train)
print(clf.predict(X_test))
print(clf.predict_proba(np.array([[5, 3]])))
print(clf.score(X_test, y_test))

# 参数调节
# n_estimators = 10 默认10,弱学习期最大迭代次数
# bootstrap = True 是否放回抽样 默认True
# oob_score = False 是否采用袋外样本评估 袋外样本指未有抽到的样本
# criterion = 'gini' CART树对特征的评判标准 有基尼系数(gini)和信息增益(entropy)
# 还有其他决策树的参数
clf = RandomForestClassifier(n_estimators=10, bootstrap=True, max_depth=8)
clf.fit(X_train, y_train)
print(clf.score(X_test, y_test))
classfy_plt_3d(clf, X_test, y_test)
''' 最佳参数搜索 并行调参 '''
# from sklearn.grid_search import GridSearchCV
from sklearn.model_selection import GridSearchCV
param_test = {'n_estimators': range(5, 50, 5), 'max_depth': range(5, 30, 5)}
gsearch = GridSearchCV(estimator=RandomForestClassifier(bootstrap=True),
                       param_grid=param_test,
                       cv=5)
gsearch.fit(X_train, y_train)
print('最佳参数:', gsearch.best_params_, gsearch.best_score_)
''' RF 回归 '''

import Tdata
from sklearn.ensemble import RandomForestRegressor
import matplotlib.pyplot as plt
Esempio n. 2
0
# class_weight->dict,list of dicts,'balanced',None,optional(default=None),主要是考虑每个类的权重{class_label: weight}
# max_leaf_nodes: 最大叶子节点数。通过限制最大叶子节点数,可以防止过拟合,默认是"None”,即不限制最大的叶子节点数。

tree_model = tree.DecisionTreeClassifier(criterion='entropy')
tree_model = tree_model.fit(X, y)
result_proba = tree_model.predict_proba([[3, 1]], check_input=True)
print('分类概率:', result_proba.tolist())
print('分类结果:', tree_model.predict([[3, 2]]))

# 3D 图表展示分类效果
iris = datasets.load_iris()  # 使用自带的iris数据
X = iris.data[:, [0, 2]]
y = iris.target
clf = tree.DecisionTreeClassifier(max_depth=4)  # 训练模型,限制树的最大深度4
clf.fit(X, y)  #拟合模型
classfy_plt_3d(clf, X, y)

# 决策树回归
# fit(变量,结果)
train, test = sin_data()
x_train, y_train = train[:, :2], train[:, 2]  # 数据前两列是x1,x2 第三列是y,这里的y有随机噪声
x_test, y_test = test[:, :2], test[:, 2]  # 同上,不过这里的y没有噪声

# train数据格式:
# [[  0.         -10.           2.69876376]
#  [  0.1002004   -9.95991984   2.36347624]
#  ...,
#  [ 50.          10.           7.29325787]]


# 方法调度函数
Esempio n. 3
0
print(estimator.fit_predict(data))  # 训练并直接输出结果


# 2个特征+类别  分类效果展示
import matplotlib.pyplot as plt
data = np.random.rand(100, 2)
estimator = KMeans(n_clusters=3)
estimator.fit(data)
plt.figure(figsize=(8,10))  # 设置尺寸
colors = ['blue', 'yellow', 'red']
markers = ['o', 's', 'D']  # 点形状
for i,l in enumerate(estimator.labels_):
     plt.plot(data[i][0],data[i][1],color=colors[l],marker=markers[l],ls='None')
plt.show()
# 3D分类效果展示
classfy_plt_3d(estimator, data, estimator.labels_)

# 文本应用
# 文本聚类

import jieba
from sklearn.feature_extraction.text import TfidfVectorizer


def jieba_tokenize(text):
    print(jieba.lcut(text))
    return jieba.lcut(text)


# scikit-learn 自带的TF-IDF功能 依赖分词工具
tfidf_vectorizer = TfidfVectorizer(tokenizer=jieba_tokenize, lowercase=False)
Esempio n. 4
0
#   tol=0.001, verbose=False)
# SVC参数解释
# C: 目标函数的惩罚系数C,用来平衡分类间隔margin和错分样本的,default C = 1.0;
# kernel:核函数选择,有RBF(高斯核函数,是线性不可分SVM常用的核函数之一),
#           Linear(线性核函数), Poly(多项式核函数), Sigmoid, 默认的是"RBF"
# degree:Poly下多项式的最高次幂;
# gamma:核函数的系数('Poly', 'RBF' and 'Sigmoid'), 默认是gamma = 1 / n_features;
# coef0:核函数中的独立项,'RBF' and 'Poly'有效;
# class_weight	 指定样本各类别的的权重,主要是为了防止训练集某些类别的样本过多,导致训练的决策过于偏向这些类别。这里可以自己指定各个样本的权重,或者用“balanced”,如果使用“balanced”,则算法会自己计算权重,样本量少的类别所对应的样本权重会高。当然,如果你的样本类别分布没有明显的偏倚,则可以不管这个参数,选择默认的"None"
# probablity: 可能性估计是否使用(true or false);及predict_proba是否可用,默认False
# max_iter: 最大迭代次数,default = 1, if max_iter = -1, no limited;
# decision_function_shape : ‘ovo’ 一对一, ‘ovr’ 多对多  or None 无, default=None ovo效果相对较精准
# random_state :用于概率估计的数据重排时的伪随机数生成器的种子。
print(clf.predict(test))
print(clf.predict_proba(test))
classfy_plt_3d(clf, X, Y)
''' LinearSVC '''

from sklearn.svm import LinearSVC

clf = LinearSVC()
clf.fit(X, Y)

dec = clf.decision_function(test)  # 返回的是样本距离超平面的距离
print(dec)

# 预测
print(clf.predict(test))
''' SVM之 SVR回归'''

from sklearn.svm import SVR, LinearSVR
Esempio n. 5
0
#
# @method   : scikit-learn LogisticRegression(逻辑回归)
# @Time     : 2018/4/2
# @Author   : wooght
# @File     : w_LogisticRegression.py
# 逻辑回归用于分类,二元分类常用, 特征没有线性要求,因变量是二元的

from Tdata import gender_sample
from sklearn.linear_model import LogisticRegression
import numpy as np
from common.classfy_plt_3d import classfy_plt_3d

# 性别分类数据 为了3D展示,只取了体重和身高作为特征 特征数据离散
x, y = gender_sample()
x_train, y_train = np.row_stack([x[:50, :2], x[150:, :2]]), y[50:150]
x_test, y_test = x[50:150, :2], y[50:150]

logisticR = LogisticRegression()
logisticR.fit(x_train, y_train)  # y_train 必须是类别数据
result = logisticR.predict(x_test)
print(result)
classfy_plt_3d(logisticR, x_train, y_train)
print(logisticR.score(x_train, y_train))

# class_weight 指定特征权重(注意这里是特征权重,而不是类别权重)
logisticR = LogisticRegression(class_weight={0: 0.6, 1: 0.4})
logisticR.fit(x_train, y_train)
result = logisticR.predict(x_test)
print(logisticR.coef_, logisticR.intercept_)
classfy_plt_3d(logisticR, x_train, y_train)
print(logisticR.score(x_train, y_train))