# 3.训练模型
mdl = LinearDiscriminantAnalysis(solver='svd')
mdl.fit(X, y)

# 当为二分类时,虽然mdl.classes_有二个值,但只有一个方程
# 映射直线
sr = pd.Series(data=[mdl.intercept_[0]] + mdl.coef_[0].tolist(),
               index=['常数'] + cols)
print(sr)

# 4、评估模型
y_pred = mdl.predict(X)
displayClassifierMetrics(y, y_pred, mdl.classes_, pos_label='是')

y_prob = mdl.predict_proba(X)
displayROCurve(y, y_prob, mdl.classes_)

# 6.应用模型
# 1)预测值
srPred = pd.Series(data=y_pred, index=df.index, name='预测值')

# 2)预测概率
dfProb = pd.DataFrame(data=y_prob, index=df.index, columns=mdl.classes_)

# 3)映射后的值
X_ = mdl.transform(X)
facts = ['f{}'.format(i + 1) for i in range(X_.shape[1])]
dfFacts = pd.DataFrame(data=X_, index=df.index, columns=facts)

# 4)合并到原数据集
dfNew = pd.concat([y, dfFacts, srPred, dfProb], axis=1)
예제 #2
0
mdl = RandomForestClassifier(max_features=0.8,
                             n_estimators=51,
                             min_samples_split=10,
                             min_samples_leaf=5,
                             oob_score=True,
                             random_state=10)
mdl.fit(X, y)

# 4、评估模型
print('袋外得分=', mdl.oob_score_)  # 相当于泛化准确度

y_pred = mdl.predict(X)
displayClassifierMetrics(y, y_pred, mdl.classes_, poslabel)

y_prob = mdl.predict_proba(X)
displayROCurve(y, y_prob, mdl.classes_, '随机森林')

# 显示特征重要性
sr = pd.Series(mdl.feature_importances_, index=cols, name='特征重要性')
sr.sort_values(ascending=False, inplace=True)
sr.plot(kind='bar', title=sr.name)

# 5.超参优化(略)
# 6.应用模型(略)

######################################################################
########  Part2、RF分类的超参优化
######################################################################
# 超参分两部分:
# 1)RF框架参数:
# n_estimators
예제 #3
0
    gamma=0.1,
    subsample=0.8,
    colsample_bytree=0.8,
    reg_alpha=1,
    objective='binary:logistic',
    nthread=8,
    scale_pos_weight=1,
    seed=27)
model.fit(X, y)

# 模型评估
y_pred = model.predict(X)
displayClassifierMetrics(y, y_pred, model.classes_)

y_prob = model.predict_proba(X)
displayROCurve(y, y_prob, model.classes_, 'XGBoost')

######################################################################
########  Part2、超参优化
######################################################################
# 调优步骤:
# 1)学习率learning_rate [0.05, 0.3]
# 2)决策树超参:max_depth, min_child_weight,
# 3)节点分裂参数:gamma
# 4)抽样参数:subsample, colsample_bytree
# 5)正则化参数:reg_alpha, reg_lambda

# 默认的经验值
xgb = XGBClassifier(booster='gbtree',
                    learning_rate=0.1,
                    n_estimators=100,
예제 #4
0
    base_estimator=mdl,
    algorithm='SAMME',
    n_estimators=200,
    # learning_rate=0.7,
    # random_state=10
)
clf.fit(X, y)

# 3、评估模型
print('score=', clf.score(X, y))

y_pred = clf.predict(X)
displayClassifierMetrics(y, y_pred, clf.classes_)

y_prob = clf.predict_proba(X)
displayROCurve(y, y_prob, clf.classes_, 'AdaBoost')

# 1)显示特征重要性
sr = pd.Series(clf.feature_importances_, index=cols, name='特征重要性')
sr.sort_values(ascending=False, inplace=True)
sr.plot(kind='bar', title=sr.name)

# 2)其余基类信息
print('类别取值个数:', clf.n_classes_)
print('类别标签取值:', clf.classes_)
for i, est in enumerate(clf.estimators_):
    print('第{}个基学习器:'.format(i))
    # mdl = est   #可以保存起来,后续使用
    print('     权重:', np.round(clf.estimator_weights_[i], 4))
    print('     分类错误率:', np.round(clf.estimator_errors_[i], 4))
for i in range(clf.n_estimators):
    # mdl = clf.estimators_[i]
    idxs = clf.estimators_features_[i]  #特征序号
    idxs.sort()

    estFeatures = []
    for idx in idxs:
        estFeatures.append(cols[idx])
    print('第{0}个基类使用的特征:{1}'.format(i, estFeatures))

# 4、评估模型
y_pred = clf.predict(X)
displayClassifierMetrics(y, y_pred, clf.classes_, poslabel)

y_prob = clf.predict_proba(X)
displayROCurve(y, y_prob, clf.classes_, 'BaggingClassifier')

# 5.超参优化(略)
# 6.使用模型(略)

# 相关类
#   1、BaggingClassifier(base_estimator=None, bootstrap=True,
#               bootstrap_features=False, max_features=1.0,
#               max_samples=1.0, n_estimators=10,
#               n_jobs=None, oob_score=False, random_state=None,
#               verbose=0, warm_start=False)
#   2、BaggingRegressor()--参数列表和分类完全一样

# 重要参数
# base_estimator : 基学习器(默认=None),默认为决策树。
# n_estimators : int,基学习器的个数,(默认值为10)
# 支持向量列表
vts = mdl.support_vectors_
# 支持向量的索引列表
idxs = mdl.support_

# 当kernel='linear'时,才有如下属性
if mdl.kernel == 'linear':
    print(mdl._intercept_)
    print(mdl.coef_)

# 4.评估模型
y_pred = mdl.predict(X)
displayClassifierMetrics(y, y_pred, mdl.classes_, posLabel)

y_prob = mdl.predict_proba(X)
displayROCurve(y, y_prob, mdl.classes_, '支持向量机')

# 6.应用模型(略)

######################################################################
########  Part2、多分类SVC
######################################################################
# SVC和NuSVC为多分类实现了'one-vs-one'的方法,从而训练n*(n-1)/2个模型。

# 1.读取数据
from sklearn import datasets

iris = datasets.load_iris()
cols = ['花萼长度', '花萼宽度', '花瓣长度', '花瓣宽度']
labels = ['山鸢尾', '杂色鸢尾', '维吉尼亚鸢尾']
# cols = iris['feature_names']
예제 #7
0
# 3、模型训练
from lightgbm import LGBMClassifier, LGBMRegressor

gbm = LGBMClassifier(objective='multiclass',
                     num_leaves=31,
                     learning_rate=0.05,
                     n_estimators=20)
gbm.fit(X, y)

# 模型评估
y_pred = gbm.predict(X)
displayClassifierMetrics(y, y_pred, gbm.classes_)

y_prob = gbm.predict_proba(X)
displayROCurve(y, y_prob, gbm.classes_, 'LightGBM')

# 显示特征重要性

######################################################################
########  Part2、超参优化
######################################################################

bestParams = {}
lt_params = [
    {
        'n_estimators': range(50, 150, 10)
    },
    {
        'max_depth': range(3, 14),
        'min_child_weight': range(1, 6)
예제 #8
0
mdl = LogisticRegression(penalty='none')
mdl.fit(X, y)

# 注意:逻辑回归中的这两个参数都是元组,不像回归模型
sr = pd.Series(
        data=[mdl.intercept_[0]] + mdl.coef_[0].tolist(),
        index=['常数'] + cols )
print(sr)

####### 5、评估模型指标
y_pred = mdl.predict(X)
displayClassifierMetrics(y, y_pred, mdl.classes_, poslabel)

y_prob = mdl.predict_proba(X)
displayROCurve(y, y_prob, mdl.classes_, '二分类逻辑回归')

# 6.应用模型

# 1)可选操作:将原始数据,预测值,预测概率合并在一个DF集
srPred = pd.Series(y_pred, index=df.index, name='预测值')            #预测结果

# y_prob有顺序与classes_指定的类别顺序是一致的
ProbCols = [F"{val}-概率" for val in mdl.classes_]
dfProb = pd.DataFrame(y_prob, index=df.index, columns=ProbCols)

dfNew = pd.concat([df, srPred, dfProb], axis=1)
print(dfNew.head())

# 2)保存模型(略)
# 3)加载模型(略)
예제 #9
0
print('网络层数:', mdl.n_layers_)
print('输出节点数:', mdl.n_outputs_)
for i, coefs in enumerate(mdl.coefs_):
    nodes = len(mdl.intercepts_[i])
    print('中间层{},节点数:{}'.format(i + 1, nodes))
    for j in range(nodes):
        wt = [mdl.intercepts_[i][j]] + coefs[:, j].tolist()
        print('  节点{}:{}'.format(j + 1, np.round(wt, 2)))

# 4.评估模型
y_pred = mdl.predict(X)
displayClassifierMetrics(y, y_pred, mdl.classes_)

y_prob = mdl.predict_proba(X)
displayROCurve(y, y_prob, mdl.classes_, '神经网络')

# 5.超参优化

# 6.应用模型
# 1)保存模型
# 2)加载模型
# 3)预测

######################################################################
########  Part2、MLPClassifier(类别自变量)
######################################################################

# 1、读取数据
filename = '分类预测.xls'
sheet = '贷款违约'
예제 #10
0
from sklearn.preprocessing import OrdinalEncoder

enc = OrdinalEncoder(dtype='int')
X_ = enc.fit_transform(df[catCols])

# 映射关系
for i, col in enumerate(catCols):
    print('\n变量名称:', col)
    print('数值顺序', enc.categories_[i])

dfCats = pd.DataFrame(X_, df.index, catCols)

# 3)合并
X = pd.concat([df[intCols], dfCats], axis=1)
cols = X.columns.tolist()

# 3.训练模型
from sklearn.naive_bayes import GaussianNB

mdl = GaussianNB()
mdl.fit(X, y)

# 4.评估模型
y_pred = mdl.predict(X)
displayClassifierMetrics(y, y_pred, mdl.classes_, poslabel)

y_prob = mdl.predict_proba(X)
displayROCurve(y, y_prob, mdl.classes_, '朴素贝叶斯')

# 其余略
예제 #11
0
# 3)合并
X = pd.concat([dfCats, df[intCols]], axis=1)
cols = X.columns.tolist()

# 3、训练模型
from sklearn.tree import DecisionTreeClassifier

mdl = DecisionTreeClassifier(criterion='entropy')
mdl.fit(X, y)

# 4、评估模型
y_pred = mdl.predict(X)
displayClassifierMetrics(y, y_pred, mdl.classes_, poslabel)

y_prob = mdl.predict_proba(X)
displayROCurve(y, y_prob, mdl.classes_, '决策树')

# 5、筛选重要特征-手工
# 显示特征重要性
sr = pd.Series(mdl.feature_importances_, index=cols, name='决策树')
sr.sort_values(ascending=False, inplace=True)
sr.plot(kind='bar', title='特征重要性')

# 找出重要性累积超过85%的自变量
sr = sr.cumsum()
cond = sr < 0.85
k = len(sr[cond]) + 1
cols = sr.index[:k].tolist()

X = X[cols]
예제 #12
0
            loss='deviance',
            n_estimators=100, 
            learning_rate=1.0,
            subsample = 0.8,
            max_depth=1, 
            random_state=0)
clf.fit(X, y)

# 4、评估模型
print('score=', clf.score(X, y))

y_pred = clf.predict(X)
displayClassifierMetrics(y, y_pred, clf.classes_, poslable)

y_prob = clf.predict_proba(X)
displayROCurve(y, y_prob, clf.classes_, 'GBDT')

# 显示特征重要性
sr = pd.Series(mdl.feature_importances_, index = cols, name='特征重要性')
sr.sort_values(ascending=False, inplace=True)
sr.plot(kind='bar', title=sr.name)

# 5.超参优化(略)
# 6.应用模型(略)


# 保存模型(略)
# sklearn.ensemble.GradientBoostingRegressor
    # (loss='ls', learning_rate=0.1, n_estimators=100, 
    # subsample=1.0, criterion='friedman_mse', 
    # min_samples_split=2, min_samples_leaf=1,