y = df[target]
posLabel = '是'

# 3.训练模型
mdl = LinearDiscriminantAnalysis(solver='svd')
mdl.fit(X, y)

# 当为二分类时,虽然mdl.classes_有二个值,但只有一个方程
# 映射直线
sr = pd.Series(data=[mdl.intercept_[0]] + mdl.coef_[0].tolist(),
               index=['常数'] + cols)
print(sr)

# 4、评估模型
y_pred = mdl.predict(X)
displayClassifierMetrics(y, y_pred, mdl.classes_, pos_label='是')

y_prob = mdl.predict_proba(X)
displayROCurve(y, y_prob, mdl.classes_)

# 6.应用模型
# 1)预测值
srPred = pd.Series(data=y_pred, index=df.index, name='预测值')

# 2)预测概率
dfProb = pd.DataFrame(data=y_prob, index=df.index, columns=mdl.classes_)

# 3)映射后的值
X_ = mdl.transform(X)
facts = ['f{}'.format(i + 1) for i in range(X_.shape[1])]
dfFacts = pd.DataFrame(data=X_, index=df.index, columns=facts)
예제 #2
0
# 3、建立随机森林模型
from sklearn.ensemble import RandomForestClassifier

mdl = RandomForestClassifier(max_features=0.8,
                             n_estimators=51,
                             min_samples_split=10,
                             min_samples_leaf=5,
                             oob_score=True,
                             random_state=10)
mdl.fit(X, y)

# 4、评估模型
print('袋外得分=', mdl.oob_score_)  # 相当于泛化准确度

y_pred = mdl.predict(X)
displayClassifierMetrics(y, y_pred, mdl.classes_, poslabel)

y_prob = mdl.predict_proba(X)
displayROCurve(y, y_prob, mdl.classes_, '随机森林')

# 显示特征重要性
sr = pd.Series(mdl.feature_importances_, index=cols, name='特征重要性')
sr.sort_values(ascending=False, inplace=True)
sr.plot(kind='bar', title=sr.name)

# 5.超参优化(略)
# 6.应用模型(略)

######################################################################
########  Part2、RF分类的超参优化
######################################################################
예제 #3
0
    # n_estimators=3000,
    max_depth=4,
    min_child_weight=5,
    gamma=0.1,
    subsample=0.8,
    colsample_bytree=0.8,
    reg_alpha=1,
    objective='binary:logistic',
    nthread=8,
    scale_pos_weight=1,
    seed=27)
model.fit(X, y)

# 模型评估
y_pred = model.predict(X)
displayClassifierMetrics(y, y_pred, model.classes_)

y_prob = model.predict_proba(X)
displayROCurve(y, y_prob, model.classes_, 'XGBoost')

######################################################################
########  Part2、超参优化
######################################################################
# 调优步骤:
# 1)学习率learning_rate [0.05, 0.3]
# 2)决策树超参:max_depth, min_child_weight,
# 3)节点分裂参数:gamma
# 4)抽样参数:subsample, colsample_bytree
# 5)正则化参数:reg_alpha, reg_lambda

# 默认的经验值
예제 #4
0
# min_samples_split=20,
# min_samples_leaf=5)
clf = AdaBoostClassifier(
    base_estimator=mdl,
    algorithm='SAMME',
    n_estimators=200,
    # learning_rate=0.7,
    # random_state=10
)
clf.fit(X, y)

# 3、评估模型
print('score=', clf.score(X, y))

y_pred = clf.predict(X)
displayClassifierMetrics(y, y_pred, clf.classes_)

y_prob = clf.predict_proba(X)
displayROCurve(y, y_prob, clf.classes_, 'AdaBoost')

# 1)显示特征重要性
sr = pd.Series(clf.feature_importances_, index=cols, name='特征重要性')
sr.sort_values(ascending=False, inplace=True)
sr.plot(kind='bar', title=sr.name)

# 2)其余基类信息
print('类别取值个数:', clf.n_classes_)
print('类别标签取值:', clf.classes_)
for i, est in enumerate(clf.estimators_):
    print('第{}个基学习器:'.format(i))
    # mdl = est   #可以保存起来,后续使用