y = df[target] posLabel = '是' # 3.训练模型 mdl = LinearDiscriminantAnalysis(solver='svd') mdl.fit(X, y) # 当为二分类时,虽然mdl.classes_有二个值,但只有一个方程 # 映射直线 sr = pd.Series(data=[mdl.intercept_[0]] + mdl.coef_[0].tolist(), index=['常数'] + cols) print(sr) # 4、评估模型 y_pred = mdl.predict(X) displayClassifierMetrics(y, y_pred, mdl.classes_, pos_label='是') y_prob = mdl.predict_proba(X) displayROCurve(y, y_prob, mdl.classes_) # 6.应用模型 # 1)预测值 srPred = pd.Series(data=y_pred, index=df.index, name='预测值') # 2)预测概率 dfProb = pd.DataFrame(data=y_prob, index=df.index, columns=mdl.classes_) # 3)映射后的值 X_ = mdl.transform(X) facts = ['f{}'.format(i + 1) for i in range(X_.shape[1])] dfFacts = pd.DataFrame(data=X_, index=df.index, columns=facts)
# 3、建立随机森林模型 from sklearn.ensemble import RandomForestClassifier mdl = RandomForestClassifier(max_features=0.8, n_estimators=51, min_samples_split=10, min_samples_leaf=5, oob_score=True, random_state=10) mdl.fit(X, y) # 4、评估模型 print('袋外得分=', mdl.oob_score_) # 相当于泛化准确度 y_pred = mdl.predict(X) displayClassifierMetrics(y, y_pred, mdl.classes_, poslabel) y_prob = mdl.predict_proba(X) displayROCurve(y, y_prob, mdl.classes_, '随机森林') # 显示特征重要性 sr = pd.Series(mdl.feature_importances_, index=cols, name='特征重要性') sr.sort_values(ascending=False, inplace=True) sr.plot(kind='bar', title=sr.name) # 5.超参优化(略) # 6.应用模型(略) ###################################################################### ######## Part2、RF分类的超参优化 ######################################################################
# n_estimators=3000, max_depth=4, min_child_weight=5, gamma=0.1, subsample=0.8, colsample_bytree=0.8, reg_alpha=1, objective='binary:logistic', nthread=8, scale_pos_weight=1, seed=27) model.fit(X, y) # 模型评估 y_pred = model.predict(X) displayClassifierMetrics(y, y_pred, model.classes_) y_prob = model.predict_proba(X) displayROCurve(y, y_prob, model.classes_, 'XGBoost') ###################################################################### ######## Part2、超参优化 ###################################################################### # 调优步骤: # 1)学习率learning_rate [0.05, 0.3] # 2)决策树超参:max_depth, min_child_weight, # 3)节点分裂参数:gamma # 4)抽样参数:subsample, colsample_bytree # 5)正则化参数:reg_alpha, reg_lambda # 默认的经验值
# min_samples_split=20, # min_samples_leaf=5) clf = AdaBoostClassifier( base_estimator=mdl, algorithm='SAMME', n_estimators=200, # learning_rate=0.7, # random_state=10 ) clf.fit(X, y) # 3、评估模型 print('score=', clf.score(X, y)) y_pred = clf.predict(X) displayClassifierMetrics(y, y_pred, clf.classes_) y_prob = clf.predict_proba(X) displayROCurve(y, y_prob, clf.classes_, 'AdaBoost') # 1)显示特征重要性 sr = pd.Series(clf.feature_importances_, index=cols, name='特征重要性') sr.sort_values(ascending=False, inplace=True) sr.plot(kind='bar', title=sr.name) # 2)其余基类信息 print('类别取值个数:', clf.n_classes_) print('类别标签取值:', clf.classes_) for i, est in enumerate(clf.estimators_): print('第{}个基学习器:'.format(i)) # mdl = est #可以保存起来,后续使用