# 3.训练模型 mdl = LinearDiscriminantAnalysis(solver='svd') mdl.fit(X, y) # 当为二分类时,虽然mdl.classes_有二个值,但只有一个方程 # 映射直线 sr = pd.Series(data=[mdl.intercept_[0]] + mdl.coef_[0].tolist(), index=['常数'] + cols) print(sr) # 4、评估模型 y_pred = mdl.predict(X) displayClassifierMetrics(y, y_pred, mdl.classes_, pos_label='是') y_prob = mdl.predict_proba(X) displayROCurve(y, y_prob, mdl.classes_) # 6.应用模型 # 1)预测值 srPred = pd.Series(data=y_pred, index=df.index, name='预测值') # 2)预测概率 dfProb = pd.DataFrame(data=y_prob, index=df.index, columns=mdl.classes_) # 3)映射后的值 X_ = mdl.transform(X) facts = ['f{}'.format(i + 1) for i in range(X_.shape[1])] dfFacts = pd.DataFrame(data=X_, index=df.index, columns=facts) # 4)合并到原数据集 dfNew = pd.concat([y, dfFacts, srPred, dfProb], axis=1)
mdl = RandomForestClassifier(max_features=0.8, n_estimators=51, min_samples_split=10, min_samples_leaf=5, oob_score=True, random_state=10) mdl.fit(X, y) # 4、评估模型 print('袋外得分=', mdl.oob_score_) # 相当于泛化准确度 y_pred = mdl.predict(X) displayClassifierMetrics(y, y_pred, mdl.classes_, poslabel) y_prob = mdl.predict_proba(X) displayROCurve(y, y_prob, mdl.classes_, '随机森林') # 显示特征重要性 sr = pd.Series(mdl.feature_importances_, index=cols, name='特征重要性') sr.sort_values(ascending=False, inplace=True) sr.plot(kind='bar', title=sr.name) # 5.超参优化(略) # 6.应用模型(略) ###################################################################### ######## Part2、RF分类的超参优化 ###################################################################### # 超参分两部分: # 1)RF框架参数: # n_estimators
gamma=0.1, subsample=0.8, colsample_bytree=0.8, reg_alpha=1, objective='binary:logistic', nthread=8, scale_pos_weight=1, seed=27) model.fit(X, y) # 模型评估 y_pred = model.predict(X) displayClassifierMetrics(y, y_pred, model.classes_) y_prob = model.predict_proba(X) displayROCurve(y, y_prob, model.classes_, 'XGBoost') ###################################################################### ######## Part2、超参优化 ###################################################################### # 调优步骤: # 1)学习率learning_rate [0.05, 0.3] # 2)决策树超参:max_depth, min_child_weight, # 3)节点分裂参数:gamma # 4)抽样参数:subsample, colsample_bytree # 5)正则化参数:reg_alpha, reg_lambda # 默认的经验值 xgb = XGBClassifier(booster='gbtree', learning_rate=0.1, n_estimators=100,
base_estimator=mdl, algorithm='SAMME', n_estimators=200, # learning_rate=0.7, # random_state=10 ) clf.fit(X, y) # 3、评估模型 print('score=', clf.score(X, y)) y_pred = clf.predict(X) displayClassifierMetrics(y, y_pred, clf.classes_) y_prob = clf.predict_proba(X) displayROCurve(y, y_prob, clf.classes_, 'AdaBoost') # 1)显示特征重要性 sr = pd.Series(clf.feature_importances_, index=cols, name='特征重要性') sr.sort_values(ascending=False, inplace=True) sr.plot(kind='bar', title=sr.name) # 2)其余基类信息 print('类别取值个数:', clf.n_classes_) print('类别标签取值:', clf.classes_) for i, est in enumerate(clf.estimators_): print('第{}个基学习器:'.format(i)) # mdl = est #可以保存起来,后续使用 print(' 权重:', np.round(clf.estimator_weights_[i], 4)) print(' 分类错误率:', np.round(clf.estimator_errors_[i], 4))
for i in range(clf.n_estimators): # mdl = clf.estimators_[i] idxs = clf.estimators_features_[i] #特征序号 idxs.sort() estFeatures = [] for idx in idxs: estFeatures.append(cols[idx]) print('第{0}个基类使用的特征:{1}'.format(i, estFeatures)) # 4、评估模型 y_pred = clf.predict(X) displayClassifierMetrics(y, y_pred, clf.classes_, poslabel) y_prob = clf.predict_proba(X) displayROCurve(y, y_prob, clf.classes_, 'BaggingClassifier') # 5.超参优化(略) # 6.使用模型(略) # 相关类 # 1、BaggingClassifier(base_estimator=None, bootstrap=True, # bootstrap_features=False, max_features=1.0, # max_samples=1.0, n_estimators=10, # n_jobs=None, oob_score=False, random_state=None, # verbose=0, warm_start=False) # 2、BaggingRegressor()--参数列表和分类完全一样 # 重要参数 # base_estimator : 基学习器(默认=None),默认为决策树。 # n_estimators : int,基学习器的个数,(默认值为10)
# 支持向量列表 vts = mdl.support_vectors_ # 支持向量的索引列表 idxs = mdl.support_ # 当kernel='linear'时,才有如下属性 if mdl.kernel == 'linear': print(mdl._intercept_) print(mdl.coef_) # 4.评估模型 y_pred = mdl.predict(X) displayClassifierMetrics(y, y_pred, mdl.classes_, posLabel) y_prob = mdl.predict_proba(X) displayROCurve(y, y_prob, mdl.classes_, '支持向量机') # 6.应用模型(略) ###################################################################### ######## Part2、多分类SVC ###################################################################### # SVC和NuSVC为多分类实现了'one-vs-one'的方法,从而训练n*(n-1)/2个模型。 # 1.读取数据 from sklearn import datasets iris = datasets.load_iris() cols = ['花萼长度', '花萼宽度', '花瓣长度', '花瓣宽度'] labels = ['山鸢尾', '杂色鸢尾', '维吉尼亚鸢尾'] # cols = iris['feature_names']
# 3、模型训练 from lightgbm import LGBMClassifier, LGBMRegressor gbm = LGBMClassifier(objective='multiclass', num_leaves=31, learning_rate=0.05, n_estimators=20) gbm.fit(X, y) # 模型评估 y_pred = gbm.predict(X) displayClassifierMetrics(y, y_pred, gbm.classes_) y_prob = gbm.predict_proba(X) displayROCurve(y, y_prob, gbm.classes_, 'LightGBM') # 显示特征重要性 ###################################################################### ######## Part2、超参优化 ###################################################################### bestParams = {} lt_params = [ { 'n_estimators': range(50, 150, 10) }, { 'max_depth': range(3, 14), 'min_child_weight': range(1, 6)
mdl = LogisticRegression(penalty='none') mdl.fit(X, y) # 注意:逻辑回归中的这两个参数都是元组,不像回归模型 sr = pd.Series( data=[mdl.intercept_[0]] + mdl.coef_[0].tolist(), index=['常数'] + cols ) print(sr) ####### 5、评估模型指标 y_pred = mdl.predict(X) displayClassifierMetrics(y, y_pred, mdl.classes_, poslabel) y_prob = mdl.predict_proba(X) displayROCurve(y, y_prob, mdl.classes_, '二分类逻辑回归') # 6.应用模型 # 1)可选操作:将原始数据,预测值,预测概率合并在一个DF集 srPred = pd.Series(y_pred, index=df.index, name='预测值') #预测结果 # y_prob有顺序与classes_指定的类别顺序是一致的 ProbCols = [F"{val}-概率" for val in mdl.classes_] dfProb = pd.DataFrame(y_prob, index=df.index, columns=ProbCols) dfNew = pd.concat([df, srPred, dfProb], axis=1) print(dfNew.head()) # 2)保存模型(略) # 3)加载模型(略)
print('网络层数:', mdl.n_layers_) print('输出节点数:', mdl.n_outputs_) for i, coefs in enumerate(mdl.coefs_): nodes = len(mdl.intercepts_[i]) print('中间层{},节点数:{}'.format(i + 1, nodes)) for j in range(nodes): wt = [mdl.intercepts_[i][j]] + coefs[:, j].tolist() print(' 节点{}:{}'.format(j + 1, np.round(wt, 2))) # 4.评估模型 y_pred = mdl.predict(X) displayClassifierMetrics(y, y_pred, mdl.classes_) y_prob = mdl.predict_proba(X) displayROCurve(y, y_prob, mdl.classes_, '神经网络') # 5.超参优化 # 6.应用模型 # 1)保存模型 # 2)加载模型 # 3)预测 ###################################################################### ######## Part2、MLPClassifier(类别自变量) ###################################################################### # 1、读取数据 filename = '分类预测.xls' sheet = '贷款违约'
from sklearn.preprocessing import OrdinalEncoder enc = OrdinalEncoder(dtype='int') X_ = enc.fit_transform(df[catCols]) # 映射关系 for i, col in enumerate(catCols): print('\n变量名称:', col) print('数值顺序', enc.categories_[i]) dfCats = pd.DataFrame(X_, df.index, catCols) # 3)合并 X = pd.concat([df[intCols], dfCats], axis=1) cols = X.columns.tolist() # 3.训练模型 from sklearn.naive_bayes import GaussianNB mdl = GaussianNB() mdl.fit(X, y) # 4.评估模型 y_pred = mdl.predict(X) displayClassifierMetrics(y, y_pred, mdl.classes_, poslabel) y_prob = mdl.predict_proba(X) displayROCurve(y, y_prob, mdl.classes_, '朴素贝叶斯') # 其余略
# 3)合并 X = pd.concat([dfCats, df[intCols]], axis=1) cols = X.columns.tolist() # 3、训练模型 from sklearn.tree import DecisionTreeClassifier mdl = DecisionTreeClassifier(criterion='entropy') mdl.fit(X, y) # 4、评估模型 y_pred = mdl.predict(X) displayClassifierMetrics(y, y_pred, mdl.classes_, poslabel) y_prob = mdl.predict_proba(X) displayROCurve(y, y_prob, mdl.classes_, '决策树') # 5、筛选重要特征-手工 # 显示特征重要性 sr = pd.Series(mdl.feature_importances_, index=cols, name='决策树') sr.sort_values(ascending=False, inplace=True) sr.plot(kind='bar', title='特征重要性') # 找出重要性累积超过85%的自变量 sr = sr.cumsum() cond = sr < 0.85 k = len(sr[cond]) + 1 cols = sr.index[:k].tolist() X = X[cols]
loss='deviance', n_estimators=100, learning_rate=1.0, subsample = 0.8, max_depth=1, random_state=0) clf.fit(X, y) # 4、评估模型 print('score=', clf.score(X, y)) y_pred = clf.predict(X) displayClassifierMetrics(y, y_pred, clf.classes_, poslable) y_prob = clf.predict_proba(X) displayROCurve(y, y_prob, clf.classes_, 'GBDT') # 显示特征重要性 sr = pd.Series(mdl.feature_importances_, index = cols, name='特征重要性') sr.sort_values(ascending=False, inplace=True) sr.plot(kind='bar', title=sr.name) # 5.超参优化(略) # 6.应用模型(略) # 保存模型(略) # sklearn.ensemble.GradientBoostingRegressor # (loss='ls', learning_rate=0.1, n_estimators=100, # subsample=1.0, criterion='friedman_mse', # min_samples_split=2, min_samples_leaf=1,