def criteo_gdbtlr(X_idx, X_value, y): import numpy as np from sklearn.metrics import roc_auc_score, accuracy_score from sklearn.linear_model import LogisticRegression from lightgbm.sklearn import LGBMClassifier X_idx = X_idx.values.tolist() y = y.values.tolist() num_leaves = 31 model = LGBMClassifier(num_leaves=num_leaves) model.fit(X_idx, y) model_path = os.path.join(pwd_path, 'gbdtlr_model1.pt') y_pred = model.predict(X_idx, pred_leaf=True) y_pred_gbdt = model.predict(X_idx, pred_leaf=False) acc = model.score(X_idx, y) print("gbdt train acc:", acc) s = roc_auc_score(y, y_pred_gbdt) print('gbdt auc:', s) a = accuracy_score(y, y_pred_gbdt) print('gbdt train acc:', a) import pickle # pickle模块 # 保存Model(注:save文件夹要预先建立,否则会报错) with open(model_path, 'wb') as f: pickle.dump(model, f) # # 读取Model # with open('save/clf.pickle', 'rb') as f: # clf2 = pickle.load(f) transformed_matrix = np.zeros( [len(y_pred), len(y_pred[0]) * num_leaves], dtype=np.int64) for i in range(0, len(y_pred)): temp = np.arange(len(y_pred[0])) * num_leaves - 1 + np.array(y_pred[i]) transformed_matrix[i][temp] += 1 lr_model = LogisticRegression() lr_model.fit(transformed_matrix, y) y_pred_lr = lr_model.predict(transformed_matrix) print("truth_y:", y[:100], 'y_pred_lr:', y_pred_lr[:100]) s = roc_auc_score(y, y_pred_lr) print('auc:', s)
def multi_machine_learing_models(data_train, data_cv): print('正在训练模型!') data_train=pd.concat([data_train,data_cv],axis=0) y_train = data_train['label'].apply(lambda x: 0 if x == 'good' else 1) y_test = data_cv['label'].apply(lambda x: 0 if x == 'good' else 1) X_train = data_train.drop(['URL', 'label'], axis=1) X_test = data_cv.drop(['URL', 'label'], axis=1) filename_bayes = 'classifier_model\c_bayes.model' filename_LGB = 'classifier_model\c_LGB.model' filename_ada = 'classifier_model\c_ada.model' filename_rf = 'classifier_model\c_rf.model' filename_decision_tree = 'classifier_model\c_decision_tree.model' filename_lgs = 'classifier_model\c_lgs.model' vote = [] for i in range(len(y_test)): vote.append(0) bayes = BernoulliNB() bayes.fit(X_train, y_train) print('\nbayes模型的准确度:', bayes.score(X_test, y_test)) predict = bayes.predict(X_test) vote = list(map(lambda x: x[0] + x[1], zip(predict, vote))) precision = metrics.precision_score(y_test, predict) recall = metrics.recall_score(y_test, predict) print("precison:", precision) print("recall:", recall) joblib.dump(bayes, filename_bayes) gbc = LGBMClassifier(n_estimators=200, objective='binary') gbc.fit(X_train, y_train) print('LGBMClassifier模型的准确度:', gbc.score(X_test, y_test)) predict = gbc.predict(X_test) vote = list(map(lambda x: 3 * x[0] + x[1], zip(predict, vote))) precision = metrics.precision_score(y_test, predict) recall = metrics.recall_score(y_test, predict) print("precison:", precision) print("recall:", recall) joblib.dump(gbc, filename_LGB) ada = AdaBoostClassifier(n_estimators=100) # 迭代100次 ada.fit(X_train, y_train) print('ada模型的准确度:', ada.score(X_test, y_test)) predict = ada.predict(X_test) vote = list(map(lambda x: 2 * x[0] + x[1], zip(predict, vote))) precision = metrics.precision_score(y_test, predict) recall = metrics.recall_score(y_test, predict) print("precison:", precision) print("recall:", recall) joblib.dump(ada, filename_ada) rf = RandomForestClassifier(n_estimators=100, oob_score=True) rf.fit(X_train, y_train) print('\nrf模型的准确度:', rf.score(X_test, y_test)) predict = rf.predict(X_test) vote = list(map(lambda x: x[0] * 3 + x[1], zip(predict, vote))) precision = metrics.precision_score(y_test, predict) recall = metrics.recall_score(y_test, predict) print("precison:", precision) print("recall:", recall) joblib.dump(rf, filename_rf) decision_tree = tree.DecisionTreeClassifier() decision_tree.fit(X_train, y_train) print('\ndecision_tree模型的准确度:', decision_tree.score(X_test, y_test)) predict = decision_tree.predict(X_test) vote = list(map(lambda x: x[0] * 2 + x[1], zip(predict, vote))) precision = metrics.precision_score(y_test, predict) recall = metrics.recall_score(y_test, predict) print("precison:", precision) print("recall:", recall) joblib.dump(decision_tree, filename_decision_tree) lgs = LogisticRegression() lgs.fit(X_train, y_train) print('\nLogisticRegression模型的准确度:', lgs.score(X_test, y_test)) predict = lgs.predict(X_test) vote = list(map(lambda x: x[0] * 2 + x[1], zip(predict, vote))) precision = metrics.precision_score(y_test, predict) recall = metrics.recall_score(y_test, predict) print("precison:", precision) print("recall:", recall) joblib.dump(lgs, filename_lgs) print('\n投票结果:') vote_r = [] for i in range(len(vote)): if vote[i] >= 3: vote_r.append(1) else: vote_r.append(0) precision = metrics.precision_score(y_test, vote_r) recall = metrics.recall_score(y_test, vote_r) acc = metrics.accuracy_score(y_test, vote_r) print('准确度:', acc) print("precison:", precision) print("recall:", recall)