Beispiel #1
0
def criteo_gdbtlr(X_idx, X_value, y):
    import numpy as np
    from sklearn.metrics import roc_auc_score, accuracy_score
    from sklearn.linear_model import LogisticRegression
    from lightgbm.sklearn import LGBMClassifier

    X_idx = X_idx.values.tolist()
    y = y.values.tolist()
    num_leaves = 31
    model = LGBMClassifier(num_leaves=num_leaves)
    model.fit(X_idx, y)
    model_path = os.path.join(pwd_path, 'gbdtlr_model1.pt')
    y_pred = model.predict(X_idx, pred_leaf=True)
    y_pred_gbdt = model.predict(X_idx, pred_leaf=False)
    acc = model.score(X_idx, y)
    print("gbdt train acc:", acc)
    s = roc_auc_score(y, y_pred_gbdt)
    print('gbdt auc:', s)
    a = accuracy_score(y, y_pred_gbdt)
    print('gbdt train acc:', a)
    import pickle  # pickle模块

    # 保存Model(注:save文件夹要预先建立,否则会报错)
    with open(model_path, 'wb') as f:
        pickle.dump(model, f)

    # # 读取Model
    # with open('save/clf.pickle', 'rb') as f:
    #     clf2 = pickle.load(f)

    transformed_matrix = np.zeros(
        [len(y_pred), len(y_pred[0]) * num_leaves], dtype=np.int64)
    for i in range(0, len(y_pred)):
        temp = np.arange(len(y_pred[0])) * num_leaves - 1 + np.array(y_pred[i])
        transformed_matrix[i][temp] += 1

    lr_model = LogisticRegression()
    lr_model.fit(transformed_matrix, y)
    y_pred_lr = lr_model.predict(transformed_matrix)
    print("truth_y:", y[:100], 'y_pred_lr:', y_pred_lr[:100])

    s = roc_auc_score(y, y_pred_lr)
    print('auc:', s)
Beispiel #2
0
def multi_machine_learing_models(data_train, data_cv):
    print('正在训练模型!')
    data_train=pd.concat([data_train,data_cv],axis=0)
    y_train = data_train['label'].apply(lambda x: 0 if x == 'good' else 1)
    y_test = data_cv['label'].apply(lambda x: 0 if x == 'good' else 1)

    X_train = data_train.drop(['URL', 'label'], axis=1)
    X_test = data_cv.drop(['URL', 'label'], axis=1)

    filename_bayes = 'classifier_model\c_bayes.model'
    filename_LGB = 'classifier_model\c_LGB.model'
    filename_ada = 'classifier_model\c_ada.model'
    filename_rf = 'classifier_model\c_rf.model'
    filename_decision_tree = 'classifier_model\c_decision_tree.model'
    filename_lgs = 'classifier_model\c_lgs.model'

    vote = []
    for i in range(len(y_test)):
        vote.append(0)

    bayes = BernoulliNB()
    bayes.fit(X_train, y_train)
    print('\nbayes模型的准确度:', bayes.score(X_test, y_test))
    predict = bayes.predict(X_test)
    vote = list(map(lambda x: x[0] + x[1], zip(predict, vote)))
    precision = metrics.precision_score(y_test, predict)
    recall = metrics.recall_score(y_test, predict)
    print("precison:", precision)
    print("recall:", recall)
    joblib.dump(bayes, filename_bayes)

    gbc = LGBMClassifier(n_estimators=200, objective='binary')
    gbc.fit(X_train, y_train)
    print('LGBMClassifier模型的准确度:', gbc.score(X_test, y_test))
    predict = gbc.predict(X_test)
    vote = list(map(lambda x: 3 * x[0] + x[1], zip(predict, vote)))
    precision = metrics.precision_score(y_test, predict)
    recall = metrics.recall_score(y_test, predict)
    print("precison:", precision)
    print("recall:", recall)
    joblib.dump(gbc, filename_LGB)

    ada = AdaBoostClassifier(n_estimators=100)  # 迭代100次
    ada.fit(X_train, y_train)
    print('ada模型的准确度:', ada.score(X_test, y_test))
    predict = ada.predict(X_test)
    vote = list(map(lambda x: 2 * x[0] + x[1], zip(predict, vote)))
    precision = metrics.precision_score(y_test, predict)
    recall = metrics.recall_score(y_test, predict)
    print("precison:", precision)
    print("recall:", recall)
    joblib.dump(ada, filename_ada)

    rf = RandomForestClassifier(n_estimators=100, oob_score=True)
    rf.fit(X_train, y_train)
    print('\nrf模型的准确度:', rf.score(X_test, y_test))
    predict = rf.predict(X_test)
    vote = list(map(lambda x: x[0] * 3 + x[1], zip(predict, vote)))
    precision = metrics.precision_score(y_test, predict)
    recall = metrics.recall_score(y_test, predict)
    print("precison:", precision)
    print("recall:", recall)
    joblib.dump(rf, filename_rf)

    decision_tree = tree.DecisionTreeClassifier()
    decision_tree.fit(X_train, y_train)
    print('\ndecision_tree模型的准确度:', decision_tree.score(X_test, y_test))
    predict = decision_tree.predict(X_test)
    vote = list(map(lambda x: x[0] * 2 + x[1], zip(predict, vote)))
    precision = metrics.precision_score(y_test, predict)
    recall = metrics.recall_score(y_test, predict)
    print("precison:", precision)
    print("recall:", recall)
    joblib.dump(decision_tree, filename_decision_tree)

    lgs = LogisticRegression()
    lgs.fit(X_train, y_train)
    print('\nLogisticRegression模型的准确度:', lgs.score(X_test, y_test))
    predict = lgs.predict(X_test)
    vote = list(map(lambda x: x[0] * 2 + x[1], zip(predict, vote)))
    precision = metrics.precision_score(y_test, predict)
    recall = metrics.recall_score(y_test, predict)
    print("precison:", precision)
    print("recall:", recall)
    joblib.dump(lgs, filename_lgs)

    print('\n投票结果:')
    vote_r = []
    for i in range(len(vote)):
        if vote[i] >= 3:
            vote_r.append(1)
        else:
            vote_r.append(0)
    precision = metrics.precision_score(y_test, vote_r)
    recall = metrics.recall_score(y_test, vote_r)
    acc = metrics.accuracy_score(y_test, vote_r)
    print('准确度:', acc)
    print("precison:", precision)
    print("recall:", recall)