コード例 #1
0
def gbdt_lr(X_train, X_test, y_train, y_test):
    """
    GBDT + LR

    :param X_train:
    :param X_test:
    :param y_train:
    :param y_test:
    :return:
    """
    # 基于 GBDT 的监督变换
    gbdt = GradientBoostingClassifier(n_estimators=n_estimator)
    gbdt.fit(X_train, y_train)
    # 得到 OneHot 编码
    gbdt_enc = OneHotEncoder(categories='auto')
    np.set_printoptions(threshold=np.inf)
    gbdt_enc.fit(gbdt.apply(X_train)[:, :, 0])
    # 使用 OneHot 编码作为特征,训练 LR
    gbdt_lr = LogisticRegression(solver='lbfgs', max_iter=1000)
    gbdt_lr.fit(gbdt_enc.transform(gbdt.apply(X_train_lr)[:, :, 0]),
                y_train_lr)
    y_pred_gbdt_lr = gbdt_lr.predict_proba(
        gbdt_enc.transform(gbdt.apply(X_test)[:, :, 0]))[:, 1]
    fpr_gbdt_lr, tpr_gbdt_lr, _ = roc_curve(y_test, y_pred_gbdt_lr)
    return fpr_gbdt_lr, tpr_gbdt_lr
コード例 #2
0
def transform_with_gbm_to_categorical(header,tr_x,tr_y,ts_x,n_est=100,learning_rate=0.1,max_depth=5):

    clf = GradientBoostingClassifier(n_estimators=n_est,learning_rate=learning_rate,max_depth=max_depth)
    clf = clf.fit(tr_x, tr_y)

    ''' #Node count
    estimators = clf.estimators_
    for row in estimators:
        for e in row:
            print(e.tree_.node_count)'''
    leaf_indices = clf.apply(tr_x)
    leaf_indices = leaf_indices.reshape(leaf_indices.shape[0],-1)

    ts_leaf_indices = clf.apply(ts_x)
    ts_leaf_indices = ts_leaf_indices.reshape(ts_leaf_indices.shape[0],-1)

    enc = OneHotEncoder()
    enc.fit(np.append(leaf_indices,ts_leaf_indices,axis=0))

    tr_cat_features = enc.transform(leaf_indices).toarray()
    ts_cat_features = enc.transform(ts_leaf_indices).toarray()

    header = ['cat_'+str(i) for i in range(ts_cat_features.shape[1])]
    print('[gbm_cat] Features size: ',len(header))
    return header,tr_cat_features,ts_cat_features
コード例 #3
0
def gbdt_lr_train_test(libsvmFileName):
    split_dataset(libsvmFileName, './model_train/label_feature_data_train', './model_train/label_feature_data_test',split_ratio, total)
    X_train, y_train = load_svmlight_file('./model_train/label_feature_data_train')
    X_test, y_test = load_svmlight_file('./model_train/label_feature_data_test')
    gbclf = GradientBoostingClassifier(n_estimators=30, max_depth=4, verbose=0)
    tuned_parameter = [{'n_estimators':[30, 40, 50,60], 'max_depth':[3, 4, 5, 6, 7, 8, 9], 'max_features':[0.4,0.5,0.6,0.7,0.8,0.9]}]
    gs_clf = GridSearchCV(gbclf, tuned_parameter, cv=5, scoring='roc_auc')
    gs_clf.fit(X_train.toarray(), y_train)
    logging.info('best parameters set found: ')
    logging.info(gs_clf.best_params_)
    y_pred_gbdt = gs_clf.predict_proba(X_test.toarray())[:, 1]
    gbdt_auc = roc_auc_score(y_test, y_pred_gbdt)
    logging.info('gbdt auc: %.5f' % gbdt_auc)
    X_train_leaves = gbclf.apply(X_train)[:,:,0]
    (train_rows, cols) = X_train_leaves.shape
    X_test_leaves = gbclf.apply(X_test)[:,:,0]
    gbdtenc = OneHotEncoder()
    X_trans = gbdtenc.fit_transform(np.concatenate((X_train_leaves, X_test_leaves), axis=0))
    lr = LogisticRegression()
    lr.fit(X_trans[:train_rows, :], y_train)
    y_pred_gbdtlr1 = lr.predict_proba(X_trans[train_rows:, :])[:, 1]
    gbdtlr_auc1 = roc_auc_score(y_test, y_pred_gbdtlr1)
    logging.info('gbdt+lr auc 1: %.5f' % gbdtlr_auc1)
    lr = LogisticRegression(n_jobs=-1)
    X_train_ext = hstack([X_trans[:train_rows, :], X_train])
    lr.fit(X_train_ext, y_train)
    X_test_ext = hstack([X_trans[train_rows:, :], X_test])
    y_pred_gbdtlr2 = lr.predict_proba(X_test_ext)[:, 1]
    gbdtlr_auc2 = roc_auc_score(y_test, y_pred_gbdtlr2)
    logging.info('gbdt+lr auc 2: %.5f' % gbdtlr_auc2)
コード例 #4
0
ファイル: gbdt_lr.py プロジェクト: qianrenjian/TextMatch-1
class GBDTLR:
    def __init__(self):
        #self.other_params = {'learning_rate': cfg.gbdt.learning_rate,
        #                     'n_estimators':cfg.gbdt.n_estimators,
        #                     }
        self.clf_gbdt = GradientBoostingClassifier(n_estimators=50)
        self.clf_lr = LogisticRegression()
        self.enc = OneHotEncoder()
        pass

    def fit(self, train_x, train_y):
        self.clf_gbdt.fit(train_x, train_y)
        train_new_feature = self.clf_gbdt.apply(train_x)
        train_new_feature = train_new_feature.reshape(-1, 50)
        self.enc.fit(train_new_feature)
        train_new_feature2 = np.array(
            self.enc.transform(train_new_feature).toarray())
        self.clf_lr.fit(train_new_feature2, train_y)

        return self

    def predict(self, X_test):
        test_new_feature = self.clf_gbdt.apply(X_test)
        test_new_feature = test_new_feature.reshape(-1, 50)
        test_new_feature2 = np.array(
            self.enc.transform(test_new_feature).toarray())
        predict = self.clf_lr.predict_proba(test_new_feature2)[:, 1]
        return predict

    def save_model(self):
        pass

    def load_model(self):
        pass
コード例 #5
0
def transform_with_gbm_to_categorical(header, tr_x, tr_y, ts_x, n_est=100, learning_rate=0.1, max_depth=5):

    clf = GradientBoostingClassifier(n_estimators=n_est, learning_rate=learning_rate, max_depth=max_depth)
    clf = clf.fit(tr_x, tr_y)

    """ #Node count
    estimators = clf.estimators_
    for row in estimators:
        for e in row:
            print(e.tree_.node_count)"""
    leaf_indices = clf.apply(tr_x)
    leaf_indices = leaf_indices.reshape(leaf_indices.shape[0], -1)

    ts_leaf_indices = clf.apply(ts_x)
    ts_leaf_indices = ts_leaf_indices.reshape(ts_leaf_indices.shape[0], -1)

    enc = OneHotEncoder()
    enc.fit(np.append(leaf_indices, ts_leaf_indices, axis=0))

    tr_cat_features = enc.transform(leaf_indices).toarray()
    ts_cat_features = enc.transform(ts_leaf_indices).toarray()

    header = ["cat_" + str(i) for i in range(ts_cat_features.shape[1])]
    print("[gbm_cat] Features size: ", len(header))
    return header, tr_cat_features, ts_cat_features
コード例 #6
0
    def train_model(self):
        lable = "Churn"
        ID = "customerID"
        x_columns = [x for x in self.train.columns if x not in [lable, ID]]
        x_train = self.data[x_columns]
        y_train = self.data[lable]

        # 创建gbdt模型 并训练
        gbdt = GradientBoostingClassifier()
        gbdt.fit(x_train, y_train)

        # 创建lr模型
        lr = LogisticRegression()
        lr.fit(x_train, y_train)

        # 模型融合
        gbdt_lr = LogisticRegression()
        enc = OneHotEncoder()
        print(gbdt.apply(x_train).shape)
        print(gbdt.apply(x_train).reshape(
            -1, 100).shape)  # 返回是三维向量需要进行reshape成enc接收的二维形式

        # 100为n_estimators,迭代次数
        enc.fit(gbdt.apply(x_train).reshape(-1, 100))  # apply函数返回每棵树对应叶子节点索引值
        gbdt_lr.fit(enc.transform(gbdt.apply(x_train).reshape(-1, 100)),
                    y_train)

        return enc, gbdt, lr, gbdt_lr
コード例 #7
0
def model_fit(train_X,train_y,test_X,sample_fraction):
    """fit lr, gbt, gbt + lr """
    def rescale_prediction(x):
        return x / (x + (1 - x)/sample_fraction)

    train_X,train_X_lr, train_y,train_y_lr = train_test_split(train_X, train_y, test_size=0.5)

    # logistic regression
    l1_ratio = 1
    model = SGDClassifier(loss='log', l1_ratio=l1_ratio, penalty='l1')
    model.fit(train_X, train_y)
    y_pred_lr = rescale_prediction(model.predict_proba(test_X)[:,1])

     # gradient boosted tree
    grd = GradientBoostingClassifier(n_estimators=100, verbose=2)
    grd.fit(train_X, train_y)
    y_pred_grd = rescale_prediction(grd.predict_proba(test_X)[:, 1])

    # GBDT + LR
    grd_enc = OneHotEncoder(categories='auto', sparse=False)
    grd_enc.fit(grd.apply(train_X)[:, :, 0])
    grd_lm = SGDClassifier(loss='log', l1_ratio=1, penalty='l1', max_iter=1000, verbose=True)
    grd_lm.fit(grd_enc.transform(grd.apply(train_X_lr)[:, :, 0]), train_y_lr)
    y_pred_grd = rescale_prediction(grd_lm.predict_proba(grd_enc.transform(grd.apply(test_X)[:,:,0]))[:,1])

    res = {'lr':y_pred_lr,'gdt':y_pred_grd,'gdt_lr':y_pred_grd}
    pickle.dump(res,open(".\\interim\\pred.pkl",'wb'))
コード例 #8
0
ファイル: tree.py プロジェクト: zhangzhenhu/prophet
    def fit(self, **kwargs) -> Model:
        feature_list = kwargs.get('feature_list', None)
        if not feature_list:
            self.name = self.name + '(-irt)'
        self.train_x = self.select_features(self.feature.features_train,
                                            feature_list)
        self.train_y = self.feature.label_train.values
        self.feature_names = self.train_x.columns

        self.train_x, self.train_y = self.tf_sample(self.train_x, self.train_y)

        grd = GradientBoostingClassifier(**self.param)
        grd_enc = OneHotEncoder()
        grd_lm = LogisticRegression(penalty='l2', C=1, solver='lbfgs')
        grd.fit(self.train_x, self.train_y)
        grd_enc.fit(grd.apply(self.train_x)[:, :, 0])
        grd_lm.fit(grd_enc.transform(grd.apply(self.train_x)[:, :, 0]),
                   self.train_y)
        self.grd = grd
        self.grd_enc = grd_enc
        self.model = grd_lm

        # 评估训练集上的效果
        self.train_y_pred = self.predict(self.train_x)
        self.train_y = np.array(self.train_y)
        self.train_y_pred = np.array(self.train_y_pred)
        self.train_ev = self.evaluation.evaluate(y_true=self.train_y,
                                                 y_pred=self.train_y_pred,
                                                 threshold=0.5)

        return self
コード例 #9
0
class GBDTLR(BaseEstimator, ClassifierMixin):
    def __init__(self,
                 n_estimators=100,
                 max_depth=3,
                 min_samples_leaf=1,
                 max_leaf_nodes=None,
                 subsample=1.0,
                 learning_rate=0.1,
                 max_iter=100,
                 C=1.0,
                 random_state=None):
        self.n_estimators = n_estimators
        self.max_depth = max_depth
        self.min_samples_leaf = min_samples_leaf
        self.max_leaf_nodes = max_leaf_nodes
        self.subsample = subsample
        self.learning_rate = learning_rate
        self.max_iter = max_iter
        self.C = C
        self.random_state = random_state

        self.gbdt_params = {
            'n_estimators': self.n_estimators,
            'max_depth': self.max_depth,
            'min_samples_leaf': self.min_samples_leaf,
            'max_leaf_nodes': self.max_leaf_nodes,
            'subsample': self.subsample,
            'learning_rate': self.learning_rate
        }

        self.lr_params = {'C': self.C, 'max_iter': self.max_iter}

        self.GBDT = GradientBoostingClassifier(**self.gbdt_params,
                                               random_state=random_state)
        self.LR = LogisticRegression(**self.lr_params,
                                     random_state=random_state)
        self.ENC = OneHotEncoder(categories='auto')

    def fit(self, X, y):
        X_gbdt, X_lr, Y_gbdt, Y_lr = train_test_split(X, y, test_size=0.5)
        self.GBDT.fit(X_gbdt, Y_gbdt)
        tree_feature = self.GBDT.apply(X_gbdt)[:, :, 0]
        self.ENC.fit(tree_feature)

        X = self.ENC.transform(self.GBDT.apply(X_lr)[:, :, 0])
        y = Y_lr
        return self.LR.fit(X, y)

    def predict(self, X):
        X = self.ENC.transform(self.GBDT.apply(X)[:, :, 0])
        return self.LR.predict(X)

    def predict_proba(self, X):
        X = self.ENC.transform(self.GBDT.apply(X)[:, :, 0])
        return self.LR.predict_proba(X)

    def predict_log_proba(self, X):
        X = self.ENC.transform(self.GBDT.apply(X)[:, :, 0])
        return self.LR.predict_log_proba(X)
コード例 #10
0
def compare_models():
    X, y = make_classification(n_samples=10000)
    #
    X_train, X_test, y_train, y_test = train_test_split(X, y, test_size=0.3)
    # 对lr部分也给出训练集
    X_train_lr, X_test_lr, y_train_lr, y_test_lr = train_test_split(
        X_train, y_train, test_size=0.5)

    # 建立模型
    n_estimaters = 100
    gbc = GradientBoostingClassifier(n_estimators=n_estimaters)
    encoder = OneHotEncoder()
    lr = LogisticRegression()

    # 训练决策树
    gbc.fit(X_train, y_train)

    # encode 编码规则训练
    # apply返回每个树最终落到那个叶子上 apply返回的是 所在样本落在第几个树的第几个叶上
    # 注意apply返回的维度
    encoder.fit(gbc.apply(X_train)[:, :, 0])

    # 训练Logistic 单独采用一些样本
    lr.fit(encoder.transform(gbc.apply(X_train_lr)[:, :, 0]), y_train_lr)

    # predict
    # 预测概率 .predict_proba 返回的是每个类里面的概率,只需要选择正类的概率即可
    y_test_pred = lr.predict_proba(
        encoder.transform(gbc.apply(X_test)[:, :, 0]))[:, 1]

    # plot roc
    # roc的分类只能鉴别两类 并且必须预测结果必须能以秩排序
    fpr_gbc_lr, tpr_gbc_lr, _ = roc_curve(y_test, y_test_pred)
    auc = roc_auc_score(y_test, y_test_pred)
    print(auc)

    # make roc graph
    fig = plt.figure()
    ax = fig.add_subplot(1, 1, 1)
    ax.plot([0, 1], [0, 1], 'k-')
    ax.plot(fpr_gbc_lr, tpr_gbc_lr, label='gbc-lr')

    # 若只采用lr回归预测看看效果呢
    lr.fit(X_train_lr, y_train_lr)
    y_test_pred_lr = lr.predict_proba(X_test)[:, 1]
    fpr_lr, tpr_lr, _ = roc_curve(y_test, y_test_pred_lr)
    ax.plot(fpr_lr, tpr_lr, label='lr')

    # 若只采用gbc来预测呢
    y_test_pred_gbc = gbc.predict_proba(X_test)[:, 1]
    fpr_gbc, tpr_gbc, _ = roc_curve(y_test, y_test_pred_gbc)
    ax.plot(fpr_gbc, tpr_gbc, label='gbc')

    # 呈现
    ax.legend(loc='best')
    plt.show()
コード例 #11
0
def GBDTLR_Fit(X_train, y_train, pars):
    #from sklearn.ensemble import GradientBoostingClassifier
    #from sklearn.preprocessing import OneHotEncoder
    gbdt = GradientBoostingClassifier(**pars)
    gbdt.fit(X_train, y_train)
    model_onehot = OneHotEncoder()
    model_onehot.fit(gbdt.apply(X_train)[:, :, 0])
    gbdt_lr = LogisticRegression()
    gbdt_lr.fit(model_onehot.transform(gbdt.apply(X_train)[:, :, 0]), y_train)
    return gbdt_lr, model_onehot, gbdt
コード例 #12
0
class GRDTransformer:
    def __init__(self, n_estimators):
        self.model = GradientBoostingClassifier(n_estimators=n_estimators)
        self.enc = OneHotEncoder(sparse=False)

    def fit(self, X, y):
        self.model.fit(X, y)
        self.enc.fit(self.model.apply(X)[:, :, 0])

    def transform(self, X):
        return self.enc.transform(self.model.apply(X)[:, :, 0])
コード例 #13
0
def GBDTLR():
    GBDT = GradientBoostingClassifier(n_estimators=10)
    GBDT.fit(X_train, Y_train)
    OHE = OneHotEncoder()
    OHE.fit(GBDT.apply(X_train)[:, :, 0])
    LR = LogisticRegression()
    LR.fit(OHE.transform(GBDT.apply(X_train_lr)[:, :, 0]), Y_train_lr)
    Y_pred = LR.predict_proba(OHE.transform(GBDT.apply(X_test)[:, :, 0]))[:, 1]
    fpr, tpr, _ = roc_curve(Y_test, Y_pred)
    auc = roc_auc_score(Y_test, Y_pred)
    print('GradientBoosting+LogisticRegression:', auc)
    return fpr, tpr
コード例 #14
0
ファイル: tree_lr.py プロジェクト: daiwei9501/tree_lr
def GdbtLR(X_train, y_train, X_test, y_test, X_train_lr, y_train_lr):
    grd = GradientBoostingClassifier(n_estimators=50)
    grd_enc = OneHotEncoder()
    grd_lr = LogisticRegression()
    grd.fit(X_train, y_train)
    grd_enc.fit(grd.apply(X_train)[:, :, 0])
    grd_lr.fit(grd_enc.transform(grd.apply(X_train_lr)[:, :, 0]), y_train_lr)
    y_pred_grd_lr = grd_lr.predict_proba(
        grd_enc.transform(grd.apply(X_test)[:, :, 0]))[:, 1]
    fpr_grd_lr, tpr_grd_lr, _ = roc_curve(y_test, y_pred_grd_lr)
    auc = roc_auc_score(y_test, y_pred_grd_lr)
    print("GDBT+LR:", auc)
    return fpr_grd_lr, tpr_grd_lr
コード例 #15
0
ファイル: gbdt_lr.py プロジェクト: cqw5/MachineLearning
def gbdt_lr_model():
    """
    GBDT + LR
    """
    gbdt = GradientBoostingClassifier(n_estimators=n_estimator, max_depth=max_depth)
    gbdt_enc = OneHotEncoder()
    gbdt_lm = LogisticRegression()
    gbdt.fit(X_train, y_train)
    gbdt_enc.fit(gbdt.apply(X_train)[:, :, 0])
    gbdt_lm.fit(gbdt_enc.transform(gbdt.apply(X_train_lr)[:, :, 0]), y_train_lr)
    y_pred = gbdt_lm.predict_proba(gbdt_enc.transform(gbdt.apply(X_test)[:, :, 0]))[:, 1]
    fpr, tpr, _ = roc_curve(y_test, y_pred)
    print 'GBDT+LR AUC: {0}'.format(auc(fpr, tpr))
コード例 #16
0
ファイル: models.py プロジェクト: liyouzhang/Churn_Prediction
def GBC_Logistic(X_train,y_train,X_test):
    X_train, X_train_lr, y_train, y_train_lr = train_test_split(X_train,
                                                            y_train,
                                                            test_size=0.5)
    grd = GradientBoostingClassifier(n_estimators=200,learning_rate=0.5)
    grd_enc = OneHotEncoder()
    grd_lm = LogisticRegression()
    grd.fit(X_train, y_train)
    grd_enc.fit(grd.apply(X_train)[:, :, 0])
    grd_lm.fit(grd_enc.transform(grd.apply(X_train_lr)[:, :, 0]), y_train_lr)
    y_pred_grd_lm = grd_lm.predict_proba(
    grd_enc.transform(grd.apply(X_test)[:, :, 0]))[:, 1]
    return y_pred_grd_lm
コード例 #17
0
ファイル: main.py プロジェクト: majuncai/data-mining-template
    def GBDT_LReval(self, feature, target):
        feature_train = feature.iloc[0:self.lenth_eval, :]
        feature_eval = feature.iloc[self.lenth_eval:, :]
        label_train = target.iloc[0:self.lenth_eval]
        label_eval = target.iloc[self.lenth_eval:]
        GBDT = GradientBoostingClassifier(n_estimators=10)
        GBDT.fit(feature_train.values, label_train.values)

        y_pred_gbdt = GBDT.predict_proba(feature_eval.values)[:, 1]
        gbdt_auc = roc_auc_score(label_eval.values, y_pred_gbdt)
        print('gbdt auc: %.5f' % gbdt_auc)

        X_train_leaves = GBDT.apply(feature_train)[:, :, 0]
        X_test_leaves = GBDT.apply(feature_eval)[:, :, 0]

        (train_rows, cols) = X_train_leaves.shape

        gbdtenc = OneHotEncoder()
        X_trans = gbdtenc.fit_transform(
            np.concatenate((X_train_leaves, X_test_leaves), axis=0))

        # 定义LR模型
        lr = LogisticRegression()
        # lr对gbdt特征编码后的样本模型训练
        lr.fit(X_trans[:train_rows, :], label_train)
        # 预测及AUC评测
        y_pred_gbdtlr1 = lr.predict_proba(X_trans[train_rows:, :])[:, 1]
        gbdt_lr_auc1 = roc_auc_score(label_eval, y_pred_gbdtlr1)

        # 定义LR模型
        lr = LogisticRegression(n_jobs=-1)
        # 组合特征
        X_train_ext = hstack([X_trans[:train_rows, :], feature_train])
        X_test_ext = hstack([X_trans[train_rows:, :], feature_eval])

        print(X_train_ext.shape)
        # lr对组合特征的样本模型训练
        lr.fit(X_train_ext, label_train)

        # 预测及AUC评测
        y_pred_gbdtlr2 = lr.predict_proba(X_test_ext)[:, 1]
        gbdt_lr_auc2 = roc_auc_score(label_eval, y_pred_gbdtlr2)
        print('基于组合特征的LR AUC: %.5f' % gbdt_lr_auc2)

        print('基于GBDT特征编码后的LR AUC: %.5f' % gbdt_lr_auc1)

        print(X_train_leaves.shape)
        print(X_test_leaves.shape)
        #print(X_train_leaves.shape)
        print(X_train_leaves)
        print(X_test_leaves)
コード例 #18
0
def gbdtLR():
    GBDT= GradientBoostingClassifier(learning_rate=0.005,n_estimators=2400,max_depth=3,min_samples_split=800,min_samples_leaf=600,
                                     max_features=9,subsample=0.7,random_state=20)
    GBDT.fit(train_x, train_y)
    OHE = OneHotEncoder()
    OHE.fit(GBDT.apply(train_x)[:, :, 0])
    LR =  LogisticRegression(n_jobs=4, C=0.1, penalty='l1')
    
    LR.fit(OHE.transform(GBDT.apply(train_x_lr)[:, :, 0]), train_y_lr)
    
    Y_pred = LR.predict_proba(OHE.transform(GBDT.apply(test_x)[:, :, 0]))[:, 1]
    fpr, tpr, _ = roc_curve(test_y, Y_pred)
    auc = roc_auc_score(test_y, Y_pred)
    print('GBDT + LogisticRegression: ', auc)
    return fpr, tpr
コード例 #19
0
def gbdt_lr_mix():
    gbdtModel = GradientBoostingClassifier(n_estimators=10)
    gbdtModel.fit(X_train, Y_train)
    oneHot = OneHotEncoder()
    train_leafs_inds = gbdtModel.apply(X_train)[:, :, 0]
    print(np.shape(train_leafs_inds))
    print(gbdtModel.estimators_)
    oneHot.fit(train_leafs_inds)
    lrModel = LogisticRegression(n_jobs=4, C=0.1, penalty='l2')
    lrModel.fit(oneHot.transform(train_leafs_inds), Y_train)
    Y_pred = lrModel.predict_proba(oneHot.transform(gbdtModel.apply(X_test)[:, :, 0]))[:, 1]
    fpr, tpr, _ = roc_curve(Y_test, Y_pred)
    auc = roc_auc_score(Y_test, Y_pred)
    print('gbdt + lr: ', auc)
    return fpr, tpr
コード例 #20
0
ファイル: transform.py プロジェクト: amphibian-dev/toad
    def fit_(self, X, y, **kwargs):
        """fit GBDT transformer

        Args:
            X (DataFrame|array-like)
            y (str|array-like)
            select_dtypes (str|numpy.dtypes): `'object'`, `'number'` etc. only selected dtypes will be transform,
        """

        if isinstance(y, str):
            X = X.copy()
            y = X.pop(y)

        gbdt = GradientBoostingClassifier(**kwargs)
        gbdt.fit(X, y)

        X = gbdt.apply(X)
        X = X.reshape(-1, X.shape[1])

        onehot = OneHotEncoder().fit(X)

        return {
            'gbdt': gbdt,
            'onehot': onehot,
        }
コード例 #21
0
class GBDTTransformer(TransformerMixin):
    """GBDT transformer
    """
    def __init__(self):
        self.gbdt = None
        self.onehot = None

    @support_exclude
    @support_select_dtypes
    def fit(self, X, y, **kwargs):
        """fit GBDT transformer

        Args:
            X (DataFrame|array-like)
            y (str|array-like)
            select_dtypes (str|numpy.dtypes): `'object'`, `'number'` etc. only selected dtypes will be transform,
        """

        if isinstance(y, str):
            X = X.copy()
            y = X.pop(y)

        self.gbdt = GradientBoostingClassifier(**kwargs)
        self.gbdt.fit(X, y)

        X = self.gbdt.apply(X)
        X = X.reshape(-1, X.shape[1])

        self.onehot = OneHotEncoder().fit(X)

        return self

    def transform(self, X):
        """transform woe

        Args:
            X (DataFrame|array-like)
            default (str): 'min'(default), 'max' - the strategy to be used for unknown group

        Returns:
            array-like
        """
        X = self.gbdt.apply(X)
        X = X.reshape(-1, X.shape[1])
        res = self.onehot.transform(X).toarray()
        return res
コード例 #22
0
ファイル: tree.py プロジェクト: zhangzhenhu/zzh
    def _fit(self, dataset, **options):
        # self.param = param
        # print('model GBDT_LR fit begin:')
        # GBDT 模型
        grd = GradientBoostingClassifier(**self.model_params)
        grd.fit(dataset.x, dataset.y)
        #
        enc = OneHotEncoder()
        enc.fit(grd.apply(dataset.x)[:, :, 0])

        lm = LogisticRegression(penalty='l2', C=1, solver='lbfgs')
        x = enc.transform(grd.apply(dataset.x)[:, :, 0])
        lm.fit(x, dataset.y)

        self.tree = grd
        self.enc = enc
        self.m = lm
        return self
コード例 #23
0
 def model(self):  #x为除去customerid和churn的值  有y为churn
     x_train = self.train[[
         x for x in self.train.columns if x not in ['customerID', 'Churn']
     ]]
     y_train = self.train['Churn']
     lr = LogisticRegression(penalty='l2',
                             tol=0.0001,
                             fit_intercept=True,
                             max_iter=20)
     gbdt = GradientBoostingClassifier(
         learning_rate=0.1, n_estimators=100,
         max_depth=7)  #学习率为0.1 最大迭代次数为100,树深为7
     gbdt.fit(x_train, y_train)
     #gbdt产生的是3维数据
     enc = OneHotEncoder()
     enc.fit(gbdt.apply(x_train).reshape(-1, 100))
     lr.fit(enc.transform(gbdt.apply(x_train).reshape(-1, 100)), y_train)
     return lr, gbdt, enc
コード例 #24
0
    def train_model(self):
        print("training")
        label = "Churn"
        ID = "customerID"
        x_columns = [x for x in self.train.columns if x not in [ID, label]]
        x_train = self.train[x_columns]
        y_train = self.train[label]

        gbdt = GradientBoostingClassifier()
        gbdt.fit(x_train, y_train)

        gbdt_lr = LogisticRegression(max_iter=3000)
        enc = OneHotEncoder()
        enc.fit(gbdt.apply(x_train).reshape(-1, 100))

        gbdt_lr.fit(enc.transform(gbdt.apply(x_train).reshape(-1, 100)),
                    y_train)
        return enc, gbdt, gbdt_lr
コード例 #25
0
    def train_model(self):
        label = 'Churn'
        ID = 'customerID'
        x_columns = [x for x in self.train.columns if x not in [label, ID]]
        x_train = self.train[x_columns]
        y_train = self.train[label]

        gbdt = GradientBoostingClassifier()
        gbdt.fit(x_train, y_train)

        gbdt_lr = LogisticRegression()
        enc = OneHotEncoder()

        enc.fit(gbdt.apply(x_train).reshape(-1, 100))
        gbdt_lr.fit(enc.transform(gbdt.apply(x_train).reshape(-1, 100)),
                    y_train)

        return enc, gbdt, gbdt_lr
コード例 #26
0
    def train(X_train, y_train, params):
        # It is important to train the ensemble of trees on a different subset
        # of the training data than the linear regression model to avoid
        # overfitting, in particular if the total number of leaves is
        # similar to the number of training samples
        X_train, X_train_lr, y_train, y_train_lr = train_test_split(
            X_train, y_train, test_size=0.5)

        gbt = GradientBoostingClassifier(**params)
        gbt.fit(X_train, y_train)
        gbt_enc = OneHotEncoder()
        gbt_enc.fit(gbt.apply(X_train)[:, :, 0])

        grd_lm = LogisticRegression(max_iter=300)
        grd_lm.fit(gbt_enc.transform(gbt.apply(X_train_lr)[:, :, 0]),
                   y_train_lr)

        return grd_lm, gbt, gbt_enc
コード例 #27
0
ファイル: GBDT_LR.py プロジェクト: YuliNet/CTR_personal_test
def select_feature(x_train, y_train):
    gbm = GradientBoostingClassifier(n_estimators=50, random_state=10, max_depth=8)
    gbm.fit(x_train, y_train)
    train_new_feature = gbm.apply(x_train)
    print(train_new_feature.shape)
    train_new_feature = train_new_feature.reshape(-1, 50)
    enc = OneHotEncoder()
    enc.fit(train_new_feature)
    new_feature = np.array(enc.transform(train_new_feature).toarray())
    return new_feature
コード例 #28
0
ファイル: gbdt_lr_model.py プロジェクト: jmsking/Work
	def buildModel(self, X_train_d, X_train_c, X_test_d, X_test_c, y_train, y_test):
		'''
		开始构建模型
		Args:
			X_train_d: 离散特征训练数据
			X_train_c: 连续特征训练数据
			X_test_d: 离散特征测试数据
			X_test_c: 连续特征测试数据
			y_train: 训练数据标记 {-1, 1}
			y_test: 测试数据标记 {-1, 1}
		Returns:
			gbc_enc: GBDT OneHotEncoder
			gbc: GBDT模型
			comb_model: 训练得到的组合模型
			threshold: 正负样例阈值, Pred_Prob >= threshold 为正样例; Pred_Prob < threshold 为负样例
			comb_model_auc: 模型AUC
			precision: 模型精度
			recall: 模型召回率
		'''
		if self._random_state is not None:
			gbc = GradientBoostingClassifier(n_estimators=self._n_estimators, learning_rate=self._gbdt_learning_rate, max_depth=self._max_depth, random_state=self._random_state).fit(X_train_c, y_train)
		else:
			gbc = GradientBoostingClassifier(n_estimators=self._n_estimators, learning_rate=self._gbdt_learning_rate, max_depth=self._max_depth).fit(X_train_c, y_train)
		X_train_leaves = gbc.apply(X_train_c)[:,:,0]
		X_test_leaves = gbc.apply(X_test_c)[:,:,0]
		(X_train_rows, cols) = X_train_leaves.shape
		gbc_enc = OneHotEncoder().fit(np.concatenate([X_train_leaves,X_test_leaves], axis = 0))
		X_trans = gbc_enc.transform(np.concatenate([X_train_leaves,X_test_leaves], axis = 0))
		X_train_ext = hstack([X_trans[:X_train_rows,:], X_train_d])
		X_test_ext = hstack([X_trans[X_train_rows:,:], X_test_d])
		log.debug("Combine features done.")
		comb_model = LogisticRegression().fit(X_train_ext, y_train)
		log.debug("Training done.")
		comb_model_pred = comb_model.predict_proba(X_test_ext)[:,1]
		precision, recall, thresholds = precision_recall_curve(y_test, comb_model_pred)
		ap = average_precision_score(y_test, comb_model_pred)
		recall_meet = recall >= self._recall_rate
		recall_meet_min = len([item for item in recall_meet if item == True])
		threshold = thresholds[recall_meet_min-1]
		log.debug("threshold: %f - precision: %f - recall: %f", threshold, precision[recall_meet_min-1], recall[recall_meet_min-1])
		comb_model_auc = roc_auc_score(y_test, comb_model_pred)
		log.debug("AUC score is: %f", comb_model_auc)
		return gbc_enc, gbc, comb_model, threshold, comb_model_auc, precision[recall_meet_min-1], recall[recall_meet_min-1]
コード例 #29
0
def train(x_train, y_train, x_train_lr, y_train_lr, solver, class_weigeht,
          n_estimator):
    enc = OneHotEncoder()
    grd = GradientBoostingClassifier(n_estimators=n_estimator)
    grd_enc = OneHotEncoder()
    # 初始化两个逻辑回归模型
    # class_weight参数用于标示分类模型中各种类型的权重,可以不输入,即不考虑权重,或者说所有类型的权重一样。
    # 如果选择输入的话,可以选择balanced让类库自己计算类型权重,或者我们自己输入各个类型的权重,比如对于0,1的二元模型,我们可以定义class_weight={0:0.9, 1:0.1},这样类型0的权重为90%,而类型1的权重为10%。
    # 如果class_weight选择balanced,那么类库会根据训练样本量来计算权重。某种类型样本量越多,则权重越低,样本量越少,则权重越高。
    grd_lm = LogisticRegression(solver=solver, class_weight=class_weigeht)

    # 训练GBDT模型
    grd.fit(x_train, y_train)
    grd_enc.fit(grd.apply(x_train)[:, :, 0])
    # 训练逻辑回归分类器
    grd_lm.fit(grd_enc.transform(grd.apply(x_train_lr)[:, :, 0]), y_train_lr)
    #输出各个特征的权重值
    weights = grd_lm.coef_
    return weights, grd, grd_enc, grd_lm
コード例 #30
0
def test_gbm_classifier_backupsklearn(backend='auto'):
    df = pd.read_csv("./open_data/creditcard.csv")
    X = np.array(df.iloc[:, :df.shape[1] - 1], dtype='float32', order='C')
    y = np.array(df.iloc[:, df.shape[1] - 1], dtype='float32', order='C')
    import h2o4gpu
    Solver = h2o4gpu.GradientBoostingClassifier

    # Run h2o4gpu version of RandomForest Regression
    gbm = Solver(backend=backend, random_state=1234)
    print("h2o4gpu fit()")
    gbm.fit(X, y)

    # Run Sklearn version of RandomForest Regression
    from sklearn.ensemble import GradientBoostingClassifier
    gbm_sk = GradientBoostingClassifier(random_state=1234, max_depth=3)
    print("Scikit fit()")
    gbm_sk.fit(X, y)

    if backend == "sklearn":
        assert (gbm.predict(X) == gbm_sk.predict(X)).all() == True
        assert (gbm.predict_log_proba(X) == gbm_sk.predict_log_proba(X)
                ).all() == True
        assert (gbm.predict_proba(X) == gbm_sk.predict_proba(X)).all() == True
        assert (gbm.score(X, y) == gbm_sk.score(X, y)).all() == True
        assert (gbm.decision_function(X)[1] == gbm_sk.decision_function(X)[1]
                ).all() == True
        assert np.allclose(list(gbm.staged_predict(X)),
                           list(gbm_sk.staged_predict(X)))
        assert np.allclose(list(gbm.staged_predict_proba(X)),
                           list(gbm_sk.staged_predict_proba(X)))
        assert (gbm.apply(X) == gbm_sk.apply(X)).all() == True

        print("Estimators")
        print(gbm.estimators_)
        print(gbm_sk.estimators_)

        print("loss")
        print(gbm.loss_)
        print(gbm_sk.loss_)
        assert gbm.loss_.__dict__ == gbm_sk.loss_.__dict__

        print("init_")
        print(gbm.init)
        print(gbm_sk.init)

        print("Feature importance")
        print(gbm.feature_importances_)
        print(gbm_sk.feature_importances_)
        assert (gbm.feature_importances_ == gbm_sk.feature_importances_
                ).all() == True

        print("train_score_")
        print(gbm.train_score_)
        print(gbm_sk.train_score_)
        assert (gbm.train_score_ == gbm_sk.train_score_).all() == True
コード例 #31
0
def GBDT_clf(X_train, y_train, X_valid, X_test):
    from sklearn.preprocessing import OneHotEncoder
    #若深度7 , 树10棵,则共80个叶子节点
    gbdt = GradientBoostingClassifier(learning_rate=0.1,
                                      n_estimators=300,
                                      max_depth=6)  #10,3
    gbdt_enc = OneHotEncoder()

    gbdt.fit(X_train, y_train)
    gbdt_enc.fit(gbdt.apply(X_train)[:, :, 0])
    # X_train = pd.DataFrame.as_matrix(X_train)   #DataFrame转np.ndarray
    # X_valid = pd.DataFrame.as_matrix(X_valid)
    # X_test = pd.DataFrame.as_matrix(X_test)
    gbdt_train_feature = gbdt_enc.transform(gbdt.apply(X_train)[:, :,
                                                                0]).toarray()
    gbdt_valid_feature = gbdt_enc.transform(gbdt.apply(X_valid)[:, :,
                                                                0]).toarray()
    gbdt_test_feature = gbdt_enc.transform(gbdt.apply(X_test)[:, :,
                                                              0]).toarray()
    return gbdt_train_feature, gbdt_valid_feature, gbdt_test_feature
コード例 #32
0
def check_iris(presort, subsample, sample_weight):
    # Check consistency on dataset iris.
    clf = GradientBoostingClassifier(n_estimators=100,
                                     loss='deviance',
                                     random_state=1,
                                     subsample=subsample,
                                     presort=presort)
    clf.fit(iris.data, iris.target, sample_weight=sample_weight)
    score = clf.score(iris.data, iris.target)
    assert_greater(score, 0.9)

    leaves = clf.apply(iris.data)
    assert_equal(leaves.shape, (150, 100, 3))
コード例 #33
0
def test_iris():
    # Check consistency on dataset iris.
    for subsample in (1.0, 0.5):
        for sample_weight in (None, np.ones(len(iris.target))):
            clf = GradientBoostingClassifier(n_estimators=100, loss='deviance',
                                             random_state=1, subsample=subsample)
            clf.fit(iris.data, iris.target, sample_weight=sample_weight)
            score = clf.score(iris.data, iris.target)
            assert score > 0.9, "Failed with subsample %.1f " \
                "and score = %f" % (subsample, score)

            leaves = clf.apply(iris.data)
            assert_equal(leaves.shape, (150, 100, 3))
コード例 #34
0
def test_gbm_classifier_backupsklearn(backend='auto'):
    df = pd.read_csv("./open_data/creditcard.csv")
    X = np.array(df.iloc[:, :df.shape[1] - 1], dtype='float32', order='C')
    y = np.array(df.iloc[:, df.shape[1] - 1], dtype='float32', order='C')
    import h2o4gpu
    Solver = h2o4gpu.GradientBoostingClassifier

    # Run h2o4gpu version of RandomForest Regression
    gbm = Solver(backend=backend, random_state=1234)
    print("h2o4gpu fit()")
    gbm.fit(X, y)

    # Run Sklearn version of RandomForest Regression
    from sklearn.ensemble import GradientBoostingClassifier
    gbm_sk = GradientBoostingClassifier(random_state=1234, max_depth=3)
    print("Scikit fit()")
    gbm_sk.fit(X, y)

    if backend == "sklearn":
        assert (gbm.predict(X) == gbm_sk.predict(X)).all() == True
        assert (gbm.predict_log_proba(X) == gbm_sk.predict_log_proba(X)).all() == True
        assert (gbm.predict_proba(X) == gbm_sk.predict_proba(X)).all() == True
        assert (gbm.score(X, y) == gbm_sk.score(X, y)).all() == True
        assert (gbm.decision_function(X)[1] == gbm_sk.decision_function(X)[1]).all() == True
        assert np.allclose(list(gbm.staged_predict(X)), list(gbm_sk.staged_predict(X)))
        assert np.allclose(list(gbm.staged_predict_proba(X)), list(gbm_sk.staged_predict_proba(X)))
        assert (gbm.apply(X) == gbm_sk.apply(X)).all() == True

        print("Estimators")
        print(gbm.estimators_)
        print(gbm_sk.estimators_)

        print("loss")
        print(gbm.loss_)
        print(gbm_sk.loss_)
        assert gbm.loss_.__dict__ == gbm_sk.loss_.__dict__

        print("init_")
        print(gbm.init)
        print(gbm_sk.init)

        print("Feature importance")
        print(gbm.feature_importances_)
        print(gbm_sk.feature_importances_)
        assert (gbm.feature_importances_ == gbm_sk.feature_importances_).all() == True

        print("train_score_")
        print(gbm.train_score_)
        print(gbm_sk.train_score_)
        assert (gbm.train_score_ == gbm_sk.train_score_).all() == True
コード例 #35
0
def check_classification_toy(presort, loss):
    # Check classification on a toy dataset.
    clf = GradientBoostingClassifier(loss=loss, n_estimators=10,
                                     random_state=1, presort=presort)

    assert_raises(ValueError, clf.predict, T)

    clf.fit(X, y)
    assert_array_equal(clf.predict(T), true_result)
    assert_equal(10, len(clf.estimators_))

    deviance_decrease = (clf.train_score_[:-1] - clf.train_score_[1:])
    assert np.any(deviance_decrease >= 0.0)

    leaves = clf.apply(X)
    assert_equal(leaves.shape, (6, 10, 1))
コード例 #36
0
def test_classification_toy():
    # Check classification on a toy dataset.

    for loss in ('deviance', 'exponential'):
        clf = GradientBoostingClassifier(loss=loss, n_estimators=10,
                                         random_state=1)

        assert_raises(ValueError, clf.predict, T)

        clf.fit(X, y)
        assert_array_equal(clf.predict(T), true_result)
        assert_equal(10, len(clf.estimators_))

        deviance_decrease = (clf.train_score_[:-1] - clf.train_score_[1:])
        assert np.any(deviance_decrease >= 0.0), \
            "Train deviance does not monotonically decrease."

        leaves = clf.apply(X)
        assert_equal(leaves.shape, (6, 10, 1))
コード例 #37
0
# Supervised transformation based on random forests
rf = RandomForestClassifier(max_depth=3, n_estimators=n_estimator)
rf_enc = OneHotEncoder()
rf_lm = LogisticRegression()
rf.fit(X_train, y_train)
rf_enc.fit(rf.apply(X_train))
rf_lm.fit(rf_enc.transform(rf.apply(X_train_lr)), y_train_lr)

y_pred_rf_lm = rf_lm.predict_proba(rf_enc.transform(rf.apply(X_test)))[:, 1]
fpr_rf_lm, tpr_rf_lm, _ = roc_curve(y_test, y_pred_rf_lm)

grd = GradientBoostingClassifier(n_estimators=n_estimator)
grd_enc = OneHotEncoder()
grd_lm = LogisticRegression()
grd.fit(X_train, y_train)
grd_enc.fit(grd.apply(X_train)[:, :, 0])
grd_lm.fit(grd_enc.transform(grd.apply(X_train_lr)[:, :, 0]), y_train_lr)

y_pred_grd_lm = grd_lm.predict_proba(
    grd_enc.transform(grd.apply(X_test)[:, :, 0]))[:, 1]
fpr_grd_lm, tpr_grd_lm, _ = roc_curve(y_test, y_pred_grd_lm)


# The gradient boosted model by itself
y_pred_grd = grd.predict_proba(X_test)[:, 1]
fpr_grd, tpr_grd, _ = roc_curve(y_test, y_pred_grd)


# The random forest model by itself
y_pred_rf = rf.predict_proba(X_test)[:, 1]
fpr_rf, tpr_rf, _ = roc_curve(y_test, y_pred_rf)
コード例 #38
0
del X_train_gbdt
del y_train_gbdt
gc.collect()

gbdt_model = pickle.load(open(fp_gbdt_model, 'rb'))
#----- data for LR (one-hot encoding with GDBT output) -----#
id_cols = []
for i in range(1, gbdt_model.get_params()['n_estimators']+1):
    id_cols.append('tree'+str(i))
oh_enc = OneHotEncoder(id_cols)

def chunker(seq, size):
    return (seq[pos: pos + size] for pos in range(0, len(seq), size))

## oh_enc fit the train_set
df_train_id = pd.DataFrame(gbdt_model.apply(X_train_org)[:, :, 0], columns=id_cols, dtype=np.int8)

for chunk in chunker(df_train_id, 50000):
    oh_enc.fit(chunk)
    
del df_train_id

del X_train_org
del y_train_org
gc.collect()

## oh_enc fit the test_set
df_test_f = pd.read_csv(fp_test_f, 
                        index_col=None,  dtype={'id':str}, 
                        chunksize=50000, iterator=True)
コード例 #39
0
# Spit the data set into roughly 2 halfs
# This allows us to not overfit with stacking
train_x1 = train_X[:30000]
train_x2 = train_X[30000:]


train_y1 = train_y[:30000]
train_y2 = train_y[30000:]

# We are first going to use Graident Boosting to transform
# the data and then using One Hot Encoding.
# After this, we will then try and fit a Logist Regression
grd = GradientBoostingClassifier()
grd_enc = OneHotEncoder()
grd.fit(train_x1, train_y1)
grd_enc.fit(grd.apply(train_x1)[:,:, 0])

grd_lm = LogisticRegression(penalty = 'l2', C = .0115)


grd_lm.fit(grd_enc.transform(grd.apply(train_x2)[:,:, 0]), train_y2)



#Import test data
test_x = []
with open('test_2012.csv', 'r') as f:
	first_row = f.readline()
	headers = first_row.split(',')
	for row in f:
		ints = [int(elem) for elem in row.split(',')]
コード例 #40
0
# Supervised transformation based on random forests
rf = RandomForestClassifier(max_depth=3, n_estimators=n_estimator)
rf_enc = OneHotEncoder()
rf_lm = LogisticRegression()
rf.fit(X_train, y_train)
rf_enc.fit(rf.apply(X_train))
rf_lm.fit(rf_enc.transform(rf.apply(X_train_lr)), y_train_lr)

y_pred_rf_lm = rf_lm.predict_proba(rf_enc.transform(rf.apply(X_test)))[:, 1]
fpr_rf_lm, tpr_rf_lm, _ = roc_curve(y_test, y_pred_rf_lm)

grd = GradientBoostingClassifier(n_estimators=n_estimator, verbose=1)
grd_enc = OneHotEncoder()
grd_lm = LogisticRegression()
grd.fit(X_train, y_train)
grd_enc.fit(grd.apply(X_train)[:, :, 0])
grd_lm.fit(grd_enc.transform(grd.apply(X_train_lr)[:, :, 0]), y_train_lr)

y_pred_grd_lm = grd_lm.predict_proba(
    grd_enc.transform(grd.apply(X_test)[:, :, 0]))[:, 1]
fpr_grd_lm, tpr_grd_lm, _ = roc_curve(y_test, y_pred_grd_lm)


# The gradient boosted model by itself
y_pred_grd = grd.predict_proba(X_test)[:, 1]
fpr_grd, tpr_grd, _ = roc_curve(y_test, y_pred_grd)


# The random forest model by itself
y_pred_rf = rf.predict_proba(X_test)[:, 1]
fpr_rf, tpr_rf, _ = roc_curve(y_test, y_pred_rf)
コード例 #41
0
ファイル: HQC2.py プロジェクト: Jilinl66/Data-Analysis-Matlab
# xt = xt[selected_feature[0:23]]

# x_train, x_test, y_train, y_test = train_test_split(x, y, test_size=0.5)

# x_train, x_train_lr, y_train, y_train_lr = train_test_split(x_train, y_train, test_size=0.5)
x_train, x_train_lr, y_train, y_train_lr = train_test_split(x, y, test_size=0.5)

params = {'n_estimators': 1800, 'max_leaf_nodes': 4, 'max_depth': 6, 'random_state': 2,  # None
          'min_samples_split': 5, 'learning_rate': 0.1, 'subsample': 0.83}
gb = GradientBoostingClassifier(**params)
gb_encoder = preprocessing.OneHotEncoder()
lr = LogisticRegression()

gb.fit(x_train, y_train)

gb_encoder.fit(gb.apply(x_train)[:, :, 0])

lr.fit(gb_encoder.transform(gb.apply(x_train_lr)[:, :, 0]), y_train_lr)

# yhat = lr.predict_proba(gb_encoder.transform(gb.(x_test)[:, :, 0]))[:, 1]
yhat = lr.predict_proba(gb_encoder.transform(gb.apply(xt)[:, :, 0]))[:, 1]
yhat2 = gb.predict_proba(xt)[:, 1]
yhat3 = (np.array(yhat)+np.array(yhat2))/2
# fpr_grd_lm, tpr_grd_lm, _ = roc_curve(y_test, yhat)

# plt.figure()
# plt.xlim(0, 1)
# plt.ylim(0, 1)
# plt.plot(fpr_grd_lm, tpr_grd_lm, label='GBT + LR')
# plt.show()
result_data = {'QuoteNumber': xt['QuoteNumber'], 'QuoteConversion_Flag': yhat3}
コード例 #42
0
ファイル: GBDT_LR.py プロジェクト: RamonYeung/ctrip-game6
X_train, X_validation, y_train, y_validation = train_test_split(X_train, y_train, test_size = 0.3)
#X_train, X_train_lr, y_train, y_train_lr = train_test_split(X_train, y_train, test_size = 0.5)
del train


"""
GBDT+LR
"""
print "Performing GBDT+LR"
random_state = np.random.RandomState(520)
lr = LogisticRegression(random_state = random_state)
grd = GradientBoostingClassifier(n_estimators = 10, random_state = random_state)

grd_enc = OneHotEncoder()
grd.fit(X_train, y_train)
grd_enc.fit(grd.apply(X_train)[:, :, 0])
lr.fit(grd_enc.transform(grd.apply(X_train_lr)[:, :, 0]), y_train_lr)
#probas = lr.predict_proba(grd_enc.transform(grd.apply(X_test)[:, :, 0]))
print "Predicting..."
probas_train = lr.predict_proba(grd_enc.transform(grd.apply(X_train)[:, :, 0]))
#probas_validation = lr.predict_proba(grd_enc.transform(grd.apply(X_validation)[:, :, 0]))
#fpr, tpr, thredsholds = roc_curve(y_test, probas[:, 1])
fpr_train, tpr_train, thredsholds_train = roc_curve(y_train, probas_train[:, 1])
#fpr_validation, tpr_validation, thredsholds_validation= roc_curve(y_validation, probas_validation[:, 1])
#roc_auc = auc(fpr, tpr)
roc_auc_train = auc(fpr_train, tpr_train)
#roc_auc_vldt = auc(fpr_validation, tpr_validation)
#plt.plot(fpr, tpr, lw = 1, label = 'AUC: = %0.4f)' % (roc_auc))
print "Plotting!..."
plt.plot(fpr_train, tpr_train, lw = 1, label = 'AUC: = %0.4f)' % (roc_auc_train))
#plt.plot(fpr_validation, tpr_validation, lw = 1, label = 'AUC: = %0.4f)' % (roc_auc_vldt))
コード例 #43
0
X_test = test_data
X_train, X_train_lr, y_train, y_train_lr = train_test_split(X_train,
                                                            y_train,
                                                            test_size=0.5)
# rf_lm.fit(rf_enc.transform(rf.apply(X_train_lr)), y_train_lr)

# y_pred_rf_lm = rf_lm.predict_proba(rf_enc.transform(rf.apply(X_test)))[:, 1]

#sixthforestt
#Encoder and Logestic Regression combined with Gradient Boosting Classifier
n_estimator = 10
grd = GradientBoostingClassifier(n_estimators=n_estimator)
grd_enc = OneHotEncoder()
grd_lm = LogisticRegression()
grd.fit(X_train, y_train)
grd_enc.fit(grd.apply(X_train)[:, :, 0])
grd_lm.fit(grd_enc.transform(grd.apply(X_train_lr)[:, :, 0]), y_train_lr)


# output = grd_lm.predict(test_data).astype(int)
#output = rf_lm.predict(rf_enc.transform(rf.apply(X_test))).astype(int)
output = grd_lm.predict(grd_enc.transform(grd.apply(X_test)[:, :, 0])).astype(int)


'''
#secondforest (in git)
#Cross Validation 
train_size = int(0.7*(train_data.shape[0]))
validation_size = train_data.shape[0] - train_size

X_train = train_data[0:train_size, 1::]