Ejemplo n.º 1
0
    def get_leaf(self):
        self.get_data("oneHot")
        n_estimators = 300
        clf_xgb = XGBClassifier(max_depth=4,
                                learning_rate=0.0125,
                                n_estimators=300,
                                subsample=0.6,
                                colsample_bytree=0.7,
                                seed=4)
        #clf_xgb = XGBClassifier(max_depth=4, n_estimators=300)
        clf_xgb.fit(self.x_train, self.y_train)

        leafes_train = list(clf_xgb.apply(self.x_train))
        leafes_test = list(clf_xgb.apply(self.x_test))

        #补充最大值,最小值,将数据one-hot时统一
        max_train = np.array(leafes_train).max()
        min_train = np.array(leafes_train).min()

        max_test = np.array(leafes_test).max()
        min_test = np.array(leafes_test).min()
        max_value = max(max_train, max_test)
        min_value = min(min_train, min_test)
        for i in range(min_value, max_value + 1):
            leafes_train.append([i] * n_estimators)

        enc = OneHotEncoder()
        enc.fit(leafes_train)
        #去除补充的值
        leafes_train_feature = enc.transform(
            leafes_train).toarray()[:-(max_value - min_value + 1), :]
        print leafes_train_feature.shape, len(leafes_train)
        return leafes_train_feature, self.y_train, enc.transform(
            leafes_test).toarray(), self.y_test
 def fit_model_split(self, X_train, y_train, X_test, y_test):
     ##X_train_1用于生成模型  X_train_2用于和新特征组成新训练集合
     X_train_1, X_train_2, y_train_1, y_train_2 = train_test_split(
         X_train, y_train, test_size=0.6, random_state=0)
     clf = XGBClassifier(learning_rate=self.learning_rate,
                         n_estimators=self.n_estimators,
                         max_depth=self.max_depth,
                         min_child_weight=self.min_child_weight,
                         gamma=self.gamma,
                         subsample=self.subsample,
                         colsample_bytree=self.colsample_bytree,
                         objective=self.objective,
                         nthread=self.nthread,
                         scale_pos_weight=self.scale_pos_weight,
                         reg_alpha=self.reg_alpha,
                         reg_lambda=self.reg_lambda,
                         seed=self.seed)
     clf.fit(X_train_1, y_train_1)
     y_pre = clf.predict(X_train_2)
     y_pro = clf.predict_proba(X_train_2)[:, 1]
     print("pred_leaf=T AUC Score : %f" %
           metrics.roc_auc_score(y_train_2, y_pro))
     print("pred_leaf=T  Accuracy : %.4g" %
           metrics.accuracy_score(y_train_2, y_pre))
     new_feature = clf.apply(X_train_2)
     X_train_new2 = self.mergeToOne(X_train_2, new_feature)
     new_feature_test = clf.apply(X_test)
     X_test_new = self.mergeToOne(X_test, new_feature_test)
     print("Training set of sample size 0.4 fewer than before")
     return X_train_new2, y_train_2, X_test_new, y_test
 def fit_model(self, X_train, y_train, X_test):
     clf = XGBClassifier(learning_rate=self.learning_rate,
                         n_estimators=self.n_estimators,
                         max_depth=self.max_depth,
                         min_child_weight=self.min_child_weight,
                         gamma=self.gamma,
                         subsample=self.subsample,
                         colsample_bytree=self.colsample_bytree,
                         objective=self.objective,
                         nthread=self.nthread,
                         scale_pos_weight=self.scale_pos_weight,
                         reg_alpha=self.reg_alpha,
                         reg_lambda=self.reg_lambda,
                         seed=self.seed)
     clf.fit(X_train, y_train)
     # y_pre= clf.predict(X_test)
     # y_pro= clf.predict_proba(X_test)[:,1]
     # print("pred_leaf=T  AUC Score : %f" % metrics.roc_auc_score(y_test, y_pro))
     # print("pred_leaf=T  Accuracy : %.4g" % metrics.accuracy_score(y_test, y_pre))
     new_feature = clf.apply(X_train)
     X_train_new = self.mergeToOne(X_train, new_feature)
     new_feature_test = clf.apply(X_test)
     X_test_new = self.mergeToOne(X_test, new_feature_test)
     print("Training set sample number remains the same")
     return X_train_new, y_train, X_test_new
Ejemplo n.º 4
0
    def feature_transformer(x_train, y_train, x_test, y_test, params):
        xgbt = XGBClassifier(**params).fit(x_train, y_train)
        gbt_enc = OneHotEncoder()
        gbt_enc.fit(xgbt.apply(x_train))
        train_x_transform = gbt_enc.transform(xgbt.apply(x_train))
        test_x_transform = gbt_enc.transform(xgbt.apply(x_test))

        # pipelineModel = PMMLPipeline([("xgboost", XGBClassifier(**params))])
        # pipelineModel.fit(x_train, y_train)
        # sklearn2pmml(pipelineModel, "/data/kongyy/ctr_online/model/xgboost_feature.pmml", with_repr=True)

        return train_x_transform, test_x_transform
Ejemplo n.º 5
0
def gbdt_new_features(result):
    clf = XGBClassifier(
        learning_rate=0.2,  # 默认0.3
        n_estimators=30,  # 树的个数
        max_depth=7,
        min_child_weight=10,
        gamma=0.5,
        subsample=0.75,
        colsample_bytree=0.75,
        objective='binary:logistic',  # 逻辑回归损失函数
        nthread=8,  # cpu线程数
        scale_pos_weight=1,
        reg_alpha=1e-05,
        reg_lambda=10,
        seed=1024)  # 随机种子
    predictors = [
        i for i in result.columns if i not in [
            'orderid', 'geohashed_start_loc', 'geohashed_end_loc', 'userid',
            'bikeid', 'starttime', 'label', 'biketype', 'start_lon',
            'start_lat', 'end_lon', 'end_lat'
        ]
    ]
    old_features = result[predictors].values
    result1, result2 = train_test_split(result, test_size=0.6, random_state=0)
    del result1
    clf.fit(result2[predictors].values, result2['label'].values)
    new_features = clf.apply(old_features)
    features = mergeToOne(old_features, new_features)
    return features  # array类型
Ejemplo n.º 6
0
 def fit_model(self, X_train, y_train, X_test):
     clf = XGBClassifier(learning_rate=self.learning_rate,
                         n_estimators=self.n_estimators,
                         max_depth=self.max_depth,
                         min_child_weight=self.min_child_weight,
                         gamma=self.gamma,
                         subsample=self.subsample,
                         colsample_bytree=self.colsample_bytree,
                         objective=self.objective,
                         nthread=self.nthread,
                         scale_pos_weight=self.scale_pos_weight,
                         reg_alpha=self.reg_alpha,
                         reg_lambda=self.reg_lambda,
                         seed=self.seed)
     clf.fit(X_train, y_train)
     new_feature = clf.apply(X_train)
     X_train_new = self.mergeToOne(X_train, new_feature)
     new_feature_test = clf.apply(X_test)
     X_test_new = self.mergeToOne(X_test, new_feature_test)
     print("Training set sample number remains the same")
     return X_train_new, X_test_new
Ejemplo n.º 7
0
def runXgbStack(inputfile, outputfile):
    '''
    输入输出文件,inputfile和outputfile
    '''
    # In[2]:

    df_all = pd.read_csv(inputfile)
    df_all['XEID'] = df_all['EID'].map(lambda x: int(x[1:]))

    # 默认填充的0,显示使用一个负数尝试一下
    df_all.replace([np.inf, -np.inf], np.nan, inplace=True)
    df_all = df_all.fillna(0)

    # 默认填充的0,显示使用一个负数尝试一下
    features = df_all.columns[0:]
    features = list(features)
    features.remove('EID')
    label = 'TARGET'

    df_train, df_test = xtrain_and_test(df_all)
    # In[7]:

    clf = XGBClassifier(
        n_estimators=50,  #50棵树
        learning_rate=0.05,
        max_depth=7,
        min_child_weight=1,
        gamma=0.1,
        subsample=0.8,
        colsample_bytree=0.8,
        objective='binary:logistic',
        seed=91)

    X_train = df_train[features]
    Y_label = df_train[label]
    X_test = df_test[features]

    clf.fit(X_train, Y_label, eval_metric='auc', verbose=5)
    column = ['STACKFEATURE' + str(i) for i in range(50)]
    df_new_feature = pd.DataFrame(clf.apply(df_all[features]), columns=column)
    df_all[column] = df_new_feature

    # 融合特征转化为onehot
    for feature in column:
        df_all[feature] = df_all[feature].astype(np.int32)
        df_tmp = pd.get_dummies(df_all[feature], prefix=feature)
        df_all[df_tmp.columns] = df_tmp
        df_all.drop(feature, axis=1, inplace=True)

    df_all.to_csv(outputfile, index=False, index_label=False)
    del df_train, df_test, df_all
    return outputfile
Ejemplo n.º 8
0
 def xgb_fit(self, data_x, data_y, all_data):
     new_xgb = XGBClassifier(
         n_estimators=self.n_estimators,  # 可以调lgb包 param:boosting='gbdt'
         random_state=10,
         gamma=self.gamma,
         max_depth=self.max_depth,
         min_child_weight=self.min_child_weight,
         base_score=self.base_score,
         colsample_bytree=0.5)
     new_xgb.fit(data_x, data_y)
     # alldata_x = pd.concat([data_x, test_x])
     train_new_fea = new_xgb.apply(all_data)  # 返回所有样本每棵树的叶子节点索引
     train_new_fea = train_new_fea.reshape(-1, self.n_estimators)
     enc = OneHotEncoder()
     enc.fit(train_new_fea)  # 每一个tree都会得到一个one-hot transform
     train_new_embeddings = np.array(enc.transform(train_new_fea).toarray())
     with open('xgblr_xgb.pkl', 'wb') as f:
         pickle.dump(new_xgb, f)
     return train_new_embeddings  # 新输入特征样本
Ejemplo n.º 9
0
    learning_rate=0.2,  # 默认0.3
    n_estimators=200,  # 树的个数
    max_depth=8,
    min_child_weight=10,
    gamma=0.5,
    subsample=0.75,
    colsample_bytree=0.75,
    objective='binary:logistic',  # 逻辑回归损失函数
    nthread=8,  # cpu线程数
    scale_pos_weight=1,
    reg_alpha=1e-05,
    reg_lambda=10,
    seed=1024)  # 随机种子

clf.fit(X_train_1, y_train_1)
new_feature = clf.apply(X_train_2)

X_train_new2 = mergeToOne(X_train_2, new_feature)
new_feature_test = clf.apply(X_test)
X_test_new = mergeToOne(X_test, new_feature_test)

model = XGBClassifier(
    learning_rate=0.05,  # 默认0.3
    n_estimators=300,  # 树的个数
    max_depth=7,
    min_child_weight=1,
    gamma=0.5,
    subsample=0.8,
    colsample_bytree=0.8,
    objective='binary:logistic',  # 逻辑回归损失函数
    nthread=8,  # cpu线程数
Ejemplo n.º 10
0
      metrics.roc_auc_score(y_test, y_sklearn))

print "原始train大小:", X_train.shape
print "原始test大小:", X_test.shape

# XGBoost 自带接口生成的新特征
train_new_feature = model_bst.predict(d_train, pred_leaf=True)
test_new_feature = model_bst.predict(d_test, pred_leaf=True)
train_new_feature1 = DataFrame(train_new_feature)
test_new_feature1 = DataFrame(test_new_feature)

print "新的特征集(自带接口):", train_new_feature1.shape
print "新的测试集(自带接口):", test_new_feature1.shape

# sklearn接口生成的新特征
train_sklearn_new_feature = clf.apply(X_train)  #每个样本在每颗树叶子节点的索引值
test_sklearn_new_feature = clf.apply(X_test)

train_new_feature2 = DataFrame(train_sklearn_new_feature)
test_new_feature2 = DataFrame(test_sklearn_new_feature)

print "新的特征集(sklearn接口):", train_new_feature2.shape
print "新的测试集(sklearn自带接口):", test_new_feature2.shape

#用XGBoost自带接口生成的新特征训练
new_feature1 = clf.fit(train_new_feature1, y_train)
y_new_feature1 = clf.predict_proba(test_new_feature1)[:, 1]
#用XGBoost自带接口生成的新特征训练
new_feature2 = clf.fit(train_new_feature2, y_train)
y_new_feature2 = clf.predict_proba(test_new_feature2)[:, 1]
                    colsample_bytree=0.7,
                    objective='binary:logistic',
                    scale_pos_weight=1.002252816020025,
                    reg_alpha=0.3,
                    reg_lambda=0.1,
                    seed=27)

# train the values
model_sklearn = clf.fit(X_train, Y_train)
y_bst = model_sklearn.predict_proba(X_test)[:, 1]
metrics_spec(Y_train, model_sklearn.predict_proba(X_train)[:, 1])
metrics_spec(Y_test, y_bst)

# make new features
# we can get the spare leaf nodes for the input of stacking
train_new_feature = clf.apply(X_train)
test_new_feature = clf.apply(X_test)
enc = OneHotEncoder()
enc.fit(train_new_feature)
train_new_feature2 = np.array(enc.transform(train_new_feature).toarray())
test_new_feature2 = np.array(enc.transform(test_new_feature).toarray())
res_data = pd.DataFrame(np.c_[Y_train, train_new_feature2])
res_data.columns = ['f' + str(x) for x in range(res_data.shape[1])]
res_test = pd.DataFrame(np.c_[Y_test, test_new_feature2])
res_test.columns = ['f' + str(x) for x in range(res_test.shape[1])]

# stacking a model , it can be logistic or fm, nerual network and they  will come to be beyond all expectations
# attention points of the stacking model can be obtained from the article mentioned at the top of the code
lr = LogisticRegression(C=1,
                        penalty='l2',
                        max_iter=100,
    learning_rate=0.2,  # 默认0.3
    n_estimators=10,  # 树的个数
    max_depth=8,
    min_child_weight=10,
    gamma=0.5,
    subsample=0.75,
    colsample_bytree=0.75,
    objective='binary:logistic',  # 逻辑回归损失函数
    nthread=8,  # cpu线程数
    scale_pos_weight=1,
    reg_alpha=1e-05,
    reg_lambda=10,
    seed=1024)  # 随机种子

clf.fit(X, y)
new_feature_X = clf.apply(X)
new_feature_Y = clf.apply(Y)
print(new_feature_X)
print(new_feature_Y)
shape_x = np.shape(new_feature_X)
shape_y = np.shape(new_feature_Y)
print('shape of train: ', shape_x)
print('shape of test: ', shape_y)

new_feature_X = pd.DataFrame(new_feature_X)
new_feature_Y = pd.DataFrame(new_feature_Y)
new_feature_X.to_csv(data_path + 'train_Drop_Delete_Log_Ratio_Label_GBDT1.csv')
new_feature_Y.to_csv(data_path + 'test_Drop_Delete_Log_Ratio_Label_GBDT1.csv')

#submission = pd.DataFrame({'pred': new_feature_X.mean(axis=1)})
#submission.to_csv(r'sub{}.csv'.format(datetime.datetime.now().strftime('%Y%m%d_%H%M%S')), header=None, index=False, float_format='%.3f')
Ejemplo n.º 13
0
def runXgbStack(inputfile, outputfile):
    '''
    输入输出文件,inputfile和outputfile
    '''
    # In[2]:

    df_all = pd.read_csv(inputfile)
    df_all['XEID'] = df_all['EID'].map(lambda x: int(x[1:]))

    # 默认填充的0,显示使用一个负数尝试一下
    df_all.replace([np.inf, -np.inf], np.nan, inplace=True)
    df_all = df_all.fillna(0)

    # 默认填充的0,显示使用一个负数尝试一下
    features = df_all.columns[0:]
    features = list(features)
    features.remove('EID')
    label = 'TARGET'

    clf = XGBClassifier(
        n_estimators=50,  #50棵树
        learning_rate=0.05,
        max_depth=7,
        min_child_weight=1,
        gamma=0.1,
        subsample=0.8,
        colsample_bytree=0.8,
        objective='binary:logistic',
        seed=91)

    df_all_prov11, df_all_prov12 = split_data_with_prov(df_all)

    ###################### prov == 11
    df_train11, df_test11 = xtrain_and_test(df_all_prov11)
    # In[7]:

    X_train11 = df_train11[features]
    Y_label11 = df_train11[label]
    X_test11 = df_test11[features]

    clf.fit(X_train11, Y_label11, eval_metric='auc', verbose=5)
    column = ['STACKFEATURE' + str(i) for i in range(50)]
    df_new_feature11 = pd.DataFrame(clf.apply(df_all_prov11[features]),
                                    columns=column)
    df_all_prov11[column] = df_new_feature11

    ####################### prov == 12
    df_train12, df_test12 = xtrain_and_test(df_all_prov12)
    # In[7]:
    X_train12 = df_train12[features]
    Y_label12 = df_train12[label]
    X_test12 = df_test12[features]

    clf.fit(X_train12, Y_label12, eval_metric='auc', verbose=5)
    column = ['STACKFEATURE' + str(i) for i in range(50)]
    df_new_feature12 = pd.DataFrame(clf.apply(df_all_prov12[features]),
                                    columns=column)
    df_all_prov12[column] = df_new_feature12

    df_all = df_all_prov11.append(df_all_prov12)

    df_all.to_csv(outputfile, index=False, index_label=False)
    del df_all_prov11, df_all_prov12, df_all
    return outputfile
Ejemplo n.º 14
0
from xgboost.sklearn import XGBClassifier
from sklearn.preprocessing import OneHotEncoder

tree_model = XGBClassifier(
    silent=0,  #设置成1则没有运行信息输出,最好是设置为0.是否在运行升级时打印消息。
    #nthread=4,# cpu 线程数 默认最大
    learning_rate=0.3,  # 如同学习率
    min_child_weight=1,
    # 这个参数默认是 1,是每个叶子里面 h 的和至少是多少,对正负样本不均衡时的 0-1 分类而言
    #,假设 h 在 0.01 附近,min_child_weight 为 1 意味着叶子节点中最少需要包含 100 个样本。
    #这个参数非常影响结果,控制叶子节点中二阶导的和的最小值,该参数值越小,越容易 overfitting。
    max_depth=5,  # 构建树的深度,越大越容易过拟合
    gamma=0,  # 树的叶子节点上作进一步分区所需的最小损失减少,越大越保守,一般0.1、0.2这样子。
    subsample=1,  # 随机采样训练样本 训练实例的子采样比
    max_delta_step=0,  #最大增量步长,我们允许每个树的权重估计。
    colsample_bytree=1,  # 生成树时进行的列采样
    reg_lambda=1,  # 控制模型复杂度的权重值的L2正则化项参数,参数越大,模型越不容易过拟合。
    #reg_alpha=0, # L1 正则项参数
    #scale_pos_weight=1, #如果取值大于0的话,在类别样本不平衡的情况下有助于快速收敛。平衡正负权重
    #objective= 'multi:softmax', #多分类的问题 指定学习任务和相应的学习目标
    #num_class=10, # 类别数,多分类与 multisoftmax 并用
    n_estimators=20,  #树的个数
    seed=1000  #随机种子
    #eval_metric= 'auc'
)

tree_model.fit(train, train_label)
code_tree = tree_model.apply(test)
tm_enc = OneHotEncoder(categories='auto')
ss = tm_enc.fit_transform(code_tree)