Beispiel #1
0
def xgb_offline(train_data, cv_data):

    train_data = build_train_dataset(train_data, rate)
    train_data.reset_index(inplace=True, drop=True)
    train_Y = train_data['is_trade'].values
    cv_Y = cv_data['is_trade'].values

    drop_cols = ['is_trade']
    train_data.drop(drop_cols, axis=1, inplace=True)
    cv_data.drop(drop_cols, axis=1, inplace=True)
    print('train shap:', train_data.shape)
    print('cv shape', cv_data.shape)

    kf = KFold(len(train_data), n_folds=5, shuffle=True, random_state=520)

    train_preds = np.zeros(train_data.shape[0])
    cv_preds = np.zeros(train_data.shape[0])
    test_preds = np.zeros((cv_data.shape[0], 5))

    for i, (train_index, cv_index) in enumerate(kf):
        print('第{}次训练...'.format(i))
        train_feat = train_data.iloc[train_index]
        cv_feat = train_data.iloc[cv_index]

        train_feat = xgb.DMatrix(train_feat.values, label=train_Y[train_index])
        cv_feat = xgb.DMatrix(cv_feat.values, label=train_Y[cv_index])
        test_feat = xgb.DMatrix(cv_data.values)
        watchlist = [(train_feat, 'train'), (cv_feat, 'val')]


        clf = xgb.train(params=params, dtrain=train_feat,num_boost_round=n_round,\
            evals=watchlist,early_stopping_rounds=7,verbose_eval=False)

        predict_train = clf.predict(train_feat)
        predict_cv = clf.predict(cv_feat)
        predict_test = clf.predict(test_feat)

        train_preds[train_index] += predict_train
        cv_preds[cv_index] += predict_cv
        test_preds[:, i] = predict_test

        print(clf.best_iteration)
        print(clf.best_score)
        print('  训练损失:', cal_log_loss(predict_train, train_Y[train_index]))
        print('  测试损失:', cal_log_loss(predict_cv, train_Y[cv_index]))
    #特征重要度
    features = train_data.columns
    ceate_feature_map(features)
    importance = clf.get_fscore(fmap='xgb.fmap')
    importance = sorted(importance.items(), key=operator.itemgetter(1))
    df = pd.DataFrame(importance, columns=['feature', 'fscore'])
    df['fscore'] = df['fscore'] / df['fscore'].sum()

    predict_test = np.median(test_preds, axis=1)
    predict_test = predict_test / (predict_test + (1 - predict_test) / rate)
    print('训练损失:', cal_log_loss(train_preds / 4, train_Y))
    print('测试损失:', cal_log_loss(cv_preds, train_Y))
    print('验证损失:', cal_log_loss(predict_test, cv_Y))
    return df, clf
Beispiel #2
0
def lgb_offline(train_data, cv_data):

    train_data = build_train_dataset(train_data, rate)
    train_data.reset_index(inplace=True, drop=True)
    train_Y = train_data['is_trade']
    cv_Y = cv_data['is_trade']

    drop_cols = ['is_trade']
    train_data.drop(drop_cols, axis=1, inplace=True)
    cv_data.drop(drop_cols, axis=1, inplace=True)

    kf = KFold(len(train_data), n_folds=5, shuffle=True, random_state=520)
    train_preds = np.zeros(train_data.shape[0])
    cv_preds = np.zeros(train_data.shape[0])
    test_preds = np.zeros((cv_data.shape[0], 5))
    for i, (train_index, cv_index) in enumerate(kf):

        train_feat = train_data.loc[train_index]
        cv_feat = train_data.loc[cv_index]

        print('第{}次训练...'.format(i))
        lgb_train = lgb.Dataset(train_feat.values, train_Y.loc[train_index])
        lgb_cv = lgb.Dataset(cv_feat.values, train_Y.loc[cv_index])
        gbm = lgb.train(params=params,
                        train_set=lgb_train,
                        num_boost_round=6000,
                        valid_sets=lgb_cv,
                        verbose_eval=False,
                        early_stopping_rounds=100)
        #评价特征的重要性
        feat_imp = pd.Series(
            gbm.feature_importance(),
            index=train_data.columns).sort_values(ascending=False)

        predict_train = gbm.predict(train_feat.values)
        predict_cv = gbm.predict(cv_feat.values)
        test_preds[:, i] = gbm.predict(cv_data.values)

        train_preds[train_index] += predict_train
        cv_preds[cv_index] += predict_cv

        feat_imp = pd.Series(
            gbm.feature_importance(),
            index=train_data.columns).sort_values(ascending=False)
        print(gbm.best_iteration)
        print(gbm.best_score)
        print('   训练损失:', cal_log_loss(predict_train,
                                       train_Y.loc[train_index]))
        print('   测试损失:', cal_log_loss(predict_cv, train_Y.loc[cv_index]))
    predict_test = np.median(test_preds, axis=1)
    predict_test = predict_test / (predict_test + (1 - predict_test) / rate)
    print(params)
    print('训练损失:', cal_log_loss(train_preds / 4, train_Y))
    print('测试损失:', cal_log_loss(cv_preds, train_Y))
    print('验证损失:', cal_log_loss(predict_test, cv_Y))

    return gbm, feat_imp
Beispiel #3
0
def LR_online(train_data, cv_data, test_data):
    train_data = pd.concat([train_data, cv_data],axis=0)
    train_data = build_train_dataset(train_data, rate)
    train_data.reset_index(inplace=True,drop=True)
    train_Y = train_data['is_trade']
    
    drop_cols = ['is_trade']
    train_data.drop(drop_cols,axis=1,inplace=True)
    test_data.drop(drop_cols,axis=1,inplace=True)
    
    fold = 5
    kf = KFold(len(train_data), n_folds = fold, shuffle=True, random_state=520)
    train_preds = np.zeros(train_data.shape[0])
    cv_preds = np.zeros(train_data.shape[0])
    test_preds = np.zeros((test_data.shape[0], fold))
    for i, (train_index, cv_index) in enumerate(kf):
        
        train_feat = train_data.loc[train_index]
        cv_feat = train_data.loc[cv_index]
        
        clf = LogisticRegression(C=1.2, fit_intercept=True, max_iter=3000,class_weight={0:0.5, 1:0.5})
        clf.fit(X=train_feat.values, y=train_Y[train_index])
        
        predict_train = clf.predict_proba(train_feat.values)[:,1]
        predict_cv = clf.predict_proba(cv_feat.values)[:,1]
        predict_test = clf.predict_proba(test_data.values)[:,1]
        train_preds[train_index] += predict_train
        cv_preds[cv_index] += predict_cv
        test_preds[:,i] = predict_test
        
        print('  训练损失:',cal_log_loss(predict_train, train_Y[train_index]))
        print('  测试损失:',cal_log_loss(predict_cv, train_Y[cv_index]))
    predict_test = np.median(test_preds,axis=1)
    predict_test = predict_test/(predict_test+(1-predict_test)/rate)
    print('训练损失:',cal_log_loss(train_preds/4, train_Y))
    print('测试损失:',cal_log_loss(cv_preds, train_Y))
    submmit_result(predict_test, 'LR')