예제 #1
0
파일: GBDT.py 프로젝트: hadoop73/sklearn
def gbdt(train,target,test,n):

    from sklearn.tree import DecisionTreeRegressor
    from sklearn.ensemble import GradientBoostingRegressor


    clf = GradientBoostingRegressor(n_estimators=150, learning_rate=0.1,subsample=0.4,
                max_depth=5, random_state=0, loss='ls')  # .fit(train, target)

    from ROC import ROC,ROC2
    print "delete ",-1*n," feature"
    (model,ks) = ROC(clf,train,target)

    #(model, ks) = ROC2(clf, train, target)

    result = model.predict(test)
    writeDatas(result, test, "bn{}".format(ks))
예제 #2
0
파일: GBDT.py 프로젝트: hadoop73/sklearn
def gbdt_a(n_estimators=300,rate=0.1,max_depth=5,rand_state=0,name='train_data_5'):

    train,target,test = getDatas(name)
    print "data :",name
    print train.shape
    from sklearn.tree import DecisionTreeRegressor
    from sklearn.ensemble import GradientBoostingRegressor


    clf = GradientBoostingRegressor(n_estimators=n_estimators, learning_rate=rate,
                max_depth=max_depth, random_state=rand_state, loss='ls')  # .fit(train, target)

    from ROC import ROC,ROC2,ROC3
    logger.info("Datas name: %s",name)
    logger.info("n_estimators= %s rate= %s max_depth= %s rand_state= %s",
                n_estimators,rate,max_depth,rand_state)

    (model, ks) = ROC(clf, train, target)
    result = model.predict(test)
    writeDatas(result, test, "{}".format(ks))
예제 #3
0
def XGBoost_part(dtrain=None,
                 test=None,
                 dtest_X=None,
                 test_y=None,
                 k=0,
                 gamma=0.02,
                 min_child_weight=1.1,
                 max_depth=5,
                 lamda=100,
                 subsamp=0.7,
                 col_bytree=0.7,
                 col_bylevel=0.7,
                 eta=0.01):

    param = {
        'booster': 'gbtree',
        'objective': 'binary:logistic',
        'eval_metric': 'auc',
        'gamma': gamma,
        'min_child_weight': min_child_weight,
        'max_depth': max_depth,
        'lambda': lamda,
        'subsample': subsamp,
        'colsample_bytree': col_bytree,
        'colsample_bylevel': col_bylevel,
        'eta': eta,
        'tree_method': 'exact',
        'seed': 0,
        'nthread': 12
    }
    cv_log = xgb.cv(param,
                    dtrain,
                    num_boost_round=3500,
                    nfold=5,
                    early_stopping_rounds=50,
                    seed=0)
    num_round = cv_log.shape[0]
    cf = './featurescore/cvg{}.csv'.format(str(num_round))
    cv_log.to_csv(cf)
    watchlist = [(dtrain, 'train')]
    #auc = cv_log['test-auc-mean'].max()
    bst = xgb.train(param,
                    dtrain,
                    num_round,
                    evals=watchlist,
                    early_stopping_rounds=50)
    # make prediction
    dtest = xgb.DMatrix(test, missing=-9999)
    preds = bst.predict(dtest)

    scores = bst.predict(dtrain, ntree_limit=bst.best_ntree_limit)
    fp, tp, thresholds = metrics.roc_curve(test_y, scores, pos_label=1)
    ks = KS(y=test_y, score=scores)
    kk = int(ks * 10000000000) % 1000
    print "K-S:{}".format(ks)
    print "AUC:{}".format(metrics.auc(fp, tp))

    with open('./featurescore/a.txt', 'a') as f:
        S = "gamma= "+str(gamma)+\
         "  min_child_weight= "+str(min_child_weight)+\
         "  max_depth= "+str(max_depth)+\
         "  lamda= "+str(lamda)+\
         "\n" + \
         "subsamp= "+str(subsamp)+\
         "  col_bytree= "+str(col_bytree)+\
         "  col_bylevel= "+str(col_bylevel)+\
         "  eta= "+str(eta) + \
         "  ntree= "+str(bst.best_ntree_limit)+ \
         "\nfeatures scores: " + str(kk)
        f.writelines("{}\n".format(S))
        f.writelines("K-S:{}\n".format(ks))
        f.writelines("AUC:{}\n\n".format(metrics.auc(fp, tp)))
        #f.writelines("AUC:{}\n\n".format(metrics.auc(fp, tp)))
    #  写入文件
    writeDatas(preds, test, "xgk{}".format(str(ks)))

    # get feature score
    feature_score = bst.get_fscore()
    feature_score = sorted(feature_score.items(),
                           key=lambda x: x[1],
                           reverse=True)
    fs = []

    for (key, value) in feature_score:
        fs.append("{0},{1}\n".format(key, value))

    print "features scores:", kk
    ff = './featurescore/feature_score_{0}.csv'.format(kk)
    with open(ff, 'w') as f:
        f.writelines("feature,score\n")
        f.writelines(fs)
    return kk
예제 #4
0
def XGBoost_(dtrain=None,
             test=None,
             dtest_X=None,
             test_y=None,
             k=0,
             gamma=0.1,
             min_child_weight=1.1,
             max_depth=5,
             lamda=10,
             subsamp=0.7,
             col_bytree=0.7,
             col_bylevel=0.7,
             eta=0.01):

    param = {
        'booster': 'gbtree',
        'objective': 'binary:logistic',
        'eval_metric': 'auc',
        'gamma': gamma,
        'min_child_weight': min_child_weight,
        'max_depth': max_depth,
        'lambda': lamda,
        'subsample': subsamp,
        'colsample_bytree': col_bytree,
        'colsample_bylevel': col_bylevel,
        'eta': eta,
        'tree_method': 'exact',
        'seed': 0,
        'nthread': 12
    }
    num_round = 1500
    watchlist = [(dtrain, 'train')]
    bst = xgb.train(param, dtrain, num_round, evals=watchlist)
    # make prediction
    dtest = xgb.DMatrix(test)
    preds = bst.predict(dtest)

    scores = bst.predict(dtest_X)

    fp, tp, thresholds = metrics.roc_curve(test_y, scores, pos_label=1)
    ks = np.max(tp - fp)
    print "K-S:{}".format(ks)
    print "AUC:{}".format(metrics.auc(fp, tp))

    with open('./featurescore/a.txt', 'a') as f:
        S = "gamma= "+str(gamma)+\
         "  min_child_weight= "+str(min_child_weight)+\
         "  max_depth= "+str(max_depth)+\
         "  lamda= "+str(lamda)+\
         "  subsamp= "+str(subsamp)+\
         "  col_bytree= "+str(col_bytree)+\
         "  col_bylevel= "+str(col_bylevel)+\
         "  eta= "+str(eta)
        f.writelines("{}\n".format(S))
        f.writelines("K-S:{}\n".format(ks))
        f.writelines("AUC:{}\n\n".format(metrics.auc(fp, tp)))

    #  写入文件
    writeDatas(preds, test, "xg{}".format(str(min_child_weight) + str(ks)))

    # get feature score
    feature_score = bst.get_fscore()
    feature_score = sorted(feature_score.items(),
                           key=lambda x: x[1],
                           reverse=True)
    fs = []

    ft = []

    for (key, value) in feature_score:
        fs.append("{0},{1}\n".format(key, value))
        if value >= 10:
            ft.append(key)

    with open('./featurescore/feature_score5_{0}.csv'.format(k), 'w') as f:
        f.writelines("feature,score\n")
        f.writelines(fs)
    return ft
예제 #5
0
파일: XGBoost.py 프로젝트: hadoop73/sklearn
def XGBoost_(train=None,
             y=None,
             test=None,
             dtest_X=None,
             test_y=None,
             k=0,
             num_round=3500,
             gamma=0.02,
             min_child_weight=1.1,
             max_depth=5,
             lamda=10,
             scale_pos_weight=3,
             subsamp=0.7,
             col_bytree=0.7,
             col_bylevel=0.7,
             eta=0.01,
             file="aac"):

    param = {
        'booster': 'gbtree',
        'objective': 'binary:logistic',
        #'eval_metric':'auc',
        'gamma': gamma,
        'min_child_weight': min_child_weight,
        'max_depth': max_depth,
        'lambda': lamda,
        'subsample': subsamp,
        'colsample_bytree': col_bytree,
        'colsample_bylevel': col_bylevel,
        'eta': eta,
        'tree_method': 'exact',
        'seed': 0,
        'nthread': 12
    }
    with open('./test/a{}.txt'.format(file), 'a') as f:
        S = "gamma= " + str(gamma) + \
         " scale_pos_weight= " + str(scale_pos_weight) + \
         "  min_child_weight= " + str(min_child_weight) + \
         "  max_depth= " + str(max_depth) + \
         "  lamda= " + str(lamda) + \
         "\n" + \
         "subsamp= " + str(subsamp) + \
         "  col_bytree= " + str(col_bytree) + \
         "  col_bylevel= " + str(col_bylevel) + \
         "  eta= " + str(eta)
        f.writelines("{}\n".format(S))
    dtrain = xgb.DMatrix(train, label=y, missing=-9999)
    #cv_log = xgb.cv(param, dtrain,show_stdv=True,verbose_eval=1,feval=evalerror,num_boost_round=3500, nfold=5,early_stopping_rounds=10, seed=0)
    #num_round = 21#cv_log.shape[0]
    #cf = './featurescore/acvg{}.csv'.format(str(num_round))
    #cv_log.to_csv(cf)

    watchlist = [(dtrain, 'train'), (dtest_X, 'eval')]
    #auc = cv_log['test-auc-mean'].max()
    bst = xgb.train(param,
                    dtrain,
                    num_round,
                    watchlist,
                    maximize=True,
                    feval=evalerror,
                    early_stopping_rounds=50)
    # make prediction
    dtest = xgb.DMatrix(test, missing=-9999)
    preds = bst.predict(dtest, ntree_limit=bst.best_ntree_limit)
    p = bst.predict(dtrain, ntree_limit=bst.best_ntree_limit)

    scores = bst.predict(dtest_X, ntree_limit=bst.best_ntree_limit)
    fp, tp, thresholds = metrics.roc_curve(test_y, scores, pos_label=1)
    auc = metrics.auc(fp, tp)
    ks = KS(y=test_y.label, pred=scores)
    kk = int(ks * 10000000000) % 10000
    print "K-S:{}".format(ks)
    print "AUC:{}".format(auc)

    with open('./test/a{}.txt'.format(file), 'a') as f:
        S =  "  best_ntree_limit:" + str(bst.best_ntree_limit) + \
          "   best_iteration= "+str(bst.best_iteration)+ \
         "\nfeatures scores: " + str(kk)
        f.writelines("{}\n".format(S))
        f.writelines("K-S:{}\n".format(ks))
        f.writelines("AUC:{}\n\n".format(metrics.auc(fp, tp)))

    res = writeDatas(preds, test, "xgk_{}".format(str(kk)))

    res.columns = ['label' + str(kk)]
    y['label' + str(kk)] = p
    y = pd.concat([y, res])
    y.drop('label', axis=1, inplace=True)
    y = y.reset_index()
    try:
        ypred = pd.read_csv("./test/y/a{}.csv".format(file))
        y = pd.merge(y, ypred, on='userid')
    except:
        pass
    finally:
        y.to_csv("./test/y/a{}.csv".format(file), index=None)

    # get feature score
    feature_score = bst.get_fscore()
    feature_score = sorted(feature_score.items(),
                           key=lambda x: x[1],
                           reverse=True)
    fs = []
    for (key, value) in feature_score:
        fs.append("{0},{1}\n".format(key, value))
    print "features scores:", kk
    ff = './test/feature_score_{0}.csv'.format(kk)
    with open(ff, 'w') as f:
        f.writelines("feature,score\n")
        f.writelines(fs)
예제 #6
0
                                            random_state=0)

                rf.fit(train_X, train_y)

                score = rf.predict_proba(test_X)[:, 1]

                fp, tp, thresholds = metrics.roc_curve(test_y.values,
                                                       score,
                                                       pos_label=1)
                ks = KS(y=test_y, score=score)
                print "K-S:{}".format(ks)
                print "AUC:{}".format(metrics.auc(fp, tp))

                ans = rf.predict_proba(test)[:, 1]

                with open('./featurescore/a.txt', 'a') as f:
                    S = "criterion= " + str(c) + \
                        "  n_estimators= " + str(n) + \
                        "  max_depth= " + str(md)
                    f.writelines("{}\n".format(S))
                    f.writelines("K-S:{}\n".format(ks))
                    f.writelines("AUC:{}\n\n".format(metrics.auc(fp, tp)))

                writeDatas(ans, test, "rf{}".format(str(ks)))
            except:
                S = "criterion= " + str(c) + \
                    "  n_estimators= " + str(n) + \
                    "  max_depth= " + str(md)
                print "Eorr", S
                pass