コード例 #1
0
    def fit_all(self, X, y, num_trees, feature_names):
        print self.params
        print 'XGBoost: training ... '
        d_train = xgb.DMatrix(X, label=y)
        watchlist = [(d_train, 'train')]
        self.gbm = xgb.train(self.params,
                             d_train,
                             evals=watchlist,
                             num_boost_round=num_trees,
                             verbose_eval=self.params['verbose_eval'])

        try:
            xgbfir.saveXgbFI(self.gbm)
        except:
            pass

        modelname_new = self.modelname.split('.pkl')[0] + 'alldata' + '.pkl'
        print 'Saving to', modelname_new, '...'
        with open(MODELS_FOLDER + '/' + modelname_new, 'wb') as fout:
            pickle.dump(self.gbm, fout)

        if self.show_importance:
            xgb.plot_importance(self.gbm, importance_type='weight')
            plt.show()

            xgb.plot_importance(self.gbm, importance_type='gain')
            plt.show()

            xgb.plot_importance(self.gbm, importance_type='cover')
            plt.show()

        return self.gbm, modelname_new
コード例 #2
0
ファイル: genxgboost_feateng.py プロジェクト: hoihui/allstate
 def score(**params):
     global featimp
     for k in params.keys():
       if k in discreteP:
         params[k]=int(params[k])
     featimpmean=gen_featimpmean()
     chosen_feat=[]
     while len(chosen_feat)<min(params['ncols'],featimp.shape[1]):
       candfeat=weighted_featimp(chosen_feat).fillna(featimpmean.fillna(1.))
       candfeat=candfeat.fillna(1./featimp.shape[1])
       candfeat=candfeat.replace(0,candfeat[candfeat>0].min())
       theone = np.random.choice(candfeat.index,p=candfeat.values/np.sum(candfeat.values))
       chosen_feat.append( theone )
     chosen_feat=list(set(col_keep(chosen_feat)+musthave))
     p=xgbparams.copy()
     p.update(params)
     skip = sorted(random.sample(xrange(1,Nrows+1),Nrows-nrows))
     if args.preload: train = train_exp.ix[:,chosen_feat+['loss']]
     else: train = pd.read_csv(path+"train_exp.csv",index_col=0,usecols=['id','loss']+chosen_feat,skiprows=skip )
     print train.shape
     train_y = yforw(train['loss'],params)
     train_x = train.drop('loss',1)
     y_pred = 0*train_y
     fscores=dict((el,0) for el in chosen_feat) 
     for train_idx, val_idx in kftune.split(train_x):
       X_train, X_val = train_x.iloc[train_idx], train_x.iloc[val_idx]
       y_train, y_val = train_y.iloc[train_idx], train_y.iloc[val_idx]
       d_train = xgboost.DMatrix(X_train, label=y_train)
       d_valid = xgboost.DMatrix(X_val, label=y_val)
       model = xgboost.train(p,
                             d_train,
                             num_boost_round=100000,
                             evals=[(d_valid, 'eval')],
                             early_stopping_rounds=patience,
                             feval=es_eval,
                             # obj=fair_obj,
                             verbose_eval=False)
       y_pred.iloc[val_idx]=model.predict(d_valid,ntree_limit=model.best_ntree_limit)
       xgbfir.saveXgbFI(model,OutputXlsxFile=tmpfile,TopK=p_range['ncols'][1])
       time.sleep(5)
       fi=pd.read_excel(tmpfile,index_col=0)
       fscore = fi['Expected Gain'].to_dict() #Gain, FScore, wFScore, Average wFScore, Average Gain, Expected Gain
       meanscore = np.average(fscore.values())
       for k in fscore.keys(): fscore[k]/=meanscore*len(fscore)
       featimpmean=featimpmean.fillna(1./featimp.shape[1])
       normalization = featimpmean[chosen_feat].sum()/featimpmean.sum()/np.sum(fscore.values())/kftune.get_n_splits()
       for k,v in fscore.iteritems():
         fscores[k]+=normalization*v
     curscore = -mean_absolute_error(yback(y_pred,params),yback(train_y,params))
     featimp = featimp.append(pd.Series(fscores,name=round(curscore,4)))
     return curscore
コード例 #3
0
ファイル: genxgb2.py プロジェクト: hoihui/numer.ai
def score(**params):
      global featimp
      for k in params.keys():
        params[k]=p_range[k][0]*(1-params[k])+p_range[k][1]*params[k]
        if k in discreteP: params[k]=int(round(params[k]))
      featimpmean=gen_featimpmean(featimp)
      if random.random()<.3 or featimp.shape[0]<5:
        chosen=[]
        while len(chosen)<min(params['ncols'],featimp.shape[1]):
          candfeat=weighted_featimp(featimp.iloc[:-resetrows],chosen).fillna(featimpmean.fillna(1.))
          candfeat=candfeat.fillna(1./featimp.shape[1])
          candfeat=candfeat.replace(0,candfeat[candfeat>0].min())
          theone = np.random.choice(candfeat.index,p=candfeat.values/np.sum(candfeat.values))
          chosen.append( theone )
        chosen=list(set(chosen+musthave))
      else:
        cols={k:[vk for vk,imp in v.iteritems() if imp>0] for k,v in featimp.T.to_dict().iteritems()}
        estimatedfeatimp=featimp_from_cols(cols)
        chosen=list(np.random.choice(estimatedfeatimp.keys(),min(params['ncols'],len(filter(None,estimatedfeatimp.values()))),replace=False,p=estimatedfeatimp.values()))
        
      params['colsample_bytree']=max(2./len(chosen),params['colsample_bytree'])
      
      xgbp=xgbparams.copy()
      xgbp.update(params)
      fscores=dict((el,0) for el in chosen)
      if args.verbose: print 'generate_train_x',len(chosen)
      train_x = generate_train_x(chosen,train,extra_y[target],test)
      train_y = extra_y[target]
      if args.verbose: print train_x.shape
      
      d_tr = xgb.DMatrix(train_x, label=train_y)
      cv = xgb.cv(xgbp,d_tr,nfold=8,
                  num_boost_round=100000,early_stopping_rounds=patience,
                  verbose_eval=args.verbose and 50, show_stdv=False)
      s = cv.iloc[-1,0] #cv columns: ['test-rmse-mean', 'test-rmse-std', 'train-rmse-mean','train-rmse-std']
      
      model = xgb.train(xgbp,d_tr,num_boost_round=cv.shape[0],verbose_eval=args.verbose and 50) #solely for feature importances
      try:
        xgbfir.saveXgbFI(model,OutputXlsxFile=tmpfile,TopK=len(chosen))
        fi=pd.read_excel(tmpfile,index_col=0)
        fscore = fi['Expected Gain'].to_dict() #Gain, FScore, wFScore, Average wFScore, Average Gain, Expected Gain
      except: fscore = model.get_score(importance_type='gain')
      featimpmean=featimpmean.fillna(1./featimp.shape[1])
      normalization = featimpmean[chosen].sum()/featimpmean.sum()/np.sum(fscore.values())
      for k,v in fscore.iteritems():fscores[k]+=normalization*v
      
      idx=round(1000*(np.log(2)-s),scoredp)
      featimp = featimp.append(pd.Series(fscores,name=idx))
      return idx
コード例 #4
0
    def fit(self, X_train, X_eval, y_train, y_eval, feature_names):
        print self.params
        print 'XGBoost: training ... '
        eval_result = {}
        d_train = xgb.DMatrix(X_train, label=y_train)
        d_valid = xgb.DMatrix(X_eval, label=y_eval)
        watchlist = [(d_train, 'train'), (d_valid, 'valid')]
        self.gbm = xgb.train(
            self.params,
            d_train,
            evals=watchlist,
            num_boost_round=self.params['n_estimators'],
            early_stopping_rounds=self.params['early_stopping_rounds'],
            verbose_eval=self.params['verbose_eval'],
            evals_result=eval_result)

        try:
            xgbfir.saveXgbFI(self.gbm)
        except:
            pass

        valloss_best = str(eval_result['valid']['logloss'][-1])
        modelname_new = self.modelname.split('.pkl')[0] + valloss_best + '.pkl'
        print 'Saving to', modelname_new, '...'
        with open(MODELS_FOLDER + '/' + modelname_new, 'wb') as fout:
            pickle.dump(self.gbm, fout)

        if self.show_importance:
            xgb.plot_importance(self.gbm, importance_type='weight')
            plt.show()

            xgb.plot_importance(self.gbm, importance_type='gain')
            plt.show()

            xgb.plot_importance(self.gbm, importance_type='cover')
            plt.show()

        return self.gbm, eval_result, modelname_new
コード例 #5
0
# No negative times
# Use a mean of a job_duration subset to fill
pred.ix[pred.ix[:, 'job_duration'] < 0,
        'job_duration'] = data.ix[train_pred.job_duration < 0,
                                  'job_duration'].mean()

# Set name of model output
model_name = 'xgboost_final_submission'
# Set to dir of submission/output folder
submit = r'/Submissions/'
# Output test prediction to location
pred.to_csv(submit + model_name + '.csv', index=False)

## Oupt XGBFIR statistics spreadsheet
# this is the xgbfir package and is used to find feature importance
xgbfir.saveXgbFI(xgbreg, OutputXlsxFile=submit + model_name + '_FI.xlsx')

# Append CV Scores to XGBFIR
book = load_workbook(submit + model_name + '_FI.xlsx')
writer = pd.ExcelWriter(submit + model_name + '_FI.xlsx', engine='openpyxl')
writer.book = book
writer.sheets = dict((ws.title, ws) for ws in book.worksheets)
cvscores.to_excel(writer, sheet_name='cvscore', index=False, engine='openpyxl')
writer.save()

# Dump XGBoost model to a pickle file
save_model_path = r'/home/josh/Documents/Python Scripts/Data Science Challenges/ENGIE DSC/IT Operations/Models/'
pickle.dump(xgbreg, open(save_model_path + model_name + '.dat', "wb"))

# In[ ]:
コード例 #6
0
ファイル: module.py プロジェクト: michaelyryi/KaggleBiatch
 def _additional_task(self):
     save_path = os.path.join(self.config["out_folder"],
                              f'xgbfir_{self.data["fold_num"]}.xlsx')
     xgbfir.saveXgbFI(self.clf, OutputXlsxFile=save_path)
コード例 #7
0
def main():
    np.random.seed(42)
    logger = config.config_logger(__name__, 10)
    t0 = time.time()

    train_client_path = './data/raw/csv/train_clientes.csv'
    train_reque_path = './data/raw/csv/train_requerimientos.csv'
    test_client_path = './data/raw/csv/test_clientes.csv'
    test_reque_path = './data/raw/csv/test_requerimientos.csv'
    output_path = './output/'
    do_merge = False
    write_impute_test = False
    write_output = False
    add_variables = False
    version = 6

    logger.info('Beginning execution')
    logger.info('Load dataframes')
    test_client = pd.read_csv(test_client_path, header=0)
    test_reque = pd.read_csv(test_reque_path, header=0)
    main_client = pd.read_csv(train_client_path, header=0)
    main_reque = pd.read_csv(train_reque_path, header=0)

    work_data.basic_descriptive(main_client)
    work_data.basic_descriptive(main_reque)

    id_variables = work_data.id_variables()
    index_client = test_client['ID_CORRELATIVO']

    if write_impute_test:
        logger.info('Creating new test database')
        logger.info('Cleaning test reque database')
        test_reque = work_data.preprocess_reque(test_reque)
        print(test_reque.head().to_string())

        logger.info('Cleaning test client database - Imputing missing values')
        test_client = work_data.count_missings_column(test_client)
        test_client = work_data.preprocess_client(test_client)
        print(test_client.head().to_string())

        logger.info('Merging test databases')
        temp = pd.concat([test_client, test_reque], axis=1, join_axes=[test_client.index])
        temp.fillna(0, inplace=True)
        test_df = temp
        print(test_df.head().to_string())
        print(test_df.describe().transpose().to_string())

        logger.info('Saving test database')
        test_df.to_csv('./data/mod/test_imputed.csv', index=False)
    else:
        logger.info('Opening test database')
        test_df = pd.read_csv('./data/mod/test_imputed.csv', header=0)
        print(test_df.head().to_string())

    if do_merge:
        logger.info('Creating new merge')
        logger.info('Cleaning reque database')
        main_reque = work_data.preprocess_reque(main_reque)
        print(main_reque.head().to_string())

        #main_reque = pd.pivot_table(main_reque, index=['ID_CORRELATIVO'], columns=['CODMES'], aggfunc=np.sum)
        #main_reque.columns = main_reque.columns.map('{0[0]}|{0[1]}'.format)
        #main_reque.fillna(0, inplace=True)

        logger.info('Cleaning client database - Imputing missing values')
        main_client = work_data.count_missings_column(main_client)
        target = main_client.pop('ATTRITION')
        target.index = main_client['ID_CORRELATIVO']
        main_client = work_data.preprocess_client(main_client)
        main_client['ATTRITION'] = target
        print(main_client.head().to_string())

        logger.info('Merging databases')
        temp = pd.concat([main_client, main_reque], axis=1, join_axes=[main_client.index])
        temp.fillna(0, inplace=True)
        main_df = temp

        print(main_df.shape)
        print(main_df.head().to_string())
        print(main_df.describe().transpose().to_string())
        work_data.basic_descriptive(main_df)

        logger.info('Saving marges database')
        main_df.to_csv('./data/mod/merge1.csv', index=False)
    else:
        logger.info('Opening merged database')
        main_df = pd.read_csv('./data/mod/merge1.csv', header=0)
        print(main_df.head().to_string())
        print(main_df.shape)

    y = main_df.pop('ATTRITION')
    main_df = main_df.append(test_df).reset_index(drop=True)

    if False:
        logger.info('Creating T-SNE database')
        temp_tsne = pd.DataFrame(models.tnse(main_df))
        temp_tsne.to_csv('./data/mod/merge1_tsne.csv', index=False)
    else:
        logger.info('Loading T-SNE database')
        temp_tsne = pd.read_csv('./data/mod/merge1_tsne.csv')

    if add_variables:
        logger.info('Beginning feature engineering')
        logger.info('Interactions')
        main_df_feat = models.create_interactions(main_df, models.inter_vars())

        logger.info('Row sums 1-3')
        main_df_feat['ext1'] = main_df.apply(lambda row: (row == 0).sum(), axis=1)
        temp = models.standard_scale_df(main_df)
        main_df_feat['ext2'] = temp.apply(lambda row: (row > 0.5).sum(), axis=1)
        main_df_feat['ext3'] = temp.apply(lambda row: (row < -0.5).sum(), axis=1)

        logger.info('K-means 4-7')
        main_df_feat['ext4'] = pd.Series(models.kmeans(main_df, 5)).apply(str)
        main_df_feat['ext5'] = pd.Series(models.kmeans(main_df, 10)).apply(str)
        main_df_feat['ext6'] = pd.Series(models.kmeans(main_df, 15)).apply(str)
        main_df_feat['ext7'] = pd.Series(models.kmeans(main_df, 20)).apply(str)

        logger.info('KNN 8-11')
        main_df_feat['ext8'] = models.knn_distance(main_df, 2)
        main_df_feat['ext9'] = models.knn_distance(main_df, 3)
        main_df_feat['ext10'] = models.knn_distance(main_df, 5)
        main_df_feat['ext11'] = models.knn_distance(temp_tsne, 2)

        main_df_feat = pd.get_dummies(main_df_feat, drop_first=True)
        print(main_df_feat.head().to_string())
        print(main_df_feat.shape)
        config.time_taken_display(t0)
        logger.info('Saving features database')
        main_df_feat.to_csv('./data/mod/merge1_features.csv', index=False)
    else:
        logger.info('Opening feature engineered database')
        main_df_feat = pd.read_csv('./data/mod/merge1_features.csv', header=0)
        print(main_df_feat.head().to_string())
        print(main_df_feat.shape)

    logger.info('Split data into train and test')
    x, test_df = main_df_feat.iloc[:70000, :], main_df_feat.iloc[70000:, :]
    print(main_df_feat.shape)
    print(x.shape)
    print(test_df.shape)
    x_train, x_test, y_train, y_test = models.split_data(x, y)
    work_data.basic_descriptive(x_train)

    logger.info('Level 1 - Create metafeatures')

    if False:
        logger.info('1. Ridge logit')
        ridge_model = models.logit_grid(x, y, 'l2', StandardScaler())
        models.write_prediction(ridge_model, main_df_feat, index_client, 'ridge_standard')
        print(ridge_model.score(x_test, y_test))

        logger.info('2. Lasso logit')
        lasso_model = models.logit_grid(x, y, 'l1',StandardScaler())
        models.write_prediction(lasso_model, main_df_feat, index_client, 'lasso_standard')
        print(lasso_model.score(x_test, y_test))

        logger.info('3. Random Forrest')
        RF_model = models.random_forrest_grid(x, y, StandardScaler())
        models.write_prediction(RF_model, main_df_feat, index_client, 'RF_standard')
        print(RF_model.score(x_test, y_test))

        logger.info('4. Extra Trees')
        ET_model = models.extra_trees_grid(x, y, StandardScaler())
        models.write_prediction(ET_model, main_df_feat, index_client, 'ET_standard')
        print(ET_model.score(x_test, y_test))

        logger.info('5. 2-KNN')
        KNN_model = models.knn_grid(x, y, StandardScaler(), 2)
        models.write_prediction(KNN_model, main_df_feat, index_client, 'KNN2_standard')
        print(KNN_model.score(x_test, y_test))

        logger.info('6. 4-KNN')
        KNN_model = models.knn_grid(x, y, StandardScaler(), 4)
        models.write_prediction(KNN_model, main_df_feat, index_client, 'KNN4_standard')
        print(KNN_model.score(x_test, y_test))

        logger.info('7. 8-KNN')
        KNN_model = models.knn_grid(x, y, StandardScaler(), 8)
        models.write_prediction(KNN_model, main_df_feat, index_client, 'KNN8_standard')
        print(KNN_model.score(x_test, y_test))

        logger.info('8. 16-KNN')
        KNN_model = models.knn_grid(x, y, StandardScaler(), 16)
        models.write_prediction(KNN_model, main_df_feat, index_client, 'KNN16_standard')
        print(KNN_model.score(x_test, y_test))

        logger.info('9. 32-KNN')
        KNN_model = models.knn_grid(x, y, StandardScaler(), 32)
        models.write_prediction(KNN_model, main_df_feat, index_client, 'KNN32_standard')
        print(KNN_model.score(x_test, y_test))

        logger.info('10. 64-KNN')
        KNN_model = models.knn_grid(x, y, StandardScaler(), 64)
        models.write_prediction(KNN_model, main_df_feat, index_client, 'KNN64_standard')
        print(KNN_model.score(x_test, y_test))

        logger.info('11. 128-KNN')
        KNN_model = models.knn_grid(x, y, StandardScaler(), 128)
        models.write_prediction(KNN_model, main_df_feat, index_client, 'KNN128_standard')
        print(KNN_model.score(x_test, y_test))

        logger.info('12. 256-KNN')
        KNN_model = models.knn_grid(x, y, StandardScaler(), 256)
        models.write_prediction(KNN_model, main_df_feat, index_client, 'KNN256_standard')
        print(KNN_model.score(x_test, y_test))

        logger.info('13. 512-KNN')
        KNN_model = models.knn_grid(x, y, StandardScaler(), 512)
        models.write_prediction(KNN_model, main_df_feat, index_client, 'KNN512_standard')
        print(KNN_model.score(x_test, y_test))

        logger.info('14. 1024-KNN')
        KNN_model = models.knn_grid(x, y, StandardScaler(), 1024)
        models.write_prediction(KNN_model, main_df_feat, index_client, 'KNN1024_standard')
        print(KNN_model.score(x_test, y_test))

        logger.info('15. Naive Bayes')
        NB_model = models.naive_bayes_grid(x, y, StandardScaler())
        models.write_prediction(NB_model, main_df_feat, index_client, 'NB_standard')
        print(NB_model.score(x_test, y_test))

        logger.info('16. MPL')
        MLP_model = models.MLP_grid(x, y, StandardScaler())
        models.write_prediction(MLP_model, main_df_feat, index_client, 'MLP_standard')
        print(MLP_model.score(x_test, y_test))

        logger.info('17. AdaBoost')
        adaboost_model = models.adaboost_grid(x, y, StandardScaler())
        models.write_prediction(adaboost_model, main_df_feat, index_client, 'adaboost_standard')
        print(adaboost_model.score(x_test, y_test))

        logger.info('18. GBM')
        gbm_model = models.gbm_grid(x, y, StandardScaler())
        models.write_prediction(gbm_model, main_df_feat, index_client, 'gbm_standard')
        print(gbm_model.score(x_test, y_test))

        logger.info('18. LightGBM')
        lgbm_model = models.lgbm_grid(x, y, None)
        models.write_prediction(lgbm_model, main_df_feat, index_client, 'lgbm_none')
        print(lgbm_model.score(x_test, y_test))

    logger.info('19. XgBoost')
    test_final = main_df_feat.iloc[70000:, :]
    id_test = test_client['ID_CORRELATIVO']
    xgboost_model = models.xgboost_grid(x, y, StandardScaler())
    models.write_prediction(xgboost_model, main_df_feat, index_client, 'xgboost_standard')
    print(xgboost_model.score(x_test, y_test))
    models.write_prediction(xgboost_model, test_final, id_test, 'ATTRITION')
    hi

    # Stage 2:
    logger.info('Level 2')
    logger.info('Creating meta-features database')
    meta_features_list = os.listdir('./data/mod/meta_features')
    temp = {}
    for feature in meta_features_list:
        temp_df = pd.read_csv('./data/mod/meta_features/{0}'.format(feature), header=0)
        temp[feature] = temp_df.iloc[:, 1]
    meta_features = pd.DataFrame(temp)
    meta_features = pd.concat([meta_features, main_df_feat], axis=1, ignore_index=True)
    x = meta_features.iloc[:70000, :]
    test_final = meta_features.iloc[70000:, :]
    x_train, x_test, y_train, y_test = models.split_data(x, y)

    print(x_train.shape)
    print(test_final.shape)
    print(x.shape)

    logger.info('Estimating second level model with XgBoost')
    xgboost_final = models.xgboost_full_mod(x_train, y_train)
    print(xgboost_final.score(x_test, y_test))
    print(models.get_logloss(y_test, xgboost_final.predict_proba(x_test)[:, 1]))
    models.write_final_prediction(xgboost_final, test_final, test_client['ID_CORRELATIVO'], 'results8')
    models.write_final_prediction(xgboost_final, x, main_client['ATTRITION'], 'train')


    config.time_taken_display(t0)
    hi

    logger.info('XgBoost')
    xgboost_result = models.xgboost_grid(x_train, y_train, x_test, y_test)
    print('Test grid: {0}'.format(xgboost_result))
    #Test: -0.322

    xgboost_full = models.xgboost_full_mod(x_train, y_train, x_test, y_test)
    print(xgboost_full)
    xgbfir.saveXgbFI(xgboost_full, feature_names=main_df.columns, OutputXlsxFile='./data/mod/bbva.xlsx')
コード例 #8
0
booster = xg.get_booster()
print(booster.get_dump()[0])

booster = xg.get_booster()
print(booster.get_dump()[1])

booster = xg.get_booster()
print(booster.get_dump()[-1])

# Residuals plot
regressor.residuals_plot(xg, auto_X_train, auto_y_train, auto_X_test,
                         auto_y_test)

# viewing interactions
xgbfir.saveXgbFI(xg,
                 feature_names=auto_X.columns,
                 OutputXlsxFile='fir-auto.xlsx')

# column impmortance
# Gain - total gain of each feature
# Fscore - number of splits
# wFscore - weighted number of splits (by probability of split taking place)
pd.read_excel('fir-auto.xlsx').head(3).T

# column impmortance
# Gain - total gain of each feature
# Fscore - number of splits
# wFscore - weighted number of splits (by probability of split taking place)
pd.read_excel('fir-auto.xlsx', sheet_name='Interaction Depth 1').head(3).T

# column impmortance
コード例 #9
0
# In[1]:

from sklearn.datasets import load_iris, load_boston
import xgboost as xgb
import xgbfir

# loading database
boston = load_boston()

# doing all the XGBoost magic
xgb_rmodel = xgb.XGBRegressor().fit(boston['data'], boston['target'])

# saving to file with proper feature names
xgbfir.saveXgbFI(xgb_rmodel,
                 feature_names=boston.feature_names,
                 OutputXlsxFile='bostonFI.xlsx')

# loading database
iris = load_iris()

# doing all the XGBoost magic
xgb_cmodel = xgb.XGBClassifier().fit(iris['data'], iris['target'])

# saving to file with proper feature names
xgbfir.saveXgbFI(xgb_cmodel,
                 feature_names=iris.feature_names,
                 OutputXlsxFile='irisFI.xlsx')

# Check working directory. There will be two new files: **bostonFI.xlsx** and **irisFI.xlsx**.
                    model.get_score(fmap=fmap_filename,
                                    importance_type='cover')).to_frame())
            feat_imp.columns = ['Weight', 'Gain', 'Cover']
            feat_imp['FeatureName'] = feat_imp.index
            feat_imp['Model'] = model_name
            feat_imp['fold'] = ind

            FI_df = pd.concat([FI_df, feat_imp])

            if XGBFirFlg:
                print('Feature Interaction')
                interactions_data_path = '/opt/ml/processing/output_importance/interactions_%s_%s.xlsx' % (
                    model_name, ind)
                xgbfir.saveXgbFI(model,
                                 feature_names=featureset,
                                 TopK=500,
                                 MaxTrees=500,
                                 MaxInteractionDepth=2,
                                 OutputXlsxFile=interactions_data_path)

    print('Averaging results')
    #FI
    num_folds = FI_df['fold'].max() + 1
    #number of columns with folds scores depends on the number of folds (num_folds) We do not know in advance how many of them exist in the results
    folds_train_columns = []
    folds_test_columns = []
    folds_gain_columns = []
    folds_weight_columns = []
    folds_cover_columns = []
    for i in range(0, int(num_folds), 1):
        folds_train_columns.append('train-%s-fold' % i)
        folds_test_columns.append('test-%s-fold' % i)
コード例 #11
0
ファイル: module.py プロジェクト: michaelyryi/KaggleBiatch
 def _save_feature_importance(self):
     xgbfir.saveXgbFI(self.clf,
                      OutputXlsxFile=self.output_folder +
                      'xgbfir_%d.xlsx' % self.fold_num)
コード例 #12
0
    def train_models(self,
                     workflow,
                     datasource,
                     dataset,
                     y=None,
                     test_dataset=None):

        print('start training models')

        trained_models = dict()
        model_processing_type = workflow.model_processing.get('type')
        processing_models = workflow.model_processing.get('models')
        validation_type = workflow.validation.get('type')
        validation_value = workflow.validation.get('value')

        print(model_processing_type, processing_models, validation_type,
              validation_value)

        if model_processing_type == 'supervised':

            y_predictor = None

            for p in datasource.predictor_details:
                if p.get('name') == datasource.predictor_target_name:
                    y_predictor = p
                    break

            if y_predictor.get('predictor_type').get('description',
                                                     None) == 'continuous':
                model_processing_detail = 'regression'
            else:
                y_value_counts = y.value_counts()
                if len(y_value_counts) > 2:
                    model_processing_detail = 'classification_multi'
                else:
                    model_processing_detail = 'classification_binary'

            print(model_processing_detail)

            from sklearn.preprocessing import LabelEncoder
            le = LabelEncoder()
            y_encoded = le.fit_transform(y)

            if validation_type == 'fold':
                from sklearn.model_selection import cross_val_predict
                from sklearn.model_selection import StratifiedKFold
                skf = StratifiedKFold(n_splits=validation_value)

            if 'rlist' in processing_models and 'classification' in model_processing_detail:
                pass

            if 'xgb' in processing_models:

                objective = 'binary:logistic' \
                if 'binary' in model_processing_detail else 'multi:softprob'
                n_estimators = 20
                silent = 1
                subsample = .7
                colsample_bytree = .7
                learning_rate = .1
                max_depth = 7
                min_child_weight = 2

                if 'classification' in model_processing_detail:
                    from xgboost import XGBClassifier
                    xgb = XGBClassifier(n_estimators=n_estimators,
                                        objective=objective,
                                        silent=silent,
                                        subsample=subsample,
                                        colsample_bytree=colsample_bytree,
                                        learning_rate=learning_rate,
                                        max_depth=max_depth,
                                        min_child_weight=min_child_weight)
                else:
                    from xgboost import XGBRegressor
                    xgb = XGBRegressor(n_estimators=n_estimators,
                                       objective=objective,
                                       silent=silent,
                                       subsample=subsample,
                                       colsample_bytree=colsample_bytree,
                                       learning_rate=learning_rate,
                                       max_depth=max_depth,
                                       min_child_weight=min_child_weight)

                if validation_type == 'fold':
                    y_pred = cross_val_predict(xgb,
                                               dataset.values,
                                               y_encoded,
                                               cv=skf,
                                               n_jobs=-1,
                                               verbose=9)
                    xgb.fit(dataset.values, y_encoded)
                    from settings import location
                    import xgbfir
                    workflow.fi_booster = location(
                        'workflow_data') + '/' + str(workflow._id) + '_fi.xlsx'
                    print('save xgbfi', xgb._Booster, workflow.fi_booster)
                    xgbfir.saveXgbFI(xgb._Booster,
                                     OutputXlsxFile=workflow.fi_booster)
                else:
                    pass
                    # knn.fit(X, y)
                    # y_pred = knn.predict(test_dataset.values) \
                    #     if 'binary' in model_processing_detail \
                    #     else knn.predict_proba(test_dataset.values)

                trained_models['xgb'] = y_pred

            if 'frlp' in processing_models:
                pass

            if 'knn' in processing_models:
                if 'classification' in model_processing_detail:
                    from sklearn.neighbors import KNeighborsClassifier
                    knn = KNeighborsClassifier(n_neighbors=5,
                                               algorithm='auto',
                                               n_jobs=-1)
                else:
                    from sklearn.neighbors import KNeighborsRegressor
                    knn = KNeighborsRegressor(n_neighbors=5,
                                              algorithm='auto',
                                              n_jobs=-1)

                from sklearn import preprocessing
                X = preprocessing.scale(dataset)

                if validation_type == 'fold':
                    y_pred = cross_val_predict(knn,
                                               X,
                                               y_encoded,
                                               cv=skf,
                                               n_jobs=-1,
                                               verbose=9)
                else:
                    pass
                    # knn.fit(X, y)
                    # y_pred = knn.predict(test_dataset.values) \
                    #     if 'binary' in model_processing_detail \
                    #     else knn.predict_proba(test_dataset.values)

                trained_models['knn'] = y_pred
                del X

            if 'lr' in processing_models:
                '''
                solver : {‘newton-cg’, ‘lbfgs’, ‘liblinear’, ‘sag’}

                Algorithm to use in the optimization problem.
                For small datasets, ‘liblinear’ is a good choice, whereas ‘sag’ is
                faster for large ones.

                For multiclass problems, only ‘newton-cg’, ‘sag’ and ‘lbfgs’ handle
                multinomial loss; ‘liblinear’ is limited to one-versus-rest schemes.
                ‘newton-cg’, ‘lbfgs’ and ‘sag’ only handle L2 penalty.

                ‘liblinear’ might be slower in LogisticRegressionCV because it does
                not handle warm-starting.

                Note that ‘sag’ fast convergence is only guaranteed on features with approximately the same scale.
                You can preprocess the data with a scaler from sklearn.preprocessing.
                New in version 0.17: Stochastic Average Gradient descent solver.
                '''

                if 'classification' in model_processing_detail:

                    multi_class = 'ovr' if 'binary' in model_processing_detail else 'multinomial'
                    if dataset.shape[0] <= 1000:
                        if multi_class == 'ovr':
                            solver = 'liblinear'
                        else:
                            solver = 'lbfgs'
                    elif dataset.shape[0] >= 10000:
                        solver = 'sag'
                    else:
                        solver = 'lbfgs'

                    class_weight = 'balanced'
                    n_jobs = -1

                    from sklearn.linear_model import LogisticRegression
                    lr = LogisticRegression(solver=solver,
                                            class_weight=class_weight,
                                            n_jobs=n_jobs,
                                            multi_class=multi_class)

                    from sklearn import preprocessing
                    X = preprocessing.scale(dataset)

                    if validation_type == 'fold':
                        y_pred = cross_val_predict(lr,
                                                   X,
                                                   y_encoded,
                                                   cv=skf,
                                                   n_jobs=-1,
                                                   verbose=9)
                    else:
                        pass
                        # lr.fit(X, y)
                        # y_pred = lr.predict(test_dataset.values) \
                        # if multi_class == 'ovr' else lr.predict_proba(test_dataset.values)

                    trained_models['lr'] = y_pred
                    del X
                else:
                    pass

            if 'nn' in processing_models:
                '''
                solver : {‘lbfgs’, ‘sgd’, ‘adam’}, default ‘adam’
                The solver for weight optimization.
                ‘lbfgs’ is an optimizer in the family of quasi-Newton methods.
                ‘sgd’ refers to stochastic gradient descent.
                ‘adam’ refers to a stochastic gradient-based optimizer proposed by Kingma, Diederik, and Jimmy Ba
                Note: The default solver ‘adam’ works pretty well on relatively large datasets
                (with thousands of training samples or more) in terms of both training time and validation score.
                For small datasets, however, ‘lbfgs’ can converge faster and perform better.
                '''
                solver = None
                if dataset.shape[0] >= 1000.:
                    solver = 'adam'
                else:
                    solver = 'lbfgs'

                if 'classification' in model_processing_detail:
                    from sklearn.neural_network import MLPClassifier
                    nn = MLPClassifier(solver=solver,
                                       hidden_layer_sizes=(50, 3))
                else:
                    from sklearn.neural_network import MLPRegressor
                    nn = MLPRegressor(solver=solver,
                                      hidden_layer_sizes=(50, 3))

                from sklearn import preprocessing
                X = preprocessing.scale(dataset)

                if validation_type == 'fold':
                    y_pred = cross_val_predict(nn,
                                               X,
                                               y_encoded,
                                               cv=skf,
                                               n_jobs=-1,
                                               verbose=9)
                else:
                    pass
                    # nn.fit(X, y)
                    # y_pred = nn.predict(test_dataset.values) \
                    #     if 'binary' in model_processing_detail \
                    #     else nn.predict_proba(test_dataset.values)

                trained_models['nn'] = y_pred
                del X

            if 'rf' in processing_models:

                n_estimators = 50
                n_jobs = -1
                max_depth = 7

                if 'classification' in model_processing_detail:
                    from sklearn.ensemble import RandomForestClassifier
                    rf = RandomForestClassifier(n_estimators=n_estimators,
                                                max_depth=max_depth,
                                                class_weight='balanced',
                                                n_jobs=n_jobs)
                else:
                    from sklearn.ensemble import RandomForestRegressor
                    rf = RandomForestRegressor(n_estimators=n_estimators,
                                               max_depth=max_depth,
                                               n_jobs=n_jobs)

                if validation_type == 'fold':
                    y_pred = cross_val_predict(rf,
                                               dataset.values,
                                               y_encoded,
                                               cv=skf,
                                               n_jobs=-1,
                                               verbose=9)
                else:
                    pass
                    # rf.fit(X, y)
                    # y_pred = rf.predict(test_dataset.values) \
                    #     if 'binary' in model_processing_detail \
                    #     else rf.predict_proba(test_dataset.values)

                trained_models['rf'] = y_pred
        else:
            if 'gm' in processing_models:
                from sklearn.mixture import GaussianMixture
                gm = GaussianMixture()
                # gm.fit(X)

            if 'kmean' in processing_models:
                from sklearn.cluster import KMeans
                kmean = KMeans()
                # kmean.fit(X)

            if 'dbscan' in processing_models:
                from sklearn.cluster import DBSCAN
                dbscan = DBSCAN()
                # dbscan.fit(X)

            if 'pca' in processing_models:
                from sklearn.decomposition import PCA
                pca = PCA()
                # pca.fit(X)

            if 'rbm' in processing_models:
                from sklearn.neural_network import BernoulliRBM
                rbm = BernoulliRBM()
                # rbm.fit(X)

        return trained_models, model_processing_type, model_processing_detail
コード例 #13
0
# Z_pred = model.predict_proba(Z_test)[:,1]

Z_test = xgb.DMatrix(dfTest[feats].values)
Z_pred = bst.predict(Z_test, ntree_limit=bst.best_ntree_limit)

# submission
df = pd.DataFrame({"instanceID": dfTest["instanceID"].values, "proba": Z_pred})
df.sort_values("instanceID", inplace=True)
df.to_csv("submission.csv", index=False)
with zipfile.ZipFile("submission.zip", "w") as fout:
    fout.write("submission.csv", compress_type=zipfile.ZIP_DEFLATED)

# 71 feats + feats2 validation_0-logloss:0.106431	validation_1-logloss:0.110743
# 72 feats validation_0-logloss:0.106431	validation_1-logloss:0.110744
# 59 feats2 validation_0-logloss:0.116213	validation_1-logloss:0.119058
# 66 feats	validation_0-logloss:0.106338	validation_1-logloss:0.110628
# 82 feats	validation_0-logloss:0.105197	validation_1-logloss:0.109576
# 166 feats	validation_0-logloss:0.101096	validation_1-logloss:0.105124
# [133]	validation_0-logloss:0.100174	validation_1-logloss:0.10421 中位数填充
# [124]	validation_0-logloss:0.100159	validation_1-logloss:0.104205
# [139]	validation_0-logloss:0.100071	validation_1-logloss:0.104137
# [461]	validation_0-logloss:0.096659	validation_1-logloss:0.102084
# 100 0.1027
# 100 0.102788645829
# [99]	eval-logloss:0.102789	train-logloss:0.09784

print('save xgbfir')
# 下面这行会改变feats的内容所以要放到最后
featmp = feats
xgbfir.saveXgbFI(bst, feature_names=featmp, OutputXlsxFile='xgboost.xlsx')
コード例 #14
0
import pandas as pd
import xgboost as xgb

sys.path.append("../src")

from base import Utilities, Config
from common import CustomTransformation

config = Config()

train_module = CustomTransformation(config, 'train')
watchlist = [(train_module.ddata, 'train')]

print(train_module.final_columns)

params = Utilities.load_json(config.params_file)
history = xgb.cv(params, train_module.ddata, 300, early_stopping_rounds=30, metrics=["auc", "error"], verbose_eval=True)

model = xgb.train(params, train_module.ddata, 200, verbose_eval=True)

class_mapping = Utilities.load_json(config.class_mapping_file)
test_module = CustomTransformation("test", class_mapping, train_module.final_columns)
y_pred = model.predict(test_module.ddata)
submission_df = pd.DataFrame({config.notable_columns["ID"]: list(test_module.main_column.values),
                              config.notable_columns["Target"]: list(y_pred)})
submission_df.to_csv(os.path.join(config.home, 'submission', 'one.csv'), float_format='%0.6f', index=False)

xgbfir.saveXgbFI(model, feature_names=train_module.final_columns, TopK=500, SortBy='Gain', \
                 MaxTrees=500, MaxInteractionDepth=2, OutputXlsxFile='XGBoost-FI.xlsx')
コード例 #15
0
def xgboost_train(train=None,
                  train_target=None,
                  test=None,
                  id=None,
                  load=False):
    print("Start training")
    start = time.time()
    if load:
        train = pd.read_hdf("train.h5", "train")
        test = pd.read_hdf("test.h5", "test")
        id = pd.read_hdf("id.h5", "id")
        train_target = pd.read_hdf("train_target.h5", "train_target")

    # optimized hyperparameters
    param = {}
    param["objective"] = "reg:linear"
    param["booster"] = "gbtree"
    param["eta"] = 0.04
    param["max_depth"] = 8
    param["min_child_weight"] = 5
    param["subsample"] = 1
    param["colsample_bytree"] = 0.5
    param["colsample_bylevel"] = 1
    param["gamma"] = 10
    param["lambda"] = 1
    param["alpha"] = 1
    param["silent"] = 1
    param["nthread"] = 24
    param["seed"] = 1991
    # we are using a new Xgboost tree creation algorithm that is much faster, you need the newest version for that
    param["tree_method"] = "hist"
    param["eval_metric"] = "rmse"
    num_round = 3000

    dtrain = xgb.DMatrix(train, train_target)
    watchlist = [(dtrain, "train")]
    gbm = xgb.train(param,
                    dtrain,
                    num_round,
                    evals=watchlist,
                    verbose_eval=True)
    os.makedirs("../output", exist_ok=True)
    xgbfir.saveXgbFI(gbm,
                     TopK=300,
                     OutputXlsxFile="../output/XgbFeatureInteractions.xlsx")

    gain = pd.Series(gbm.get_score(importance_type="gain")) * pd.Series(
        gbm.get_score(importance_type="weight"))
    gain = gain.reset_index()
    gain.columns = ["features", "gain"]
    gain.sort_values(by="gain", inplace=True)
    gain.plot(kind="barh",
              x="features",
              y="gain",
              legend=False,
              figsize=(10, 20))
    plt.title("XGBoost Total Gain")
    plt.xlabel("Total Gain")
    plt.savefig("../output/XGBOOST_GAIN_" +
                time.strftime("%Y_%m_%d_%H_%M_%S") + ".png",
                bbox_inches="tight",
                pad_inches=1)
    gain.sort_values(
        by="gain",
        ascending=False).to_csv("../output/Gain_" +
                                time.strftime("%Y_%m_%d_%H_%M_%S") + ".csv")

    dtest = xgb.DMatrix(test)
    y_pred = gbm.predict(dtest)
    submission = pd.DataFrame({
        "id": id,
        "Demanda_uni_equil": np.expm1(y_pred)
    })
    cols = submission.columns.tolist()
    cols = cols[1:] + cols[0:1]
    submission = submission[cols]
    os.makedirs("../subm", exist_ok=True)
    submission.to_csv("../subm/submission_xgboost_" +
                      time.strftime("%Y_%m_%d_%H_%M_%S") + ".csv.gz",
                      compression="gzip",
                      index=False)
    print("Training and submitting took {:.1f}min".format(
        (time.time() - start) / 60))
コード例 #16
0
ファイル: simple.py プロジェクト: limexp/xgbfir
# # Xgbfir simple example
# This is a small working example of Xgbfir usage from Python code.

# In[1]:

from sklearn.datasets import load_iris, load_boston
import xgboost as xgb
import xgbfir

# loading database
boston = load_boston()

# doing all the XGBoost magic
xgb_rmodel = xgb.XGBRegressor().fit(boston['data'], boston['target'])

# saving to file with proper feature names
xgbfir.saveXgbFI(xgb_rmodel, feature_names=boston.feature_names, OutputXlsxFile='bostonFI.xlsx')


# loading database
iris = load_iris()

# doing all the XGBoost magic
xgb_cmodel = xgb.XGBClassifier().fit(iris['data'], iris['target'])

# saving to file with proper feature names
xgbfir.saveXgbFI(xgb_cmodel, feature_names=iris.feature_names, OutputXlsxFile='irisFI.xlsx')


# Check working directory. There will be two new files: **bostonFI.xlsx** and **irisFI.xlsx**.
コード例 #17
0
model.fit(X_train,
          Y_train,
          eval_set=([X_train, Y_train], [X_test, Y_test]),
          eval_metric="logloss",
          early_stopping_rounds=3)
Y_pred = model.predict_proba(X_test)[:, 1]

print(X_train.shape)
print logloss(Y_test, Y_pred)

# mysubmission
df = pd.DataFrame({"instanceID": Y_test, "proba": Y_pred})
df.sort_values("instanceID", inplace=True)
df.to_csv("submissionx.csv", index=False)
with zipfile.ZipFile("submissionx.zip", "w") as fout:
    fout.write("submissionx.csv", compress_type=zipfile.ZIP_DEFLATED)

Z_test = dfTest[feats].values
Z_pred = model.predict_proba(Z_test)[:, 1]

# submission
df = pd.DataFrame({"instanceID": dfTest["instanceID"].values, "proba": Z_pred})
df.sort_values("instanceID", inplace=True)
df.to_csv("submission.csv", index=False)
with zipfile.ZipFile("submission.zip", "w") as fout:
    fout.write("submission.csv", compress_type=zipfile.ZIP_DEFLATED)

print('save xgbfir')
# 下面这行会改变feats的内容所以要放到最后
xgbfir.saveXgbFI(model, feature_names=feats, OutputXlsxFile='xgboost.xlsx')
コード例 #18
0
import time
import xgboost as xgb

import xgbfir

t0org0 = pd.read_csv("train.csv")
#h0org = pd.read_csv("test.csv")
print t0org0.columns
features = t0org0
lable = features['label']
features.drop(['label'], axis=1, inplace=True)
features.userID = features.userID.astype('int64')
features.cnt_advertiserID = features.cnt_advertiserID.astype('int64')

# features.drop(['conversionTime'], axis=1,inplace=True)

# dtrain = xgb.DMatrix(feature, label=lable, missing=-1)
# dvalid = xgb.DMatrix(xvalid, label=yvalid, missing=-1)
xgb_cmodel = xgb.XGBClassifier().fit(features, lable)

# saving to file with proper feature names

xgbfir.saveXgbFI(xgb_cmodel,
                 feature_names=features.columns,
                 OutputXlsxFile='irisFI1.xlsx')

# irisFI = [pd.read_excel("irisFI.xlsx", sheetname = "Interaction Depth %d" % i) for i in range(3)]

# one_feature_list=irisFI[0].Interaction
# for column in one_feature_list:
# 	print column, t0org0[column].unique().shape, t0org0[column].min(), t0org0[column].max()
コード例 #19
0
    print("Fitting fold %d" % fold_num)
    model.fit(X[train_idx], y[train_idx], eval_metric="rmse")
    score = r2_score(y[val_idx], model.predict(X[val_idx]))
    cv_scores.append(score)
    print("Eval. score (R2-score) for fold {} = {}\n".format(fold_num, score))
    fold_num += 1

print("Mean CV score = {}; Std. dev. CV score = {}\n".format(
    np.mean(cv_scores), np.std(cv_scores)))
feat_imp = pd.DataFrame(data=model.feature_importances_, index=top_10_features)

## Using xgbfir to learn more about feature interactions and create new useful features
import xgbfir
# saving to file with proper feature names
xgbfir.saveXgbFI(model,
                 feature_names=all_features,
                 OutputXlsxFile='predict_returns_FI.xlsx')

# Creating new features based on XGBFI file
train['country_desk_id'] = train['country_code'] * 10000 + train['desk_id']
train['pr_loss_maxibor'] = train[['euribor_rate', 'libor_rate']].apply(
    max, axis=1) * train['profit_loss']
train['pr_loss_euribor'] = train['profit_loss'] * train['euribor_rate']
train['pr_loss_libor'] = train['profit_loss'] * train['libor_rate']
train[
    'currency_euribor_pr_loss'] = train['currency'] * train['pr_loss_euribor']

test['country_desk_id'] = test['country_code'] * 10000 + test['desk_id']
test['pr_loss_maxibor'] = test[['euribor_rate', 'libor_rate']].apply(
    max, axis=1) * test['profit_loss']
test['pr_loss_euribor'] = test['profit_loss'] * test['euribor_rate']
コード例 #20
0
clf = xgb.train(
    param,
    X_train,
    300,
)

im = clf.get_score(importance_type='gain')

xgb.plot_importance(clf, height=0.5)

pred = clf.predict(X_test)

test['conv_prob'] = pred

test[['policy_id', 'conv_prob']].to_csv('test_result_tao.csv', index=False)

roc_auc_score(y, pred)

xgbfir.saveXgbFI(clf,
                 feature_names=list(X.columns),
                 OutputXlsxFile='interaction.xlsx')

#param_list  = {
#        'max_depth': range(1, 5),
##        'learning_rate': [0.1, 0.05, 0.01, 0.005, 0.001]
#        }
#
#search = GridSearchCV(model, param_list, 'roc_auc', cv = 3, iid = False)
#
#search.fit(X, y)
コード例 #21
0
         label='>300kb var-gene dist',
         color=(0.55, 0.63, 0.80))
plt.legend(fontsize=12)
plt.ylabel("Predicted eQTL Prob.", fontsize=16)
plt.xlabel("HiCNormed_100kb_p", fontsize=16)
plt.tight_layout()
plt.savefig("HiCNormed_100kb_p_change.png", dpi=300)

### Feature importance plot ###
import xgbfir
import pickle

model = pickle.load(
    open('./random_assembled_balanced_dataset_123_Xy_models.pkl',
         'r'))['FULL'][0]
xgbfir.saveXgbFI(model, feat_name, OutputXlsxFile='random_model.xlsx')

dfs = pd.read_excel('random_model.xlsx', sheetname=None)
order_0 = dfs[u'Interaction Depth 0']
order_0_map = [(k, v)
               for k, v in zip(order_0['Interaction'], order_0['Gain'])][:40]
color_mapping = {
    'p': (0.4, 0.7607843137254902, 0.6470588235294118),
    'g': (0.9882352941176471, 0.5529411764705883, 0.3843137254901961),
    'v': (0.5, 0.5, 0.796078431372549)
}

names = [p[0] for p in order_0_map]
gains = [p[1] for p in order_0_map]
colors = [color_mapping[s[-1]] for s in names]
fig, ax = plt.subplots()
コード例 #22
0
X = train
Y = train['is_female']
    
tempX = X.copy()
del tempX['is_female']
del tempX['train_id']
Y_train = pd.DataFrame.as_matrix(Y)

xgdmat = xgb.DMatrix(tempX, Y_train)

our_params = {'eta': 0.01, 'seed':27, 'subsample': 0.8, 'colsample_bytree': 0.8, 'gamma': 5, 'eval_metric':'auc', 
             'objective': 'binary:logistic', 'max_depth':7, 'min_child_weight':1, 'lambda': 0.1, 'scale_pos_weight':0.862} 
booster = xgb.train(our_params, xgdmat, num_boost_round = 2700)
######################################################################################
######################################################################################
#topK = 150
xgbfir.saveXgbFI(booster, TopK = 150, MaxTrees = 1000, MaxInteractionDepth = 5, OutputXlsxFile='xgb.xlsx')
#########################################################################################
#manually copy and paste features from xgb.xlsx and then read it 
feature_list = get_feature('features.csv')
feature_list.sort()
final_list = ['is_female', 'train_id']
final_list.extend(feature_list)

train_with_selected_features = X[final_list].copy()
train_with_selected_features.to_csv('new_train_150.csv',index=False)
final_test_list = ['test_id']
final_test_list.extend(feature_list)
test[final_test_list].to_csv('new_test_150.csv',index=False)
###########################################################################################
コード例 #23
0
    fig, ax = plt.subplots(figsize=(12, 18))
    xgb.plot_importance(bst_model, height=0.8, ax=ax)
    #plt.show()
    fig.savefig('feature_importance.png')

    # show feature importance table
    #fmap = bst_model.get_score(importance_type='cover')
    #print(fmap)
    fmap = bst_model.get_score(importance_type='gain')
    print(fmap)
    #fmap = bst_model.get_score(importance_type='weight')
    #print(fmap)


    # saving to file with proper feature names
    xgbfir.saveXgbFI(bst_model, OutputXlsxFile='future_interaction.xlsx')

    # predict on test data
    preds = bst_model.predict(dtest)
    print(preds)



    bst_model.dump_model('model.txt') 



    


コード例 #24
0
plt.xlim([-1, X_train_ALL.shape[1]])
plt.tight_layout()
#plt.savefig('rysunki/04_09.png', dpi=300)
plt.show()

plt.bar(range(len(xgb8.feature_importances_)), xgb8.feature_importances_)
plt.show()

y_test_pred = xgb8.predict_proba(X_test_ALL)[:, 1]
y_train_pred = xgb8.predict_proba(X_train_ALL)[:, 1]
print('ROC AUC TRAIN: %f' % sklearn.metrics.roc_auc_score(
    y_train_ALL, y_train_pred))  #ROC AUC TRAIN: 0.803086
print('ROC AUC TEST: %f' % sklearn.metrics.roc_auc_score(
    y_test_ALL, y_test_pred))  #ROC AUC TEST: 0.764920
xgbfir.saveXgbFI(xgb8,
                 feature_names=X_train_ALL.columns,
                 OutputXlsxFile='C:/Users/...')

columny_100 = [
    'Per2', 'Veh24', 'Hist_VehPer47', 'Veh3', 'Hist_Per6', 'Hist_Veh7',
    'Hist_VehPer7', 'Per7', 'Reg78', 'Reg41', 'Per8', 'Hist_VehPer24',
    'Hist_Veh3', 'Hist_Per52', 'Hist_Per100', 'Hist_VehPer41', 'Veh20',
    'Hist_Veh29', 'Hist_Veh22', 'Hist_VehPer81', 'Hist_Per44', 'Reg58',
    'Hist_VehPer46', 'Hist_VehPer52', 'Hist_Veh4', 'Hist_VehPer82',
    'Hist_VehPer74', 'Dif3', 'Hist_Per63', 'Hist_Per109', 'Per12',
    'Hist_Per111', 'Reg81', 'Hist_Veh8', 'Dif1', 'Hist_Per118',
    'Hist_VehPer71', 'Hist_VehPer54', 'Reg47', 'Hist_Per103', 'Reg83', 'Dif2',
    'Hist_VehPer60', 'Reg77', 'Veh5', 'Reg61', 'Hist_VehPer43', 'Hist_Per51',
    'Hist_Per48', 'Reg39', 'Hist_Per69', 'Reg48', 'Reg15', 'Veh18', 'Veh23',
    'Hist_Per35', 'Hist_VehPer25', 'Veh17', 'Reg34', 'Reg6', 'Reg82', 'Veh25',
    'Hist_Per97', 'Hist_Per28', 'Reg38', 'Reg7', 'Veh22', 'Hist_Per106',
コード例 #25
0
# save model
bestXgb.save_model(model_file)

# dump model
features = list(X_train.columns.values)
bestXgb.feature_names = features  # set names for XGBoost booster

outfile = open(model_file + '.fmap', 'w')
for i, feat in enumerate(features):
    outfile.write('{0}\t{1}\tq\n'.format(i, feat))
outfile.close()

bestXgb.dump_model(model_file + '.dump', with_stats=True)

xgbfir.saveXgbFI(bestXgb,
                 feature_names=features,
                 OutputXlsxFile=model_file + '.xlsx')

xgboost_predict_proba = bestXgb.predict(dtest)
y_test_preds = (xgboost_predict_proba > 0.5).astype('int')
report = classification_report(y_test, y_test_preds)
print(report)

infofile = open(model_file + '.info', 'w')
infofile.write('X= ' + X_file + '\n')
infofile.write('Y= ' + Y_file + '\n')
infofile.write('params= ' + args.P + '\n')
infofile.write('ratio= ' + str(ratio) + '\n')
infofile.write('num_round=' + str(best_num_round) + '\n')
infofile.write(report)
infofile.close()
コード例 #26
0
ファイル: run.py プロジェクト: hotheat/Consumer-credit-score
def feature_importance(train_test):
    # 特征重要性 xgbfir
    x_train = train_test[:train_df.shape[0]]
    xgb_cmodel = xgb.XGBRegressor().fit(x_train.astype('float'), y_train)
    xgbfir.saveXgbFI(xgb_cmodel, feature_names=x_train.columns, OutputXlsxFile='特征重要性.xlsx')