def fit_all(self, X, y, num_trees, feature_names): print self.params print 'XGBoost: training ... ' d_train = xgb.DMatrix(X, label=y) watchlist = [(d_train, 'train')] self.gbm = xgb.train(self.params, d_train, evals=watchlist, num_boost_round=num_trees, verbose_eval=self.params['verbose_eval']) try: xgbfir.saveXgbFI(self.gbm) except: pass modelname_new = self.modelname.split('.pkl')[0] + 'alldata' + '.pkl' print 'Saving to', modelname_new, '...' with open(MODELS_FOLDER + '/' + modelname_new, 'wb') as fout: pickle.dump(self.gbm, fout) if self.show_importance: xgb.plot_importance(self.gbm, importance_type='weight') plt.show() xgb.plot_importance(self.gbm, importance_type='gain') plt.show() xgb.plot_importance(self.gbm, importance_type='cover') plt.show() return self.gbm, modelname_new
def score(**params): global featimp for k in params.keys(): if k in discreteP: params[k]=int(params[k]) featimpmean=gen_featimpmean() chosen_feat=[] while len(chosen_feat)<min(params['ncols'],featimp.shape[1]): candfeat=weighted_featimp(chosen_feat).fillna(featimpmean.fillna(1.)) candfeat=candfeat.fillna(1./featimp.shape[1]) candfeat=candfeat.replace(0,candfeat[candfeat>0].min()) theone = np.random.choice(candfeat.index,p=candfeat.values/np.sum(candfeat.values)) chosen_feat.append( theone ) chosen_feat=list(set(col_keep(chosen_feat)+musthave)) p=xgbparams.copy() p.update(params) skip = sorted(random.sample(xrange(1,Nrows+1),Nrows-nrows)) if args.preload: train = train_exp.ix[:,chosen_feat+['loss']] else: train = pd.read_csv(path+"train_exp.csv",index_col=0,usecols=['id','loss']+chosen_feat,skiprows=skip ) print train.shape train_y = yforw(train['loss'],params) train_x = train.drop('loss',1) y_pred = 0*train_y fscores=dict((el,0) for el in chosen_feat) for train_idx, val_idx in kftune.split(train_x): X_train, X_val = train_x.iloc[train_idx], train_x.iloc[val_idx] y_train, y_val = train_y.iloc[train_idx], train_y.iloc[val_idx] d_train = xgboost.DMatrix(X_train, label=y_train) d_valid = xgboost.DMatrix(X_val, label=y_val) model = xgboost.train(p, d_train, num_boost_round=100000, evals=[(d_valid, 'eval')], early_stopping_rounds=patience, feval=es_eval, # obj=fair_obj, verbose_eval=False) y_pred.iloc[val_idx]=model.predict(d_valid,ntree_limit=model.best_ntree_limit) xgbfir.saveXgbFI(model,OutputXlsxFile=tmpfile,TopK=p_range['ncols'][1]) time.sleep(5) fi=pd.read_excel(tmpfile,index_col=0) fscore = fi['Expected Gain'].to_dict() #Gain, FScore, wFScore, Average wFScore, Average Gain, Expected Gain meanscore = np.average(fscore.values()) for k in fscore.keys(): fscore[k]/=meanscore*len(fscore) featimpmean=featimpmean.fillna(1./featimp.shape[1]) normalization = featimpmean[chosen_feat].sum()/featimpmean.sum()/np.sum(fscore.values())/kftune.get_n_splits() for k,v in fscore.iteritems(): fscores[k]+=normalization*v curscore = -mean_absolute_error(yback(y_pred,params),yback(train_y,params)) featimp = featimp.append(pd.Series(fscores,name=round(curscore,4))) return curscore
def score(**params): global featimp for k in params.keys(): params[k]=p_range[k][0]*(1-params[k])+p_range[k][1]*params[k] if k in discreteP: params[k]=int(round(params[k])) featimpmean=gen_featimpmean(featimp) if random.random()<.3 or featimp.shape[0]<5: chosen=[] while len(chosen)<min(params['ncols'],featimp.shape[1]): candfeat=weighted_featimp(featimp.iloc[:-resetrows],chosen).fillna(featimpmean.fillna(1.)) candfeat=candfeat.fillna(1./featimp.shape[1]) candfeat=candfeat.replace(0,candfeat[candfeat>0].min()) theone = np.random.choice(candfeat.index,p=candfeat.values/np.sum(candfeat.values)) chosen.append( theone ) chosen=list(set(chosen+musthave)) else: cols={k:[vk for vk,imp in v.iteritems() if imp>0] for k,v in featimp.T.to_dict().iteritems()} estimatedfeatimp=featimp_from_cols(cols) chosen=list(np.random.choice(estimatedfeatimp.keys(),min(params['ncols'],len(filter(None,estimatedfeatimp.values()))),replace=False,p=estimatedfeatimp.values())) params['colsample_bytree']=max(2./len(chosen),params['colsample_bytree']) xgbp=xgbparams.copy() xgbp.update(params) fscores=dict((el,0) for el in chosen) if args.verbose: print 'generate_train_x',len(chosen) train_x = generate_train_x(chosen,train,extra_y[target],test) train_y = extra_y[target] if args.verbose: print train_x.shape d_tr = xgb.DMatrix(train_x, label=train_y) cv = xgb.cv(xgbp,d_tr,nfold=8, num_boost_round=100000,early_stopping_rounds=patience, verbose_eval=args.verbose and 50, show_stdv=False) s = cv.iloc[-1,0] #cv columns: ['test-rmse-mean', 'test-rmse-std', 'train-rmse-mean','train-rmse-std'] model = xgb.train(xgbp,d_tr,num_boost_round=cv.shape[0],verbose_eval=args.verbose and 50) #solely for feature importances try: xgbfir.saveXgbFI(model,OutputXlsxFile=tmpfile,TopK=len(chosen)) fi=pd.read_excel(tmpfile,index_col=0) fscore = fi['Expected Gain'].to_dict() #Gain, FScore, wFScore, Average wFScore, Average Gain, Expected Gain except: fscore = model.get_score(importance_type='gain') featimpmean=featimpmean.fillna(1./featimp.shape[1]) normalization = featimpmean[chosen].sum()/featimpmean.sum()/np.sum(fscore.values()) for k,v in fscore.iteritems():fscores[k]+=normalization*v idx=round(1000*(np.log(2)-s),scoredp) featimp = featimp.append(pd.Series(fscores,name=idx)) return idx
def fit(self, X_train, X_eval, y_train, y_eval, feature_names): print self.params print 'XGBoost: training ... ' eval_result = {} d_train = xgb.DMatrix(X_train, label=y_train) d_valid = xgb.DMatrix(X_eval, label=y_eval) watchlist = [(d_train, 'train'), (d_valid, 'valid')] self.gbm = xgb.train( self.params, d_train, evals=watchlist, num_boost_round=self.params['n_estimators'], early_stopping_rounds=self.params['early_stopping_rounds'], verbose_eval=self.params['verbose_eval'], evals_result=eval_result) try: xgbfir.saveXgbFI(self.gbm) except: pass valloss_best = str(eval_result['valid']['logloss'][-1]) modelname_new = self.modelname.split('.pkl')[0] + valloss_best + '.pkl' print 'Saving to', modelname_new, '...' with open(MODELS_FOLDER + '/' + modelname_new, 'wb') as fout: pickle.dump(self.gbm, fout) if self.show_importance: xgb.plot_importance(self.gbm, importance_type='weight') plt.show() xgb.plot_importance(self.gbm, importance_type='gain') plt.show() xgb.plot_importance(self.gbm, importance_type='cover') plt.show() return self.gbm, eval_result, modelname_new
# No negative times # Use a mean of a job_duration subset to fill pred.ix[pred.ix[:, 'job_duration'] < 0, 'job_duration'] = data.ix[train_pred.job_duration < 0, 'job_duration'].mean() # Set name of model output model_name = 'xgboost_final_submission' # Set to dir of submission/output folder submit = r'/Submissions/' # Output test prediction to location pred.to_csv(submit + model_name + '.csv', index=False) ## Oupt XGBFIR statistics spreadsheet # this is the xgbfir package and is used to find feature importance xgbfir.saveXgbFI(xgbreg, OutputXlsxFile=submit + model_name + '_FI.xlsx') # Append CV Scores to XGBFIR book = load_workbook(submit + model_name + '_FI.xlsx') writer = pd.ExcelWriter(submit + model_name + '_FI.xlsx', engine='openpyxl') writer.book = book writer.sheets = dict((ws.title, ws) for ws in book.worksheets) cvscores.to_excel(writer, sheet_name='cvscore', index=False, engine='openpyxl') writer.save() # Dump XGBoost model to a pickle file save_model_path = r'/home/josh/Documents/Python Scripts/Data Science Challenges/ENGIE DSC/IT Operations/Models/' pickle.dump(xgbreg, open(save_model_path + model_name + '.dat', "wb")) # In[ ]:
def _additional_task(self): save_path = os.path.join(self.config["out_folder"], f'xgbfir_{self.data["fold_num"]}.xlsx') xgbfir.saveXgbFI(self.clf, OutputXlsxFile=save_path)
def main(): np.random.seed(42) logger = config.config_logger(__name__, 10) t0 = time.time() train_client_path = './data/raw/csv/train_clientes.csv' train_reque_path = './data/raw/csv/train_requerimientos.csv' test_client_path = './data/raw/csv/test_clientes.csv' test_reque_path = './data/raw/csv/test_requerimientos.csv' output_path = './output/' do_merge = False write_impute_test = False write_output = False add_variables = False version = 6 logger.info('Beginning execution') logger.info('Load dataframes') test_client = pd.read_csv(test_client_path, header=0) test_reque = pd.read_csv(test_reque_path, header=0) main_client = pd.read_csv(train_client_path, header=0) main_reque = pd.read_csv(train_reque_path, header=0) work_data.basic_descriptive(main_client) work_data.basic_descriptive(main_reque) id_variables = work_data.id_variables() index_client = test_client['ID_CORRELATIVO'] if write_impute_test: logger.info('Creating new test database') logger.info('Cleaning test reque database') test_reque = work_data.preprocess_reque(test_reque) print(test_reque.head().to_string()) logger.info('Cleaning test client database - Imputing missing values') test_client = work_data.count_missings_column(test_client) test_client = work_data.preprocess_client(test_client) print(test_client.head().to_string()) logger.info('Merging test databases') temp = pd.concat([test_client, test_reque], axis=1, join_axes=[test_client.index]) temp.fillna(0, inplace=True) test_df = temp print(test_df.head().to_string()) print(test_df.describe().transpose().to_string()) logger.info('Saving test database') test_df.to_csv('./data/mod/test_imputed.csv', index=False) else: logger.info('Opening test database') test_df = pd.read_csv('./data/mod/test_imputed.csv', header=0) print(test_df.head().to_string()) if do_merge: logger.info('Creating new merge') logger.info('Cleaning reque database') main_reque = work_data.preprocess_reque(main_reque) print(main_reque.head().to_string()) #main_reque = pd.pivot_table(main_reque, index=['ID_CORRELATIVO'], columns=['CODMES'], aggfunc=np.sum) #main_reque.columns = main_reque.columns.map('{0[0]}|{0[1]}'.format) #main_reque.fillna(0, inplace=True) logger.info('Cleaning client database - Imputing missing values') main_client = work_data.count_missings_column(main_client) target = main_client.pop('ATTRITION') target.index = main_client['ID_CORRELATIVO'] main_client = work_data.preprocess_client(main_client) main_client['ATTRITION'] = target print(main_client.head().to_string()) logger.info('Merging databases') temp = pd.concat([main_client, main_reque], axis=1, join_axes=[main_client.index]) temp.fillna(0, inplace=True) main_df = temp print(main_df.shape) print(main_df.head().to_string()) print(main_df.describe().transpose().to_string()) work_data.basic_descriptive(main_df) logger.info('Saving marges database') main_df.to_csv('./data/mod/merge1.csv', index=False) else: logger.info('Opening merged database') main_df = pd.read_csv('./data/mod/merge1.csv', header=0) print(main_df.head().to_string()) print(main_df.shape) y = main_df.pop('ATTRITION') main_df = main_df.append(test_df).reset_index(drop=True) if False: logger.info('Creating T-SNE database') temp_tsne = pd.DataFrame(models.tnse(main_df)) temp_tsne.to_csv('./data/mod/merge1_tsne.csv', index=False) else: logger.info('Loading T-SNE database') temp_tsne = pd.read_csv('./data/mod/merge1_tsne.csv') if add_variables: logger.info('Beginning feature engineering') logger.info('Interactions') main_df_feat = models.create_interactions(main_df, models.inter_vars()) logger.info('Row sums 1-3') main_df_feat['ext1'] = main_df.apply(lambda row: (row == 0).sum(), axis=1) temp = models.standard_scale_df(main_df) main_df_feat['ext2'] = temp.apply(lambda row: (row > 0.5).sum(), axis=1) main_df_feat['ext3'] = temp.apply(lambda row: (row < -0.5).sum(), axis=1) logger.info('K-means 4-7') main_df_feat['ext4'] = pd.Series(models.kmeans(main_df, 5)).apply(str) main_df_feat['ext5'] = pd.Series(models.kmeans(main_df, 10)).apply(str) main_df_feat['ext6'] = pd.Series(models.kmeans(main_df, 15)).apply(str) main_df_feat['ext7'] = pd.Series(models.kmeans(main_df, 20)).apply(str) logger.info('KNN 8-11') main_df_feat['ext8'] = models.knn_distance(main_df, 2) main_df_feat['ext9'] = models.knn_distance(main_df, 3) main_df_feat['ext10'] = models.knn_distance(main_df, 5) main_df_feat['ext11'] = models.knn_distance(temp_tsne, 2) main_df_feat = pd.get_dummies(main_df_feat, drop_first=True) print(main_df_feat.head().to_string()) print(main_df_feat.shape) config.time_taken_display(t0) logger.info('Saving features database') main_df_feat.to_csv('./data/mod/merge1_features.csv', index=False) else: logger.info('Opening feature engineered database') main_df_feat = pd.read_csv('./data/mod/merge1_features.csv', header=0) print(main_df_feat.head().to_string()) print(main_df_feat.shape) logger.info('Split data into train and test') x, test_df = main_df_feat.iloc[:70000, :], main_df_feat.iloc[70000:, :] print(main_df_feat.shape) print(x.shape) print(test_df.shape) x_train, x_test, y_train, y_test = models.split_data(x, y) work_data.basic_descriptive(x_train) logger.info('Level 1 - Create metafeatures') if False: logger.info('1. Ridge logit') ridge_model = models.logit_grid(x, y, 'l2', StandardScaler()) models.write_prediction(ridge_model, main_df_feat, index_client, 'ridge_standard') print(ridge_model.score(x_test, y_test)) logger.info('2. Lasso logit') lasso_model = models.logit_grid(x, y, 'l1',StandardScaler()) models.write_prediction(lasso_model, main_df_feat, index_client, 'lasso_standard') print(lasso_model.score(x_test, y_test)) logger.info('3. Random Forrest') RF_model = models.random_forrest_grid(x, y, StandardScaler()) models.write_prediction(RF_model, main_df_feat, index_client, 'RF_standard') print(RF_model.score(x_test, y_test)) logger.info('4. Extra Trees') ET_model = models.extra_trees_grid(x, y, StandardScaler()) models.write_prediction(ET_model, main_df_feat, index_client, 'ET_standard') print(ET_model.score(x_test, y_test)) logger.info('5. 2-KNN') KNN_model = models.knn_grid(x, y, StandardScaler(), 2) models.write_prediction(KNN_model, main_df_feat, index_client, 'KNN2_standard') print(KNN_model.score(x_test, y_test)) logger.info('6. 4-KNN') KNN_model = models.knn_grid(x, y, StandardScaler(), 4) models.write_prediction(KNN_model, main_df_feat, index_client, 'KNN4_standard') print(KNN_model.score(x_test, y_test)) logger.info('7. 8-KNN') KNN_model = models.knn_grid(x, y, StandardScaler(), 8) models.write_prediction(KNN_model, main_df_feat, index_client, 'KNN8_standard') print(KNN_model.score(x_test, y_test)) logger.info('8. 16-KNN') KNN_model = models.knn_grid(x, y, StandardScaler(), 16) models.write_prediction(KNN_model, main_df_feat, index_client, 'KNN16_standard') print(KNN_model.score(x_test, y_test)) logger.info('9. 32-KNN') KNN_model = models.knn_grid(x, y, StandardScaler(), 32) models.write_prediction(KNN_model, main_df_feat, index_client, 'KNN32_standard') print(KNN_model.score(x_test, y_test)) logger.info('10. 64-KNN') KNN_model = models.knn_grid(x, y, StandardScaler(), 64) models.write_prediction(KNN_model, main_df_feat, index_client, 'KNN64_standard') print(KNN_model.score(x_test, y_test)) logger.info('11. 128-KNN') KNN_model = models.knn_grid(x, y, StandardScaler(), 128) models.write_prediction(KNN_model, main_df_feat, index_client, 'KNN128_standard') print(KNN_model.score(x_test, y_test)) logger.info('12. 256-KNN') KNN_model = models.knn_grid(x, y, StandardScaler(), 256) models.write_prediction(KNN_model, main_df_feat, index_client, 'KNN256_standard') print(KNN_model.score(x_test, y_test)) logger.info('13. 512-KNN') KNN_model = models.knn_grid(x, y, StandardScaler(), 512) models.write_prediction(KNN_model, main_df_feat, index_client, 'KNN512_standard') print(KNN_model.score(x_test, y_test)) logger.info('14. 1024-KNN') KNN_model = models.knn_grid(x, y, StandardScaler(), 1024) models.write_prediction(KNN_model, main_df_feat, index_client, 'KNN1024_standard') print(KNN_model.score(x_test, y_test)) logger.info('15. Naive Bayes') NB_model = models.naive_bayes_grid(x, y, StandardScaler()) models.write_prediction(NB_model, main_df_feat, index_client, 'NB_standard') print(NB_model.score(x_test, y_test)) logger.info('16. MPL') MLP_model = models.MLP_grid(x, y, StandardScaler()) models.write_prediction(MLP_model, main_df_feat, index_client, 'MLP_standard') print(MLP_model.score(x_test, y_test)) logger.info('17. AdaBoost') adaboost_model = models.adaboost_grid(x, y, StandardScaler()) models.write_prediction(adaboost_model, main_df_feat, index_client, 'adaboost_standard') print(adaboost_model.score(x_test, y_test)) logger.info('18. GBM') gbm_model = models.gbm_grid(x, y, StandardScaler()) models.write_prediction(gbm_model, main_df_feat, index_client, 'gbm_standard') print(gbm_model.score(x_test, y_test)) logger.info('18. LightGBM') lgbm_model = models.lgbm_grid(x, y, None) models.write_prediction(lgbm_model, main_df_feat, index_client, 'lgbm_none') print(lgbm_model.score(x_test, y_test)) logger.info('19. XgBoost') test_final = main_df_feat.iloc[70000:, :] id_test = test_client['ID_CORRELATIVO'] xgboost_model = models.xgboost_grid(x, y, StandardScaler()) models.write_prediction(xgboost_model, main_df_feat, index_client, 'xgboost_standard') print(xgboost_model.score(x_test, y_test)) models.write_prediction(xgboost_model, test_final, id_test, 'ATTRITION') hi # Stage 2: logger.info('Level 2') logger.info('Creating meta-features database') meta_features_list = os.listdir('./data/mod/meta_features') temp = {} for feature in meta_features_list: temp_df = pd.read_csv('./data/mod/meta_features/{0}'.format(feature), header=0) temp[feature] = temp_df.iloc[:, 1] meta_features = pd.DataFrame(temp) meta_features = pd.concat([meta_features, main_df_feat], axis=1, ignore_index=True) x = meta_features.iloc[:70000, :] test_final = meta_features.iloc[70000:, :] x_train, x_test, y_train, y_test = models.split_data(x, y) print(x_train.shape) print(test_final.shape) print(x.shape) logger.info('Estimating second level model with XgBoost') xgboost_final = models.xgboost_full_mod(x_train, y_train) print(xgboost_final.score(x_test, y_test)) print(models.get_logloss(y_test, xgboost_final.predict_proba(x_test)[:, 1])) models.write_final_prediction(xgboost_final, test_final, test_client['ID_CORRELATIVO'], 'results8') models.write_final_prediction(xgboost_final, x, main_client['ATTRITION'], 'train') config.time_taken_display(t0) hi logger.info('XgBoost') xgboost_result = models.xgboost_grid(x_train, y_train, x_test, y_test) print('Test grid: {0}'.format(xgboost_result)) #Test: -0.322 xgboost_full = models.xgboost_full_mod(x_train, y_train, x_test, y_test) print(xgboost_full) xgbfir.saveXgbFI(xgboost_full, feature_names=main_df.columns, OutputXlsxFile='./data/mod/bbva.xlsx')
booster = xg.get_booster() print(booster.get_dump()[0]) booster = xg.get_booster() print(booster.get_dump()[1]) booster = xg.get_booster() print(booster.get_dump()[-1]) # Residuals plot regressor.residuals_plot(xg, auto_X_train, auto_y_train, auto_X_test, auto_y_test) # viewing interactions xgbfir.saveXgbFI(xg, feature_names=auto_X.columns, OutputXlsxFile='fir-auto.xlsx') # column impmortance # Gain - total gain of each feature # Fscore - number of splits # wFscore - weighted number of splits (by probability of split taking place) pd.read_excel('fir-auto.xlsx').head(3).T # column impmortance # Gain - total gain of each feature # Fscore - number of splits # wFscore - weighted number of splits (by probability of split taking place) pd.read_excel('fir-auto.xlsx', sheet_name='Interaction Depth 1').head(3).T # column impmortance
# In[1]: from sklearn.datasets import load_iris, load_boston import xgboost as xgb import xgbfir # loading database boston = load_boston() # doing all the XGBoost magic xgb_rmodel = xgb.XGBRegressor().fit(boston['data'], boston['target']) # saving to file with proper feature names xgbfir.saveXgbFI(xgb_rmodel, feature_names=boston.feature_names, OutputXlsxFile='bostonFI.xlsx') # loading database iris = load_iris() # doing all the XGBoost magic xgb_cmodel = xgb.XGBClassifier().fit(iris['data'], iris['target']) # saving to file with proper feature names xgbfir.saveXgbFI(xgb_cmodel, feature_names=iris.feature_names, OutputXlsxFile='irisFI.xlsx') # Check working directory. There will be two new files: **bostonFI.xlsx** and **irisFI.xlsx**.
model.get_score(fmap=fmap_filename, importance_type='cover')).to_frame()) feat_imp.columns = ['Weight', 'Gain', 'Cover'] feat_imp['FeatureName'] = feat_imp.index feat_imp['Model'] = model_name feat_imp['fold'] = ind FI_df = pd.concat([FI_df, feat_imp]) if XGBFirFlg: print('Feature Interaction') interactions_data_path = '/opt/ml/processing/output_importance/interactions_%s_%s.xlsx' % ( model_name, ind) xgbfir.saveXgbFI(model, feature_names=featureset, TopK=500, MaxTrees=500, MaxInteractionDepth=2, OutputXlsxFile=interactions_data_path) print('Averaging results') #FI num_folds = FI_df['fold'].max() + 1 #number of columns with folds scores depends on the number of folds (num_folds) We do not know in advance how many of them exist in the results folds_train_columns = [] folds_test_columns = [] folds_gain_columns = [] folds_weight_columns = [] folds_cover_columns = [] for i in range(0, int(num_folds), 1): folds_train_columns.append('train-%s-fold' % i) folds_test_columns.append('test-%s-fold' % i)
def _save_feature_importance(self): xgbfir.saveXgbFI(self.clf, OutputXlsxFile=self.output_folder + 'xgbfir_%d.xlsx' % self.fold_num)
def train_models(self, workflow, datasource, dataset, y=None, test_dataset=None): print('start training models') trained_models = dict() model_processing_type = workflow.model_processing.get('type') processing_models = workflow.model_processing.get('models') validation_type = workflow.validation.get('type') validation_value = workflow.validation.get('value') print(model_processing_type, processing_models, validation_type, validation_value) if model_processing_type == 'supervised': y_predictor = None for p in datasource.predictor_details: if p.get('name') == datasource.predictor_target_name: y_predictor = p break if y_predictor.get('predictor_type').get('description', None) == 'continuous': model_processing_detail = 'regression' else: y_value_counts = y.value_counts() if len(y_value_counts) > 2: model_processing_detail = 'classification_multi' else: model_processing_detail = 'classification_binary' print(model_processing_detail) from sklearn.preprocessing import LabelEncoder le = LabelEncoder() y_encoded = le.fit_transform(y) if validation_type == 'fold': from sklearn.model_selection import cross_val_predict from sklearn.model_selection import StratifiedKFold skf = StratifiedKFold(n_splits=validation_value) if 'rlist' in processing_models and 'classification' in model_processing_detail: pass if 'xgb' in processing_models: objective = 'binary:logistic' \ if 'binary' in model_processing_detail else 'multi:softprob' n_estimators = 20 silent = 1 subsample = .7 colsample_bytree = .7 learning_rate = .1 max_depth = 7 min_child_weight = 2 if 'classification' in model_processing_detail: from xgboost import XGBClassifier xgb = XGBClassifier(n_estimators=n_estimators, objective=objective, silent=silent, subsample=subsample, colsample_bytree=colsample_bytree, learning_rate=learning_rate, max_depth=max_depth, min_child_weight=min_child_weight) else: from xgboost import XGBRegressor xgb = XGBRegressor(n_estimators=n_estimators, objective=objective, silent=silent, subsample=subsample, colsample_bytree=colsample_bytree, learning_rate=learning_rate, max_depth=max_depth, min_child_weight=min_child_weight) if validation_type == 'fold': y_pred = cross_val_predict(xgb, dataset.values, y_encoded, cv=skf, n_jobs=-1, verbose=9) xgb.fit(dataset.values, y_encoded) from settings import location import xgbfir workflow.fi_booster = location( 'workflow_data') + '/' + str(workflow._id) + '_fi.xlsx' print('save xgbfi', xgb._Booster, workflow.fi_booster) xgbfir.saveXgbFI(xgb._Booster, OutputXlsxFile=workflow.fi_booster) else: pass # knn.fit(X, y) # y_pred = knn.predict(test_dataset.values) \ # if 'binary' in model_processing_detail \ # else knn.predict_proba(test_dataset.values) trained_models['xgb'] = y_pred if 'frlp' in processing_models: pass if 'knn' in processing_models: if 'classification' in model_processing_detail: from sklearn.neighbors import KNeighborsClassifier knn = KNeighborsClassifier(n_neighbors=5, algorithm='auto', n_jobs=-1) else: from sklearn.neighbors import KNeighborsRegressor knn = KNeighborsRegressor(n_neighbors=5, algorithm='auto', n_jobs=-1) from sklearn import preprocessing X = preprocessing.scale(dataset) if validation_type == 'fold': y_pred = cross_val_predict(knn, X, y_encoded, cv=skf, n_jobs=-1, verbose=9) else: pass # knn.fit(X, y) # y_pred = knn.predict(test_dataset.values) \ # if 'binary' in model_processing_detail \ # else knn.predict_proba(test_dataset.values) trained_models['knn'] = y_pred del X if 'lr' in processing_models: ''' solver : {‘newton-cg’, ‘lbfgs’, ‘liblinear’, ‘sag’} Algorithm to use in the optimization problem. For small datasets, ‘liblinear’ is a good choice, whereas ‘sag’ is faster for large ones. For multiclass problems, only ‘newton-cg’, ‘sag’ and ‘lbfgs’ handle multinomial loss; ‘liblinear’ is limited to one-versus-rest schemes. ‘newton-cg’, ‘lbfgs’ and ‘sag’ only handle L2 penalty. ‘liblinear’ might be slower in LogisticRegressionCV because it does not handle warm-starting. Note that ‘sag’ fast convergence is only guaranteed on features with approximately the same scale. You can preprocess the data with a scaler from sklearn.preprocessing. New in version 0.17: Stochastic Average Gradient descent solver. ''' if 'classification' in model_processing_detail: multi_class = 'ovr' if 'binary' in model_processing_detail else 'multinomial' if dataset.shape[0] <= 1000: if multi_class == 'ovr': solver = 'liblinear' else: solver = 'lbfgs' elif dataset.shape[0] >= 10000: solver = 'sag' else: solver = 'lbfgs' class_weight = 'balanced' n_jobs = -1 from sklearn.linear_model import LogisticRegression lr = LogisticRegression(solver=solver, class_weight=class_weight, n_jobs=n_jobs, multi_class=multi_class) from sklearn import preprocessing X = preprocessing.scale(dataset) if validation_type == 'fold': y_pred = cross_val_predict(lr, X, y_encoded, cv=skf, n_jobs=-1, verbose=9) else: pass # lr.fit(X, y) # y_pred = lr.predict(test_dataset.values) \ # if multi_class == 'ovr' else lr.predict_proba(test_dataset.values) trained_models['lr'] = y_pred del X else: pass if 'nn' in processing_models: ''' solver : {‘lbfgs’, ‘sgd’, ‘adam’}, default ‘adam’ The solver for weight optimization. ‘lbfgs’ is an optimizer in the family of quasi-Newton methods. ‘sgd’ refers to stochastic gradient descent. ‘adam’ refers to a stochastic gradient-based optimizer proposed by Kingma, Diederik, and Jimmy Ba Note: The default solver ‘adam’ works pretty well on relatively large datasets (with thousands of training samples or more) in terms of both training time and validation score. For small datasets, however, ‘lbfgs’ can converge faster and perform better. ''' solver = None if dataset.shape[0] >= 1000.: solver = 'adam' else: solver = 'lbfgs' if 'classification' in model_processing_detail: from sklearn.neural_network import MLPClassifier nn = MLPClassifier(solver=solver, hidden_layer_sizes=(50, 3)) else: from sklearn.neural_network import MLPRegressor nn = MLPRegressor(solver=solver, hidden_layer_sizes=(50, 3)) from sklearn import preprocessing X = preprocessing.scale(dataset) if validation_type == 'fold': y_pred = cross_val_predict(nn, X, y_encoded, cv=skf, n_jobs=-1, verbose=9) else: pass # nn.fit(X, y) # y_pred = nn.predict(test_dataset.values) \ # if 'binary' in model_processing_detail \ # else nn.predict_proba(test_dataset.values) trained_models['nn'] = y_pred del X if 'rf' in processing_models: n_estimators = 50 n_jobs = -1 max_depth = 7 if 'classification' in model_processing_detail: from sklearn.ensemble import RandomForestClassifier rf = RandomForestClassifier(n_estimators=n_estimators, max_depth=max_depth, class_weight='balanced', n_jobs=n_jobs) else: from sklearn.ensemble import RandomForestRegressor rf = RandomForestRegressor(n_estimators=n_estimators, max_depth=max_depth, n_jobs=n_jobs) if validation_type == 'fold': y_pred = cross_val_predict(rf, dataset.values, y_encoded, cv=skf, n_jobs=-1, verbose=9) else: pass # rf.fit(X, y) # y_pred = rf.predict(test_dataset.values) \ # if 'binary' in model_processing_detail \ # else rf.predict_proba(test_dataset.values) trained_models['rf'] = y_pred else: if 'gm' in processing_models: from sklearn.mixture import GaussianMixture gm = GaussianMixture() # gm.fit(X) if 'kmean' in processing_models: from sklearn.cluster import KMeans kmean = KMeans() # kmean.fit(X) if 'dbscan' in processing_models: from sklearn.cluster import DBSCAN dbscan = DBSCAN() # dbscan.fit(X) if 'pca' in processing_models: from sklearn.decomposition import PCA pca = PCA() # pca.fit(X) if 'rbm' in processing_models: from sklearn.neural_network import BernoulliRBM rbm = BernoulliRBM() # rbm.fit(X) return trained_models, model_processing_type, model_processing_detail
# Z_pred = model.predict_proba(Z_test)[:,1] Z_test = xgb.DMatrix(dfTest[feats].values) Z_pred = bst.predict(Z_test, ntree_limit=bst.best_ntree_limit) # submission df = pd.DataFrame({"instanceID": dfTest["instanceID"].values, "proba": Z_pred}) df.sort_values("instanceID", inplace=True) df.to_csv("submission.csv", index=False) with zipfile.ZipFile("submission.zip", "w") as fout: fout.write("submission.csv", compress_type=zipfile.ZIP_DEFLATED) # 71 feats + feats2 validation_0-logloss:0.106431 validation_1-logloss:0.110743 # 72 feats validation_0-logloss:0.106431 validation_1-logloss:0.110744 # 59 feats2 validation_0-logloss:0.116213 validation_1-logloss:0.119058 # 66 feats validation_0-logloss:0.106338 validation_1-logloss:0.110628 # 82 feats validation_0-logloss:0.105197 validation_1-logloss:0.109576 # 166 feats validation_0-logloss:0.101096 validation_1-logloss:0.105124 # [133] validation_0-logloss:0.100174 validation_1-logloss:0.10421 中位数填充 # [124] validation_0-logloss:0.100159 validation_1-logloss:0.104205 # [139] validation_0-logloss:0.100071 validation_1-logloss:0.104137 # [461] validation_0-logloss:0.096659 validation_1-logloss:0.102084 # 100 0.1027 # 100 0.102788645829 # [99] eval-logloss:0.102789 train-logloss:0.09784 print('save xgbfir') # 下面这行会改变feats的内容所以要放到最后 featmp = feats xgbfir.saveXgbFI(bst, feature_names=featmp, OutputXlsxFile='xgboost.xlsx')
import pandas as pd import xgboost as xgb sys.path.append("../src") from base import Utilities, Config from common import CustomTransformation config = Config() train_module = CustomTransformation(config, 'train') watchlist = [(train_module.ddata, 'train')] print(train_module.final_columns) params = Utilities.load_json(config.params_file) history = xgb.cv(params, train_module.ddata, 300, early_stopping_rounds=30, metrics=["auc", "error"], verbose_eval=True) model = xgb.train(params, train_module.ddata, 200, verbose_eval=True) class_mapping = Utilities.load_json(config.class_mapping_file) test_module = CustomTransformation("test", class_mapping, train_module.final_columns) y_pred = model.predict(test_module.ddata) submission_df = pd.DataFrame({config.notable_columns["ID"]: list(test_module.main_column.values), config.notable_columns["Target"]: list(y_pred)}) submission_df.to_csv(os.path.join(config.home, 'submission', 'one.csv'), float_format='%0.6f', index=False) xgbfir.saveXgbFI(model, feature_names=train_module.final_columns, TopK=500, SortBy='Gain', \ MaxTrees=500, MaxInteractionDepth=2, OutputXlsxFile='XGBoost-FI.xlsx')
def xgboost_train(train=None, train_target=None, test=None, id=None, load=False): print("Start training") start = time.time() if load: train = pd.read_hdf("train.h5", "train") test = pd.read_hdf("test.h5", "test") id = pd.read_hdf("id.h5", "id") train_target = pd.read_hdf("train_target.h5", "train_target") # optimized hyperparameters param = {} param["objective"] = "reg:linear" param["booster"] = "gbtree" param["eta"] = 0.04 param["max_depth"] = 8 param["min_child_weight"] = 5 param["subsample"] = 1 param["colsample_bytree"] = 0.5 param["colsample_bylevel"] = 1 param["gamma"] = 10 param["lambda"] = 1 param["alpha"] = 1 param["silent"] = 1 param["nthread"] = 24 param["seed"] = 1991 # we are using a new Xgboost tree creation algorithm that is much faster, you need the newest version for that param["tree_method"] = "hist" param["eval_metric"] = "rmse" num_round = 3000 dtrain = xgb.DMatrix(train, train_target) watchlist = [(dtrain, "train")] gbm = xgb.train(param, dtrain, num_round, evals=watchlist, verbose_eval=True) os.makedirs("../output", exist_ok=True) xgbfir.saveXgbFI(gbm, TopK=300, OutputXlsxFile="../output/XgbFeatureInteractions.xlsx") gain = pd.Series(gbm.get_score(importance_type="gain")) * pd.Series( gbm.get_score(importance_type="weight")) gain = gain.reset_index() gain.columns = ["features", "gain"] gain.sort_values(by="gain", inplace=True) gain.plot(kind="barh", x="features", y="gain", legend=False, figsize=(10, 20)) plt.title("XGBoost Total Gain") plt.xlabel("Total Gain") plt.savefig("../output/XGBOOST_GAIN_" + time.strftime("%Y_%m_%d_%H_%M_%S") + ".png", bbox_inches="tight", pad_inches=1) gain.sort_values( by="gain", ascending=False).to_csv("../output/Gain_" + time.strftime("%Y_%m_%d_%H_%M_%S") + ".csv") dtest = xgb.DMatrix(test) y_pred = gbm.predict(dtest) submission = pd.DataFrame({ "id": id, "Demanda_uni_equil": np.expm1(y_pred) }) cols = submission.columns.tolist() cols = cols[1:] + cols[0:1] submission = submission[cols] os.makedirs("../subm", exist_ok=True) submission.to_csv("../subm/submission_xgboost_" + time.strftime("%Y_%m_%d_%H_%M_%S") + ".csv.gz", compression="gzip", index=False) print("Training and submitting took {:.1f}min".format( (time.time() - start) / 60))
# # Xgbfir simple example # This is a small working example of Xgbfir usage from Python code. # In[1]: from sklearn.datasets import load_iris, load_boston import xgboost as xgb import xgbfir # loading database boston = load_boston() # doing all the XGBoost magic xgb_rmodel = xgb.XGBRegressor().fit(boston['data'], boston['target']) # saving to file with proper feature names xgbfir.saveXgbFI(xgb_rmodel, feature_names=boston.feature_names, OutputXlsxFile='bostonFI.xlsx') # loading database iris = load_iris() # doing all the XGBoost magic xgb_cmodel = xgb.XGBClassifier().fit(iris['data'], iris['target']) # saving to file with proper feature names xgbfir.saveXgbFI(xgb_cmodel, feature_names=iris.feature_names, OutputXlsxFile='irisFI.xlsx') # Check working directory. There will be two new files: **bostonFI.xlsx** and **irisFI.xlsx**.
model.fit(X_train, Y_train, eval_set=([X_train, Y_train], [X_test, Y_test]), eval_metric="logloss", early_stopping_rounds=3) Y_pred = model.predict_proba(X_test)[:, 1] print(X_train.shape) print logloss(Y_test, Y_pred) # mysubmission df = pd.DataFrame({"instanceID": Y_test, "proba": Y_pred}) df.sort_values("instanceID", inplace=True) df.to_csv("submissionx.csv", index=False) with zipfile.ZipFile("submissionx.zip", "w") as fout: fout.write("submissionx.csv", compress_type=zipfile.ZIP_DEFLATED) Z_test = dfTest[feats].values Z_pred = model.predict_proba(Z_test)[:, 1] # submission df = pd.DataFrame({"instanceID": dfTest["instanceID"].values, "proba": Z_pred}) df.sort_values("instanceID", inplace=True) df.to_csv("submission.csv", index=False) with zipfile.ZipFile("submission.zip", "w") as fout: fout.write("submission.csv", compress_type=zipfile.ZIP_DEFLATED) print('save xgbfir') # 下面这行会改变feats的内容所以要放到最后 xgbfir.saveXgbFI(model, feature_names=feats, OutputXlsxFile='xgboost.xlsx')
import time import xgboost as xgb import xgbfir t0org0 = pd.read_csv("train.csv") #h0org = pd.read_csv("test.csv") print t0org0.columns features = t0org0 lable = features['label'] features.drop(['label'], axis=1, inplace=True) features.userID = features.userID.astype('int64') features.cnt_advertiserID = features.cnt_advertiserID.astype('int64') # features.drop(['conversionTime'], axis=1,inplace=True) # dtrain = xgb.DMatrix(feature, label=lable, missing=-1) # dvalid = xgb.DMatrix(xvalid, label=yvalid, missing=-1) xgb_cmodel = xgb.XGBClassifier().fit(features, lable) # saving to file with proper feature names xgbfir.saveXgbFI(xgb_cmodel, feature_names=features.columns, OutputXlsxFile='irisFI1.xlsx') # irisFI = [pd.read_excel("irisFI.xlsx", sheetname = "Interaction Depth %d" % i) for i in range(3)] # one_feature_list=irisFI[0].Interaction # for column in one_feature_list: # print column, t0org0[column].unique().shape, t0org0[column].min(), t0org0[column].max()
print("Fitting fold %d" % fold_num) model.fit(X[train_idx], y[train_idx], eval_metric="rmse") score = r2_score(y[val_idx], model.predict(X[val_idx])) cv_scores.append(score) print("Eval. score (R2-score) for fold {} = {}\n".format(fold_num, score)) fold_num += 1 print("Mean CV score = {}; Std. dev. CV score = {}\n".format( np.mean(cv_scores), np.std(cv_scores))) feat_imp = pd.DataFrame(data=model.feature_importances_, index=top_10_features) ## Using xgbfir to learn more about feature interactions and create new useful features import xgbfir # saving to file with proper feature names xgbfir.saveXgbFI(model, feature_names=all_features, OutputXlsxFile='predict_returns_FI.xlsx') # Creating new features based on XGBFI file train['country_desk_id'] = train['country_code'] * 10000 + train['desk_id'] train['pr_loss_maxibor'] = train[['euribor_rate', 'libor_rate']].apply( max, axis=1) * train['profit_loss'] train['pr_loss_euribor'] = train['profit_loss'] * train['euribor_rate'] train['pr_loss_libor'] = train['profit_loss'] * train['libor_rate'] train[ 'currency_euribor_pr_loss'] = train['currency'] * train['pr_loss_euribor'] test['country_desk_id'] = test['country_code'] * 10000 + test['desk_id'] test['pr_loss_maxibor'] = test[['euribor_rate', 'libor_rate']].apply( max, axis=1) * test['profit_loss'] test['pr_loss_euribor'] = test['profit_loss'] * test['euribor_rate']
clf = xgb.train( param, X_train, 300, ) im = clf.get_score(importance_type='gain') xgb.plot_importance(clf, height=0.5) pred = clf.predict(X_test) test['conv_prob'] = pred test[['policy_id', 'conv_prob']].to_csv('test_result_tao.csv', index=False) roc_auc_score(y, pred) xgbfir.saveXgbFI(clf, feature_names=list(X.columns), OutputXlsxFile='interaction.xlsx') #param_list = { # 'max_depth': range(1, 5), ## 'learning_rate': [0.1, 0.05, 0.01, 0.005, 0.001] # } # #search = GridSearchCV(model, param_list, 'roc_auc', cv = 3, iid = False) # #search.fit(X, y)
label='>300kb var-gene dist', color=(0.55, 0.63, 0.80)) plt.legend(fontsize=12) plt.ylabel("Predicted eQTL Prob.", fontsize=16) plt.xlabel("HiCNormed_100kb_p", fontsize=16) plt.tight_layout() plt.savefig("HiCNormed_100kb_p_change.png", dpi=300) ### Feature importance plot ### import xgbfir import pickle model = pickle.load( open('./random_assembled_balanced_dataset_123_Xy_models.pkl', 'r'))['FULL'][0] xgbfir.saveXgbFI(model, feat_name, OutputXlsxFile='random_model.xlsx') dfs = pd.read_excel('random_model.xlsx', sheetname=None) order_0 = dfs[u'Interaction Depth 0'] order_0_map = [(k, v) for k, v in zip(order_0['Interaction'], order_0['Gain'])][:40] color_mapping = { 'p': (0.4, 0.7607843137254902, 0.6470588235294118), 'g': (0.9882352941176471, 0.5529411764705883, 0.3843137254901961), 'v': (0.5, 0.5, 0.796078431372549) } names = [p[0] for p in order_0_map] gains = [p[1] for p in order_0_map] colors = [color_mapping[s[-1]] for s in names] fig, ax = plt.subplots()
X = train Y = train['is_female'] tempX = X.copy() del tempX['is_female'] del tempX['train_id'] Y_train = pd.DataFrame.as_matrix(Y) xgdmat = xgb.DMatrix(tempX, Y_train) our_params = {'eta': 0.01, 'seed':27, 'subsample': 0.8, 'colsample_bytree': 0.8, 'gamma': 5, 'eval_metric':'auc', 'objective': 'binary:logistic', 'max_depth':7, 'min_child_weight':1, 'lambda': 0.1, 'scale_pos_weight':0.862} booster = xgb.train(our_params, xgdmat, num_boost_round = 2700) ###################################################################################### ###################################################################################### #topK = 150 xgbfir.saveXgbFI(booster, TopK = 150, MaxTrees = 1000, MaxInteractionDepth = 5, OutputXlsxFile='xgb.xlsx') ######################################################################################### #manually copy and paste features from xgb.xlsx and then read it feature_list = get_feature('features.csv') feature_list.sort() final_list = ['is_female', 'train_id'] final_list.extend(feature_list) train_with_selected_features = X[final_list].copy() train_with_selected_features.to_csv('new_train_150.csv',index=False) final_test_list = ['test_id'] final_test_list.extend(feature_list) test[final_test_list].to_csv('new_test_150.csv',index=False) ###########################################################################################
fig, ax = plt.subplots(figsize=(12, 18)) xgb.plot_importance(bst_model, height=0.8, ax=ax) #plt.show() fig.savefig('feature_importance.png') # show feature importance table #fmap = bst_model.get_score(importance_type='cover') #print(fmap) fmap = bst_model.get_score(importance_type='gain') print(fmap) #fmap = bst_model.get_score(importance_type='weight') #print(fmap) # saving to file with proper feature names xgbfir.saveXgbFI(bst_model, OutputXlsxFile='future_interaction.xlsx') # predict on test data preds = bst_model.predict(dtest) print(preds) bst_model.dump_model('model.txt')
plt.xlim([-1, X_train_ALL.shape[1]]) plt.tight_layout() #plt.savefig('rysunki/04_09.png', dpi=300) plt.show() plt.bar(range(len(xgb8.feature_importances_)), xgb8.feature_importances_) plt.show() y_test_pred = xgb8.predict_proba(X_test_ALL)[:, 1] y_train_pred = xgb8.predict_proba(X_train_ALL)[:, 1] print('ROC AUC TRAIN: %f' % sklearn.metrics.roc_auc_score( y_train_ALL, y_train_pred)) #ROC AUC TRAIN: 0.803086 print('ROC AUC TEST: %f' % sklearn.metrics.roc_auc_score( y_test_ALL, y_test_pred)) #ROC AUC TEST: 0.764920 xgbfir.saveXgbFI(xgb8, feature_names=X_train_ALL.columns, OutputXlsxFile='C:/Users/...') columny_100 = [ 'Per2', 'Veh24', 'Hist_VehPer47', 'Veh3', 'Hist_Per6', 'Hist_Veh7', 'Hist_VehPer7', 'Per7', 'Reg78', 'Reg41', 'Per8', 'Hist_VehPer24', 'Hist_Veh3', 'Hist_Per52', 'Hist_Per100', 'Hist_VehPer41', 'Veh20', 'Hist_Veh29', 'Hist_Veh22', 'Hist_VehPer81', 'Hist_Per44', 'Reg58', 'Hist_VehPer46', 'Hist_VehPer52', 'Hist_Veh4', 'Hist_VehPer82', 'Hist_VehPer74', 'Dif3', 'Hist_Per63', 'Hist_Per109', 'Per12', 'Hist_Per111', 'Reg81', 'Hist_Veh8', 'Dif1', 'Hist_Per118', 'Hist_VehPer71', 'Hist_VehPer54', 'Reg47', 'Hist_Per103', 'Reg83', 'Dif2', 'Hist_VehPer60', 'Reg77', 'Veh5', 'Reg61', 'Hist_VehPer43', 'Hist_Per51', 'Hist_Per48', 'Reg39', 'Hist_Per69', 'Reg48', 'Reg15', 'Veh18', 'Veh23', 'Hist_Per35', 'Hist_VehPer25', 'Veh17', 'Reg34', 'Reg6', 'Reg82', 'Veh25', 'Hist_Per97', 'Hist_Per28', 'Reg38', 'Reg7', 'Veh22', 'Hist_Per106',
# save model bestXgb.save_model(model_file) # dump model features = list(X_train.columns.values) bestXgb.feature_names = features # set names for XGBoost booster outfile = open(model_file + '.fmap', 'w') for i, feat in enumerate(features): outfile.write('{0}\t{1}\tq\n'.format(i, feat)) outfile.close() bestXgb.dump_model(model_file + '.dump', with_stats=True) xgbfir.saveXgbFI(bestXgb, feature_names=features, OutputXlsxFile=model_file + '.xlsx') xgboost_predict_proba = bestXgb.predict(dtest) y_test_preds = (xgboost_predict_proba > 0.5).astype('int') report = classification_report(y_test, y_test_preds) print(report) infofile = open(model_file + '.info', 'w') infofile.write('X= ' + X_file + '\n') infofile.write('Y= ' + Y_file + '\n') infofile.write('params= ' + args.P + '\n') infofile.write('ratio= ' + str(ratio) + '\n') infofile.write('num_round=' + str(best_num_round) + '\n') infofile.write(report) infofile.close()
def feature_importance(train_test): # 特征重要性 xgbfir x_train = train_test[:train_df.shape[0]] xgb_cmodel = xgb.XGBRegressor().fit(x_train.astype('float'), y_train) xgbfir.saveXgbFI(xgb_cmodel, feature_names=x_train.columns, OutputXlsxFile='特征重要性.xlsx')