def predict(train, test): train_X = train.drop(['next_step'], axis=1) train_Y = train.pop('next_step') bst = XGBClassifier() bst.fit(train_X, train_Y, eval_metric='auc') bst.get_booster().load_model('xgboost.model') pred = bst.predict(test) pred_prob = bst.predict_proba(test) print(pred) print(pred_prob)
def modelfit(useTrainCV=True, cv_folds=5, early_stopping_rounds=50): alg = XGBClassifier(**params) df = data.sample(frac=0.3) pX = df.drop('LABEL', axis=1) py = df['LABEL'] if useTrainCV: print("start use cv") xgb_param = alg.get_xgb_params() cvresult = xgb.cv(xgb_param, xgtrain, num_boost_round=xgb_param['n_estimators'], nfold=cv_folds, metrics='auc', early_stopping_rounds=early_stopping_rounds) print(cvresult.shape[0]) alg.set_params(n_estimators=cvresult.shape[0]) params['n_estimators'] = cvresult.shape[0] print("best tree size is {}".format(cvresult.shape[0])) # Fit the algorithm on the data alg.fit(X, y, eval_metric='auc') y_pred = alg.predict(pX) accuracy = metrics.accuracy_score(py, y_pred) print("精确率Accuracy: %.2f%%" % (accuracy * 100.0)) print('auc:', metrics.roc_auc_score(py, y_pred)) train_report = metrics.classification_report(py, y_pred) print(train_report) feat_imp = pd.Series( alg.get_booster().get_fscore()).sort_values(ascending=False) print(feat_imp) return alg
def get_xgb_features_and_values( clf: XGBClassifier) -> Tuple[List[Features], List[LeafValues]]: """Use regex to find (features, thresholds) and (left, right) splits""" fd, fout = mkstemp(text=True) clf.get_booster().dump_model(fout, with_stats=True) with open(fout, "r") as fin: txt = fin.read() os.close(fd) pat = "\[f([0-9]+)<([0-9]+.*[0-9-e]*)\]" features_thresholds = list( map(lambda x: (int(x[0]), float(x[1])), re.findall(pat, txt))) _ = list(map(float, re.findall("leaf=(-{,1}[0-9]+.[0-9-e]+),", txt))) left_right = cast(List[Tuple[float, float]], list(zip(*(iter(_), ) * 2))) return features_thresholds, left_right
def evaluate_model(model_params): model = XGBClassifier(**model_params) data, X_train, y_train = get_transformed_data(frac=1) model.fit(X_train, y_train, eval_metric=metrics.f1_score) joblib.dump(model, 'danCdmaModel_{}.pkl'.format(format(datetime.now().strftime('%d%H%M')))) del data del X_train del y_train data, X_test, y_test = get_transformed_data(fname='cdma_train2.csv', frac=1) y_pred = model.predict(X_test) accuracy = metrics.accuracy_score(y_test, y_pred) print("Accuracy: %.2f%%" % (accuracy * 100.0)) print('auc:', metrics.roc_auc_score(y_test, y_pred)) train_report = metrics.classification_report(y_test, y_pred) print(train_report) feat_imp = pd.Series(model.get_booster().get_fscore()).sort_values(ascending=False) print(feat_imp) return model
def evaluate_model(model_params): model = XGBClassifier(**model_params) AX = data.drop('LABEL', axis=1) ay = data['LABEL'] X_train, X_test, y_train, y_test = train_test_split(AX, ay, test_size=0.33, random_state=7) model.fit(X_train, y_train, eval_metric=metrics.f1_score) y_pred = model.predict(X_test) accuracy = metrics.acpredict_probacuracy_score(y_test, y_pred) print("Accuracy: %.2f%%" % (accuracy * 100.0)) print('auc:', metrics.roc_auc_score(y_test, y_pred)) train_report = metrics.classification_report(y_test, y_pred) print(train_report) feat_imp = pd.Series( model.get_booster().get_fscore()).sort_values(ascending=False) joblib.dump( model, 'lossWarnBroadBandModel_{}.pkl'.format( datetime.now().strftime('%d%H%M'))) return model
### # FEATURE IMPORTANCE ### feat_df = get_feature_importance( best_xgb_rf, annotated_df, cols_to_drop=['GRID', 'label', 'partition']) barplot_feat_importance(feat_df, top_n=25, plt_prefix=output_suffix, fig_file=FEATURE_FIG_FILE) # shap values train_df, _ = extract_train_df(annotated_df) test_df, _ = extract_test_df(annotated_df) xgb_booster = best_xgb_rf.get_booster() calc_write_shap(X_train, y_train, xgb_booster, train_df, shap_pickle_file=TRAIN_SHAP_FILE, top_feat_file=TRAIN_TOP_FEAT_SHAP_FILE) calc_write_shap(X_test, y_test, xgb_booster, test_df, shap_pickle_file=TEST_SHAP_FILE, top_feat_file=TEST_TOP_FEAT_SHAP_FILE) ### # WRITE
def train_model_xgb_cv(X_train, X_test, y_train, y_test): dtrain = xgb.DMatrix(X_train, label=y_train) dtest = xgb.DMatrix(X_test, label=y_test) xgb_sklearn = XGBClassifier(learning_rate=0.1, n_estimators=300, max_depth=3, min_child_weight=1, gamma=0.3, subsample=0.6, colsample_bytree=0.7, objective='binary:logistic', nthread=4, seed=27, reg_lambda=0.01) xgb_params = xgb_sklearn.get_params() cvresult = xgb.cv(xgb_params, dtrain, num_boost_round=xgb_params['n_estimators'], nfold=5, metrics='auc', early_stopping_rounds=5) n_estimators = cvresult.shape[0] print("n_estimators: ", n_estimators) xgb_sklearn.set_params(n_estimators=n_estimators) xgb_sklearn.fit(np.array(X_train), np.array(y_train), eval_metric='auc') pred_y = xgb_sklearn.predict(X_test) pred_y_prob = xgb_sklearn.predict_proba(X_test)[:, 1] # auc auc = roc_auc_score(y_test, pred_y_prob) print('AUC: ', auc) # error score = xgb_sklearn.score(X_test, y_test) print('error: ', 1 - score) # grid search params = {'max_depth': [2, 3, 4, 5, 6, 7, 8]} model = GridSearchCV( estimator=XGBClassifier( learning_rate=0.1, n_estimators=300, # max_depth=3, min_child_weight=1, gamma=0.3, subsample=0.6, colsample_bytree=0.7, objective='binary:logistic', nthread=4, seed=27, reg_lambda=0.01), param_grid=params, cv=2) model.fit(np.array(X_train), np.array(y_train), eval_metric='auc') print(model.cv_results_, model.best_params_, model.best_score_) feat_imp = pd.Series(xgb_sklearn.get_booster().get_fscore( fmap='xgb.fmap')).sort_values(ascending=True) feat_imp.plot(kind='barh', color='black', legend=False, figsize=(10, 6)) plt.ylabel('Feature name') plt.xlabel('Feature score') plt.savefig( 'C:/Users/Administrator.USER-20161227PQ/Desktop/paper figure/figure5.png', dpi=300) plt.show()
def set_parameters(set_name, golden_set, input_file): golden = str_to_bool(golden_set) #------------------------------------------------------------------------- #read in the directory that is being run data_dir = set_name #read in the parameters file and load it full_path = os.path.join(working_dir, "{0}".format(data_dir), 'params.yaml') stream = open(full_path, 'r') parameters = yaml.load(stream, Loader=yaml.FullLoader) #read in Hypatia data as pandas dataframe (2D structure), drop HIP numbers df = pd.read_csv(input_file) set_number = set_name #------------------------------------------------------------------------- if golden: df2 = df.copy() df2.loc[df2[(df2['Exo'] == 1) & (df2['MaxPMass'] > parameters['gas_giant_mass'])]. sample(10, random_state=np.random.RandomState()).index, 'Exo'] = 0 yy = df2.loc[df2['Exo'] == 0].index zz = df.loc[df['Exo'] == 0].index changed = [ind for ind in yy if not ind in zz] changedhips = [df['HIP'][ind] for ind in changed] df = df2.copy() yy2 = df2.loc[df2['Exo'] == 0].index zz2 = df.loc[df['Exo'] == 0].index changed2 = [ind for ind in yy2 if not ind in zz2] #------------------------------------------------------------------------- df.index = df['HIP'] df['Exo'] = df['Exo'].astype('category') #category = limited possibilities df['Multi'] = df['Multi'].astype('category') df['MaxPMass'] = df['MaxPMass'].astype(np.number) df['Sampled'] = np.zeros((df.shape[0])) df['Predicted'] = np.zeros((df.shape[0])) df = df.drop(['HIP'], 1) # Print a bunch of stuff in terminal print('Parameters used in simulation:') print('------------------------------') print('') for key in parameters.keys(): print('{0} = {1}'.format(key, parameters[key])) cv_folds = parameters['cv_folds'] early_stopping_rounds = parameters['early_stopping_rounds'] N_iterations = parameters['N_iterations'] N_samples = parameters['N_samples'] gas_giant_mass = parameters['gas_giant_mass'] features = parameters['features'] relevant_columns = features + ['Exo', 'MaxPMass', 'Sampled', 'Predicted'] #Redefine dataframe with the "relevant columns" and remove nans if dropnans==True in yaml if (parameters['dropnans']): df = df[relevant_columns].dropna() print('Number of samples used in simulation: {0}'.format(df.shape[0])) print('') #Define the confusion matrix and other arrays cfm = np.zeros((2, 2)) auc_score_train = [] precision_score_train = [] feat_imp_train = pd.DataFrame(columns=features) probabilities_total = pd.DataFrame(index=df.index) print('iteration \t estimators') print('---------------------------') #---------------------------XGBOOST LOOP---------------------------------------------- # Loop for all of the iterations (defined in yaml) for iteration in range(0, N_iterations): #dataframe of 200 random hosts with giant planets df_iter_with_exo = df[(df['Exo'] == 1) & (df['MaxPMass'] > gas_giant_mass)].sample( N_samples, random_state=np.random.RandomState()) #dataframe of 200 random non hosts df_iter_none_exo = df[df['Exo'] == 0].sample( N_samples, random_state=np.random.RandomState()) # make a new dataframe of the 400 star subset df_train = pd.concat([df_iter_with_exo, df_iter_none_exo], axis=0) # make a dataframe of those stars NOT in the training set (to predict on) df_predict = df[~df.index.isin(df_train.index)] # The train dataframe with everything but the Exo column X = df_train.drop(['Exo'], 1) # The Exo column (and hips) Y = df_train.Exo # Note: Using gbtree booster alg = XGBClassifier( learning_rate= 0.1, #def=0.3, prevents overfitting and makes feature weight conservative n_estimators=1000, #number of boosted trees to fit max_depth=6, #def=6, max depth of tree/complexity min_child_weight= 1, #def=1, min weight needed to continue leaf partitioning gamma= 0, #def=0, minimum loss reduction required to make partition on a leaf subsample=0.8, #def=1, subsample ratio of the training set colsample_bytree= 0.8, #def=1, subsample ratio of columns when making each tree objective= 'binary:logistic', #def=linear, logistic regression for binary classification, output probability nthread= 1, #originall = 8, but issue on laptop...def=max, number of parallel threads used to run xgboost scale_pos_weight=1, #def=1, balance positive and neg weights seed=27) #def=0, random number seed #get input parameters of algorithm xgb_param = alg.get_xgb_params() #construct training set matrix xgtrain = xgb.DMatrix(X[features].values, label=Y) #cross validation (CV) of xgboost to avoid overfitting cvresult = xgb.cv(xgb_param, xgtrain, num_boost_round=alg.get_params()['n_estimators'], nfold=cv_folds, metrics='auc', early_stopping_rounds=early_stopping_rounds) alg.set_params(n_estimators=cvresult.shape[0]) print(iteration, '\t \t', cvresult.shape[0]) alg.fit(X[features], Y, eval_metric='auc') dtrain_predictions = alg.predict(X[features]) dtrain_predprob = alg.predict_proba(X[features])[:, 1] feat_imp = alg.get_booster().get_fscore() # See how the algorithm performs on the Exo data auc_score = metrics.roc_auc_score(Y, dtrain_predprob) precision_score = metrics.precision_score(Y, dtrain_predictions) metric_score = metrics.confusion_matrix(Y, dtrain_predictions) # Weighting function to ignore the null values normalized_features = pd.DataFrame( (1 - df_train[features].isnull().sum() / df_train[features].count()) * pd.Series(alg.get_booster().get_fscore()), columns=[iteration]).T #calculate the confusion matrix feat_imp_train = pd.concat([ feat_imp_train, pd.DataFrame(feat_imp, columns=features, index=[iteration]) ]) feat_imp_train_normal = pd.concat( [feat_imp_train, normalized_features]) auc_score_train.append(auc_score) precision_score_train.append(precision_score) cfm += metric_score df.loc[df_predict.index, 'Sampled'] += np.ones(len(df_predict.index)) df.loc[df_predict.index, 'Predicted'] += alg.predict(df_predict[features]) df.loc[df_predict.index, 'Prob'] = alg.predict(df_predict[features]) values = df['Prob'] probabilities_total = pd.concat( [probabilities_total, pd.Series(values, name=str(iteration))], axis=1) if (not iteration % 10): probabilities_total.to_pickle( '{0}/probabilities_total.pkl'.format(data_dir)) #------------------------------------------------------------------------- # Calculate the confusion matrix cfm /= N_iterations cfm[0] /= cfm[0].sum() cfm[1] /= cfm[1].sum() # Print confusion matrix print(np.round(cfm, 3)) df['Prob'] = df['Predicted'] / df['Sampled'] ###########-------------------Output List of Planets------------------------######### #Find the stars with >90% probability of hosting a planet, with the Sampled, Predicted, and Prob columns planets = df[(df.Prob > .90) & (df.Exo == 0)][['Sampled', 'Predicted', 'Prob']] print('Number of most probable planet hosts: {0}'.format(planets.shape[0])) #Sort the stars with predicted planets and save that file planetprobs = planets.sort_values(by='Prob', ascending=False) name = data_dir + '/figures/planet_probabilities' + str( datetime.today().strftime('-%h%d-%H%M')) + '.csv' #name = data_dir+'/figures/planet_probabilities.csv' outfile = open(name, 'w') planetprobs.to_csv(outfile) outfile.close() #Create a second list with all stars in Hypatia and the probabilities planets2 = df[(df.Prob > .0) & (df.Exo == 0)][['Sampled', 'Predicted', 'Prob']] if golden: #if 10 stars were randomly taken out changeddf = pd.DataFrame([]) #make empty dataframe for star in changedhips: #loop over the 10 known planets hosts (defined at top) changeddf = changeddf.append(planets2.loc[planets2.index == star]) if planets2.loc[ planets2.index == star].empty: #catch for when a known planet host was cut (bc of abunds) temp = pd.Series([nan, nan, nan], index=['Sampled', 'Predicted', 'Prob']) temp.name = star changeddf = changeddf.append( temp) #append blank file (with star name as index) #Save golden set as a separate file with the date and time as a tag filename = '{0}/figures/goldenSetProbabilities' + str( datetime.today().strftime('-%h%d-%H%M')) + '.csv' changeddf.to_csv(filename.format(set_number), na_rep=" ") #Save the file with all of the probabilities planetprobs2 = planets2.sort_values(by='Prob', ascending=False) name2 = data_dir + '/figures/planet_probabilitiesAll' + str( datetime.today().strftime('-%h%d-%H%M')) + '.csv' #name2 = data_dir+'/figures/planet_probabilitiesAll.csv' outfile2 = open(name2, 'w') planetprobs2.to_csv(outfile2) outfile2.close() ###########------------------------Save Files------------------------########## print('Saving data files') #Save files feat_imp_train.to_pickle('{0}/features_train.pkl'.format(data_dir)) feat_imp_train_normal.to_pickle( '{0}/features_train_normal.pkl'.format(data_dir)) probabilities_total.to_pickle( '{0}/probabilities_total.pkl'.format(data_dir)) df.to_pickle('{0}/df_info_all.pkl'.format(data_dir)) np.save('{0}/auc_score_train.npy'.format(data_dir), np.array(auc_score_train)) np.save('{0}/precision_score_train.npy'.format(data_dir), np.array(precision_score_train)) np.save('{0}/cfm.npy'.format(data_dir), cfm) print('Simulation completed successfully.') if golden: print("Changed indices and HIP numbers:") print(changed) print(changedhips)
'seed': 0, 'subsample': 1, 'colsample_bytree': 1, 'objective': 'binary:logistic', 'max_depth': 3 } # log model params for key in params: mlflow.log_param(key, params[key]) # train XGBoost model gbtree = XGBClassifier(**params) gbtree.fit(train_features, train_labels) importances = gbtree.get_booster().get_fscore() print(importances) # get predictions y_pred = gbtree.predict(test_features) accuracy = accuracy_score(test_labels, y_pred) print("Accuracy: %.1f%%" % (accuracy * 100.0)) # log accuracy metric mlflow.log_metric("accuracy", accuracy) sns.set(font_scale=1.5) xgb.plot_importance(gbtree) plt.savefig("importance.png", dpi=200, bbox_inches="tight")
#Fit the algorithm on the data model.fit(X_train, y_train, eval_metric='merror') #Predict training set: predictions = model.predict(X_test) predprob = model.predict_proba(X_test)[:, 1] # Print model report: print("\nModel Report") print("Training Accuracy : %.4g" % metrics.accuracy_score(y_train, model.predict(X_train))) print("Testing Accuracy : %.4g" % metrics.accuracy_score(y_test, model.predict(X_test))) feat_imp = pd.Series( model.get_booster().get_fscore()).sort_values(ascending=False) feat_imp.plot(kind='bar', title='Feature Importances') plt.ylabel('Feature Importance Score') ''' PARAMETER TUNING ''' ''' Tune max_depth and min_child_weight ''' # phase1 with large subset param_test1 = { 'max_depth': range(3, 10, 2), 'min_child_weight': range(1, 6, 2) } gsearch1 = GridSearchCV(estimator=model, param_grid=param_test1, scoring='accuracy', n_jobs=2, iid=False,
def train_model(self): # KFold for cross-validation folds = KFold(n_splits=self.n_folds) self.submission[target] = 0 training_start_time = time() for fold, (train_index, valid_index) in enumerate(folds.split(self.X_train)): start_time = time() print('Training on Fold {}'.format(fold + 1)) model = XGBClassifier(**self.params) # make train and valid set X_train, X_valid = self.X_train.iloc[ train_index], self.X_train.iloc[valid_index] y_train, y_valid = self.y_train.iloc[ train_index], self.y_train.iloc[valid_index] # train the model model.fit(X_train, y_train, eval_set=[(X_train, y_train), (X_valid, y_valid)], eval_metric=self.metric, verbose=self.verbose) train_pred = model.predict_proba(X_train)[:, 1] del X_train valid_pred = model.predict_proba(X_valid)[:, 1] del X_valid # train and valid roc_auc self.train_aucs.append(roc_auc_score(y_train, train_pred)) self.valid_aucs.append(roc_auc_score(y_valid, valid_pred)) del y_train, train_pred del y_valid, valid_pred print('ROC AUC on Train: {}'.format(self.train_aucs[fold])) print('ROC AUC on Validation: {}'.format(self.valid_aucs[fold])) # test set predictions for KFold test_pred = model.predict_proba(self.X_test)[:, 1] self.submission[self.target] = self.submission[ self.target] + test_pred / self.n_folds gc.collect() print('Fold {} finished in {}'.format( fold + 1, str(datetime.timedelta(seconds=time() - start_time)))) print("=" * 30) print() self.feature_importances['fold_{}'.format(fold + 1)] = pd.Series( model.get_booster().get_fscore()) print('-' * 30) print('Training has finished!') print('Total training time is {}'.format( str(datetime.timedelta(seconds=time() - training_start_time)))) print('Mean AUC on Train: ', np.mean(self.train_aucs)) print('Mean AUC on Validation: ', np.mean(self.valid_aucs)) print('-' * 30) return model
# 10.1 Print feature importance # https://stackoverflow.com/a/52777909 # https://towardsdatascience.com/be-careful-when-interpreting-your-features-importance-in-xgboost-6e16132588e7 """ importance_type ‘weight’ - the number of times a feature is used to split the data across all trees. ‘gain’ - the average gain across all splits the feature is used in. ‘cover’ - the average coverage across all splits the feature is used in. ‘total_gain’ - the total gain across all splits the feature is used in. ‘total_cover’ - the total coverage across all splits the feature is used in. """ # 11.0 Get results in a sorted DataFrame feature_important = model_gs.get_booster().get_score(importance_type='weight') feature_important keys = list(feature_important.keys()) values = list(feature_important.values()) data = pd.DataFrame(data=values, index=keys, columns=["score"]). \ sort_values( \ by = "score", \ ascending=False) # 11.1 Compare the results in the following DataFrame # with that obtained using PermutationImportance # of eli5 below.