def test_one_doc_feature_importance(): pool = Pool(TRAIN_FILE, column_description=CD_FILE) model = CatBoostClassifier(iterations=5, random_seed=0) model.fit(pool) np.save( FIMP_PATH, np.array( model.get_feature_importance( np.ones(pool.num_col(), dtype=int), 0, cat_features=pool.get_cat_feature_indices(), fstr_type='Doc'))) return local_canonical_file(FIMP_PATH)
def fun_catboost(X, y, X_train, X_validation, y_train, y_validation, target): #Creating a training set for modeling and validation set to check model performance #X = df_train.drop(['Segmentation', 'Gender','Ever_Married', 'Work_Experience','Family_Size','Var_1'], axis=1) #categorical_features_indices = np.where(df_train.dtypes != np.float)[0] categorical_features_indices = list(range(len(X_train.columns))) categorical_features_indices #importing library and building model from catboost import CatBoostClassifier model = CatBoostClassifier(iterations=5, depth=3, learning_rate=0.1, loss_function='MultiClass', eval_metric='Accuracy') model.fit(X_train, y_train, eval_set=(X_validation, y_validation), plot=True) predictions = model.predict(df_test) model.get_feature_importance(type="FeatureImportance") from catboost import Pool, CatBoostClassifier from catboost.utils import get_confusion_matrix train_label = ["A", "B", "C", "D"] cm = get_confusion_matrix(model, Pool(X_validation, y_validation)) print(cm) print(model.get_best_score()) submission = pd.DataFrame() submission['ID'] = df_test['ID'] submission[target] = predictions return categorical_features_indices, model, submission, predictions
class CatBoost(BaseModel): ''' Wrapper class of LightGBM. self.core contains Booster. ''' @timer def __init__(self, config): self.config = config @timer def train(self, X_train, y_train, X_val=None, y_val=None, params=None, num_boost_round=100, early_stopping_rounds=None, fold=0): self.core = CatBoostClassifier( # **self.config.params, **params, num_boost_round=num_boost_round) self.core.fit( X=X_train, y=y_train, eval_set=(X_val, y_val), # verbose=True, early_stopping_rounds=early_stopping_rounds) return self @timer def predict(self, X_test): y_test = self.core.predict_proba(X_test)[:, 1] return y_test @property def feature_importance(self): return self.core.get_feature_importance() @property def best_iteration(self): return self.core.get_best_iteration() @property def evals_result(self): return self.core.get_evals_result()
class CatBoost: _verbose = 200 _train_dir = DATA_CACHE_DIR _is_gpu_available = get_gpu_device_count() _task_type = "GPU" if _is_gpu_available > 0 else None _devices = "GPU" if _is_gpu_available > 0 else None def __init__(self, model_id, num_input_features, num_output_classes, model_save_path, **aux_params): self.model = CatBoostClassifier(loss_function="MultiClass", task_type=self._task_type, devices=self._devices, train_dir=self._train_dir, random_seed=SEED) self.model.set_params(**aux_params) self.model_id = model_id path = f"{model_save_path}/{model_id}" os.makedirs(path, exist_ok=True) self.model_path = path self.modelfile_save_path = os.path.join(path, STANDARD_MODEL_NAME) def load(self): self.model.load_model(self.modelfile_save_path) def save(self): self.model.save_model(self.modelfile_save_path) def fit(self, X_train, y_train, X_valid, y_valid): self.model.fit(Pool(X_train, y_train), eval_set=(X_valid, y_valid), use_best_model=True, verbose=self._verbose) self.save() def predict(self, X, load=False): if load: self.load() return self.model.predict_proba(X) def explain(self, X_train, y_train, features, classes): importances = self.model.get_feature_importance( data=Pool(X_train, y_train)) plot_importance(importances, features, self.model_path, self.model_id)
def train_all_save_catboost(self, X, y, categorical_features_indices): """train whole data and save the training to be use later in new predictions""" model = CatBoostClassifier(loss_function='MultiClass', eval_metric='TotalF1', random_seed=42, leaf_estimation_method='Newton') cv_data = cv(Pool(X, y, cat_features=categorical_features_indices), model.get_params()) print("precise validation accuracy score:{}".format(np.max(cv_data))) model.fit(X, y, cat_features=categorical_features_indices) #feature importance print(model.get_feature_importance(prettified=True)) # train = Pool(X, y, cat_features=categorical_features_indices) # feature_importances = model.get_feature_importance(train) # feature_names = X.columns # for score, name in sorted(zip(feature_importances, feature_names), reverse=True): # print('{}: {}'.format(name, score)) model.save_model('catboost_model.dump') print("Catboost model has been saved!")
class ModelCatboost(Model): # class for model Catboost def __init__(self, **params): super().__init__(**params) self.early_stopping = True self.feature_importance = [] self.set_model() def set_model(self): # set loss function depending of binary / multi class problem if self.problem_type == 'regression': self.model = CatBoostRegressor(**self.model_params) else: self.model = CatBoostClassifier(**self.model_params) def fit(self, X_train, y_train): # train with num_rounds train_pool = Pool(X_train, label=y_train.astype(float)) self.set_model() self.model.fit(train_pool, use_best_model=False) self.feature_importances_ = self.model.get_feature_importance( train_pool) def fit_early_stopping(self, X_train, y_train, X_eval, y_eval): # specific early stopping for Catboost train_pool = Pool(X_train, label=y_train.astype(float)) eval_pool = Pool(X_eval, label=y_eval.astype(float)) # set specific parameters for early stopping (overfitting detector with iter) self.params['iterations'] = MAX_ROUNDS self.params['od_type'] = 'iter' self.params['od_wait'] = PATIENCE self.model.fit(train_pool, eval_set=eval_pool, use_best_model=True) self.num_rounds = self.model.tree_count_ self.params['iterations'] = self.num_rounds self.params.pop('od_type') self.params.pop('od_wait')
def multi_classification(): from catboost import CatBoostClassifier from sklearn.metrics import accuracy_score categorical_features_indices = [0,1] clf = CatBoostClassifier( loss_function='MultiClass', iterations=1000, random_seed=42, logging_level='Silent' ) clf.fit( X_train, y_train, cat_features=categorical_features_indices, #eval_set=(X_validation, y_validation), #logging_level='Verbose', #plot=True ) # Prediction y_pred = clf.predict(X_test) from sklearn.metrics import classification_report, accuracy_score print ('\n classification report:\n', classification_report(y_test, y_pred)) from sklearn.metrics import f1_score f1_micro = f1_score(y_test, y_pred, average='micro') f1_macro = f1_score(y_test, y_pred, average='macro') print ('f1-score (micro): ', f1_micro) print ('f1-score (macro): ', f1_macro) # Feature Importance feature_importances = clf.get_feature_importance(X=X_train,y=y_train, cat_features=categorical_features_indices) feature_names = train_data.columns print ('\n Feature Importance: ') for score, name in zip(feature_importances, feature_names): print('{}: {}'.format(name, score))
def cb_model(self, category_cols=None): if category_cols is None: category_cols = [] category_id = [] # for index, value in enumerate(self.X_t.columns): if value in category_cols: category_id.append(index) model = CatBoostClassifier(iterations=self.rounds, learning_rate=0.1, cat_features=category_id, loss_function='Logloss', logging_level='Verbose', eval_metric='AUC') model.fit(self.X_t, self.y_t, eval_set=(self.X_v, self.y_v), early_stopping_rounds=self.early_stop) # res = model.predict_proba(self.test)[:, 1] importance = model.get_feature_importance(prettified=True) # 显示特征重要程度 print(importance) if self.modelname is not None: model.save_model(self.modelname + '_cb.model') return model
def get_important_features(train_x, train_y): tr_x, val_x, tr_y, val_y = train_test_split(train_x, train_y, random_state=1) model = CatBoostClassifier( iterations=1000, learning_rate=0.1, use_best_model=True, eval_metric="Accuracy", ) model.fit( tr_x, tr_y, eval_set=(val_x, val_y), plot=True, ) importance = pd.DataFrame(model.get_feature_importance(), index=train_x.columns, columns=["importance" ]).sort_values("importance", ascending=False) print(importance) return importance
def catboost_train_evel(x_train, y_train, x_vali, y_vali, x_test, params, cate_feat_idx): print('=============================================') print('catboost model training...') train_pool = Pool(x_train, y_train, cat_features=cate_feat_idx) vali_pool = Pool(x_vali, y_vali, cat_features=cate_feat_idx) model = CatBoostClassifier( **params, task_type='CPU') # sometimes GPU slower then CPU model.fit(train_pool, eval_set=vali_pool) print('=============================================') print('catboost vali acc: {:06.4f}'.format( accuracy_score(y_vali, model.predict(x_vali)))) print('=============================================') print('catboost model training parameters:') for k, v in params.items(): print('{:15}: {}'.format(k, v)) print('=============================================') print('catboost model predicting...') test_pred_result = model.predict(x_test) test_pred_prob = model.predict_proba(x_test) print(test_pred_result[:10]) print(test_pred_prob[:10]) print('=============================================') print('catboost feature importances evaluate...') feat_importances = model.get_feature_importance(train_pool) feat_names = x_train.columns feat_importances_df = pd.DataFrame() feat_importances_df['feat'] = feat_names feat_importances_df['score'] = feat_importances feat_importances_df.sort_values(['score'], ascending=False, inplace=True) feat_importances_df = feat_importances_df.reset_index(drop=True) print(feat_importances_df)
"Breed1", "Breed2", "Breed3", "Breed4", "Breed5", "Breed6", "Breed7", "Breed8", "Breed9", "Breed10", "Color-light", "Color-medium", "Color-dark", "Color-warm", "Color-medium", "Color-cold", "Color_feature1", "Color_feature2", ] print(model.score(x_test, y_test)) plt.figure(num=None, figsize=(10, 10), dpi=80, facecolor='w', edgecolor='k') plt.bar(range(len(model.get_feature_importance(prettified=False))), model.get_feature_importance(prettified=False)) plt.title("Cat Feature Importance") plt.xticks(range(len(model.get_feature_importance(prettified=False))), features, rotation='vertical') plt.gcf().savefig('feature_importance_catboost.png') plt.show()
class Classifier(object): attr_key_list = [ "AGE", "AVE_ASSISTANCE", "AVE_FOUL", "AVE_SCORE", "AVE_STEALING", "AVE_TACKLING", "HEIGHT", "NATION", "SAVING_TIME", "SPEED", "WEIGHT", "YELLOW_RED_CARD_NUMBER" ] def __init__(self, dataset_file_path, test_dataset_file_path, dataset_split_ratio=0.7, train_iter=10, depth=10, learing_rate=0.1, loss='MultiClass', logging_level='Verbose'): self.dataset = load_json(file_path=dataset_file_path) dataset_num = len(self.dataset) self.train_set = self.dataset[:int(dataset_num * dataset_split_ratio)] self.validate_set = self.dataset[int(dataset_num * dataset_split_ratio):] self.test_set = load_json(file_path=test_dataset_file_path) self.train_attr_set, self.train_label_set = self._process_dataset( dataset=self.train_set) self.validate_attr_set, self.validate_label_set = self._process_dataset( dataset=self.validate_set) self.test_attr_set, self.test_label_set = self._process_dataset( dataset=self.test_set) self.model = CatBoostClassifier(iterations=train_iter, depth=depth, cat_features=[7], loss_function=loss, learning_rate=learing_rate, logging_level=logging_level) self.config = {} self.config['LEARNING_RATE'] = learing_rate self.config['LOSS'] = loss self.config['DEPTH'] = depth self.config['TRAIN_DATASET_COUNT'] = len(self.train_label_set) self.config['VALIDATE_DATASET_COUNT'] = len(self.validate_label_set) self.config['TEST_DATASET_COUNT'] = len(self.test_label_set) self.final_test_acc = 0.0 self.feature_res = {} def train(self): self.model.fit(X=self.train_attr_set, y=self.train_label_set, eval_set=(self.validate_attr_set, self.validate_label_set)) def test(self): res = self.model.predict(self.test_attr_set) # print(res) acc = np.sum([ 1 if res[i] == self.test_label_set[i] else 0 for i in range(len(self.test_label_set)) ]) / len(self.test_attr_set) print("accuracy is %f" % acc) self.final_test_acc = acc def feature_importance(self): res = self.model.get_feature_importance() for score, key in zip(res, Classifier.attr_key_list): print("%s %f" % (key, score)) self.feature_res[key] = score @staticmethod def _process_dataset(dataset): attr_set = [] label_set = [] for sample in dataset: attr = [] for key in Classifier.attr_key_list: attr.append(sample[key]) label = sample['LABEL'] attr_set.append(attr) label_set.append(label) return np.array(attr_set), np.array(label_set) def export_res(self, path='', file_name='exp_log.json'): res = {} res['EXP_CONFIG'] = self.config res['FEATURE_IMPORTANCE'] = self.feature_res res['FINAL_TEST_ACCURACY'] = self.final_test_acc export_json(file_path=os.path.join(LOG_PATH, path, file_name), dict=res) pass
for e in ax.get_yticklabels()+ax.get_xticklabels(): e.set_fontsize(6) e.set_color(GRAY1) ax.tick_params(left=False) ax.spines['right'].set_visible(False) ax.spines['top'].set_visible(False) offset = transforms.ScaledTranslation(0, -0.07, fig.dpi_scale_trans) for e in ax.get_xticklabels() + ax.xaxis.get_ticklines() + \ [ax.spines['bottom']]: e.set_transform(e.get_transform() + offset) ax.spines['bottom'].set_bounds(0, 100) _ = ax.set_xlabel('Relative Importance', color=GRAY4, fontsize=7) # PAGE 354. FIGURE 10.6. Predictor variable importance spectrum for the spam # data. The variable names are written on the vertical axis. plot_relative_feature_importance(np.array(cb_clf.get_feature_importance())) plt.tight_layout() plt.savefig('../figures/spam_feature_importance.pdf', dpi=300) plt.show() # Partial dependence def plot_partial_dependence(ax, feature): n = features.index(feature) X_tmp = X.copy() vals = np.unique(np.percentile(X_tmp[:, n], np.linspace(5, 95, 100))) result = [] for i in range(vals.shape[0]): X_tmp[:, n] = vals[i] pr = np.mean(cb_clf.predict_proba(X_tmp), axis=0) result.append(np.log(pr[1]/pr[0]))
def train(self, feature_names): """ Input: feature_names: directionary of features' names Output: validity: Dataframe(["MachineIdentifier", "HasDetections", "Predict") """ # Initialize parameters validity = None model_path = Path(__file__).absolute().parents[2] / "data" / "model" / str(get_version()) Path.mkdir(model_path, exist_ok=True, parents=True) feature_importance = pd.DataFrame() START_FOLD = 0 if get_back_training(): START_FOLD = len(list(model_path.glob('**/*.model'))) END_FOLD = 5 if train_one_round(): START_FOLD = 0 END_FOLD = 1 if START_FOLD == END_FOLD: return None # Process for each fold for fold in range(START_FOLD, END_FOLD): log_path = Path(__file__).absolute().parents[2] / "log" / "train" / str(get_version()) / str("fold{}".format(fold)) Path.mkdir(log_path, exist_ok=True, parents=True) # Measure start time of the classification of this fold start = time.time() getLogger(get_version()).info("\t >> {} folds start".format(fold)) send_message("\t :cat: {} folds start".format(fold)) # Generate dataset getLogger(get_version()).info("\t \t Generating datasets...") send_message("\t \t Generating datasets...") valid = "valid{}".format(str(fold)) trn_x = super().get_feature_df(feature_names, valid, "train") val_x = super().get_feature_df(feature_names, valid, "validate") trn_x.set_index("MachineIdentifier", inplace=True) val_x.set_index("MachineIdentifier", inplace=True) trn_y = trn_x["HasDetections"].astype(np.int8) val_y = val_x["HasDetections"].astype(np.int8) getLogger(get_version()).info("\t \t Datasets were generated.") send_message("\t \t Datasets were generated.") # Initialize variables for scoring if validity is None: validity = pd.DataFrame() validity["HasDetections"] = pd.concat([trn_y, val_y]) validity["Predict"] = 0 # Delete needless features del trn_x["HasDetections"], val_x["HasDetections"] # Classify clf = CatBoostClassifier(iterations=self.params["iterations"], verbose=self.params["verbose"], early_stopping_rounds=self.params["early_stopping_rounds"], random_seed=self.params["random_seed"], max_depth=self.params["max_depth"], loss_function=self.params["loss_function"], custom_metric=self.params["custom_metric"], eval_metric=self.params["eval_metric"], rsm=self.params["rsm"], train_dir=str(log_path)) clf.fit(trn_x.values, trn_y.values, eval_set=(val_x.values, val_y.values)) for train_or_valid, metrics in clf.best_score_.items(): for metric, score in metrics.items(): getLogger(get_version()).info("\t\t >> Best {} {}: {}".format(train_or_valid, metric, score)) send_message("\t\t :star-struck: Best {} {}: {}".format(train_or_valid, metric, score)) validity.loc[validity.index.isin(val_x.index), "Predict"] = clf.predict_proba(val_x.values)[:, 1] # Calculate feature importance per fold if fold == 0: feature_importance["feature"] = trn_x.columns feature_importance["fold{}".format(fold)] = clf.get_feature_importance() # Measure finish time of the classification of this fold elapsed_time = int(time.time() - start) minutes, sec = divmod(elapsed_time, 60) hour, minutes = divmod(minutes, 60) getLogger(get_version()).info( "\t >> {} folds finish: [elapsed_time] >> {:0>2}:{:0>2}:{:0>2}" .format(fold, hour, minutes, sec)) send_message("\t :cat: {} folds finish: [elapsed_time] >> {:0>2}:{:0>2}:{:0>2}".format(fold, hour, minutes, sec)) # Post-process this fold clf.save_model(str(model_path / "valid{}.model".format(fold))) # Output CV score validity = output_cv(validity, ":cat:") # Save importance directory_path = Path(__file__).absolute().parents[2] / "importance" save_feature_importance(feature_importance, directory_path) # Post-process the training del feature_importance gc.collect() return validity
'early_stopping_rounds': 100, 'iterations': 1000, 'verbose': 20, 'random_seed': 1031 } # 上記のパラメータでモデルを学習する clf = CatBoostClassifier(**params) # パラメータをハッシュ化してファイル名に投げる hs = hashlib.md5(str(params).encode()).hexdigest() clf.fit(trains, eval_set=valids, use_best_model=True, plot=True) # feature importance feature_importances = clf.get_feature_importance(trains) feature_names = X_train.columns with open(f"./output/importance_catb_{hs}.csv", "w", newline="", encoding="utf-8") as f: f.write("feature,importance\n") for score, name in sorted(zip(feature_importances, feature_names), reverse=True): f.write(f'{name},{score}\n') print(f'{name}: {score}') # テストデータを予測する y_pred_proba = clf.predict_proba(X_test) y_pred = clf.predict(X_test)
def kfold_catboost(df, num_folds, stratified = False, debug= False): # Divide in training/validation and test data train_df = df[df['TARGET'].notnull()] test_df = df[df['TARGET'].isnull()] print("Starting CatBoost. Train shape: {}, test shape: {}".format(train_df.shape, test_df.shape)) del df gc.collect() train_df = train_df.replace(np.inf, 0) test_df = test_df.replace(np.inf, 0) train_df = train_df.fillna(0) test_df = test_df.fillna(0) # Cross validation model if stratified: folds = StratifiedKFold(n_splits= num_folds, shuffle=True, random_state=1001) else: folds = KFold(n_splits= num_folds, shuffle=True, random_state=1001) # Create arrays and dataframes to store results oof_preds = np.zeros(train_df.shape[0]) sub_preds = np.zeros(test_df.shape[0]) feature_importance_df = pd.DataFrame() feats = [f for f in train_df.columns if f not in ['TARGET','SK_ID_CURR','SK_ID_BUREAU','SK_ID_PREV','index']] for n_fold, (train_idx, valid_idx) in enumerate(folds.split(train_df[feats], train_df['TARGET'])): train_x, train_y = train_df[feats].iloc[train_idx], train_df['TARGET'].iloc[train_idx] valid_x, valid_y = train_df[feats].iloc[valid_idx], train_df['TARGET'].iloc[valid_idx] catboost_params = { 'iterations': 10000, 'verbose': 1000, 'learning_rate': 0.05, 'depth': 8, 'l2_leaf_reg': 40, 'bootstrap_type': 'Bernoulli', 'subsample': 0.7, 'scale_pos_weight': 5, 'eval_metric': 'AUC', 'od_type': 'Iter', 'od_wait': 200, 'allow_writing_files': False } clf = CatBoostClassifier(**catboost_params) clf.fit(train_x, train_y,eval_set=[(train_x,train_y),(valid_x, valid_y)], use_best_model=True, verbose=1000, early_stopping_rounds=200) oof_preds[valid_idx] = clf.predict_proba(valid_x)[:, 1] sub_pred = clf.predict_proba(test_df[feats])[:, 1] sub_preds += sub_pred / folds.n_splits auc_score = roc_auc_score(valid_y, oof_preds[valid_idx]) fold_importance_df = pd.DataFrame(clf.get_feature_importance(prettified=True),columns=["feature","importance"]) fold_importance_df["fold"] = n_fold + 1 feature_importance_df = pd.concat([feature_importance_df, fold_importance_df], axis=0) print('Fold %2d AUC : %.6f' % (n_fold + 1, auc_score)) del clf, train_x, train_y, valid_x, valid_y gc.collect() print('Full AUC score %.6f' % roc_auc_score(train_df['TARGET'], oof_preds)) write_to_csv(train_df,oof_preds,"oof_catboost.csv") write_to_csv(test_df,sub_preds,"submission_catboost.csv") # Write submission file and plot feature importance # if not debug: # test_df['TARGET'] = sub_preds # test_df[['SK_ID_CURR', 'TARGET']].to_csv(submission_file_name, index= False) temp = feature_importance_df[["feature", "importance"]].groupby("feature").mean().sort_values(by="importance", ascending=False) print("no. of contributing features: %d" % (len(temp[temp["importance"]>0]))) display_importances(feature_importance_df) feature_importance_df.groupby("feature").mean().sort_values("importance",ascending=False)["importance"].to_csv("feature_importance.csv")
def test_interaction_feature_importance(): pool = Pool(TRAIN_FILE, column_description=CD_FILE) model = CatBoostClassifier(iterations=5, random_seed=0) model.fit(pool) np.save(FIMP_PATH, np.array(model.get_feature_importance(pool, fstr_type='Interaction'))) return local_canonical_file(FIMP_PATH)
cat_features = categorical_features_indices # In[129]: categorical_features_indices # In[131]: feature_score = pd.DataFrame(list(zip(one_hot.dtypes.index, model.get_feature_importance(Pool(one_hot, label=Y, cat_features=categorical_features_indices)))), columns=['Feature','Score']) feature_score = feature_score.sort_values(by='Score', ascending=False, inplace=False, kind='quicksort', na_position='last') # In[152]: plt.rcParams["figure.figsize"] = (502,7) ax = feature_score.plot('Feature', 'Score', kind='bar', color='r') ax.set_title("Catboost Feature Importance Ranking", fontsize = 6) ax.set_xlabel('') rects = ax.patches labels = feature_score['Score'].round(2) for rect, label in zip(rects, labels): height = rect.get_height()
y_val = y.iloc[valid_index] model = CatBoostClassifier(**params) model.fit(X_tr, y_tr, eval_set=(X_val, y_val), plot=True) y_pred_valid = model.predict(X_val) y_oof[valid_index] = y_pred_valid print(f"Fold {fold_n + 1} | AUC: {roc_auc_score(y_val, y_pred_valid)}") score += roc_auc_score(y_val, y_pred_valid) / NFOLDS y_preds += model.predict_proba(X_test)[:, 1] / NFOLDS fold_importance_df = pd.DataFrame() fold_importance_df['feature'] = columns fold_importance_df['importance'] = model.get_feature_importance() feature_importance_df = pd.concat( [feature_importance_df, fold_importance_df], axis=0) del model, X_val, X_tr, y_val, y_tr gc.collect() print(f"\nMean AUC = {score}") print(f"Out of folds AUC = {roc_auc_score(y, y_oof)}") # In[17]: sub = pd.read_csv('sample_submission.csv') # In[18]:
sigmoid = lambda x: 1 / (1 + exp(-x)) probabilities = sigmoid(raw_pred) print(probabilities) # In[40]: predictions_gen = model.staged_predict_proba( data=X_validation, ntree_start=0, ntree_end=5, eval_period=1 ) try: for iteration, predictions in enumerate(predictions_gen): print('Iteration ' + str(iteration) + ', predictions:') print(predictions) except Exception: pass # In[41]: model.get_feature_importance(prettified=True)
def model_catboost(self, X, y, X_train, y_train, X_test, y_test, categorical_features_indices, target, file): print("Processing CATBOOST....") # Adicione esto: inicio train_pool = Pool(X_train, y_train, cat_features=categorical_features_indices) validate_pool = Pool(X_test, y_test, cat_features=categorical_features_indices) # fin # model=CatBoostClassifier(loss_function='MultiClass',use_best_model=True, random_seed=42)#, class_weights=[1,2,3,4,5,6,7,8,9,10,11]) model = CatBoostClassifier(loss_function='MultiClass', eval_metric='TotalF1', use_best_model=True, random_seed=42, leaf_estimation_method='Newton') model.fit(train_pool, eval_set=validate_pool, use_best_model=True, verbose=50, plot=False, early_stopping_rounds=100) # cross-validation cv_params = model.get_params() cv_data = cv(Pool(X, y, cat_features=categorical_features_indices), cv_params, fold_count=10, plot=False) print('Precise validation accuracy score: {}'.format( np.max(cv_data))) # ['TotalF1'] # fin print("PRIMER prediccion") print() print(model) # make predictions expected_y = y_test predicted_y = model.predict(X_test) # summarize the fit of the model print() print(metrics.classification_report(expected_y, predicted_y)) print() print(metrics.confusion_matrix(expected_y, predicted_y)) print("SEGUNDO prediccion") print(model.best_iteration_, model.best_score_) print(model.evals_result_['validation']['MultiClass'][-10:]) # prediction pred = model.predict(X_test) print("PREDICT") print(pred) print("print dataframe predictions:") cm = pd.DataFrame() # cm['DAMAGE'] = y_test cm[target] = y_test cm['Predict'] = model.predict(X_test) print(cm) print("SCORES") print(model.score(X_test, y_test)) cm.to_csv(file) # , index=False) # cm.to_csv("catboost_prediction.csv")#, index=False) # confusion matrix print("confusion matrix:") # conf_mat = get_confusion_matrix(model, Pool(X_train, y_train, cat_features=categorical_features_indices)) conf_mat = get_confusion_matrix( model, Pool(X_test, y_test, cat_features=categorical_features_indices)) print(conf_mat) # feature selection print(model.get_feature_importance(prettified=True)) # feature_importances = model.get_feature_importance(train_pool) # feature_names = X_train.columns # for score, name in sorted(zip(feature_importances, feature_names), reverse=True): # print('{}: {}'.format(name, score)) ## return model, cv_data
# To Pool Class (for catboost only) pool_train=Pool(X_train, Y_train,cat_features=Pos) # Fit the model print('\nCatboost Optimal Fit with %d rounds...\n' % nrounds) model_catboost.fit(X=pool_train) # 3) Shap Importance for the features of the final model ################################################################################ # Shap methodology: # "https://medium.com/@gabrieltseng/interpreting-complex-models-with-shap-values-1c187db6ec83" # Catboost has already SHAP integrated ShapImportance=model_catboost.get_feature_importance(data=pool_train, type='ShapValues', prettified=True, verbose=False) ShapValues=pd.DataFrame(ShapImportance[:,:-1], columns=list(X_train)) # Shap come as variation with respect to LOG Odds!!! # We make an adaptataion to express the probability variation ShapValues['SUMX_LO'] = ShapValues.sum(axis=1) ShapValues['EXP_VAL_LO'] = ShapImportance[0,-1] ShapValues['Pred_LO'] = ShapValues['EXP_VAL_LO'] + ShapValues['SUMX_LO'] ShapValues['EXP_VAL_p']=1/(1+np.exp(-ShapValues['EXP_VAL_LO'])) ShapValues['pred_p']=1/(1+np.exp(-ShapValues['Pred_LO'])) ShapValues['SUMX_p']=ShapValues['pred_p']-ShapValues['EXP_VAL_p'] cols=list(ShapValues)[0:-6]
# In[77]: #Check the ROC AUC score on validation dataset roc_auc_score(Y_test, y_pred) # In[78]: #Plot roc curve plot_roc_curve(model, X_test, Y_test) # In[79]: #Check the feature importance of our model model.get_feature_importance(data=catboost.Pool(X_test, label=Y_test, cat_features=cat_cols), type='FeatureImportance', prettified=True, verbose=True) # In[80]: #Check the interaction between features in the model model.get_feature_importance(data=catboost.Pool(X_test, label=Y_test, cat_features=cat_cols), type='Interaction', prettified=True, verbose=True) # In[58]:
fpr_rf_cat, tpr_rf_cat, _ = roc_curve(y_test, y_pred_cat) auc(fpr_rf_cat, tpr_rf_cat) plt.figure(1) plt.plot([0, 1], [0, 1], 'k--') plt.plot(fpr_rf_cat, tpr_rf_cat, label='RF') #plt.plot(fpr_rf_lm, tpr_rf_lm, label='RF + LR') #plt.plot(fpr_grd, tpr_grd, label='GBT') plt.xlabel('False positive rate') plt.ylabel('True positive rate') plt.title('ROC curve') plt.legend(loc='best') plt.show() feature_importance = model.get_feature_importance(np.reshape( x_test, (30000, -1)), y=y_test) ## Logistic Regression from sklearn.linear_model import LogisticRegression logistic = LogisticRegression() logistic.fit(x_train, y_train) y_pred_logistic = logistic.predict(x_test) logistic.score(x_test, y_test.values.reshape(-1, 1)) logistic.predict_proba(x_test) from sklearn.metrics import roc_curve, auc fpr, tpr, thresholds = roc_curve(y_test, y_pred_logistic) auc(fpr, tpr)
def test_one_doc_feature_importance(): pool = Pool(TRAIN_FILE, column_description=CD_FILE) model = CatBoostClassifier(iterations=5, random_seed=0) model.fit(pool) np.save(FIMP_PATH, np.array(model.get_feature_importance(np.ones(pool.num_col(), dtype=int), 0, cat_features=pool.get_cat_feature_indices(), fstr_type='Doc'))) return local_canonical_file(FIMP_PATH)
verbose=VERBOSE, random_state=RANDOM_STATE, thread_count=N_THREADS, task_type="GPU") model.fit( train_dataset, eval_set=valid_dataset, early_stopping_rounds=EARLY_STOPPING_ROUNDS, ) y_pred_valid = model.predict_proba(valid_dataset)[:, 1] y_pred = model.predict_proba(test_dataset)[:, 1] fold_importance = pd.DataFrame() fold_importance["feature"] = model.feature_names_ fold_importance["importance"] = model.get_feature_importance() fold_importance["fold"] = fold_n + 1 feature_importance = pd.concat([feature_importance, fold_importance], axis=0) best_iteration = model.best_iteration_ best_iterations.append(best_iteration) fold_score = roc_auc_score(y_valid, y_pred_valid) scores.append(fold_score) update_tracking( run_id, "AUC_f{}".format(fold_n + 1), fold_score, integer=False, )
y_train = y_train[:,1] y_test = y_test[:,1] train_pool = Pool(X_train, y_train) eval_pool = Pool(X_test, y_test) # load model model = CatBoostClassifier() model.load_model('models/catboost_model_4.dump') # Feature Importance: Know which feature contributed the most feature_importances = model.get_feature_importance(train_pool) feature_names = pd.DataFrame(X_train).columns for score, name in sorted(zip(feature_importances, feature_names), reverse=True): print('{}: {}'.format(name, score)) print('\n\n\n') print(model.get_best_score()) print(model.get_params()) # Validation Prediction probabilities = model.predict(eval_pool) # print(probabilities) pd.DataFrame(probabilities).to_csv('validation-scores/val-scores-3.csv')
model2 = CatBoostClassifier() model2.fit(X_train2, y_train2, cat_features=categorical_features_indices2, eval_set=(X_test2, y_test2)) # In[156]: print('Accuracy of CatBoost classifier on training set: {:.2f}'.format( model2.score(X_train2, y_train2))) print('Accuracy of CatBoost classifier on test set: {:.2f}'.format( model2.score(X_test2, y_test2))) # In[157]: model2.get_feature_importance() # In[158]: X2.columns # In[183]: X_test2.shape # In[184]: y_test2.shape # In[185]:
pd.read_csv( '/Users/jacobtryba/DSI/assignments/supervised-learning-case-study/data/churn_train.csv' )).drop('months_as_user', axis=1) df_test = clean( pd.read_csv( '/Users/jacobtryba/DSI/assignments/supervised-learning-case-study/data/churn_test.csv' )).drop('months_as_user', axis=1) X_train, X_test, y_train, y_test = X_y(df_train) model = CatBoostClassifier(iterations=2, depth=2, learning_rate=1, loss_function='Logloss', verbose=True) # train the model model.fit(X_train, y_train) # make the prediction using the resulting model preds_class = model.predict(X_test) preds_proba = model.predict_proba(X_test) print(preds_class) print(preds_proba) print( sklearn.metrics.accuracy_score(y_test, preds_class, normalize=True, sample_weight=None)) print(model.get_feature_importance())
cbc.fit(X_train, y_train, cat_features=cat_features, logging_level='Verbose', eval_set=(X_val, y_val), # early_stopping_rounds=100, use_best_model=True, plot=True) print("Count of trees in model = {}".format(cbc.tree_count_)) # Print Feature Importance train_pool = Pool(X_train, y_train, cat_features=cat_features) feature_importances = cbc.get_feature_importance(train_pool) feature_names = X_train.columns for score, name in sorted(zip(feature_importances, feature_names), reverse=True): print('{}: {}'.format(name, score)) # Plotting sns.set(font_scale=2) def func_plot_importance(df_imp): sns.set(font_scale=1) fig = plt.figure(figsize=(3, 3), dpi=100) ax = sns.barplot( x="Importance", y="Features", data=df_imp, label="Total", color="b") ax.tick_params(labelcolor='k', labelsize='10', width=3)
def model_rpt(df, config): print("training model...") numeric_features = list(config['numeric_features'].keys()) categorical_features = list(config['categorical_features'].keys()) target = config['target_feature'] #rpt = open('samplefile.txt', 'w') rpt = open(config['output_path'] + config['output_file'] + ".txt", 'w') rpt.write("--- dataset summary --- \n") rpt.write("output_file = " + config['output_file'] + ".csv\n") rpt.write("n_samples = " + str(config['n_samples']) + "\n") rpt.write("n_features = " + str(len(numeric_features) + len(categorical_features)) + "\n") rpt.write("pct_missing = " + "{:.2%}".format(config['pct_missing']) + "\n") rpt.write("\n") rpt.write("--- model features --- \n") rpt.write("numeric_features = " + str(numeric_features) + "\n") rpt.write("categorical_features = " + str(categorical_features) + "\n") rpt.write("\n") """ df_stats = summary_stats(df) rpt.write("--- data summary --- \n") rpt.write("Column \t Dtype \t Count\t N_unique\t N_null\t N_notnull\t PCT_null\t PCT_unique\t CHK_null\t CHK_unique\n") for index, row in fprtbl.iterrows(): rpt.write( "%1.2f\t %1.2f\t %1.2f\n" % (row['threshold'],row['fpr'], row['tpr'] )) rpt.write("\n") """ df = prep_df(df, numeric_features, categorical_features) X_train, X_eval = part_df( df[numeric_features + categorical_features + [target]], 0.2) X_eval, X_test = part_df(X_eval, 0.5) categorical_features_pos = column_index( X_train[numeric_features + categorical_features], categorical_features) # Initialize CatBoostClassifier model = CatBoostClassifier(100) # Fit model model.fit(X_train[numeric_features + categorical_features], X_train[target].values, plot=False, verbose=False, cat_features=categorical_features_pos, eval_set=(X_eval[numeric_features + categorical_features], X_eval[target].values)) # --- model predict --- prob_cat_train = model.predict_proba(X_train[numeric_features + categorical_features])[:, 1] prob_cat_eval = model.predict_proba(X_eval[numeric_features + categorical_features])[:, 1] prob_cat_test = model.predict_proba(X_test[numeric_features + categorical_features])[:, 1] print("(Train)") print("AUC Score : %f" % roc_auc_score(X_train[target].values, prob_cat_train)) print("\n") print("(eval)") print("AUC Score : %f" % roc_auc_score(X_eval[target].values, prob_cat_eval)) print("\n") print("(Test)") print("AUC Score : %f" % roc_auc_score(X_test[target], prob_cat_test)) print("\n") rpt.write("--- dataset performance ---\n") rpt.write("Train AUC Score : %f" % roc_auc_score(X_train[target].values, prob_cat_train)) rpt.write("\n") rpt.write("Eval AUC Score : %f" % roc_auc_score(X_eval[target].values, prob_cat_eval)) rpt.write("\n") rpt.write("Test AUC Score : %f" % roc_auc_score(X_test[target], prob_cat_test)) rpt.write("\n\n") fpr, tpr, thr = roc_curve(X_test[target], prob_cat_test) model_stat = pd.concat([ pd.DataFrame(fpr).rename(columns={0: 'fpr'}), pd.DataFrame(tpr).rename(columns={0: 'tpr'}), pd.DataFrame(thr).rename(columns={0: 'threshold'}) ], axis=1).round(decimals=2) # m = model_stat.loc[model_stat['fpr'] <= 0.1] m = model_stat.loc[model_stat.groupby(["fpr"])["threshold"].idxmax()] # m1 = m.loc[model_stat['threshold'].idxmax()] print("--- score thresholds ---") print(m.loc[(m['fpr'] > 0.0) & (m['fpr'] <= 0.1)].reset_index(drop=True)) print("\n") fprtbl = m.loc[(m['fpr'] > 0.0) & (m['fpr'] <= 0.1)].reset_index(drop=True) rpt.write("--- score thresholds ---\n") rpt.write("THR \t FPR \t TPR\t \n") for index, row in fprtbl.iterrows(): rpt.write("%1.2f\t %1.2f\t %1.2f\n" % (row['threshold'], row['fpr'], row['tpr'])) rpt.write("\n") shap_values = model.get_feature_importance(Pool( X_test[numeric_features + categorical_features], label=X_test[target], cat_features=categorical_features_pos), type="ShapValues") shap_values = shap_values[:, :-1] vals = np.abs(shap_values).mean(0) feature_importance = pd.DataFrame( list(zip(X_train.columns, vals)), columns=['feature_name', 'feature_importance']) feature_importance.sort_values(by=['feature_importance'], ascending=False, inplace=True) print("--- feature importance ---") print(feature_importance.reset_index(drop=True)) rpt.write("--- feature importance ---\n") for index, row in feature_importance.iterrows(): rpt.write("%-30s\t %1.4f\n" % (row['feature_name'], row['feature_importance'])) rpt.write("\n") rpt.close() if config["s3_upload"] == "True": ret = upload_file_s3('txt', config) return model
def test_shap_feature_importance(): pool = Pool(TRAIN_FILE, column_description=CD_FILE) model = CatBoostClassifier(iterations=5, random_seed=0, max_ctr_complexity=1) model.fit(pool) np.save(FIMP_PATH, np.array(model.get_feature_importance(pool, fstr_type='ShapValues'))) return local_canonical_file(FIMP_PATH)