encoder = joblib.load('models/encoder.pkl') # Load data X_train = encoder.transform(pd.read_csv('data/X_train.csv')) y_train = pd.read_csv('data/y_train.csv')['Survived'] # Determine categorical features cat_features = np.where(X_train.dtypes == int)[0] # Create a validation set with 20% of the training set X_fit, X_val, y_fit, y_val = model_selection.train_test_split(X_train, y_train, test_size=0.2) # Initialize stage 0 models models = { 'rf': ensemble.RandomForestClassifier(random_state=1), 'catboost': catboost.CatBoostClassifier() } # Initialize stack stack = xam.stacking.StackingClassifier( models=models, meta_model=linear_model.LogisticRegression(), cv=model_selection.StratifiedKFold(n_splits=10), use_base_features=True, use_proba=True ) stack.fit( X=X_train, y=y_train, fit_params={
return X, y def read_yaml(path): with open(path, "r") as f: return yaml.safe_load(f) MODEL_PARAMS = {"allow_writing_files": False, "iterations": 10} @pytest.fixture( scope="module", params=[ cb.CatBoost(MODEL_PARAMS), cb.CatBoostClassifier(**MODEL_PARAMS), cb.CatBoostRegressor(**MODEL_PARAMS), ], ids=["CatBoost", "CatBoostClassifier", "CatBoostRegressor"], ) def cb_model(request): model = request.param X, y = get_iris() return ModelWithData(model=model.fit(X, y), inference_dataframe=X) @pytest.fixture def reg_model(): model = cb.CatBoostRegressor(**MODEL_PARAMS) X, y = get_iris() return ModelWithData(model=model.fit(X, y), inference_dataframe=X)
from sklearn.metrics import roc_auc_score tr_index = ~all_data['label'].isnull() X_train = all_data[tr_index][list(set(feature_name))].reset_index(drop=True) y = all_data[tr_index]['label'].reset_index(drop=True).astype(int) X_test = all_data[~tr_index][list(set(feature_name))].reset_index(drop=True) print(X_train.shape,X_test.shape) random_seed = 2019 final_pred = [] cv_score = [] cv_model = [] skf = StratifiedKFold(n_splits=5, random_state=random_seed, shuffle=True) for index, (train_index, test_index) in enumerate(skf.split(X_train, y)): print(index) train_x, test_x, train_y, test_y = X_train[feature_name].iloc[train_index], X_train[feature_name].iloc[test_index], y.iloc[train_index], y.iloc[test_index] cbt_model = cbt.CatBoostClassifier(iterations=3000,learning_rate=0.1,max_depth=7,l2_leaf_reg=10,verbose=10,early_stopping_rounds=100,devices='GPU',eval_metric='F1') cbt_model.fit(train_x[feature_name], train_y,eval_set=(test_x[feature_name],test_y)) cv_model.append(cbt_model) y_test = cbt_model.predict(X_test[feature_name]) y_val = cbt_model.predict_proba(test_x[feature_name]) print(Counter(np.argmax(y_val,axis=1))) cv_score.append(f1_score(test_y,np.round(y_val[:,1]))) # Catboost比较适合类别较多的场景 # GPU结果五折 # 第一折 # bestTest = 0.9358583857 # bestIteration = 1437 # 第二折
model1.predict_proba(X.iloc[val_idx, :])[:, 1])) preds[:, i] = model1.predict_proba(public)[:, 1] # ================= CatBoost ===================== # cat_feature_inds = [] for i, c in enumerate(X.columns.values): num_uniques = len(X[c].unique()) if num_uniques < 3: cat_feature_inds.append(i) preds = np.zeros((len(public), 5)) for i, (train_idx, val_idx) in enumerate(kf.split(X)): cat_model = catboost.CatBoostClassifier(iterations=600, learning_rate=0.03, depth=6, l2_leaf_reg=1, eval_metric='Logloss', random_seed=4 * 100 + 6) cat_model.fit(X.iloc[train_idx, :], y[train_idx], cat_features=cat_feature_inds) print("Train Log loss is %.4f" % log_loss(y[train_idx], cat_model.predict_proba(X.iloc[train_idx, :])[:, 1])) print("Validation Log loss is %.4f" % log_loss(y[val_idx], cat_model.predict_proba(X.iloc[val_idx, :])[:, 1])) preds[:, i] = cat_model.predict_proba(public)[:, 1] # lightgbm
def main(): # Load cli params parser = argparse.ArgumentParser(prog="grid") parser.add_argument("-d", "--data", dest="data", required=True) parser.add_argument("-m", "--model", dest="model", required=True, choices=["xgboost", "lightgbm", "catboost"]) parser.add_argument("-t", "--tag", dest="tag", required=True) parser.add_argument("-n", "--number", dest="number", type=int, default=-1) args = parser.parse_args() # Load config cf = c.Config(args.data, args.model, args.tag) grid_params = cf.load_config_file("grid") hyper_params = cf.load_config_file("hyper") fit_params = {} # Load data features_to_load = list(cf.features.keys()) source_file = os.path.join(c.DATA_FOLDER, args.data) data = pd.read_csv(source_file, usecols=features_to_load, sep=";", decimal=".", encoding="latin1", keep_default_na=False, na_values=[""]) # Preprocessing group_categoricals_tail(data, cf.classes["categorical"]) if args.model == "xgboost": import xgboost as xgb estimator = xgb.XGBClassifier() data = pd.get_dummies(data, columns=cf.classes["categorical"]).copy() elif args.model == "lightgbm": import lightgbm as lgb from sklearn.preprocessing import LabelEncoder estimator = lgb.LGBMClassifier() label_encoding = {} for col in cf.classes["categorical"]: unique_values = data[col].unique().tolist() label_encoding[col] = LabelEncoder() label_encoding[col].fit(sorted(unique_values)) data[col] = label_encoding[col].transform(data[col].values) fit_params = {"categorical_feature": cf.classes["categorical"]} elif args.model == "catboost": import catboost as cb estimator = cb.CatBoostClassifier() else: # This code should never end up here raise ValueError( "Something went wrong: {} is not a feasible model.".format( args.model)) features = [ x for x in sorted(data.columns.tolist()) if x not in cf.classes["label"] + cf.classes["index"] ] if args.model == "lightgbm": fit_params["feature_name"] = features elif args.model == "catboost": fit_params["cat_features"] = [ i for i, f in enumerate(features) if f in cf.classes["categorical"] ] # Sampling if not args.number == -1 or args.number <= data.shape[0]: data = data.sample(args.number) d = data.loc[:, features].values # Prepare to save output = "_".join([args.model, args.tag, "grid"]) + ".xlsx" excel_writer = pd.ExcelWriter(os.path.join(c.RESULTS_FOLDER, output), engine="xlsxwriter") # Grid results = {} for label in cf.classes["label"]: print("----------------------------", end="\n") print(label, end="\n") print("----------------------------", end="\n\n") y = data.loc[:, label].values rand_grid_cv = RandomizedSearchCV(estimator, param_distributions=hyper_params, **grid_params) rand_grid_cv.fit(d, y, **fit_params) results[label] = rand_grid_cv.best_params_ results_df = pd.DataFrame(rand_grid_cv.cv_results_) results_df.to_excel(excel_writer, sheet_name=label, index=False) # Save excel_writer.save() file_name = "best" cf.save_config_file(file_name, results)
# Define hyperparameters params = { "depth": [1, 3, 7], "iterations": [100], "learning_rate": [0.01, 0.1, 0.2], "l2_leaf_reg": [1, 5, 10], } # Step 1: set up target metrics for evaluating training # Define target loss metric to aim for target_f1 = 0.7 # Instantiate classifier and run grid search to find best parameters clf = catboost.CatBoostClassifier() grid_clf = GridSearchCV(clf, params, scoring="neg_log_loss") grid_clf.fit(X_train, y_train) # Set the clf to the best combination of parameters clf = grid_clf.best_estimator_ clf.fit(X_train, y_train) # Step 3: Evaluate the quality of the trained model y_pred = clf.predict(X_test) y_pred_proba = clf.predict_proba(X_test) # print classification report of classifier print(classification_report(y_test, y_pred, target_names=class_names)) # evaluate the quality of the trained model using weighted f1 score
from sklearn import metrics import catboost import multiprocessing d_train = pd.read_csv("train-1m.csv") d_test = pd.read_csv("test.csv") X_train = d_train.drop(['dep_delayed_15min'], axis=1) X_test = d_test.drop(['dep_delayed_15min'], axis=1) y_train = np.where(d_train["dep_delayed_15min"]=="Y",1,0) y_test = np.where(d_test["dep_delayed_15min"]=="Y",1,0) cat_cols = np.where(X_train.dtypes == np.object)[0] md = catboost.CatBoostClassifier(iterations = 100, depth = 10, learning_rate = 0.1, task_type = "GPU") ## thread_count = multiprocessing.cpu_count()) %time md.fit(X_train, y_train, cat_features = cat_cols) y_pred = md.predict_proba(X_test)[:,1] metrics.roc_auc_score(y_test, y_pred)
X_test = data[~tr_index][list(set(feature_name))].reset_index(drop=True) print(X_train.shape, X_test.shape) oof = np.zeros(X_train.shape[0]) prediction = np.zeros(X_test.shape[0]) seeds = [19970412, 2019 * 2 + 1024, 4096, 2048, 1024] num_model_seed = 1 for model_seed in range(num_model_seed): oof_cat = np.zeros(X_train.shape[0]) prediction_cat = np.zeros(X_test.shape[0]) skf = StratifiedKFold(n_splits=5, random_state=seeds[model_seed], shuffle=True) for index, (train_index, test_index) in enumerate(skf.split(X_train, y)): print(index) train_x, test_x, train_y, test_y = X_train[feature_name].iloc[train_index], X_train[feature_name].iloc[ test_index], y.iloc[train_index], y.iloc[test_index] cbt_model = cbt.CatBoostClassifier(iterations=7000, learning_rate=0.1, max_depth=7, verbose=100, early_stopping_rounds=500, eval_metric='F1', task_type='GPU', cat_features=cat_list) cbt_model.fit(train_x[feature_name], train_y, eval_set=(test_x[feature_name], test_y)) gc.collect() oof_cat[test_index] += cbt_model.predict_proba(test_x)[:, 1] prediction_cat += cbt_model.predict_proba(X_test[feature_name])[:, 1] / 5 print('F1', f1_score(y, np.round(oof_cat))) oof += oof_cat / num_model_seed prediction += prediction_cat / num_model_seed print('score', f1_score(y, np.round(oof))) # write to csv submit = test[['sid']] submit['label'] = (prediction >= 0.499).astype(int) print(submit['label'].value_counts()) submit.to_csv("round2_A_submission.csv", index=False)
num_model_seed = 5 for model_seed in range(num_model_seed): print(model_seed + 1) oof_cat = np.zeros((X_train.shape[0], 4)) prediction_cat = np.zeros((X_test.shape[0], 4)) skf = StratifiedKFold(n_splits=5, random_state=seeds[model_seed], shuffle=True) for index, (train_index, test_index) in enumerate(skf.split(X_train, y)): print(index) train_x, test_x, train_y, test_y = X_train.iloc[train_index], X_train.iloc[ test_index], y.iloc[train_index], y.iloc[test_index] gc.collect() cbt_model = cbt.CatBoostClassifier(iterations=800, learning_rate=0.01, verbose=300, early_stopping_rounds=200, loss_function='MultiClass') cbt_model.fit(train_x, train_y, eval_set=(train_x, train_y)) oof_cat[test_index] += cbt_model.predict_proba(test_x) prediction_cat += cbt_model.predict_proba(X_test) / 5 gc.collect() oof += oof_cat / num_model_seed prediction += prediction_cat / num_model_seed print('logloss', log_loss(pd.get_dummies(y).values, oof_cat)) print('ac', accuracy_score(y, np.argmax(oof_cat, axis=1))) print('mae', 1 / (1 + np.sum(np.absolute(np.eye(4)[y] - oof_cat)) / 480)) print('logloss', log_loss(pd.get_dummies(y).values, oof)) print('ac', accuracy_score(y, np.argmax(oof, axis=1))) print('mae', 1 / (1 + np.sum(np.absolute(np.eye(4)[y] - oof)) / 480)) sub = test[['Group']]
def test_check_mask_params(self, check_consistency_model_label, check_consistency_model_features, check_preprocessing_options, check_explainer, check_model): """ Unit test check mask params """ train = pd.DataFrame({ 'Onehot1': ['A', 'B', 'A', 'B'], 'Onehot2': ['C', 'D', 'C', 'D'], 'Binary1': ['E', 'F', 'E', 'F'], 'Binary2': ['G', 'H', 'G', 'H'], 'Ordinal1': ['I', 'J', 'I', 'J'], 'Ordinal2': ['K', 'L', 'K', 'L'], 'BaseN1': ['M', 'N', 'M', 'N'], 'BaseN2': ['O', 'P', 'O', 'P'], 'Target1': ['Q', 'R', 'Q', 'R'], 'Target2': ['S', 'T', 'S', 'T'], 'other': ['other', np.nan, 'other', 'other'] }) features_dict = None columns_dict = { i: features for i, features in enumerate(train.columns) } features_types = { features: str(train[features].dtypes) for features in train.columns } label_dict = None enc_ordinal = ce.OrdinalEncoder(cols=[ 'Onehot1', 'Onehot2', 'Binary1', 'Binary2', 'Ordinal1', 'Ordinal2', 'BaseN1', 'BaseN2', 'Target1', 'Target2', 'other' ]).fit(train) train_ordinal = enc_ordinal.transform(train) y = pd.DataFrame({'y_class': [0, 0, 0, 1]}) model = cb.CatBoostClassifier(n_estimators=1).fit(train_ordinal, y) clf_explainer = shap.TreeExplainer(model) check_preprocessing_options.return_value = True check_consistency_model_features.return_value = True check_consistency_model_label.return_value = True check_explainer.return_value = clf_explainer check_model.return_value = "classification", [0, 1] wrong_mask_params_1 = list() wrong_mask_params_2 = None wrong_mask_params_3 = { "features_to_hide": None, "threshold": None, "positive": None } wright_mask_params = { "features_to_hide": None, "threshold": None, "positive": True, "max_contrib": 5 } with self.assertRaises(ValueError): predictor_1 = SmartPredictor(features_dict, model, columns_dict, clf_explainer, features_types, label_dict, mask_params=wrong_mask_params_1) predictor_1 = SmartPredictor(features_dict, model, columns_dict, clf_explainer, features_types, label_dict, mask_params=wrong_mask_params_2) predictor_1 = SmartPredictor(features_dict, model, columns_dict, clf_explainer, features_types, label_dict, mask_params=wrong_mask_params_3) predictor_1 = SmartPredictor(features_dict, model, columns_dict, clf_explainer, features_types, label_dict, mask_params=wright_mask_params)
def test_summarize_2(self): """ Unit test 2 summarize method """ predictor_1 = self.predictor_3 predictor_1._case = "classification" predictor_1._classes = [0, 1] clf = cb.CatBoostClassifier(n_estimators=1).fit( self.df_3[['x1', 'x2']], self.df_3['y']) clf_explainer = shap.TreeExplainer(clf) predictor_1.model = clf predictor_1.explainer = clf_explainer with self.assertRaises(ValueError): predictor_1.summarize() predictor_1.data = { "x": None, "x_preprocessed": None, "x_postprocessed": None, "ypred": None, "contributions": None } predictor_1.data["x"] = self.df_3[["x1", "x2"]] predictor_1.data["x_preprocessed"] = self.df_3[["x1", "x2"]] predictor_1.data["x_postprocessed"] = self.df_3[["x1", "x2"]] predictor_1.data["ypred"] = pd.DataFrame({ "y": ["Yes", "Yes", "No", "No", "No"], "proba": [0.519221, 0.468791, 0.531209, 0.531209, 0.531209] }) predictor_1.data["contributions"] = pd.DataFrame({ "x1": [0, 0, -0, -0, -0], "x2": [0.161538, -0.0403846, 0.0403846, 0.0403846, 0.0403846] }) output = predictor_1.summarize() expected_output = pd.DataFrame( { "y": ["Yes", "Yes", "No", "No", "No"], "proba": [0.519221, 0.468791, 0.531209, 0.531209, 0.531209], "feature_1": ["weight", "weight", "weight", "weight", "weight"], "value_1": ["90", "78", "84", "85", "53"], "contribution_1": [ "0.161538", "-0.0403846", "0.0403846", "0.0403846", "0.0403846" ], "feature_2": ["age", "age", "age", "age", "age"], "value_2": ["25", "39", "50", "43", "67"], "contribution_2": ["0", "0", "0", "0", "0"] }, dtype=object) expected_output["proba"] = expected_output["proba"].astype(float) feature_expected = [ column for column in expected_output.columns if column.startswith("feature_") ] feature_output = [ column for column in output.columns if column.startswith("feature_") ] value_expected = [ column for column in expected_output.columns if column.startswith("value_") ] value_output = [ column for column in output.columns if column.startswith("value_") ] contribution_expected = [ column for column in expected_output.columns if column.startswith("contribution_") ] contribution_output = [ column for column in output.columns if column.startswith("contribution_") ] assert expected_output.shape == output.shape assert len(feature_expected) == len(feature_output) assert len(value_expected) == len(value_output) assert len(contribution_expected) == len(contribution_output) assert all(output.columns == expected_output.columns)
def setUp(self): df = pd.DataFrame(range(0, 5), columns=['id']) df['y'] = df['id'].apply(lambda x: 1 if x < 2 else 0) df['x1'] = np.random.randint(1, 123, df.shape[0]) df['x2'] = ["S", "M", "S", "D", "M"] df = df.set_index('id') encoder = ce.OrdinalEncoder(cols=["x2"], handle_unknown="None") encoder_fitted = encoder.fit(df) df_encoded = encoder_fitted.transform(df) clf = cb.CatBoostClassifier(n_estimators=1).fit( df_encoded[['x1', 'x2']], df_encoded['y']) clf_explainer = shap.TreeExplainer(clf) columns_dict = {0: "x1", 1: "x2"} label_dict = {0: "Yes", 1: "No"} postprocessing = { "x2": { "type": "transcoding", "rule": { "S": "single", "M": "married", "D": "divorced" } } } features_dict = {"x1": "age", "x2": "family_situation"} features_types = { features: str(df[features].dtypes) for features in df[['x1', 'x2']] } self.df_1 = df self.preprocessing_1 = encoder_fitted self.df_encoded_1 = df_encoded self.clf_1 = clf self.clf_explainer_1 = clf_explainer self.columns_dict_1 = columns_dict self.label_dict_1 = label_dict self.postprocessing_1 = postprocessing self.features_dict_1 = features_dict self.features_types_1 = features_types self.predictor_1 = SmartPredictor(features_dict, clf, columns_dict, clf_explainer, features_types, label_dict, encoder_fitted, postprocessing) df['x2'] = np.random.randint(1, 100, df.shape[0]) encoder = ce.OrdinalEncoder(cols=["x2"], handle_unknown="None") encoder_fitted = encoder.fit(df[["x1", "x2"]]) df_encoded = encoder_fitted.transform(df[["x1", "x2"]]) clf = cb.CatBoostClassifier(n_estimators=1).fit( df[['x1', 'x2']], df['y']) clf_explainer = shap.TreeExplainer(clf) features_dict = {"x1": "age", "x2": "weight"} features_types = { features: str(df[features].dtypes) for features in df[["x1", "x2"]].columns } self.df_2 = df self.preprocessing_2 = encoder_fitted self.df_encoded_2 = df_encoded self.clf_2 = clf self.clf_explainer_2 = clf_explainer self.columns_dict_2 = columns_dict self.label_dict_2 = label_dict self.postprocessing_2 = postprocessing self.features_dict_2 = features_dict self.features_types_2 = features_types self.predictor_2 = SmartPredictor(features_dict, clf, columns_dict, clf_explainer, features_types, label_dict, encoder_fitted, postprocessing) df['x1'] = [25, 39, 50, 43, 67] df['x2'] = [90, 78, 84, 85, 53] columns_dict = {0: "x1", 1: "x2"} label_dict = {0: "No", 1: "Yes"} features_dict = {"x1": "age", "x2": "weight"} features_types = { features: str(df[features].dtypes) for features in df[['x1', 'x2']].columns } clf = cb.CatBoostRegressor(n_estimators=1).fit(df[['x1', 'x2']], df['y']) clf_explainer = shap.TreeExplainer(clf) self.df_3 = df self.preprocessing_3 = None self.df_encoded_3 = df self.clf_3 = clf self.clf_explainer_3 = clf_explainer self.columns_dict_3 = columns_dict self.label_dict_3 = label_dict self.postprocessing_3 = None self.features_dict_3 = features_dict self.features_types_3 = features_types self.predictor_3 = SmartPredictor(features_dict, clf, columns_dict, clf_explainer, features_types, label_dict)
def test_check_preprocessing_1(self, check_consistency_model_label, check_consistency_model_features, check_preprocessing_options, check_explainer, check_model): """ Test check preprocessing on multiple preprocessing """ train = pd.DataFrame({ 'Onehot1': ['A', 'B', 'A', 'B'], 'Onehot2': ['C', 'D', 'C', 'D'], 'Binary1': ['E', 'F', 'E', 'F'], 'Binary2': ['G', 'H', 'G', 'H'], 'Ordinal1': ['I', 'J', 'I', 'J'], 'Ordinal2': ['K', 'L', 'K', 'L'], 'BaseN1': ['M', 'N', 'M', 'N'], 'BaseN2': ['O', 'P', 'O', 'P'], 'Target1': ['Q', 'R', 'Q', 'R'], 'Target2': ['S', 'T', 'S', 'T'], 'other': ['other', np.nan, 'other', 'other'] }) features_dict = None columns_dict = { i: features for i, features in enumerate(train.columns) } features_types = { features: str(train[features].dtypes) for features in train.columns } label_dict = None enc_ordinal_all = ce.OrdinalEncoder(cols=[ 'Onehot1', 'Onehot2', 'Binary1', 'Binary2', 'Ordinal1', 'Ordinal2', 'BaseN1', 'BaseN2', 'Target1', 'Target2', 'other' ]).fit(train) train_ordinal_all = enc_ordinal_all.transform(train) y = pd.DataFrame({'y_class': [0, 0, 0, 1]}) model = cb.CatBoostClassifier(n_estimators=1).fit(train_ordinal_all, y) clf_explainer = shap.TreeExplainer(model) check_preprocessing_options.return_value = True check_consistency_model_features.return_value = True check_consistency_model_label.return_value = True check_explainer.return_value = clf_explainer check_model.return_value = "classification", [0, 1] predictor_1 = SmartPredictor(features_dict, model, columns_dict, clf_explainer, features_types, label_dict) y = pd.DataFrame(data=[0, 1, 0, 0], columns=['y']) enc_onehot = ce.OneHotEncoder(cols=['Onehot1', 'Onehot2']).fit(train) train_onehot = enc_onehot.transform(train) enc_binary = ce.BinaryEncoder( cols=['Binary1', 'Binary2']).fit(train_onehot) train_binary = enc_binary.transform(train_onehot) enc_ordinal = ce.OrdinalEncoder( cols=['Ordinal1', 'Ordinal2']).fit(train_binary) train_ordinal = enc_ordinal.transform(train_binary) enc_basen = ce.BaseNEncoder( cols=['BaseN1', 'BaseN2']).fit(train_ordinal) train_basen = enc_basen.transform(train_ordinal) enc_target = ce.TargetEncoder(cols=['Target1', 'Target2']).fit( train_basen, y) input_dict1 = dict() input_dict1['col'] = 'Onehot2' input_dict1['mapping'] = pd.Series(data=['C', 'D', np.nan], index=['C', 'D', 'missing']) input_dict1['data_type'] = 'object' input_dict2 = dict() input_dict2['col'] = 'Binary2' input_dict2['mapping'] = pd.Series(data=['G', 'H', np.nan], index=['G', 'H', 'missing']) input_dict2['data_type'] = 'object' input_dict = dict() input_dict['col'] = 'state' input_dict['mapping'] = pd.Series(data=['US', 'FR-1', 'FR-2'], index=['US', 'FR', 'FR']) input_dict['data_type'] = 'object' input_dict3 = dict() input_dict3['col'] = 'Ordinal2' input_dict3['mapping'] = pd.Series(data=['K', 'L', np.nan], index=['K', 'L', 'missing']) input_dict3['data_type'] = 'object' list_dict = [input_dict2, input_dict3] y = pd.DataFrame(data=[0, 1], columns=['y']) train = pd.DataFrame({ 'city': ['chicago', 'paris'], 'state': ['US', 'FR'], 'other': ['A', 'B'] }) enc = ColumnTransformer(transformers=[('onehot', skp.OneHotEncoder(), ['city', 'state'])], remainder='drop') enc.fit(train, y) wrong_prepro = skp.OneHotEncoder().fit(train, y) predictor_1.preprocessing = [ enc_onehot, enc_binary, enc_ordinal, enc_basen, enc_target, input_dict1, list_dict ] predictor_1.check_preprocessing() for preprocessing in [ enc_onehot, enc_binary, enc_ordinal, enc_basen, enc_target ]: predictor_1.preprocessing = preprocessing predictor_1.check_preprocessing() predictor_1.preprocessing = input_dict2 predictor_1.check_preprocessing() predictor_1.preprocessing = enc predictor_1.check_preprocessing() predictor_1.preprocessing = None predictor_1.check_preprocessing() with self.assertRaises(Exception): predictor_1.preprocessing = wrong_prepro predictor_1.check_preprocessing()
from scoring import calculate_scores VAL_SPLIT = 0.2 data = get_data(val_split=VAL_SPLIT, apply_label_encoding=True, fillna=True) X_train, X_val, X_test, y_train, y_val, categorical_features = ( data["X_train"], data["X_val"], data["X_test"], data["y_train"], data["y_val"], data["categorical_features"], ) clf = cb.CatBoostClassifier( n_estimators=200, learning_rate=0.05, metric_period=500, od_wait=500, task_type="CPU", depth=8, ) print("Fitting a catboost model...") clf.fit(X_train, y_train, cat_features=categorical_features) _ = calculate_scores(clf, X_val, y_val) for scoring in SCORING_LIST: print("Optimizing catboost params for", scoring, "with random search...") best_estimator = perform_random_search( estimator=clf, X_train=X_train, X_val=X_val,
def objective(trial): #-------------------------------------------- # ベイズ最適化でのチューニングパイパーパラメーター #-------------------------------------------- if (args.classifier == "logistic"): params = { 'penalty': trial.suggest_categorical('penalty', ['l2']), "solver": trial.suggest_categorical("solver", ['sag']), 'C': trial.suggest_discrete_uniform('C', 0.01, 100.0, 0.1), # 一様分布に従う。 'random_state': trial.suggest_int("random_state", 71, 71), 'n_jobs': trial.suggest_int("n_jobs", -1, -1), } elif (args.classifier == "knn"): params = { "metric": trial.suggest_categorical("metric", ['minkowski']), "p": trial.suggest_int("p", 1, 2), 'n_neighbors': trial.suggest_int("n_neighbors", 1, 50), 'n_jobs': trial.suggest_int("n_jobs", -1, -1), } elif (args.classifier == "svm"): params = { "kernel": trial.suggest_categorical("kernel", ['rbf']), 'C': trial.suggest_loguniform('C', 0.1, 1000.0), 'gamma': trial.suggest_loguniform('gamma', 1e-8, 10.0), 'random_state': trial.suggest_int("random_state", 71, 71), } elif (args.classifier == "random_forest"): params = { "oob_score": trial.suggest_int( "oob_score", 0, 1 ), # Whether to use out-of-bag samples to estimate the generalization accuracy.(default=False) "n_estimators": trial.suggest_int("n_estimators", 1000, 1000), # チューニングは固定 "criterion": trial.suggest_categorical( "criterion", ['gini', "entropy"]), # 不純度関数 [purity] 'max_features': trial.suggest_categorical('max_features', [ 'auto', 0.2, 0.4, 0.6, 0.8, ]), 'min_samples_split': trial.suggest_int( 'min_samples_split', 2, 10 ), # min_samples_split must be an integer greater than 1 or a float in (0.0, 1.0]; got the integer 1 'min_samples_leaf': trial.suggest_int('min_samples_leaf', 1, 10), "bootstrap": trial.suggest_int( "bootstrap", True, True), # 決定木の構築に、ブートストラップサンプルを使用するか否か(default:True) "oob_score": trial.suggest_int( "oob_score", False, True ), # Whether to use out-of-bag samples to estimate the generalization accuracy.(default=False) 'random_state': trial.suggest_int("random_state", 71, 71), 'n_jobs': trial.suggest_int("n_jobs", -1, -1), } elif (args.classifier == "bagging"): params = { "n_estimators": trial.suggest_int("n_estimators", 1000, 1000), # チューニングは固定 'max_samples': trial.suggest_float( 'max_samples', 0.0, 1.0), # base_estimator に設定した弱識別器の内, 使用するサンプルの割合 'max_features': trial.suggest_float( 'max_features', 0.0, 1.0 ), # The number of features to draw from X to train each base estimator. "bootstrap": trial.suggest_int( "bootstrap", True, True), # 決定木の構築に、ブートストラップサンプルを使用するか否か(default:True) "bootstrap_features": trial.suggest_int("bootstrap", False, True), 'random_state': trial.suggest_int("random_state", 71, 71), 'n_jobs': -1, # 弱識別器のパラメータ(先頭に "base_estimator__" をつけることでアクセス可能 ) 'base_estimator__max_depth': trial.suggest_int("base_estimator__random_state", 1, 8), 'base_estimator__max_features': trial.suggest_float('base_estimator__max_features', 0.0, 1.0), 'base_estimator__min_samples_leaf': trial.suggest_int('base_estimator__min_samples_leaf', 1, 10), 'base_estimator__min_samples_split': trial.suggest_int('base_estimator__min_samples_split', 2, 10), 'base_estimator__random_state': trial.suggest_int("base_estimator__random_state", 71, 71), } elif (args.classifier == "adaboost"): params = { "n_estimators": trial.suggest_int("n_estimators", 1000, 1000), # チューニングは固定 "learning_rate": trial.suggest_loguniform("learning_rate", 0.01, 0.01), # ハイパーパラメーターのチューニング時は固定 'random_state': 71, # 弱識別器のパラメータ(先頭に "base_estimator__" をつけることでアクセス可能 ) 'base_estimator__max_depth': trial.suggest_int("base_estimator__random_state", 1, 10), 'base_estimator__max_features': trial.suggest_float('base_estimator__max_features', 0.0, 1.0), 'base_estimator__min_samples_leaf': trial.suggest_int('base_estimator__min_samples_leaf', 1, 10), 'base_estimator__min_samples_split': trial.suggest_int('base_estimator__min_samples_split', 2, 10), 'base_estimator__random_state': trial.suggest_int("base_estimator__random_state", 71, 71), } elif (args.classifier == "xgboost"): params = { 'booster': trial.suggest_categorical('booster', ['gbtree']), 'objective': trial.suggest_categorical('objective', ['binary:logistic']), "learning_rate": trial.suggest_loguniform("learning_rate", 0.01, 0.01), # ハイパーパラメーターのチューニング時は固定 "n_estimators": trial.suggest_int("n_estimators", 1000, 1000), # ハイパーパラメーターのチューニング時は固定 'max_depth': trial.suggest_int("max_depth", 3, 9), # 3 ~ 9 : 一様分布に従う。1刻み 'min_child_weight': trial.suggest_loguniform('min_child_weight', 0.1, 10.0), # 0.1 ~ 10.0 : 対数が一様分布に従う 'subsample': trial.suggest_discrete_uniform( 'subsample', 0.6, 0.95, 0.05), # 0.6 ~ 0.95 : 一様分布に従う。0.05 刻み 'colsample_bytree': trial.suggest_discrete_uniform( 'colsample_bytree', 0.6, 0.95, 0.05), # 0.6 ~ 0.95 : 一様分布に従う。0.05 刻み 'gamma': trial.suggest_loguniform("gamma", 1e-8, 1.0), # 1e-8 ~ 1.0 : 対数が一様分布に従う 'alpha': trial.suggest_float("alpha", 0.0, 0.0), # デフォルト値としておく。余裕があれば変更 'reg_lambda': trial.suggest_float("reg_lambda", 1.0, 1.0), # デフォルト値としておく。余裕があれば変更 'random_state': trial.suggest_int("random_state", 71, 71), } elif (args.classifier == "lightgbm"): params = { 'boosting_type': trial.suggest_categorical('boosting', ['gbdt', 'dart', 'goss']), 'objective': 'binary', 'metric': 'binary_logloss', 'num_class': 1, 'num_leaves': trial.suggest_int("num_leaves", 10, 500), 'learning_rate': trial.suggest_loguniform("learning_rate", 0.01, 0.01), 'max_depth': trial.suggest_int("max_depth", 1, 5), 'reg_alpha': trial.suggest_uniform("reg_alpha", 0, 100), 'reg_lambda': trial.suggest_uniform("reg_lambda", 1, 5), 'num_leaves': trial.suggest_int("num_leaves", 10, 500), 'verbose': 0, } elif (args.classifier == "catboost"): params = { 'eval_metric': trial.suggest_categorical('eval_metric', ['Accuracy']), 'iterations': trial.suggest_int('iterations', 1000, 1000), # まず大きな数を設定しておく 'learning_rate': trial.suggest_loguniform('learning_rate', 0.01, 0.01), 'depth': trial.suggest_int('depth', 4, 10), 'random_strength': trial.suggest_int('random_strength', 0, 100), 'bagging_temperature': trial.suggest_loguniform('bagging_temperature', 0.01, 100.00), 'od_type': trial.suggest_categorical('od_type', ['IncToDec', 'Iter']), 'od_wait': trial.suggest_int('od_wait', 100, 100), # 最適な指標値に達した後、iterationを続ける数。 'random_state': trial.suggest_int("random_state", 71, 71), } #-------------------------------------------- # stratified k-fold CV での評価 #-------------------------------------------- y_preds_train = np.zeros((len(y_train), )) # k-hold cross validation で、学習用データセットを学習用と検証用に分割したもので評価 kf = StratifiedKFold(n_splits=args.n_splits_gs, shuffle=True, random_state=args.seed) k = 0 for fold_id, (train_index, valid_index) in enumerate(kf.split(X_train, y_train)): # seed 値の固定 np.random.seed(args.seed + k) random.seed(args.seed + k) #-------------------- # データセットの分割 #-------------------- X_train_fold, X_valid_fold = X_train.iloc[ train_index], X_train.iloc[valid_index] y_train_fold, y_valid_fold = y_train.iloc[ train_index], y_train.iloc[valid_index] #-------------------- # モデルの定義 #-------------------- if (args.classifier == "logistic"): model = SklearnClassifier( LogisticRegression(penalty='l2', solver="sag", random_state=args.seed)) elif (args.classifier == "knn"): model = SklearnClassifier( KNeighborsClassifier(n_neighbors=3, p=2, metric='minkowski')) elif (args.classifier == "svm"): model = SklearnClassifier(SVC(kernel='rbf', gamma=0.1, C=10.0)) elif (args.classifier == "random_forest"): model = SklearnClassifier( RandomForestClassifier(criterion="gini", bootstrap=True, oob_score=True, n_estimators=1000, n_jobs=-1, random_state=args.seed)) elif (args.classifier == "bagging"): model = SklearnClassifier( BaggingClassifier( DecisionTreeClassifier(criterion='entropy', max_depth=None, random_state=args.seed))) elif (args.classifier == "adaboost"): model = SklearnClassifier( AdaBoostClassifier( DecisionTreeClassifier(criterion='entropy', max_depth=None, random_state=args.seed))) elif (args.classifier == "xgboost"): model = XGBoostClassifier(model=xgb.XGBClassifier( booster='gbtree', objective='binary:logistic', eval_metric='logloss', learning_rate=0.01), train_type=args.train_type, use_valid=True, debug=args.debug) elif (args.classifier == "lightgbm"): model = LightGBMClassifier(model=lgb.LGBMClassifier( objective='binary', metric='binary_logloss'), train_type=args.train_type, use_valid=True, debug=args.debug) elif (args.classifier == "catboost"): model = CatBoostClassifier( model=catboost.CatBoostClassifier(loss_function="Logloss"), use_valid=True, debug=args.debug) # モデルのチューニングパラメータ設定 model.set_params(**params) #-------------------- # モデルの学習処理 #-------------------- model.fit(X_train_fold, y_train_fold, X_valid_fold, y_valid_fold) #-------------------- # モデルの推論処理 #-------------------- y_pred_train[valid_index] = model.predict(X_valid_fold) k += 1 accuracy = (y_train == y_pred_train).sum() / len(y_pred_train) return accuracy
train3 = data[(data['date']>=20170929)&(data['date']<20171011)] train4 = data[(data['date']>=20171011)&(data['date']<20171023)] train5 = data[(data['date']>=20171023)&(data['date']<=20171105)] gbm1 = lgb.LGBMClassifier(max_depth=7, n_estimators = 110, min_child_samples=100) gbm2 = lgb.LGBMClassifier(max_depth=7, n_estimators = 110, min_child_samples=100) gbm3 = lgb.LGBMClassifier(max_depth=7, n_estimators = 110, min_child_samples=100) gbm4 = lgb.LGBMClassifier(max_depth=7, n_estimators = 110, min_child_samples=100) gbm5 = lgb.LGBMClassifier(max_depth=7, n_estimators = 110, min_child_samples=100) gbm1.fit(train1[feat_name], train1['label']) gbm2.fit(train2[feat_name], train2['label']) gbm3.fit(train3[feat_name], train3['label']) gbm4.fit( train4[feat_name], train4['label']) gbm5.fit( train5[feat_name], train5['label']) cb1 = cat.CatBoostClassifier(iterations=110,learning_rate=0.1,depth=7) cb2 = cat.CatBoostClassifier(iterations=110,learning_rate=0.1,depth=7) cb3 = cat.CatBoostClassifier(iterations=110,learning_rate=0.1,depth=7) cb4 = cat.CatBoostClassifier(iterations=110,learning_rate=0.1,depth=7) cb5 = cat.CatBoostClassifier(iterations=110,learning_rate=0.1,depth=7) cb1.fit(train1[feat_name], train1['label'],verbose=20) cb2.fit(train2[feat_name], train2['label'],verbose=20) cb3.fit(train3[feat_name], train3['label'],verbose=20) cb4.fit(train4[feat_name], train4['label'],verbose=20) cb5.fit(train5[feat_name], train5['label'],verbose=20) xg1 = xgb.XGBClassifier(max_depth=7, n_estimators=110, silent=False) xg2 = xgb.XGBClassifier(max_depth=7, n_estimators=110, silent=False) xg3 = xgb.XGBClassifier(max_depth=7, n_estimators=110, silent=False) xg4 = xgb.XGBClassifier(max_depth=7, n_estimators=110, silent=False) xg5 = xgb.XGBClassifier(max_depth=7, n_estimators=110, silent=False)
return result # In[ ]: # In[26]: df_res = pd.DataFrame() # In[27]: import catboost as cat clf_cbt = cat.CatBoostClassifier(iterations=2500, learning_rate=0.01, depth=6, verbose=True, thread_count=12, colsample_bylevel=0.8, l2_leaf_reg=1, random_seed=1024) df_res['result_1'] = cbt_model(clf_cbt, model_train_s_1, online_train, feature_list) df_res['result_2'] = cbt_model(clf_cbt, model_train_s_2, online_train, feature_list) df_res['result_3'] = cbt_model(clf_cbt, model_train_s_3, online_train, feature_list) df_res['result_4'] = cbt_model(clf_cbt, model_train_s_4, online_train, feature_list)
skf = StratifiedKFold(n_splits=N_SPLITS, shuffle=True, random_state=SEED) for fold, (train_idx, valid_idx) in enumerate(skf.split(all_df, all_df[TARGET])): print(f"===== FOLD {fold} =====") oof_idx = np.array([idx for idx in valid_idx if idx < train_df.shape[0]]) preds_idx = np.array( [idx for idx in valid_idx if idx >= train_df.shape[0]]) X_train, y_train = all_df.iloc[train_idx].drop( TARGET, axis=1), all_df.iloc[train_idx][TARGET] X_valid, y_valid = all_df.iloc[oof_idx].drop( TARGET, axis=1), all_df.iloc[oof_idx][TARGET] X_test = all_df.iloc[preds_idx].drop(TARGET, axis=1) model = ctb.CatBoostClassifier(**params) model.fit(X_train, y_train, eval_set=[(X_valid, y_valid)], use_best_model=True, early_stopping_rounds=EARLY_STOPPING_ROUNDS, verbose=VERBOSE) fi_tmp = pd.DataFrame() fi_tmp["feature"] = X_test.columns.to_list() fi_tmp["importance"] = model.get_feature_importance() fi_tmp["fold"] = fold fi_tmp["seed"] = SEED feature_importances = feature_importances.append(fi_tmp) ctb_oof[oof_idx] = model.predict(X_valid)
if( args.debug ): print( "X_train_fold.shape : ", X_train_fold.shape ) print( "X_valid_fold.shape : ", X_valid_fold.shape ) print( "y_train_fold.shape : ", y_train_fold.shape ) print( "y_valid_fold.shape : ", y_valid_fold.shape ) #-------------------- # モデルの定義 #-------------------- models = [] for c, classifier in enumerate(args.classifiers): if( classifier == "svm" ): model = SklearnImageClassifier( SVC( kernel = 'rbf', gamma = 0.1, C = 10.0 ) ) elif( classifier == "catboost" ): if( args.device == "gpu" ): model = CatBoostImageClassifier( model = catboost.CatBoostClassifier( loss_function="MultiClass", iterations = 1000, task_type="GPU", devices='0:1' ), use_valid = True, debug = args.debug ) # iterations = (trees / (epochs * batches) else: model = CatBoostImageClassifier( model = catboost.CatBoostClassifier( loss_function="MultiClass", iterations = 1000 ), use_valid = True, debug = args.debug ) elif( classifier == "mlp" ): model = KerasMLPImageClassifier( n_input_dim = X_train_fold.shape[1] * X_train_fold.shape[2] * X_train_fold.shape[3], n_classes = args.n_classes, n_epoches = args.n_epoches, batch_size = args.batch_size, lr = args.lr, beta1 = args.beta1, beta2 = args.beta2, use_valid = True, one_hot_encode = True, callbacks = None, use_datagen = False, datagen = datagen, debug = args.debug ) elif( classifier == "resnet50" ): model = KerasResNet50ImageClassifier( image_height = args.image_height, image_width = args.image_width, n_channles = 3, n_classes = args.n_classes, n_epoches = args.n_epoches, batch_size = args.batch_size, lr = args.lr, beta1 = args.beta1, beta2 = args.beta2, pretrained = False, train_only_fc = False, use_valid = True, one_hot_encode = True, callbacks = None, use_datagen = True, datagen = datagen, debug = args.debug )
def Catboost_crossvalidated_model(max_depths, n_estimators, colsample_bytrees, Xtrain, Ytrain, nfold, feature_selection=0, nthread=8): '''Function returns a cross-validated hyper parameter tuned model for the training data Arguments: max_depths: options for maximum depth eg: input [6,10,13], this will choose the best max_depth among the three n_estimators: best number of estimators to be chosen from this. eg: [200,150,100] colsample_bytrees: eg. input [0.4,0.8] nfold: Number of folds for cross-validated Xtrain, Ytrain: Training features and labels feature_selection : 0 means feature_selection diabled and 1 otherswise. If 1 then a second output is returned which consists of the selected features Output: model: Trained model with good hyper-parameters features : Coordinates of selected features, if feature_selection = 0 bp: Dictionary of tuned parameters This procedure is CPU intensive. So, it is advised to not provide too many choices of hyper-parameters ''' classifiers = {} model = catboost.CatBoostClassifier(thread_count=nthread, learning_rate=0.02, iterations=100, depth=6, subsample=0.8, random_seed=11) #model = xgb.XGBClassifier( nthread=nthread, learning_rate =0.02, n_estimators=100, max_depth=6,min_child_weight=1, gamma=0, subsample=0.8, colsample_bytree=0.8,objective= 'binary:logistic',scale_pos_weight=1, seed=11) model.fit(Xtrain, Ytrain) m, n = Xtrain.shape bp = {'max_depth': [0], 'n_estimator': [0]} classifiers['model'] = catboost.CatBoostClassifier(thread_count=nthread, learning_rate=0.02, iterations=100, depth=6, subsample=0.8, random_seed=11) #classifiers['model'] = xgb.XGBClassifier( nthread = nthread, learning_rate =0.02, n_estimators=100, max_depth=6,min_child_weight=1, gamma=0, subsample=0.8, colsample_bytree=0.9,objective= 'binary:logistic',scale_pos_weight=1, seed=11) classifiers['train_X'] = Xtrain classifiers['train_y'] = Ytrain maxi = 0 pos = 0 for r in max_depths: classifiers['model'] = catboost.CatBoostClassifier( thread_count=nthread, learning_rate=0.02, iterations=100, depth=r, subsample=0.8, random_seed=11) #classifiers['model'] = xgb.XGBClassifier( nthread=nthread,learning_rate =0.02, n_estimators=100, max_depth=r,min_child_weight=1, gamma=0, subsample=0.8, colsample_bytree=0.8,objective= 'binary:logistic',scale_pos_weight=1, seed=11) score = cross_validate(classifiers, nfold) if maxi < score: maxi = score pos = r bp['max_depth'] = pos #print pos maxi = 0 pos = 0 for r in n_estimators: classifiers['model'] = catboost.CatBoostClassifier( thread_count=nthread, learning_rate=0.02, iterations=r, depth=bp['max_depth'], subsample=0.8, random_seed=11) #classifiers['model'] = xgb.XGBClassifier( nthread=nthread,learning_rate =0.02, n_estimators=r, max_depth=bp['max_depth'],min_child_weight=1, gamma=0, subsample=0.8, colsample_bytree=0.8,objective= 'binary:logistic',scale_pos_weight=1, seed=11) score = cross_validate(classifiers, nfold) if maxi < score: maxi = score pos = r bp['n_estimator'] = pos #print pos classifiers['model'] = catboost.CatBoostClassifier( thread_count=nthread, learning_rate=0.02, iterations=bp['n_estimator'], depth=bp['max_depth'], subsample=0.8, random_seed=11) #model = xgb.XGBClassifier( nthread=nthread,learning_rate =0.02, n_estimators=bp['n_estimator'], max_depth=bp['max_depth'],min_child_weight=1, gamma=0, subsample=0.8, colsample_bytree=bp['colsample_bytree'],objective= 'binary:logistic',scale_pos_weight=1, seed=11).fit(Xtrain,Ytrain) return model, bp
verbose_eval = 30 num_rounds = 800 folds = 3 kf = KFold(n_splits=folds, shuffle=True, random_state=seed + 1) y_preds3 = np.zeros(X_test.shape[0]) feature_importance_df = pd.DataFrame() i = 0 for tr_idx, val_idx in kf.split(X_train, y_train): X_tr = X_train[features].iloc[tr_idx, :].fillna(-1) y_tr = y_train.iloc[tr_idx] model = cb.CatBoostClassifier(iterations=num_rounds, depth=14, learning_rate=0.04, loss_function='Logloss', eval_metric='Logloss', task_type="GPU") if debug: model.fit(X_tr, y_tr, cat_features=cate, verbose_eval=30) else: model.fit(X_tr, y_tr, cat_features=cate, verbose_eval=30) del X_tr y_preds3 += model.predict_proba(X_test[features].fillna(-1))[:, 1] / folds if debug: print( "debug:", roc_auc_score( y_test,
def CatBoost_CV(): # Get csv data data = pd.read_csv(cf.base_dir + cf.prepared_data_real_comb) X = data.drop(['0'], axis=1) y = data[['0']].values.ravel() # Feature Scaling StdScaler = StandardScaler() X_scaled = StdScaler.fit_transform(X) # Splitting the dataset into the Training set and Test set X_Train, x_test, Y_Train, y_test = train_test_split(X_scaled, y, test_size=0.5, random_state=0) # Number of trees in random forest iterations = [int(x) for x in np.linspace(start=200, stop=2000, num=5)] # Number of features to consider at every split learning_rate = [x for x in np.linspace(start=0.01, stop=0.1, num=5)] # Maximum number of levels in tree depth = [int(x) for x in np.linspace(6, 10, num=4)] # Minimum number of samples required to split a node l2_leaf_reg = [x for x in np.linspace(start=0.01, stop=0.1, num=5)] # Minimum number of samples required at each leaf node # per_float_feature_quantization = ['0:border_count=1024', '1:border_count=1024'] # Method of selecting samples for training each tree bootstrap = [True, False] random_grid = { 'iterations': iterations, 'learning_rate': learning_rate, 'depth': depth, 'l2_leaf_reg': l2_leaf_reg } #'per_float_feature_quantization': per_float_feature_quantization} clf = cb.CatBoostClassifier(random_state=0, border_count=255, task_type='GPU') # import multiprocessing # cores = multiprocessing.cpu_count()-1 rf_random = RandomizedSearchCV(estimator=clf, param_distributions=random_grid, n_iter=5, cv=3, verbose=10, random_state=42, n_jobs=1) # Fit the random search model start_time = time.time() # Time counter print( "Started at ", datetime.utcfromtimestamp(int( time.time())).strftime('%Y-%m-%d %H:%M:%S')) rf_random.fit(X_Train, Y_Train) best_p = rf_random.best_params_ best_r = rf_random.best_score_ print(best_p, best_r)
train_pred3 = (model.predict_proba(X))[:,1] train_pred3.shape # In[356]: params = {'depth': [2, 4, 7, 10], 'learning_rate' : [0.03, 0.1, 0.15], 'l2_leaf_reg': [1,4,9], 'iterations': [300] 'loss_function': 'logloss' } #如果选取的参数组比较多的话,需要的时间特别久,以此模型下的ks值为例可以明显看出过拟合严重, #故手动调整参数在实际应用中更为合适 cb = ct.CatBoostClassifier(eval_metric="AUC", random_seed=741) cb_model = GridSearchCV(cb, params, scoring="roc_auc", cv = 3) cb_model.fit(X, Y) #clf = ct.CatBoostClassifier(eval_metric="AUC", depth=10, iterations= 300, l2_leaf_reg= 9, learning_rate= 0.15) #clf.fit(X,Y) # In[382]: cat_features_index = 2 clf = ct.CatBoostClassifier(eval_metric="AUC", depth=8, iterations= 300, l2_leaf_reg= 10, learning_rate= 0.008)#学习率越大越过拟 clf.fit(X,Y) preds_class = clf.predict(P_test) preds_probs = clf.predict_proba(P_test) print('class = ',preds_class)
def model_cbt(features, test_features, encoding='ohe', n_folds=5): # 提取ID train_ids = features['cust_no'] test_ids = test_features['cust_no'] # 提取训练集的结果 labels = features['label'] # 移除ID和target features = features.drop(columns=['cust_no', 'label']) test_features = test_features.drop(columns=['cust_no']) one_hot = OneHotEncoder(3, sparse=False) # One Hot Encoding if encoding == 'ohe': features = pd.get_dummies(features) test_features = pd.get_dummies(test_features) features, test_features = features.align(test_features, join='inner', axis=1) cat_indices = 'auto' # Integer label encoding elif encoding == 'le': label_encoder = LabelEncoder() cat_indices = [] for i, col in enumerate(features): if features[col].dtype == 'object': features[col] = label_encoder.fit_transform( np.array(features[col].astype(str)).reshape((-1, ))) test_features[col] = label_encoder.transform( np.array(test_features[col].astype(str)).reshape((-1, ))) cat_indices.append(i) else: raise ValueError("Encoding must be either 'ohe' or 'le'") features = features[important_features] test_features = test_features[important_features] print('Training Data Shape: ', features.shape) print('Testing Data Shape: ', test_features.shape) feature_names = list(features.columns) features = np.array(features) test_features = np.array(test_features) features[np.isnan(features)] = -1000000 features[np.where(features >= np.finfo(np.float64).max)] = -1000000 test_features[np.isnan(test_features)] = -1000000 test_features[np.where( test_features >= np.finfo(np.float64).max)] = -1000000 k_fold = StratifiedKFold(n_splits=n_folds, shuffle=True, random_state=50) feature_importance_values = np.zeros(len(feature_names)) test_predictions = np.zeros((test_features.shape[0], 3)) out_of_fold = np.zeros((features.shape[0], 3)) # print(out_of_fold) valid_scores = [] train_scores = [] for train_indices, valid_indices in k_fold.split(features, labels): train_features, train_labels = features[train_indices], labels[ train_indices] valid_features, valid_labels = features[valid_indices], labels[ valid_indices] train_labels += 1 valid_labels += 1 train_features, train_labels = SMOTE().fit_sample( train_features, train_labels) # 建模 model = cbt.CatBoostClassifier(random_seed=2020, iterations=1500, learning_rate=0.1, max_depth=11, l2_leaf_reg=1, verbose=1, early_stopping_rounds=20, task_type='CPU', eval_metric='Kappa') # 训练模型 eval_metric=lambda y_true, y_pred: [custom_kappa_eval(y_true, y_pred)] model.fit(train_features, train_labels, eval_set=[(valid_features, valid_labels)], early_stopping_rounds=20, verbose=True) # 特征重要性 feature_importance_values += model.feature_importances_ / k_fold.n_splits # 做预测 test_predictions += model.predict_proba( test_features)[:, :] / k_fold.n_splits out_of_fold[valid_indices] = model.predict_proba(valid_features)[:, :] valid_score = accuracy_score( valid_labels, np.argmax(out_of_fold[valid_indices], axis=1)) train_score = accuracy_score( train_labels, np.argmax(model.predict_proba(train_features)[:, :], axis=1)) valid_scores.append(valid_score) train_scores.append(train_score) gc.enable() del model, train_features, valid_features gc.collect() y_pred = np.argmax(test_predictions, axis=1) - 1 test_predictions = pd.DataFrame({ 'cust_no': test_ids, '-1': test_predictions[:, 0], '0': test_predictions[:, 1], '1': test_predictions[:, 2] }) print(y_pred) submission = pd.DataFrame({'cust_no': test_ids, 'label': y_pred}) feature_importances = pd.DataFrame({ 'feature': feature_names, 'importance': feature_importance_values }) fold_names = list(range(n_folds)) fold_names.append('overall') valid_auc = accuracy_score(labels + 1, np.argmax(out_of_fold, axis=1)) valid_scores.append(valid_auc) train_scores.append(np.mean(train_scores)) metrics = pd.DataFrame({ 'fold': fold_names, 'train': train_scores, 'valid': valid_scores }) return submission, feature_importances, metrics, test_predictions
def Model_Search_Catboost_cv(X, y, cat_features=None, model='binary', folds=5, sklearn_metric=None, catboost_metric=None, step_wise_start_at=0, final_learning_rate=0.01, use_optuna=False, direction='minimize', n_trials=20, load_study_from=None, save_study_as=None, n_jobs=4): # model if isinstance(model, str): if model == 'binary': model = catboost.CatBoostClassifier(loss_function='Logloss', thread_count=n_jobs) elif model == 'multiclass': model = catboost.CatBoostClassifier(loss_function='MultiClass', thread_count=n_jobs) elif model == 'regression': model = catboost.CatBoostRegressor(loss_function='RMSE', thread_count=n_jobs) else: sys.exit('Error: Unkown model type.') # sklearn_metric if sklearn_metric is None: # https://scikit-learn.org/stable/modules/model_evaluation.html if isinstance(model, catboost.CatBoostClassifier): sklearn_metric = 'neg_log_loss' elif isinstance(model, catboost.CatBoostRegressor): sklearn_metric = 'neg_root_mean_squared_error' else: sys.exit('Error: Sklearn score metric needs to be provided.') # catboost_metric if catboost_metric is not None: # https://catboost.ai/docs/concepts/loss-functions.html model.set_params(loss_function=catboost_metric) # folds if isinstance(folds, int): if isinstance(model, lightgbm.LGBMClassifier): folds = StratifiedKFold(n_splits=folds, shuffle=True, random_state=42) else: folds = KFold(n_splits=folds, shuffle=True, random_state=42) # ------------------------------------------------------------------------------------------------ # Set fixed params fixed_params = { "verbose": False, "random_state": 42, "iterations": 10000, } model.set_params(**fixed_params) # Set dataset for .cv d_train = catboost.Pool(X, label=y, cat_features=cat_features) # ------------------------------------------------------------------------------------------------ if use_optuna: print("Searching for a Catboost model with optuna \n") params = model.get_params() if load_study_from is not None: study = joblib.load(load_study_from) else: study = optuna.create_study( direction=direction, pruner=optuna.pruners.SuccessiveHalvingPruner()) def objetive(trial): params.update({ "boosting_type": trial.suggest_categorical("boosting_type", ['Ordered', 'Plain']), "learning_rate": trial.suggest_loguniform("learning_rate", 0.005, 0.1), "max_depth": trial.suggest_int("max_depth", 4, 12), "l2_leaf_reg": trial.suggest_loguniform("l2_leaf_reg", 1e-4, 1e4), "border_count": trial.suggest_int('border_count', 1, 255), "random_strength": trial.suggest_loguniform("random_strength", 1e-4, 1e4), "bagging_temperature": trial.suggest_loguniform("bagging_temperature", 1e-4, 1e4), }) cv_results = catboost.cv(params=params, pool=d_train, iterations=10000, early_stopping_rounds=50, folds=folds, verbose_eval=None, as_pandas=False) rmetric_name = list(cv_results.keys())[1] score = cv_results[rmetric_name][ -1] # np.min(cv_results[rmetric_name]) print("Num_boost_round: " + str(len(cv_results[rmetric_name]))) if save_study_as is not None: joblib.dump(study, save_study_as) return score study.optimize(objetive, n_trials=n_trials, n_jobs=1) print( "------------------------------------------------------------------------" ) print("Best parameters found: " + str(study.best_params)) print("Best score achived: " + str(study.best_value)) print( "------------------------------------------------------------------------" ) model.set_params(**study.best_params) # num_boost_round optimization cv_results = catboost.cv(params=model.get_params(), pool=d_train, iterations=10000, early_stopping_rounds=50, folds=folds, verbose_eval=None, as_pandas=False) rmetric_name = list(cv_results.keys())[1] best_boost_round = len(cv_results[rmetric_name]) best_score_achived = cv_results[rmetric_name][-1] print("Best num_boost_round: " + str(best_boost_round)) print("Best score achived: " + str(best_score_achived)) print( "------------------------------------------------------------------------" ) model.set_params(iterations=best_boost_round) else: print("Searching for a Catboost model with the step wise method \n") if step_wise_start_at <= 0: # num_boost_round optimization cv_results = catboost.cv(params=model.get_params(), pool=d_train, iterations=10000, early_stopping_rounds=50, folds=folds, verbose_eval=None, as_pandas=False) rmetric_name = list(cv_results.keys())[1] best_boost_round = len(cv_results[rmetric_name]) best_score_achived = cv_results[rmetric_name][-1] print( "------------------------------------------------------------------------" ) print("Best num_boost_round: " + str(best_boost_round)) print("Best score achived: " + str(best_score_achived)) print( "------------------------------------------------------------------------" ) model.set_params(iterations=best_boost_round) # Param search if step_wise_start_at <= 1: param_test = {'max_depth': range(2, 11, 1)} search = GridSearchCV(estimator=model, param_grid=param_test, scoring=sklearn_metric, n_jobs=n_jobs, iid=False, cv=folds, verbose=True) search.fit(X, y, cat_features=cat_features) print("Best params encountered: " + str(search.best_params_)) print("Best score achived: " + str(search.best_score_)) print( "------------------------------------------------------------------------" ) pipe_model = search.best_estimator_ if step_wise_start_at <= 2: param_test = { 'l2_leaf_reg': [ 0, 0.0001, 0.001, 0.004, 0.007, 0.01, 0.04, 0.07, 1, 2, 3, 4, 7, 10 ] } search = GridSearchCV(estimator=model, param_grid=param_test, scoring=sklearn_metric, n_jobs=n_jobs, iid=False, cv=folds, verbose=True) search.fit(X, y, cat_features=cat_features) print("Best params encountered: " + str(search.best_params_)) print("Best score achived: " + str(search.best_score_)) print( "------------------------------------------------------------------------" ) pipe_model = search.best_estimator_ if step_wise_start_at <= 3: param_test = { 'random_strength': [0, 0.001, 0.01, 0.04, 0.07, 0.1, 0.4, 0.7, 1, 4, 7, 10] } search = GridSearchCV(estimator=model, param_grid=param_test, scoring=sklearn_metric, n_jobs=n_jobs, iid=False, cv=folds, verbose=True) search.fit(X, y, cat_features=cat_features) print("Best params encountered: " + str(search.best_params_)) print("Best score achived: " + str(search.best_score_)) print( "------------------------------------------------------------------------" ) pipe_model = search.best_estimator_ if step_wise_start_at <= 4: param_test = { 'bagging_temperature': [ 0, 0.001, 0.01, 0.04, 0.07, 0.1, 0.4, 0.7, 1, 4, 7, 10, 30, 60, 90, 120 ] } search = GridSearchCV(estimator=model, param_grid=param_test, scoring=sklearn_metric, n_jobs=n_jobs, iid=False, cv=folds, verbose=True) search.fit(X, y, cat_features=cat_features) print("Best params encountered: " + str(search.best_params_)) print("Best score achived: " + str(search.best_score_)) print( "------------------------------------------------------------------------" ) pipe_model = search.best_estimator_ if step_wise_start_at <= 5: param_test = { 'border_count': [32, 5, 10, 20, 50, 100, 150, 200, 255] } search = GridSearchCV(estimator=model, param_grid=param_test, scoring=sklearn_metric, n_jobs=n_jobs, iid=False, cv=folds, verbose=True) search.fit(X, y, cat_features=cat_features) print("Best params encountered: " + str(search.best_params_)) print("Best score achived: " + str(search.best_score_)) print( "------------------------------------------------------------------------" ) pipe_model = search.best_estimator_ if step_wise_start_at <= 6: param_test = { 'ctr_border_count': [50, 5, 10, 20, 100, 150, 200, 255] } search = GridSearchCV(estimator=model, param_grid=param_test, scoring=sklearn_metric, n_jobs=n_jobs, iid=False, cv=folds, verbose=True) search.fit(X, y, cat_features=cat_features) print("Best params encountered: " + str(search.best_params_)) print("Best score achived: " + str(search.best_score_)) print( "------------------------------------------------------------------------" ) pipe_model = search.best_estimator_ # Get model from pipeline model = pipe_model.named_steps['model'] # Set final learning rate model.set_params(learning_rate=final_learning_rate) # num_boost_round optimization cv_results = catboost.cv(params=model.get_params(), pool=d_train, iterations=10000, early_stopping_rounds=50, folds=folds, verbose_eval=None, as_pandas=False) rmetric_name = list(cv_results.keys())[1] best_boost_round = len(cv_results[rmetric_name]) best_score_achived = cv_results[rmetric_name][-1] print("Best num_boost_round: " + str(best_boost_round)) print("Best score achived: " + str(best_score_achived)) print( "------------------------------------------------------------------------" ) model.set_params(iterations=best_boost_round) return model
n_estimators=999999, learning_rate=0.02, colsample_bytree=0.3, num_leaves=2, metric='auc', objective='binary', n_jobs=-1) modelxgb = xgb.XGBClassifier(max_depth=2, n_estimators=999999, colsample_bytree=0.3, learning_rate=0.02, objective='binary:logistic', n_jobs=-1) modelcb = cb.CatBoostClassifier(iterations=999999, max_depth=2, learning_rate=0.02, colsample_bylevel=0.03, objective="Logloss") train_path = '../input/train.csv' test_path = '../input/test.csv' lgb_path = './lgb_models_stack/' xgb_path = './xgb_models_stack/' cb_path = './cb_models_stack/' #Create dir for models for filename in [lgb_path, xgb_path, cb_path]: if os.path.exists(filename) == False: os.mkdir(filename) print('Load Train Data.') train_x = pd.read_csv(train_path)
import time import os red_data_training, red_data_test, red_quality_training, red_quality_test, white_data_training, white_data_test, white_quality_training, white_quality_test = wines_import.read_data( False) if (os.path.exists("./catboost modele i wyniki/CPU WHITE MultiClass")): os.remove("./catboost modele i wyniki/CPU WHITE MultiClass") if (os.path.exists("./catboost modele i wyniki/GPU WHITE MultiClass")): os.remove("./catboost modele i wyniki/GPU WHITE MultiClass") messages_file = open( "./catboost modele i wyniki/models WHITE MultiClass verification", mode="w+") model_white = catboost.CatBoostClassifier(task_type="CPU", random_seed=42, objective="MultiClass", iterations=2000) time_before = time.time() model_white.fit(white_data_training, white_quality_training, eval_set=catboost.Pool(white_data_test, white_quality_test, has_header=True)) time_after = time.time() model_white.save_model( "./catboost modele i wyniki/CPU WHITE MultiClass verification") messages_file.write("Uczenie na CPU wina biale trwalo:\n") messages_file.write(str(time_after - time_before)) model_white_GPU = catboost.CatBoostClassifier(task_type="GPU", random_seed=42, objective="MultiClass",
booster='gbtree', objective='binary:logistic', eval_metric='logloss', learning_rate=0.01), train_type=args.train_type, use_valid=True, debug=args.debug) elif (args.classifier == "lightgbm"): model = LightGBMClassifier(model=lgb.LGBMClassifier( objective='binary', metric='binary_logloss'), train_type=args.train_type, use_valid=True, debug=args.debug) elif (args.classifier == "catboost"): model = CatBoostClassifier( model=catboost.CatBoostClassifier(loss_function="Logloss"), use_valid=True, debug=args.debug) # モデルのパラメータ設定 model.set_params(**study.best_params) #-------------------- # モデルの学習処理 #-------------------- model.fit(X_train_fold, y_train_fold, X_valid_fold, y_valid_fold) #-------------------- # モデルの推論処理 #-------------------- y_pred_train[valid_index] = model.predict(X_valid_fold)
def train_catboost(self): # self.ct = bcolz.open(datadir, mode='r') # ct = self.ct time_start = datetime.datetime.now() print("catboost training start", datetime.datetime.now(), datetime.datetime.now() - time_start) time_start = datetime.datetime.now() y_train = pickle.load(open("train-label-2-0.pickle", "rb")) y_test1 = pickle.load(open("train-label-4-1.pickle", "rb")) y_test2 = pickle.load(open("train-label-4-3.pickle", "rb")) gc.collect() print("labels loaded", datetime.datetime.now(), datetime.datetime.now() - time_start) time_start = datetime.datetime.now() X_train = pickle.load(open("train-features-2-0.pickle", "rb")) X_test1 = pickle.load(open("train-features-4-1.pickle", "rb")) X_test2 = pickle.load(open("train-features-4-3.pickle", "rb")) gc.collect() print("CSR loaded", datetime.datetime.now(), datetime.datetime.now() - time_start) time_start = datetime.datetime.now() print("start learning ", datetime.datetime.now(), datetime.datetime.now() - time_start) time_start = datetime.datetime.now() self.ml_params = { "loss_function": 'Logloss', #RMSE, MAE, Quantile, LogLinQuantile, Poisson, MAPE, Lq "eval_metric": "Logloss", "random_strength": 90, "boosting_type": "Plain", "bootstrap_type": "Bernoulli", "od_type": 'Iter', "od_wait": 800, "depth": 5, "learning_rate": 0.2, #"learning_rate": 0.1, #"iterations": 45, "iterations": 10, } time_start = datetime.datetime.now() print("before import catboost", datetime.datetime.now(), datetime.datetime.now() - time_start) time_start = datetime.datetime.now() import catboost print("after import catboost", datetime.datetime.now(), datetime.datetime.now() - time_start) time_start = datetime.datetime.now() def Pool(X, y): cols = self.num_feature_names + self.cat_feature_names cb = catboost.Pool(X, label=y, feature_names=cols, cat_features=[]) return cb # cbtrain = get_pool(0, self.rows_num-1000) # try: # pickle.dump(cbtrain, open('cbtrain-pool.pickle', 'wb'), protocol=4) # except Exception as ex: # print(ex) # cbtest = get_pool(self.rows_num-1000, self.rows_num) # model = catboost.CatBoostRegressor(**self.ml_params) model = catboost.CatBoostClassifier(**self.ml_params) self.model = model # eval_set_ = [cbtest] model.fit(Pool(X_train, y_train), eval_set=[ Pool(X_train, y_train), Pool(X_test1, y_test1), Pool(X_test2, y_test2) ], use_best_model=True, verbose=True) print("end learning ", datetime.datetime.now(), datetime.datetime.now() - time_start) time_start = datetime.datetime.now() pickle.dump(model, open('model-catboost.pickle', 'wb'), protocol=4) gc.collect()
feature_name = ['Parameter{0}'.format(i) for i in range(5, 11)] tr_index = ~data['label'].isnull() X_train = data[tr_index][feature_name].reset_index(drop=True) y = data[tr_index]['label'].reset_index(drop=True).astype(int) X_test = data[~tr_index][feature_name].reset_index(drop=True) print(X_train.shape, X_test.shape) oof = np.zeros((X_train.shape[0], 4)) prediction = np.zeros((X_test.shape[0], 4)) # cbt_model = cbt.CatBoostClassifier(iterations=800,verbose=300,learning_rate=0.01, # task_type='GPU', # loss_function='MultiClass') cbt_model = cbt.CatBoostClassifier(iterations=1000, verbose=300, task_type='GPU', loss_function='MultiClass', random_state=303) cbt_model.fit(X_train, y, eval_set=(X_train, y)) oof = cbt_model.predict_proba(X_test) prediction = cbt_model.predict_proba(X_test) gc.collect() #垃圾回收 print('logloss', log_loss(pd.get_dummies(y).values, oof)) print('ac', accuracy_score(y, np.argmax(oof, axis=1))) print('mae', 1 / (1 + np.sum(np.absolute(np.eye(4)[y] - oof)) / 480)) print('-' * 80) sub = test[['Group']] prob_cols = [i for i in submit.columns if i not in ['Group']] for i, f in enumerate(prob_cols): sub[f] = prediction[:, i]