Beispiel #1
0
encoder = joblib.load('models/encoder.pkl')

# Load data
X_train = encoder.transform(pd.read_csv('data/X_train.csv'))
y_train = pd.read_csv('data/y_train.csv')['Survived']

# Determine categorical features
cat_features = np.where(X_train.dtypes == int)[0]

# Create a validation set with 20% of the training set
X_fit, X_val, y_fit, y_val = model_selection.train_test_split(X_train, y_train, test_size=0.2)

# Initialize stage 0 models
models = {
    'rf': ensemble.RandomForestClassifier(random_state=1),
    'catboost': catboost.CatBoostClassifier()
}

# Initialize stack
stack = xam.stacking.StackingClassifier(
    models=models,
    meta_model=linear_model.LogisticRegression(),
    cv=model_selection.StratifiedKFold(n_splits=10),
    use_base_features=True,
    use_proba=True
)

stack.fit(
    X=X_train,
    y=y_train,
    fit_params={
Beispiel #2
0
    return X, y


def read_yaml(path):
    with open(path, "r") as f:
        return yaml.safe_load(f)


MODEL_PARAMS = {"allow_writing_files": False, "iterations": 10}


@pytest.fixture(
    scope="module",
    params=[
        cb.CatBoost(MODEL_PARAMS),
        cb.CatBoostClassifier(**MODEL_PARAMS),
        cb.CatBoostRegressor(**MODEL_PARAMS),
    ],
    ids=["CatBoost", "CatBoostClassifier", "CatBoostRegressor"],
)
def cb_model(request):
    model = request.param
    X, y = get_iris()
    return ModelWithData(model=model.fit(X, y), inference_dataframe=X)


@pytest.fixture
def reg_model():
    model = cb.CatBoostRegressor(**MODEL_PARAMS)
    X, y = get_iris()
    return ModelWithData(model=model.fit(X, y), inference_dataframe=X)
from sklearn.metrics import roc_auc_score

tr_index = ~all_data['label'].isnull()
X_train = all_data[tr_index][list(set(feature_name))].reset_index(drop=True)
y = all_data[tr_index]['label'].reset_index(drop=True).astype(int)
X_test = all_data[~tr_index][list(set(feature_name))].reset_index(drop=True)
print(X_train.shape,X_test.shape)
random_seed = 2019
final_pred = []
cv_score = []
cv_model = []
skf = StratifiedKFold(n_splits=5, random_state=random_seed, shuffle=True)
for index, (train_index, test_index) in enumerate(skf.split(X_train, y)):
    print(index)
    train_x, test_x, train_y, test_y = X_train[feature_name].iloc[train_index], X_train[feature_name].iloc[test_index], y.iloc[train_index], y.iloc[test_index]
    cbt_model = cbt.CatBoostClassifier(iterations=3000,learning_rate=0.1,max_depth=7,l2_leaf_reg=10,verbose=10,early_stopping_rounds=100,devices='GPU',eval_metric='F1')
    cbt_model.fit(train_x[feature_name], train_y,eval_set=(test_x[feature_name],test_y))
    cv_model.append(cbt_model)
    y_test = cbt_model.predict(X_test[feature_name])
    y_val = cbt_model.predict_proba(test_x[feature_name])
    print(Counter(np.argmax(y_val,axis=1)))
    cv_score.append(f1_score(test_y,np.round(y_val[:,1])))

# Catboost比较适合类别较多的场景
    
# GPU结果五折 
# 第一折
# bestTest = 0.9358583857
# bestIteration = 1437

# 第二折
Beispiel #4
0
                   model1.predict_proba(X.iloc[val_idx, :])[:, 1]))
    preds[:, i] = model1.predict_proba(public)[:, 1]

# ================= CatBoost ===================== #

cat_feature_inds = []
for i, c in enumerate(X.columns.values):
    num_uniques = len(X[c].unique())
    if num_uniques < 3:
        cat_feature_inds.append(i)

preds = np.zeros((len(public), 5))
for i, (train_idx, val_idx) in enumerate(kf.split(X)):
    cat_model = catboost.CatBoostClassifier(iterations=600,
                                            learning_rate=0.03,
                                            depth=6,
                                            l2_leaf_reg=1,
                                            eval_metric='Logloss',
                                            random_seed=4 * 100 + 6)

    cat_model.fit(X.iloc[train_idx, :],
                  y[train_idx],
                  cat_features=cat_feature_inds)
    print("Train Log loss is %.4f" %
          log_loss(y[train_idx],
                   cat_model.predict_proba(X.iloc[train_idx, :])[:, 1]))
    print("Validation Log loss is %.4f" %
          log_loss(y[val_idx],
                   cat_model.predict_proba(X.iloc[val_idx, :])[:, 1]))
    preds[:, i] = cat_model.predict_proba(public)[:, 1]

# lightgbm
Beispiel #5
0
def main():
    # Load cli params
    parser = argparse.ArgumentParser(prog="grid")
    parser.add_argument("-d", "--data", dest="data", required=True)
    parser.add_argument("-m",
                        "--model",
                        dest="model",
                        required=True,
                        choices=["xgboost", "lightgbm", "catboost"])
    parser.add_argument("-t", "--tag", dest="tag", required=True)
    parser.add_argument("-n", "--number", dest="number", type=int, default=-1)

    args = parser.parse_args()

    # Load config
    cf = c.Config(args.data, args.model, args.tag)
    grid_params = cf.load_config_file("grid")
    hyper_params = cf.load_config_file("hyper")
    fit_params = {}

    # Load data
    features_to_load = list(cf.features.keys())
    source_file = os.path.join(c.DATA_FOLDER, args.data)
    data = pd.read_csv(source_file,
                       usecols=features_to_load,
                       sep=";",
                       decimal=".",
                       encoding="latin1",
                       keep_default_na=False,
                       na_values=[""])

    # Preprocessing
    group_categoricals_tail(data, cf.classes["categorical"])

    if args.model == "xgboost":
        import xgboost as xgb
        estimator = xgb.XGBClassifier()
        data = pd.get_dummies(data, columns=cf.classes["categorical"]).copy()
    elif args.model == "lightgbm":
        import lightgbm as lgb
        from sklearn.preprocessing import LabelEncoder
        estimator = lgb.LGBMClassifier()
        label_encoding = {}
        for col in cf.classes["categorical"]:
            unique_values = data[col].unique().tolist()
            label_encoding[col] = LabelEncoder()
            label_encoding[col].fit(sorted(unique_values))
            data[col] = label_encoding[col].transform(data[col].values)
        fit_params = {"categorical_feature": cf.classes["categorical"]}
    elif args.model == "catboost":
        import catboost as cb
        estimator = cb.CatBoostClassifier()
    else:  # This code should never end up here
        raise ValueError(
            "Something went wrong: {} is not a feasible model.".format(
                args.model))

    features = [
        x for x in sorted(data.columns.tolist())
        if x not in cf.classes["label"] + cf.classes["index"]
    ]
    if args.model == "lightgbm":
        fit_params["feature_name"] = features
    elif args.model == "catboost":
        fit_params["cat_features"] = [
            i for i, f in enumerate(features) if f in cf.classes["categorical"]
        ]

    # Sampling
    if not args.number == -1 or args.number <= data.shape[0]:
        data = data.sample(args.number)
    d = data.loc[:, features].values

    # Prepare to save
    output = "_".join([args.model, args.tag, "grid"]) + ".xlsx"
    excel_writer = pd.ExcelWriter(os.path.join(c.RESULTS_FOLDER, output),
                                  engine="xlsxwriter")

    # Grid
    results = {}

    for label in cf.classes["label"]:
        print("----------------------------", end="\n")
        print(label, end="\n")
        print("----------------------------", end="\n\n")

        y = data.loc[:, label].values

        rand_grid_cv = RandomizedSearchCV(estimator,
                                          param_distributions=hyper_params,
                                          **grid_params)
        rand_grid_cv.fit(d, y, **fit_params)

        results[label] = rand_grid_cv.best_params_

        results_df = pd.DataFrame(rand_grid_cv.cv_results_)
        results_df.to_excel(excel_writer, sheet_name=label, index=False)

    # Save
    excel_writer.save()
    file_name = "best"
    cf.save_config_file(file_name, results)
Beispiel #6
0
# Define hyperparameters
params = {
    "depth": [1, 3, 7],
    "iterations": [100],
    "learning_rate": [0.01, 0.1, 0.2],
    "l2_leaf_reg": [1, 5, 10],
}

# Step 1: set up target metrics for evaluating training

# Define target loss metric to aim for
target_f1 = 0.7

# Instantiate classifier and run grid search to find best parameters
clf = catboost.CatBoostClassifier()
grid_clf = GridSearchCV(clf, params, scoring="neg_log_loss")
grid_clf.fit(X_train, y_train)

# Set the clf to the best combination of parameters
clf = grid_clf.best_estimator_
clf.fit(X_train, y_train)

# Step 3: Evaluate the quality of the trained model
y_pred = clf.predict(X_test)
y_pred_proba = clf.predict_proba(X_test)

# print classification report of classifier
print(classification_report(y_test, y_pred, target_names=class_names))

# evaluate the quality of the trained model using weighted f1 score
Beispiel #7
0
from sklearn import metrics
import catboost 
import multiprocessing


d_train = pd.read_csv("train-1m.csv")
d_test = pd.read_csv("test.csv")

X_train = d_train.drop(['dep_delayed_15min'], axis=1)
X_test = d_test.drop(['dep_delayed_15min'], axis=1)
y_train = np.where(d_train["dep_delayed_15min"]=="Y",1,0)           
y_test = np.where(d_test["dep_delayed_15min"]=="Y",1,0)           

cat_cols = np.where(X_train.dtypes == np.object)[0]


md = catboost.CatBoostClassifier(iterations = 100, depth = 10, learning_rate = 0.1,
               task_type = "GPU")
      ##         thread_count = multiprocessing.cpu_count())
%time md.fit(X_train, y_train, cat_features = cat_cols)


y_pred = md.predict_proba(X_test)[:,1]
metrics.roc_auc_score(y_test, y_pred)






X_test = data[~tr_index][list(set(feature_name))].reset_index(drop=True)
print(X_train.shape, X_test.shape)
oof = np.zeros(X_train.shape[0])
prediction = np.zeros(X_test.shape[0])
seeds = [19970412, 2019 * 2 + 1024, 4096, 2048, 1024]
num_model_seed = 1
for model_seed in range(num_model_seed):
    oof_cat = np.zeros(X_train.shape[0])
    prediction_cat = np.zeros(X_test.shape[0])
    skf = StratifiedKFold(n_splits=5, random_state=seeds[model_seed], shuffle=True)
    for index, (train_index, test_index) in enumerate(skf.split(X_train, y)):
        print(index)
        train_x, test_x, train_y, test_y = X_train[feature_name].iloc[train_index], X_train[feature_name].iloc[
            test_index], y.iloc[train_index], y.iloc[test_index]
        cbt_model = cbt.CatBoostClassifier(iterations=7000, learning_rate=0.1, max_depth=7, verbose=100,
                                           early_stopping_rounds=500, eval_metric='F1', task_type='GPU',
                                           cat_features=cat_list)
        cbt_model.fit(train_x[feature_name], train_y, eval_set=(test_x[feature_name], test_y))
        gc.collect()
        oof_cat[test_index] += cbt_model.predict_proba(test_x)[:, 1]
        prediction_cat += cbt_model.predict_proba(X_test[feature_name])[:, 1] / 5
    print('F1', f1_score(y, np.round(oof_cat)))
    oof += oof_cat / num_model_seed
    prediction += prediction_cat / num_model_seed
print('score', f1_score(y, np.round(oof)))
# write to csv
submit = test[['sid']]
submit['label'] = (prediction >= 0.499).astype(int)
print(submit['label'].value_counts())
submit.to_csv("round2_A_submission.csv", index=False)
Beispiel #9
0
num_model_seed = 5
for model_seed in range(num_model_seed):
    print(model_seed + 1)
    oof_cat = np.zeros((X_train.shape[0], 4))
    prediction_cat = np.zeros((X_test.shape[0], 4))
    skf = StratifiedKFold(n_splits=5,
                          random_state=seeds[model_seed],
                          shuffle=True)
    for index, (train_index, test_index) in enumerate(skf.split(X_train, y)):
        print(index)
    train_x, test_x, train_y, test_y = X_train.iloc[train_index], X_train.iloc[
        test_index], y.iloc[train_index], y.iloc[test_index]
    gc.collect()
    cbt_model = cbt.CatBoostClassifier(iterations=800,
                                       learning_rate=0.01,
                                       verbose=300,
                                       early_stopping_rounds=200,
                                       loss_function='MultiClass')
    cbt_model.fit(train_x, train_y, eval_set=(train_x, train_y))
    oof_cat[test_index] += cbt_model.predict_proba(test_x)
    prediction_cat += cbt_model.predict_proba(X_test) / 5
    gc.collect()
    oof += oof_cat / num_model_seed
    prediction += prediction_cat / num_model_seed
    print('logloss', log_loss(pd.get_dummies(y).values, oof_cat))
    print('ac', accuracy_score(y, np.argmax(oof_cat, axis=1)))
    print('mae', 1 / (1 + np.sum(np.absolute(np.eye(4)[y] - oof_cat)) / 480))
print('logloss', log_loss(pd.get_dummies(y).values, oof))
print('ac', accuracy_score(y, np.argmax(oof, axis=1)))
print('mae', 1 / (1 + np.sum(np.absolute(np.eye(4)[y] - oof)) / 480))
sub = test[['Group']]
    def test_check_mask_params(self, check_consistency_model_label,
                               check_consistency_model_features,
                               check_preprocessing_options, check_explainer,
                               check_model):
        """
        Unit test check mask params
        """
        train = pd.DataFrame({
            'Onehot1': ['A', 'B', 'A', 'B'],
            'Onehot2': ['C', 'D', 'C', 'D'],
            'Binary1': ['E', 'F', 'E', 'F'],
            'Binary2': ['G', 'H', 'G', 'H'],
            'Ordinal1': ['I', 'J', 'I', 'J'],
            'Ordinal2': ['K', 'L', 'K', 'L'],
            'BaseN1': ['M', 'N', 'M', 'N'],
            'BaseN2': ['O', 'P', 'O', 'P'],
            'Target1': ['Q', 'R', 'Q', 'R'],
            'Target2': ['S', 'T', 'S', 'T'],
            'other': ['other', np.nan, 'other', 'other']
        })

        features_dict = None
        columns_dict = {
            i: features
            for i, features in enumerate(train.columns)
        }
        features_types = {
            features: str(train[features].dtypes)
            for features in train.columns
        }
        label_dict = None

        enc_ordinal = ce.OrdinalEncoder(cols=[
            'Onehot1', 'Onehot2', 'Binary1', 'Binary2', 'Ordinal1', 'Ordinal2',
            'BaseN1', 'BaseN2', 'Target1', 'Target2', 'other'
        ]).fit(train)
        train_ordinal = enc_ordinal.transform(train)

        y = pd.DataFrame({'y_class': [0, 0, 0, 1]})

        model = cb.CatBoostClassifier(n_estimators=1).fit(train_ordinal, y)
        clf_explainer = shap.TreeExplainer(model)

        check_preprocessing_options.return_value = True
        check_consistency_model_features.return_value = True
        check_consistency_model_label.return_value = True
        check_explainer.return_value = clf_explainer
        check_model.return_value = "classification", [0, 1]

        wrong_mask_params_1 = list()
        wrong_mask_params_2 = None
        wrong_mask_params_3 = {
            "features_to_hide": None,
            "threshold": None,
            "positive": None
        }
        wright_mask_params = {
            "features_to_hide": None,
            "threshold": None,
            "positive": True,
            "max_contrib": 5
        }
        with self.assertRaises(ValueError):
            predictor_1 = SmartPredictor(features_dict,
                                         model,
                                         columns_dict,
                                         clf_explainer,
                                         features_types,
                                         label_dict,
                                         mask_params=wrong_mask_params_1)
            predictor_1 = SmartPredictor(features_dict,
                                         model,
                                         columns_dict,
                                         clf_explainer,
                                         features_types,
                                         label_dict,
                                         mask_params=wrong_mask_params_2)
            predictor_1 = SmartPredictor(features_dict,
                                         model,
                                         columns_dict,
                                         clf_explainer,
                                         features_types,
                                         label_dict,
                                         mask_params=wrong_mask_params_3)

        predictor_1 = SmartPredictor(features_dict,
                                     model,
                                     columns_dict,
                                     clf_explainer,
                                     features_types,
                                     label_dict,
                                     mask_params=wright_mask_params)
    def test_summarize_2(self):
        """
        Unit test 2 summarize method
        """
        predictor_1 = self.predictor_3
        predictor_1._case = "classification"
        predictor_1._classes = [0, 1]
        clf = cb.CatBoostClassifier(n_estimators=1).fit(
            self.df_3[['x1', 'x2']], self.df_3['y'])
        clf_explainer = shap.TreeExplainer(clf)
        predictor_1.model = clf
        predictor_1.explainer = clf_explainer

        with self.assertRaises(ValueError):
            predictor_1.summarize()

        predictor_1.data = {
            "x": None,
            "x_preprocessed": None,
            "x_postprocessed": None,
            "ypred": None,
            "contributions": None
        }

        predictor_1.data["x"] = self.df_3[["x1", "x2"]]
        predictor_1.data["x_preprocessed"] = self.df_3[["x1", "x2"]]
        predictor_1.data["x_postprocessed"] = self.df_3[["x1", "x2"]]
        predictor_1.data["ypred"] = pd.DataFrame({
            "y": ["Yes", "Yes", "No", "No", "No"],
            "proba": [0.519221, 0.468791, 0.531209, 0.531209, 0.531209]
        })

        predictor_1.data["contributions"] = pd.DataFrame({
            "x1": [0, 0, -0, -0, -0],
            "x2": [0.161538, -0.0403846, 0.0403846, 0.0403846, 0.0403846]
        })
        output = predictor_1.summarize()

        expected_output = pd.DataFrame(
            {
                "y": ["Yes", "Yes", "No", "No", "No"],
                "proba": [0.519221, 0.468791, 0.531209, 0.531209, 0.531209],
                "feature_1":
                ["weight", "weight", "weight", "weight", "weight"],
                "value_1": ["90", "78", "84", "85", "53"],
                "contribution_1": [
                    "0.161538", "-0.0403846", "0.0403846", "0.0403846",
                    "0.0403846"
                ],
                "feature_2": ["age", "age", "age", "age", "age"],
                "value_2": ["25", "39", "50", "43", "67"],
                "contribution_2": ["0", "0", "0", "0", "0"]
            },
            dtype=object)
        expected_output["proba"] = expected_output["proba"].astype(float)

        feature_expected = [
            column for column in expected_output.columns
            if column.startswith("feature_")
        ]
        feature_output = [
            column for column in output.columns
            if column.startswith("feature_")
        ]

        value_expected = [
            column for column in expected_output.columns
            if column.startswith("value_")
        ]
        value_output = [
            column for column in output.columns if column.startswith("value_")
        ]

        contribution_expected = [
            column for column in expected_output.columns
            if column.startswith("contribution_")
        ]
        contribution_output = [
            column for column in output.columns
            if column.startswith("contribution_")
        ]

        assert expected_output.shape == output.shape
        assert len(feature_expected) == len(feature_output)
        assert len(value_expected) == len(value_output)
        assert len(contribution_expected) == len(contribution_output)
        assert all(output.columns == expected_output.columns)
    def setUp(self):
        df = pd.DataFrame(range(0, 5), columns=['id'])
        df['y'] = df['id'].apply(lambda x: 1 if x < 2 else 0)
        df['x1'] = np.random.randint(1, 123, df.shape[0])
        df['x2'] = ["S", "M", "S", "D", "M"]
        df = df.set_index('id')
        encoder = ce.OrdinalEncoder(cols=["x2"], handle_unknown="None")
        encoder_fitted = encoder.fit(df)
        df_encoded = encoder_fitted.transform(df)
        clf = cb.CatBoostClassifier(n_estimators=1).fit(
            df_encoded[['x1', 'x2']], df_encoded['y'])
        clf_explainer = shap.TreeExplainer(clf)

        columns_dict = {0: "x1", 1: "x2"}
        label_dict = {0: "Yes", 1: "No"}

        postprocessing = {
            "x2": {
                "type": "transcoding",
                "rule": {
                    "S": "single",
                    "M": "married",
                    "D": "divorced"
                }
            }
        }
        features_dict = {"x1": "age", "x2": "family_situation"}

        features_types = {
            features: str(df[features].dtypes)
            for features in df[['x1', 'x2']]
        }

        self.df_1 = df
        self.preprocessing_1 = encoder_fitted
        self.df_encoded_1 = df_encoded
        self.clf_1 = clf
        self.clf_explainer_1 = clf_explainer
        self.columns_dict_1 = columns_dict
        self.label_dict_1 = label_dict
        self.postprocessing_1 = postprocessing
        self.features_dict_1 = features_dict
        self.features_types_1 = features_types

        self.predictor_1 = SmartPredictor(features_dict, clf, columns_dict,
                                          clf_explainer, features_types,
                                          label_dict, encoder_fitted,
                                          postprocessing)

        df['x2'] = np.random.randint(1, 100, df.shape[0])
        encoder = ce.OrdinalEncoder(cols=["x2"], handle_unknown="None")
        encoder_fitted = encoder.fit(df[["x1", "x2"]])
        df_encoded = encoder_fitted.transform(df[["x1", "x2"]])

        clf = cb.CatBoostClassifier(n_estimators=1).fit(
            df[['x1', 'x2']], df['y'])
        clf_explainer = shap.TreeExplainer(clf)
        features_dict = {"x1": "age", "x2": "weight"}
        features_types = {
            features: str(df[features].dtypes)
            for features in df[["x1", "x2"]].columns
        }

        self.df_2 = df
        self.preprocessing_2 = encoder_fitted
        self.df_encoded_2 = df_encoded
        self.clf_2 = clf
        self.clf_explainer_2 = clf_explainer
        self.columns_dict_2 = columns_dict
        self.label_dict_2 = label_dict
        self.postprocessing_2 = postprocessing
        self.features_dict_2 = features_dict
        self.features_types_2 = features_types

        self.predictor_2 = SmartPredictor(features_dict, clf, columns_dict,
                                          clf_explainer, features_types,
                                          label_dict, encoder_fitted,
                                          postprocessing)

        df['x1'] = [25, 39, 50, 43, 67]
        df['x2'] = [90, 78, 84, 85, 53]

        columns_dict = {0: "x1", 1: "x2"}
        label_dict = {0: "No", 1: "Yes"}
        features_dict = {"x1": "age", "x2": "weight"}

        features_types = {
            features: str(df[features].dtypes)
            for features in df[['x1', 'x2']].columns
        }

        clf = cb.CatBoostRegressor(n_estimators=1).fit(df[['x1', 'x2']],
                                                       df['y'])
        clf_explainer = shap.TreeExplainer(clf)

        self.df_3 = df
        self.preprocessing_3 = None
        self.df_encoded_3 = df
        self.clf_3 = clf
        self.clf_explainer_3 = clf_explainer
        self.columns_dict_3 = columns_dict
        self.label_dict_3 = label_dict
        self.postprocessing_3 = None
        self.features_dict_3 = features_dict
        self.features_types_3 = features_types

        self.predictor_3 = SmartPredictor(features_dict, clf, columns_dict,
                                          clf_explainer, features_types,
                                          label_dict)
    def test_check_preprocessing_1(self, check_consistency_model_label,
                                   check_consistency_model_features,
                                   check_preprocessing_options,
                                   check_explainer, check_model):
        """
        Test check preprocessing on multiple preprocessing
        """
        train = pd.DataFrame({
            'Onehot1': ['A', 'B', 'A', 'B'],
            'Onehot2': ['C', 'D', 'C', 'D'],
            'Binary1': ['E', 'F', 'E', 'F'],
            'Binary2': ['G', 'H', 'G', 'H'],
            'Ordinal1': ['I', 'J', 'I', 'J'],
            'Ordinal2': ['K', 'L', 'K', 'L'],
            'BaseN1': ['M', 'N', 'M', 'N'],
            'BaseN2': ['O', 'P', 'O', 'P'],
            'Target1': ['Q', 'R', 'Q', 'R'],
            'Target2': ['S', 'T', 'S', 'T'],
            'other': ['other', np.nan, 'other', 'other']
        })

        features_dict = None
        columns_dict = {
            i: features
            for i, features in enumerate(train.columns)
        }
        features_types = {
            features: str(train[features].dtypes)
            for features in train.columns
        }
        label_dict = None

        enc_ordinal_all = ce.OrdinalEncoder(cols=[
            'Onehot1', 'Onehot2', 'Binary1', 'Binary2', 'Ordinal1', 'Ordinal2',
            'BaseN1', 'BaseN2', 'Target1', 'Target2', 'other'
        ]).fit(train)
        train_ordinal_all = enc_ordinal_all.transform(train)

        y = pd.DataFrame({'y_class': [0, 0, 0, 1]})

        model = cb.CatBoostClassifier(n_estimators=1).fit(train_ordinal_all, y)
        clf_explainer = shap.TreeExplainer(model)

        check_preprocessing_options.return_value = True
        check_consistency_model_features.return_value = True
        check_consistency_model_label.return_value = True
        check_explainer.return_value = clf_explainer
        check_model.return_value = "classification", [0, 1]

        predictor_1 = SmartPredictor(features_dict, model, columns_dict,
                                     clf_explainer, features_types, label_dict)

        y = pd.DataFrame(data=[0, 1, 0, 0], columns=['y'])

        enc_onehot = ce.OneHotEncoder(cols=['Onehot1', 'Onehot2']).fit(train)
        train_onehot = enc_onehot.transform(train)
        enc_binary = ce.BinaryEncoder(
            cols=['Binary1', 'Binary2']).fit(train_onehot)
        train_binary = enc_binary.transform(train_onehot)
        enc_ordinal = ce.OrdinalEncoder(
            cols=['Ordinal1', 'Ordinal2']).fit(train_binary)
        train_ordinal = enc_ordinal.transform(train_binary)
        enc_basen = ce.BaseNEncoder(
            cols=['BaseN1', 'BaseN2']).fit(train_ordinal)
        train_basen = enc_basen.transform(train_ordinal)
        enc_target = ce.TargetEncoder(cols=['Target1', 'Target2']).fit(
            train_basen, y)

        input_dict1 = dict()
        input_dict1['col'] = 'Onehot2'
        input_dict1['mapping'] = pd.Series(data=['C', 'D', np.nan],
                                           index=['C', 'D', 'missing'])
        input_dict1['data_type'] = 'object'

        input_dict2 = dict()
        input_dict2['col'] = 'Binary2'
        input_dict2['mapping'] = pd.Series(data=['G', 'H', np.nan],
                                           index=['G', 'H', 'missing'])
        input_dict2['data_type'] = 'object'

        input_dict = dict()
        input_dict['col'] = 'state'
        input_dict['mapping'] = pd.Series(data=['US', 'FR-1', 'FR-2'],
                                          index=['US', 'FR', 'FR'])
        input_dict['data_type'] = 'object'

        input_dict3 = dict()
        input_dict3['col'] = 'Ordinal2'
        input_dict3['mapping'] = pd.Series(data=['K', 'L', np.nan],
                                           index=['K', 'L', 'missing'])
        input_dict3['data_type'] = 'object'
        list_dict = [input_dict2, input_dict3]

        y = pd.DataFrame(data=[0, 1], columns=['y'])

        train = pd.DataFrame({
            'city': ['chicago', 'paris'],
            'state': ['US', 'FR'],
            'other': ['A', 'B']
        })
        enc = ColumnTransformer(transformers=[('onehot', skp.OneHotEncoder(),
                                               ['city', 'state'])],
                                remainder='drop')
        enc.fit(train, y)

        wrong_prepro = skp.OneHotEncoder().fit(train, y)

        predictor_1.preprocessing = [
            enc_onehot, enc_binary, enc_ordinal, enc_basen, enc_target,
            input_dict1, list_dict
        ]
        predictor_1.check_preprocessing()

        for preprocessing in [
                enc_onehot, enc_binary, enc_ordinal, enc_basen, enc_target
        ]:
            predictor_1.preprocessing = preprocessing
            predictor_1.check_preprocessing()

        predictor_1.preprocessing = input_dict2
        predictor_1.check_preprocessing()

        predictor_1.preprocessing = enc
        predictor_1.check_preprocessing()

        predictor_1.preprocessing = None
        predictor_1.check_preprocessing()

        with self.assertRaises(Exception):
            predictor_1.preprocessing = wrong_prepro
            predictor_1.check_preprocessing()
from scoring import calculate_scores

VAL_SPLIT = 0.2
data = get_data(val_split=VAL_SPLIT, apply_label_encoding=True, fillna=True)
X_train, X_val, X_test, y_train, y_val, categorical_features = (
    data["X_train"],
    data["X_val"],
    data["X_test"],
    data["y_train"],
    data["y_val"],
    data["categorical_features"],
)
clf = cb.CatBoostClassifier(
    n_estimators=200,
    learning_rate=0.05,
    metric_period=500,
    od_wait=500,
    task_type="CPU",
    depth=8,
)

print("Fitting a catboost model...")
clf.fit(X_train, y_train, cat_features=categorical_features)

_ = calculate_scores(clf, X_val, y_val)

for scoring in SCORING_LIST:
    print("Optimizing catboost params for", scoring, "with random search...")
    best_estimator = perform_random_search(
        estimator=clf,
        X_train=X_train,
        X_val=X_val,
Beispiel #15
0
    def objective(trial):
        #--------------------------------------------
        # ベイズ最適化でのチューニングパイパーパラメーター
        #--------------------------------------------
        if (args.classifier == "logistic"):
            params = {
                'penalty': trial.suggest_categorical('penalty', ['l2']),
                "solver": trial.suggest_categorical("solver", ['sag']),
                'C': trial.suggest_discrete_uniform('C', 0.01, 100.0,
                                                    0.1),  # 一様分布に従う。
                'random_state': trial.suggest_int("random_state", 71, 71),
                'n_jobs': trial.suggest_int("n_jobs", -1, -1),
            }
        elif (args.classifier == "knn"):
            params = {
                "metric": trial.suggest_categorical("metric", ['minkowski']),
                "p": trial.suggest_int("p", 1, 2),
                'n_neighbors': trial.suggest_int("n_neighbors", 1, 50),
                'n_jobs': trial.suggest_int("n_jobs", -1, -1),
            }
        elif (args.classifier == "svm"):
            params = {
                "kernel": trial.suggest_categorical("kernel", ['rbf']),
                'C': trial.suggest_loguniform('C', 0.1, 1000.0),
                'gamma': trial.suggest_loguniform('gamma', 1e-8, 10.0),
                'random_state': trial.suggest_int("random_state", 71, 71),
            }
        elif (args.classifier == "random_forest"):
            params = {
                "oob_score":
                trial.suggest_int(
                    "oob_score", 0, 1
                ),  # Whether to use out-of-bag samples to estimate the generalization accuracy.(default=False)
                "n_estimators":
                trial.suggest_int("n_estimators", 1000, 1000),  # チューニングは固定
                "criterion":
                trial.suggest_categorical(
                    "criterion", ['gini', "entropy"]),  # 不純度関数 [purity]
                'max_features':
                trial.suggest_categorical('max_features', [
                    'auto',
                    0.2,
                    0.4,
                    0.6,
                    0.8,
                ]),
                'min_samples_split':
                trial.suggest_int(
                    'min_samples_split', 2, 10
                ),  # min_samples_split must be an integer greater than 1 or a float in (0.0, 1.0]; got the integer 1                   
                'min_samples_leaf':
                trial.suggest_int('min_samples_leaf', 1, 10),
                "bootstrap":
                trial.suggest_int(
                    "bootstrap", True,
                    True),  # 決定木の構築に、ブートストラップサンプルを使用するか否か(default:True)
                "oob_score":
                trial.suggest_int(
                    "oob_score", False, True
                ),  # Whether to use out-of-bag samples to estimate the generalization accuracy.(default=False)
                'random_state':
                trial.suggest_int("random_state", 71, 71),
                'n_jobs':
                trial.suggest_int("n_jobs", -1, -1),
            }
        elif (args.classifier == "bagging"):
            params = {
                "n_estimators":
                trial.suggest_int("n_estimators", 1000, 1000),  # チューニングは固定
                'max_samples':
                trial.suggest_float(
                    'max_samples', 0.0,
                    1.0),  # base_estimator に設定した弱識別器の内, 使用するサンプルの割合
                'max_features':
                trial.suggest_float(
                    'max_features', 0.0, 1.0
                ),  # The number of features to draw from X to train each base estimator.
                "bootstrap":
                trial.suggest_int(
                    "bootstrap", True,
                    True),  # 決定木の構築に、ブートストラップサンプルを使用するか否か(default:True)
                "bootstrap_features":
                trial.suggest_int("bootstrap", False, True),
                'random_state':
                trial.suggest_int("random_state", 71, 71),
                'n_jobs':
                -1,
                # 弱識別器のパラメータ(先頭に "base_estimator__" をつけることでアクセス可能 )
                'base_estimator__max_depth':
                trial.suggest_int("base_estimator__random_state", 1, 8),
                'base_estimator__max_features':
                trial.suggest_float('base_estimator__max_features', 0.0, 1.0),
                'base_estimator__min_samples_leaf':
                trial.suggest_int('base_estimator__min_samples_leaf', 1, 10),
                'base_estimator__min_samples_split':
                trial.suggest_int('base_estimator__min_samples_split', 2, 10),
                'base_estimator__random_state':
                trial.suggest_int("base_estimator__random_state", 71, 71),
            }
        elif (args.classifier == "adaboost"):
            params = {
                "n_estimators":
                trial.suggest_int("n_estimators", 1000, 1000),  # チューニングは固定
                "learning_rate":
                trial.suggest_loguniform("learning_rate", 0.01,
                                         0.01),  # ハイパーパラメーターのチューニング時は固定  
                'random_state':
                71,
                # 弱識別器のパラメータ(先頭に "base_estimator__" をつけることでアクセス可能 )
                'base_estimator__max_depth':
                trial.suggest_int("base_estimator__random_state", 1, 10),
                'base_estimator__max_features':
                trial.suggest_float('base_estimator__max_features', 0.0, 1.0),
                'base_estimator__min_samples_leaf':
                trial.suggest_int('base_estimator__min_samples_leaf', 1, 10),
                'base_estimator__min_samples_split':
                trial.suggest_int('base_estimator__min_samples_split', 2, 10),
                'base_estimator__random_state':
                trial.suggest_int("base_estimator__random_state", 71, 71),
            }
        elif (args.classifier == "xgboost"):
            params = {
                'booster':
                trial.suggest_categorical('booster', ['gbtree']),
                'objective':
                trial.suggest_categorical('objective', ['binary:logistic']),
                "learning_rate":
                trial.suggest_loguniform("learning_rate", 0.01,
                                         0.01),  # ハイパーパラメーターのチューニング時は固定  
                "n_estimators":
                trial.suggest_int("n_estimators", 1000,
                                  1000),  # ハイパーパラメーターのチューニング時は固定
                'max_depth':
                trial.suggest_int("max_depth", 3, 9),  # 3 ~ 9 : 一様分布に従う。1刻み
                'min_child_weight':
                trial.suggest_loguniform('min_child_weight', 0.1,
                                         10.0),  # 0.1 ~ 10.0 : 対数が一様分布に従う
                'subsample':
                trial.suggest_discrete_uniform(
                    'subsample', 0.6, 0.95,
                    0.05),  # 0.6 ~ 0.95 : 一様分布に従う。0.05 刻み
                'colsample_bytree':
                trial.suggest_discrete_uniform(
                    'colsample_bytree', 0.6, 0.95,
                    0.05),  # 0.6 ~ 0.95 : 一様分布に従う。0.05 刻み
                'gamma':
                trial.suggest_loguniform("gamma", 1e-8,
                                         1.0),  # 1e-8 ~ 1.0 : 対数が一様分布に従う
                'alpha':
                trial.suggest_float("alpha", 0.0, 0.0),  # デフォルト値としておく。余裕があれば変更
                'reg_lambda':
                trial.suggest_float("reg_lambda", 1.0,
                                    1.0),  # デフォルト値としておく。余裕があれば変更
                'random_state':
                trial.suggest_int("random_state", 71, 71),
            }
        elif (args.classifier == "lightgbm"):
            params = {
                'boosting_type':
                trial.suggest_categorical('boosting',
                                          ['gbdt', 'dart', 'goss']),
                'objective':
                'binary',
                'metric':
                'binary_logloss',
                'num_class':
                1,
                'num_leaves':
                trial.suggest_int("num_leaves", 10, 500),
                'learning_rate':
                trial.suggest_loguniform("learning_rate", 0.01, 0.01),
                'max_depth':
                trial.suggest_int("max_depth", 1, 5),
                'reg_alpha':
                trial.suggest_uniform("reg_alpha", 0, 100),
                'reg_lambda':
                trial.suggest_uniform("reg_lambda", 1, 5),
                'num_leaves':
                trial.suggest_int("num_leaves", 10, 500),
                'verbose':
                0,
            }
        elif (args.classifier == "catboost"):
            params = {
                'eval_metric':
                trial.suggest_categorical('eval_metric', ['Accuracy']),
                'iterations':
                trial.suggest_int('iterations', 1000, 1000),  # まず大きな数を設定しておく
                'learning_rate':
                trial.suggest_loguniform('learning_rate', 0.01, 0.01),
                'depth':
                trial.suggest_int('depth', 4, 10),
                'random_strength':
                trial.suggest_int('random_strength', 0, 100),
                'bagging_temperature':
                trial.suggest_loguniform('bagging_temperature', 0.01, 100.00),
                'od_type':
                trial.suggest_categorical('od_type', ['IncToDec', 'Iter']),
                'od_wait':
                trial.suggest_int('od_wait', 100,
                                  100),  # 最適な指標値に達した後、iterationを続ける数。
                'random_state':
                trial.suggest_int("random_state", 71, 71),
            }

        #--------------------------------------------
        # stratified k-fold CV での評価
        #--------------------------------------------
        y_preds_train = np.zeros((len(y_train), ))

        # k-hold cross validation で、学習用データセットを学習用と検証用に分割したもので評価
        kf = StratifiedKFold(n_splits=args.n_splits_gs,
                             shuffle=True,
                             random_state=args.seed)
        k = 0
        for fold_id, (train_index,
                      valid_index) in enumerate(kf.split(X_train, y_train)):
            # seed 値の固定
            np.random.seed(args.seed + k)
            random.seed(args.seed + k)

            #--------------------
            # データセットの分割
            #--------------------
            X_train_fold, X_valid_fold = X_train.iloc[
                train_index], X_train.iloc[valid_index]
            y_train_fold, y_valid_fold = y_train.iloc[
                train_index], y_train.iloc[valid_index]

            #--------------------
            # モデルの定義
            #--------------------
            if (args.classifier == "logistic"):
                model = SklearnClassifier(
                    LogisticRegression(penalty='l2',
                                       solver="sag",
                                       random_state=args.seed))
            elif (args.classifier == "knn"):
                model = SklearnClassifier(
                    KNeighborsClassifier(n_neighbors=3,
                                         p=2,
                                         metric='minkowski'))
            elif (args.classifier == "svm"):
                model = SklearnClassifier(SVC(kernel='rbf', gamma=0.1, C=10.0))
            elif (args.classifier == "random_forest"):
                model = SklearnClassifier(
                    RandomForestClassifier(criterion="gini",
                                           bootstrap=True,
                                           oob_score=True,
                                           n_estimators=1000,
                                           n_jobs=-1,
                                           random_state=args.seed))
            elif (args.classifier == "bagging"):
                model = SklearnClassifier(
                    BaggingClassifier(
                        DecisionTreeClassifier(criterion='entropy',
                                               max_depth=None,
                                               random_state=args.seed)))
            elif (args.classifier == "adaboost"):
                model = SklearnClassifier(
                    AdaBoostClassifier(
                        DecisionTreeClassifier(criterion='entropy',
                                               max_depth=None,
                                               random_state=args.seed)))
            elif (args.classifier == "xgboost"):
                model = XGBoostClassifier(model=xgb.XGBClassifier(
                    booster='gbtree',
                    objective='binary:logistic',
                    eval_metric='logloss',
                    learning_rate=0.01),
                                          train_type=args.train_type,
                                          use_valid=True,
                                          debug=args.debug)
            elif (args.classifier == "lightgbm"):
                model = LightGBMClassifier(model=lgb.LGBMClassifier(
                    objective='binary', metric='binary_logloss'),
                                           train_type=args.train_type,
                                           use_valid=True,
                                           debug=args.debug)
            elif (args.classifier == "catboost"):
                model = CatBoostClassifier(
                    model=catboost.CatBoostClassifier(loss_function="Logloss"),
                    use_valid=True,
                    debug=args.debug)

            # モデルのチューニングパラメータ設定
            model.set_params(**params)

            #--------------------
            # モデルの学習処理
            #--------------------
            model.fit(X_train_fold, y_train_fold, X_valid_fold, y_valid_fold)

            #--------------------
            # モデルの推論処理
            #--------------------
            y_pred_train[valid_index] = model.predict(X_valid_fold)
            k += 1

        accuracy = (y_train == y_pred_train).sum() / len(y_pred_train)
        return accuracy
Beispiel #16
0
train3 = data[(data['date']>=20170929)&(data['date']<20171011)]
train4 = data[(data['date']>=20171011)&(data['date']<20171023)]
train5 = data[(data['date']>=20171023)&(data['date']<=20171105)]

gbm1 = lgb.LGBMClassifier(max_depth=7, n_estimators = 110,  min_child_samples=100)
gbm2 = lgb.LGBMClassifier(max_depth=7, n_estimators = 110,  min_child_samples=100)
gbm3 = lgb.LGBMClassifier(max_depth=7, n_estimators = 110,  min_child_samples=100)
gbm4 = lgb.LGBMClassifier(max_depth=7, n_estimators = 110,  min_child_samples=100)
gbm5 = lgb.LGBMClassifier(max_depth=7, n_estimators = 110,  min_child_samples=100)
gbm1.fit(train1[feat_name], train1['label'])
gbm2.fit(train2[feat_name], train2['label'])
gbm3.fit(train3[feat_name], train3['label'])
gbm4.fit( train4[feat_name], train4['label'])
gbm5.fit( train5[feat_name],  train5['label'])

cb1 = cat.CatBoostClassifier(iterations=110,learning_rate=0.1,depth=7)
cb2 = cat.CatBoostClassifier(iterations=110,learning_rate=0.1,depth=7)
cb3 = cat.CatBoostClassifier(iterations=110,learning_rate=0.1,depth=7)
cb4 = cat.CatBoostClassifier(iterations=110,learning_rate=0.1,depth=7)
cb5 = cat.CatBoostClassifier(iterations=110,learning_rate=0.1,depth=7)
cb1.fit(train1[feat_name], train1['label'],verbose=20)
cb2.fit(train2[feat_name], train2['label'],verbose=20)
cb3.fit(train3[feat_name], train3['label'],verbose=20)
cb4.fit(train4[feat_name], train4['label'],verbose=20)
cb5.fit(train5[feat_name], train5['label'],verbose=20)

xg1 = xgb.XGBClassifier(max_depth=7, n_estimators=110, silent=False)
xg2 = xgb.XGBClassifier(max_depth=7, n_estimators=110, silent=False)
xg3 = xgb.XGBClassifier(max_depth=7, n_estimators=110, silent=False)
xg4 = xgb.XGBClassifier(max_depth=7, n_estimators=110, silent=False)
xg5 = xgb.XGBClassifier(max_depth=7, n_estimators=110, silent=False)
Beispiel #17
0
    return result


# In[ ]:

# In[26]:

df_res = pd.DataFrame()

# In[27]:

import catboost as cat
clf_cbt = cat.CatBoostClassifier(iterations=2500,
                                 learning_rate=0.01,
                                 depth=6,
                                 verbose=True,
                                 thread_count=12,
                                 colsample_bylevel=0.8,
                                 l2_leaf_reg=1,
                                 random_seed=1024)

df_res['result_1'] = cbt_model(clf_cbt, model_train_s_1, online_train,
                               feature_list)

df_res['result_2'] = cbt_model(clf_cbt, model_train_s_2, online_train,
                               feature_list)

df_res['result_3'] = cbt_model(clf_cbt, model_train_s_3, online_train,
                               feature_list)

df_res['result_4'] = cbt_model(clf_cbt, model_train_s_4, online_train,
                               feature_list)
skf = StratifiedKFold(n_splits=N_SPLITS, shuffle=True, random_state=SEED)

for fold, (train_idx,
           valid_idx) in enumerate(skf.split(all_df, all_df[TARGET])):
    print(f"===== FOLD {fold} =====")
    oof_idx = np.array([idx for idx in valid_idx if idx < train_df.shape[0]])
    preds_idx = np.array(
        [idx for idx in valid_idx if idx >= train_df.shape[0]])

    X_train, y_train = all_df.iloc[train_idx].drop(
        TARGET, axis=1), all_df.iloc[train_idx][TARGET]
    X_valid, y_valid = all_df.iloc[oof_idx].drop(
        TARGET, axis=1), all_df.iloc[oof_idx][TARGET]
    X_test = all_df.iloc[preds_idx].drop(TARGET, axis=1)

    model = ctb.CatBoostClassifier(**params)
    model.fit(X_train,
              y_train,
              eval_set=[(X_valid, y_valid)],
              use_best_model=True,
              early_stopping_rounds=EARLY_STOPPING_ROUNDS,
              verbose=VERBOSE)

    fi_tmp = pd.DataFrame()
    fi_tmp["feature"] = X_test.columns.to_list()
    fi_tmp["importance"] = model.get_feature_importance()
    fi_tmp["fold"] = fold
    fi_tmp["seed"] = SEED
    feature_importances = feature_importances.append(fi_tmp)

    ctb_oof[oof_idx] = model.predict(X_valid)
        if( args.debug ):
            print( "X_train_fold.shape : ", X_train_fold.shape )
            print( "X_valid_fold.shape : ", X_valid_fold.shape )
            print( "y_train_fold.shape : ", y_train_fold.shape )
            print( "y_valid_fold.shape : ", y_valid_fold.shape )

        #--------------------
        # モデルの定義
        #--------------------
        models = []
        for c, classifier in enumerate(args.classifiers):
            if( classifier == "svm" ):
                model = SklearnImageClassifier( SVC( kernel = 'rbf', gamma = 0.1, C = 10.0 ) )
            elif( classifier == "catboost" ):
                if( args.device == "gpu" ): 
                    model = CatBoostImageClassifier( model = catboost.CatBoostClassifier( loss_function="MultiClass", iterations = 1000, task_type="GPU", devices='0:1' ), use_valid = True, debug = args.debug )  # iterations = (trees / (epochs * batches)
                else:
                    model = CatBoostImageClassifier( model = catboost.CatBoostClassifier( loss_function="MultiClass", iterations = 1000 ), use_valid = True, debug = args.debug )
            elif( classifier == "mlp" ):
                model = KerasMLPImageClassifier( 
                    n_input_dim = X_train_fold.shape[1] * X_train_fold.shape[2] * X_train_fold.shape[3], n_classes = args.n_classes, 
                    n_epoches = args.n_epoches, batch_size = args.batch_size, lr = args.lr, beta1 = args.beta1, beta2 = args.beta2,
                    use_valid = True, one_hot_encode = True, callbacks = None, use_datagen = False, datagen = datagen, debug = args.debug
                )
            elif( classifier == "resnet50" ):
                model = KerasResNet50ImageClassifier( 
                    image_height = args.image_height, image_width = args.image_width, n_channles = 3, n_classes = args.n_classes, 
                    n_epoches = args.n_epoches, batch_size = args.batch_size, lr = args.lr, beta1 = args.beta1, beta2 = args.beta2,
                    pretrained = False, train_only_fc = False,
                    use_valid = True, one_hot_encode = True, callbacks = None, use_datagen = True, datagen = datagen, debug = args.debug
                )
Beispiel #20
0
def Catboost_crossvalidated_model(max_depths,
                                  n_estimators,
                                  colsample_bytrees,
                                  Xtrain,
                                  Ytrain,
                                  nfold,
                                  feature_selection=0,
                                  nthread=8):
    '''Function returns a cross-validated hyper parameter tuned model for the training data 
    Arguments:
    	max_depths: options for maximum depth eg: input [6,10,13], this will choose the best max_depth among the three
    	n_estimators: best number of estimators to be chosen from this. eg: [200,150,100]
    	colsample_bytrees: eg. input [0.4,0.8]
    	nfold: Number of folds for cross-validated
    	Xtrain, Ytrain: Training features and labels
    	feature_selection : 0 means feature_selection diabled and 1 otherswise. If 1 then a second output is returned which consists of the selected features

    Output:
    	model: Trained model with good hyper-parameters
    	features : Coordinates of selected features, if feature_selection = 0
    	bp: Dictionary of tuned parameters 

    This procedure is CPU intensive. So, it is advised to not provide too many choices of hyper-parameters
    '''
    classifiers = {}
    model = catboost.CatBoostClassifier(thread_count=nthread,
                                        learning_rate=0.02,
                                        iterations=100,
                                        depth=6,
                                        subsample=0.8,
                                        random_seed=11)
    #model =  xgb.XGBClassifier( nthread=nthread, learning_rate =0.02, n_estimators=100, max_depth=6,min_child_weight=1, gamma=0, subsample=0.8, colsample_bytree=0.8,objective= 'binary:logistic',scale_pos_weight=1, seed=11)
    model.fit(Xtrain, Ytrain)
    m, n = Xtrain.shape

    bp = {'max_depth': [0], 'n_estimator': [0]}
    classifiers['model'] = catboost.CatBoostClassifier(thread_count=nthread,
                                                       learning_rate=0.02,
                                                       iterations=100,
                                                       depth=6,
                                                       subsample=0.8,
                                                       random_seed=11)
    #classifiers['model'] = xgb.XGBClassifier( nthread = nthread, learning_rate =0.02, n_estimators=100, max_depth=6,min_child_weight=1, gamma=0, subsample=0.8, colsample_bytree=0.9,objective= 'binary:logistic',scale_pos_weight=1, seed=11)
    classifiers['train_X'] = Xtrain
    classifiers['train_y'] = Ytrain
    maxi = 0
    pos = 0
    for r in max_depths:
        classifiers['model'] = catboost.CatBoostClassifier(
            thread_count=nthread,
            learning_rate=0.02,
            iterations=100,
            depth=r,
            subsample=0.8,
            random_seed=11)
        #classifiers['model'] = xgb.XGBClassifier( nthread=nthread,learning_rate =0.02, n_estimators=100, max_depth=r,min_child_weight=1, gamma=0, subsample=0.8, colsample_bytree=0.8,objective= 'binary:logistic',scale_pos_weight=1, seed=11)
        score = cross_validate(classifiers, nfold)
        if maxi < score:
            maxi = score
            pos = r
    bp['max_depth'] = pos
    #print pos

    maxi = 0
    pos = 0
    for r in n_estimators:
        classifiers['model'] = catboost.CatBoostClassifier(
            thread_count=nthread,
            learning_rate=0.02,
            iterations=r,
            depth=bp['max_depth'],
            subsample=0.8,
            random_seed=11)
        #classifiers['model'] = xgb.XGBClassifier( nthread=nthread,learning_rate =0.02, n_estimators=r, max_depth=bp['max_depth'],min_child_weight=1, gamma=0, subsample=0.8, colsample_bytree=0.8,objective= 'binary:logistic',scale_pos_weight=1, seed=11)
        score = cross_validate(classifiers, nfold)
        if maxi < score:
            maxi = score
            pos = r

    bp['n_estimator'] = pos
    #print pos
    classifiers['model'] = catboost.CatBoostClassifier(
        thread_count=nthread,
        learning_rate=0.02,
        iterations=bp['n_estimator'],
        depth=bp['max_depth'],
        subsample=0.8,
        random_seed=11)
    #model = xgb.XGBClassifier( nthread=nthread,learning_rate =0.02, n_estimators=bp['n_estimator'], max_depth=bp['max_depth'],min_child_weight=1, gamma=0, subsample=0.8, colsample_bytree=bp['colsample_bytree'],objective= 'binary:logistic',scale_pos_weight=1, seed=11).fit(Xtrain,Ytrain)

    return model, bp
verbose_eval = 30
num_rounds = 800

folds = 3
kf = KFold(n_splits=folds, shuffle=True, random_state=seed + 1)
y_preds3 = np.zeros(X_test.shape[0])
feature_importance_df = pd.DataFrame()
i = 0
for tr_idx, val_idx in kf.split(X_train, y_train):

    X_tr = X_train[features].iloc[tr_idx, :].fillna(-1)
    y_tr = y_train.iloc[tr_idx]

    model = cb.CatBoostClassifier(iterations=num_rounds,
                                  depth=14,
                                  learning_rate=0.04,
                                  loss_function='Logloss',
                                  eval_metric='Logloss',
                                  task_type="GPU")
    if debug:
        model.fit(X_tr, y_tr, cat_features=cate, verbose_eval=30)
    else:
        model.fit(X_tr, y_tr, cat_features=cate, verbose_eval=30)

    del X_tr
    y_preds3 += model.predict_proba(X_test[features].fillna(-1))[:, 1] / folds

    if debug:
        print(
            "debug:",
            roc_auc_score(
                y_test,
def CatBoost_CV():
    # Get csv data
    data = pd.read_csv(cf.base_dir + cf.prepared_data_real_comb)

    X = data.drop(['0'], axis=1)
    y = data[['0']].values.ravel()

    # Feature Scaling
    StdScaler = StandardScaler()
    X_scaled = StdScaler.fit_transform(X)

    # Splitting the dataset into the Training set and Test set
    X_Train, x_test, Y_Train, y_test = train_test_split(X_scaled,
                                                        y,
                                                        test_size=0.5,
                                                        random_state=0)

    # Number of trees in random forest
    iterations = [int(x) for x in np.linspace(start=200, stop=2000, num=5)]
    # Number of features to consider at every split
    learning_rate = [x for x in np.linspace(start=0.01, stop=0.1, num=5)]
    # Maximum number of levels in tree
    depth = [int(x) for x in np.linspace(6, 10, num=4)]
    # Minimum number of samples required to split a node
    l2_leaf_reg = [x for x in np.linspace(start=0.01, stop=0.1, num=5)]
    # Minimum number of samples required at each leaf node
    # per_float_feature_quantization = ['0:border_count=1024', '1:border_count=1024']
    # Method of selecting samples for training each tree
    bootstrap = [True, False]

    random_grid = {
        'iterations': iterations,
        'learning_rate': learning_rate,
        'depth': depth,
        'l2_leaf_reg': l2_leaf_reg
    }
    #'per_float_feature_quantization': per_float_feature_quantization}

    clf = cb.CatBoostClassifier(random_state=0,
                                border_count=255,
                                task_type='GPU')

    # import multiprocessing
    # cores = multiprocessing.cpu_count()-1
    rf_random = RandomizedSearchCV(estimator=clf,
                                   param_distributions=random_grid,
                                   n_iter=5,
                                   cv=3,
                                   verbose=10,
                                   random_state=42,
                                   n_jobs=1)

    # Fit the random search model
    start_time = time.time()  # Time counter
    print(
        "Started at ",
        datetime.utcfromtimestamp(int(
            time.time())).strftime('%Y-%m-%d %H:%M:%S'))
    rf_random.fit(X_Train, Y_Train)
    best_p = rf_random.best_params_
    best_r = rf_random.best_score_

    print(best_p, best_r)
train_pred3 = (model.predict_proba(X))[:,1]
train_pred3.shape


# In[356]:


params = {'depth': [2, 4, 7, 10],
'learning_rate' : [0.03, 0.1, 0.15],
'l2_leaf_reg': [1,4,9],
'iterations': [300]
'loss_function': 'logloss'         
}
#如果选取的参数组比较多的话,需要的时间特别久,以此模型下的ks值为例可以明显看出过拟合严重,
#故手动调整参数在实际应用中更为合适
cb = ct.CatBoostClassifier(eval_metric="AUC", random_seed=741)
cb_model = GridSearchCV(cb, params, scoring="roc_auc", cv = 3)
cb_model.fit(X, Y)
#clf = ct.CatBoostClassifier(eval_metric="AUC", depth=10, iterations= 300, l2_leaf_reg= 9, learning_rate= 0.15)
#clf.fit(X,Y)


# In[382]:


cat_features_index = 2
clf = ct.CatBoostClassifier(eval_metric="AUC", depth=8, iterations= 300, l2_leaf_reg= 10, learning_rate= 0.008)#学习率越大越过拟
clf.fit(X,Y)
preds_class = clf.predict(P_test)
preds_probs = clf.predict_proba(P_test)
print('class = ',preds_class)
Beispiel #24
0
def model_cbt(features, test_features, encoding='ohe', n_folds=5):
    # 提取ID
    train_ids = features['cust_no']
    test_ids = test_features['cust_no']

    # 提取训练集的结果
    labels = features['label']
    # 移除ID和target
    features = features.drop(columns=['cust_no', 'label'])
    test_features = test_features.drop(columns=['cust_no'])
    one_hot = OneHotEncoder(3, sparse=False)

    # One Hot Encoding
    if encoding == 'ohe':
        features = pd.get_dummies(features)
        test_features = pd.get_dummies(test_features)

        features, test_features = features.align(test_features,
                                                 join='inner',
                                                 axis=1)

        cat_indices = 'auto'

    # Integer label encoding
    elif encoding == 'le':

        label_encoder = LabelEncoder()

        cat_indices = []

        for i, col in enumerate(features):
            if features[col].dtype == 'object':
                features[col] = label_encoder.fit_transform(
                    np.array(features[col].astype(str)).reshape((-1, )))
                test_features[col] = label_encoder.transform(
                    np.array(test_features[col].astype(str)).reshape((-1, )))

                cat_indices.append(i)

    else:
        raise ValueError("Encoding must be either 'ohe' or 'le'")

    features = features[important_features]
    test_features = test_features[important_features]
    print('Training Data Shape: ', features.shape)
    print('Testing Data Shape: ', test_features.shape)

    feature_names = list(features.columns)

    features = np.array(features)
    test_features = np.array(test_features)

    features[np.isnan(features)] = -1000000
    features[np.where(features >= np.finfo(np.float64).max)] = -1000000
    test_features[np.isnan(test_features)] = -1000000
    test_features[np.where(
        test_features >= np.finfo(np.float64).max)] = -1000000

    k_fold = StratifiedKFold(n_splits=n_folds, shuffle=True, random_state=50)

    feature_importance_values = np.zeros(len(feature_names))

    test_predictions = np.zeros((test_features.shape[0], 3))

    out_of_fold = np.zeros((features.shape[0], 3))
    # print(out_of_fold)

    valid_scores = []
    train_scores = []

    for train_indices, valid_indices in k_fold.split(features, labels):
        train_features, train_labels = features[train_indices], labels[
            train_indices]
        valid_features, valid_labels = features[valid_indices], labels[
            valid_indices]
        train_labels += 1
        valid_labels += 1
        train_features, train_labels = SMOTE().fit_sample(
            train_features, train_labels)
        # 建模
        model = cbt.CatBoostClassifier(random_seed=2020,
                                       iterations=1500,
                                       learning_rate=0.1,
                                       max_depth=11,
                                       l2_leaf_reg=1,
                                       verbose=1,
                                       early_stopping_rounds=20,
                                       task_type='CPU',
                                       eval_metric='Kappa')
        # 训练模型 eval_metric=lambda y_true, y_pred: [custom_kappa_eval(y_true, y_pred)]
        model.fit(train_features,
                  train_labels,
                  eval_set=[(valid_features, valid_labels)],
                  early_stopping_rounds=20,
                  verbose=True)

        # 特征重要性
        feature_importance_values += model.feature_importances_ / k_fold.n_splits

        # 做预测
        test_predictions += model.predict_proba(
            test_features)[:, :] / k_fold.n_splits

        out_of_fold[valid_indices] = model.predict_proba(valid_features)[:, :]

        valid_score = accuracy_score(
            valid_labels, np.argmax(out_of_fold[valid_indices], axis=1))
        train_score = accuracy_score(
            train_labels,
            np.argmax(model.predict_proba(train_features)[:, :], axis=1))

        valid_scores.append(valid_score)
        train_scores.append(train_score)

        gc.enable()
        del model, train_features, valid_features
        gc.collect()
    y_pred = np.argmax(test_predictions, axis=1) - 1
    test_predictions = pd.DataFrame({
        'cust_no': test_ids,
        '-1': test_predictions[:, 0],
        '0': test_predictions[:, 1],
        '1': test_predictions[:, 2]
    })
    print(y_pred)
    submission = pd.DataFrame({'cust_no': test_ids, 'label': y_pred})

    feature_importances = pd.DataFrame({
        'feature': feature_names,
        'importance': feature_importance_values
    })

    fold_names = list(range(n_folds))
    fold_names.append('overall')

    valid_auc = accuracy_score(labels + 1, np.argmax(out_of_fold, axis=1))
    valid_scores.append(valid_auc)

    train_scores.append(np.mean(train_scores))

    metrics = pd.DataFrame({
        'fold': fold_names,
        'train': train_scores,
        'valid': valid_scores
    })

    return submission, feature_importances, metrics, test_predictions
Beispiel #25
0
def Model_Search_Catboost_cv(X,
                             y,
                             cat_features=None,
                             model='binary',
                             folds=5,
                             sklearn_metric=None,
                             catboost_metric=None,
                             step_wise_start_at=0,
                             final_learning_rate=0.01,
                             use_optuna=False,
                             direction='minimize',
                             n_trials=20,
                             load_study_from=None,
                             save_study_as=None,
                             n_jobs=4):

    # model
    if isinstance(model, str):
        if model == 'binary':
            model = catboost.CatBoostClassifier(loss_function='Logloss',
                                                thread_count=n_jobs)
        elif model == 'multiclass':
            model = catboost.CatBoostClassifier(loss_function='MultiClass',
                                                thread_count=n_jobs)
        elif model == 'regression':
            model = catboost.CatBoostRegressor(loss_function='RMSE',
                                               thread_count=n_jobs)
        else:
            sys.exit('Error: Unkown model type.')

    # sklearn_metric
    if sklearn_metric is None:  # https://scikit-learn.org/stable/modules/model_evaluation.html
        if isinstance(model, catboost.CatBoostClassifier):
            sklearn_metric = 'neg_log_loss'
        elif isinstance(model, catboost.CatBoostRegressor):
            sklearn_metric = 'neg_root_mean_squared_error'
        else:
            sys.exit('Error: Sklearn score metric needs to be provided.')

    # catboost_metric
    if catboost_metric is not None:  # https://catboost.ai/docs/concepts/loss-functions.html
        model.set_params(loss_function=catboost_metric)

    # folds
    if isinstance(folds, int):
        if isinstance(model, lightgbm.LGBMClassifier):
            folds = StratifiedKFold(n_splits=folds,
                                    shuffle=True,
                                    random_state=42)
        else:
            folds = KFold(n_splits=folds, shuffle=True, random_state=42)

    # ------------------------------------------------------------------------------------------------

    # Set fixed params
    fixed_params = {
        "verbose": False,
        "random_state": 42,
        "iterations": 10000,
    }
    model.set_params(**fixed_params)

    # Set dataset for .cv
    d_train = catboost.Pool(X, label=y, cat_features=cat_features)

    # ------------------------------------------------------------------------------------------------

    if use_optuna:
        print("Searching for a Catboost model with optuna \n")

        params = model.get_params()

        if load_study_from is not None:
            study = joblib.load(load_study_from)
        else:
            study = optuna.create_study(
                direction=direction,
                pruner=optuna.pruners.SuccessiveHalvingPruner())

        def objetive(trial):

            params.update({
                "boosting_type":
                trial.suggest_categorical("boosting_type",
                                          ['Ordered', 'Plain']),
                "learning_rate":
                trial.suggest_loguniform("learning_rate", 0.005, 0.1),
                "max_depth":
                trial.suggest_int("max_depth", 4, 12),
                "l2_leaf_reg":
                trial.suggest_loguniform("l2_leaf_reg", 1e-4, 1e4),
                "border_count":
                trial.suggest_int('border_count', 1, 255),
                "random_strength":
                trial.suggest_loguniform("random_strength", 1e-4, 1e4),
                "bagging_temperature":
                trial.suggest_loguniform("bagging_temperature", 1e-4, 1e4),
            })

            cv_results = catboost.cv(params=params,
                                     pool=d_train,
                                     iterations=10000,
                                     early_stopping_rounds=50,
                                     folds=folds,
                                     verbose_eval=None,
                                     as_pandas=False)

            rmetric_name = list(cv_results.keys())[1]
            score = cv_results[rmetric_name][
                -1]  # np.min(cv_results[rmetric_name])

            print("Num_boost_round: " + str(len(cv_results[rmetric_name])))

            if save_study_as is not None:
                joblib.dump(study, save_study_as)

            return score

        study.optimize(objetive, n_trials=n_trials, n_jobs=1)

        print(
            "------------------------------------------------------------------------"
        )
        print("Best parameters found: " + str(study.best_params))
        print("Best score achived: " + str(study.best_value))
        print(
            "------------------------------------------------------------------------"
        )

        model.set_params(**study.best_params)

        # num_boost_round optimization
        cv_results = catboost.cv(params=model.get_params(),
                                 pool=d_train,
                                 iterations=10000,
                                 early_stopping_rounds=50,
                                 folds=folds,
                                 verbose_eval=None,
                                 as_pandas=False)

        rmetric_name = list(cv_results.keys())[1]
        best_boost_round = len(cv_results[rmetric_name])
        best_score_achived = cv_results[rmetric_name][-1]
        print("Best num_boost_round: " + str(best_boost_round))
        print("Best score achived: " + str(best_score_achived))
        print(
            "------------------------------------------------------------------------"
        )
        model.set_params(iterations=best_boost_round)

    else:
        print("Searching for a Catboost model with the step wise method \n")

        if step_wise_start_at <= 0:
            # num_boost_round optimization
            cv_results = catboost.cv(params=model.get_params(),
                                     pool=d_train,
                                     iterations=10000,
                                     early_stopping_rounds=50,
                                     folds=folds,
                                     verbose_eval=None,
                                     as_pandas=False)

            rmetric_name = list(cv_results.keys())[1]
            best_boost_round = len(cv_results[rmetric_name])
            best_score_achived = cv_results[rmetric_name][-1]
            print(
                "------------------------------------------------------------------------"
            )
            print("Best num_boost_round: " + str(best_boost_round))
            print("Best score achived: " + str(best_score_achived))
            print(
                "------------------------------------------------------------------------"
            )
            model.set_params(iterations=best_boost_round)

        # Param search
        if step_wise_start_at <= 1:
            param_test = {'max_depth': range(2, 11, 1)}
            search = GridSearchCV(estimator=model,
                                  param_grid=param_test,
                                  scoring=sklearn_metric,
                                  n_jobs=n_jobs,
                                  iid=False,
                                  cv=folds,
                                  verbose=True)
            search.fit(X, y, cat_features=cat_features)
            print("Best params encountered: " + str(search.best_params_))
            print("Best score achived: " + str(search.best_score_))
            print(
                "------------------------------------------------------------------------"
            )
            pipe_model = search.best_estimator_

        if step_wise_start_at <= 2:
            param_test = {
                'l2_leaf_reg': [
                    0, 0.0001, 0.001, 0.004, 0.007, 0.01, 0.04, 0.07, 1, 2, 3,
                    4, 7, 10
                ]
            }
            search = GridSearchCV(estimator=model,
                                  param_grid=param_test,
                                  scoring=sklearn_metric,
                                  n_jobs=n_jobs,
                                  iid=False,
                                  cv=folds,
                                  verbose=True)
            search.fit(X, y, cat_features=cat_features)
            print("Best params encountered: " + str(search.best_params_))
            print("Best score achived: " + str(search.best_score_))
            print(
                "------------------------------------------------------------------------"
            )
            pipe_model = search.best_estimator_

        if step_wise_start_at <= 3:
            param_test = {
                'random_strength':
                [0, 0.001, 0.01, 0.04, 0.07, 0.1, 0.4, 0.7, 1, 4, 7, 10]
            }
            search = GridSearchCV(estimator=model,
                                  param_grid=param_test,
                                  scoring=sklearn_metric,
                                  n_jobs=n_jobs,
                                  iid=False,
                                  cv=folds,
                                  verbose=True)
            search.fit(X, y, cat_features=cat_features)
            print("Best params encountered: " + str(search.best_params_))
            print("Best score achived: " + str(search.best_score_))
            print(
                "------------------------------------------------------------------------"
            )
            pipe_model = search.best_estimator_

        if step_wise_start_at <= 4:
            param_test = {
                'bagging_temperature': [
                    0, 0.001, 0.01, 0.04, 0.07, 0.1, 0.4, 0.7, 1, 4, 7, 10, 30,
                    60, 90, 120
                ]
            }
            search = GridSearchCV(estimator=model,
                                  param_grid=param_test,
                                  scoring=sklearn_metric,
                                  n_jobs=n_jobs,
                                  iid=False,
                                  cv=folds,
                                  verbose=True)
            search.fit(X, y, cat_features=cat_features)
            print("Best params encountered: " + str(search.best_params_))
            print("Best score achived: " + str(search.best_score_))
            print(
                "------------------------------------------------------------------------"
            )
            pipe_model = search.best_estimator_

        if step_wise_start_at <= 5:
            param_test = {
                'border_count': [32, 5, 10, 20, 50, 100, 150, 200, 255]
            }
            search = GridSearchCV(estimator=model,
                                  param_grid=param_test,
                                  scoring=sklearn_metric,
                                  n_jobs=n_jobs,
                                  iid=False,
                                  cv=folds,
                                  verbose=True)
            search.fit(X, y, cat_features=cat_features)
            print("Best params encountered: " + str(search.best_params_))
            print("Best score achived: " + str(search.best_score_))
            print(
                "------------------------------------------------------------------------"
            )
            pipe_model = search.best_estimator_

        if step_wise_start_at <= 6:
            param_test = {
                'ctr_border_count': [50, 5, 10, 20, 100, 150, 200, 255]
            }
            search = GridSearchCV(estimator=model,
                                  param_grid=param_test,
                                  scoring=sklearn_metric,
                                  n_jobs=n_jobs,
                                  iid=False,
                                  cv=folds,
                                  verbose=True)
            search.fit(X, y, cat_features=cat_features)
            print("Best params encountered: " + str(search.best_params_))
            print("Best score achived: " + str(search.best_score_))
            print(
                "------------------------------------------------------------------------"
            )
            pipe_model = search.best_estimator_

        # Get model from pipeline
        model = pipe_model.named_steps['model']

        # Set final learning rate
        model.set_params(learning_rate=final_learning_rate)

        # num_boost_round optimization
        cv_results = catboost.cv(params=model.get_params(),
                                 pool=d_train,
                                 iterations=10000,
                                 early_stopping_rounds=50,
                                 folds=folds,
                                 verbose_eval=None,
                                 as_pandas=False)

        rmetric_name = list(cv_results.keys())[1]
        best_boost_round = len(cv_results[rmetric_name])
        best_score_achived = cv_results[rmetric_name][-1]
        print("Best num_boost_round: " + str(best_boost_round))
        print("Best score achived: " + str(best_score_achived))
        print(
            "------------------------------------------------------------------------"
        )
        model.set_params(iterations=best_boost_round)

    return model
Beispiel #26
0
                                  n_estimators=999999,
                                  learning_rate=0.02,
                                  colsample_bytree=0.3,
                                  num_leaves=2,
                                  metric='auc',
                                  objective='binary',
                                  n_jobs=-1)
    modelxgb = xgb.XGBClassifier(max_depth=2,
                                 n_estimators=999999,
                                 colsample_bytree=0.3,
                                 learning_rate=0.02,
                                 objective='binary:logistic',
                                 n_jobs=-1)
    modelcb = cb.CatBoostClassifier(iterations=999999,
                                    max_depth=2,
                                    learning_rate=0.02,
                                    colsample_bylevel=0.03,
                                    objective="Logloss")
    train_path = '../input/train.csv'
    test_path = '../input/test.csv'

    lgb_path = './lgb_models_stack/'
    xgb_path = './xgb_models_stack/'
    cb_path = './cb_models_stack/'

    #Create dir for models
    for filename in [lgb_path, xgb_path, cb_path]:
        if os.path.exists(filename) == False:
            os.mkdir(filename)
    print('Load Train Data.')
    train_x = pd.read_csv(train_path)
import time
import os

red_data_training, red_data_test, red_quality_training, red_quality_test, white_data_training, white_data_test, white_quality_training, white_quality_test = wines_import.read_data(
    False)

if (os.path.exists("./catboost modele i wyniki/CPU WHITE MultiClass")):
    os.remove("./catboost modele i wyniki/CPU WHITE MultiClass")
if (os.path.exists("./catboost modele i wyniki/GPU WHITE MultiClass")):
    os.remove("./catboost modele i wyniki/GPU WHITE MultiClass")

messages_file = open(
    "./catboost modele i wyniki/models WHITE MultiClass verification",
    mode="w+")
model_white = catboost.CatBoostClassifier(task_type="CPU",
                                          random_seed=42,
                                          objective="MultiClass",
                                          iterations=2000)
time_before = time.time()
model_white.fit(white_data_training,
                white_quality_training,
                eval_set=catboost.Pool(white_data_test,
                                       white_quality_test,
                                       has_header=True))
time_after = time.time()
model_white.save_model(
    "./catboost modele i wyniki/CPU WHITE MultiClass verification")
messages_file.write("Uczenie na CPU wina biale trwalo:\n")
messages_file.write(str(time_after - time_before))
model_white_GPU = catboost.CatBoostClassifier(task_type="GPU",
                                              random_seed=42,
                                              objective="MultiClass",
Beispiel #28
0
                booster='gbtree',
                objective='binary:logistic',
                eval_metric='logloss',
                learning_rate=0.01),
                                      train_type=args.train_type,
                                      use_valid=True,
                                      debug=args.debug)
        elif (args.classifier == "lightgbm"):
            model = LightGBMClassifier(model=lgb.LGBMClassifier(
                objective='binary', metric='binary_logloss'),
                                       train_type=args.train_type,
                                       use_valid=True,
                                       debug=args.debug)
        elif (args.classifier == "catboost"):
            model = CatBoostClassifier(
                model=catboost.CatBoostClassifier(loss_function="Logloss"),
                use_valid=True,
                debug=args.debug)

        # モデルのパラメータ設定
        model.set_params(**study.best_params)

        #--------------------
        # モデルの学習処理
        #--------------------
        model.fit(X_train_fold, y_train_fold, X_valid_fold, y_valid_fold)

        #--------------------
        # モデルの推論処理
        #--------------------
        y_pred_train[valid_index] = model.predict(X_valid_fold)
Beispiel #29
0
    def train_catboost(self):
        # self.ct = bcolz.open(datadir, mode='r')
        # ct = self.ct
        time_start = datetime.datetime.now()
        print("catboost training start", datetime.datetime.now(),
              datetime.datetime.now() - time_start)
        time_start = datetime.datetime.now()

        y_train = pickle.load(open("train-label-2-0.pickle", "rb"))
        y_test1 = pickle.load(open("train-label-4-1.pickle", "rb"))
        y_test2 = pickle.load(open("train-label-4-3.pickle", "rb"))
        gc.collect()

        print("labels loaded", datetime.datetime.now(),
              datetime.datetime.now() - time_start)
        time_start = datetime.datetime.now()

        X_train = pickle.load(open("train-features-2-0.pickle", "rb"))
        X_test1 = pickle.load(open("train-features-4-1.pickle", "rb"))
        X_test2 = pickle.load(open("train-features-4-3.pickle", "rb"))
        gc.collect()

        print("CSR loaded", datetime.datetime.now(),
              datetime.datetime.now() - time_start)
        time_start = datetime.datetime.now()

        print("start learning ", datetime.datetime.now(),
              datetime.datetime.now() - time_start)
        time_start = datetime.datetime.now()

        self.ml_params = {
            "loss_function": 'Logloss',
            #RMSE, MAE, Quantile, LogLinQuantile, Poisson, MAPE, Lq
            "eval_metric": "Logloss",
            "random_strength": 90,
            "boosting_type": "Plain",
            "bootstrap_type": "Bernoulli",
            "od_type": 'Iter',
            "od_wait": 800,
            "depth": 5,
            "learning_rate": 0.2,
            #"learning_rate": 0.1,
            #"iterations": 45,
            "iterations": 10,
        }

        time_start = datetime.datetime.now()
        print("before import catboost", datetime.datetime.now(),
              datetime.datetime.now() - time_start)
        time_start = datetime.datetime.now()
        import catboost
        print("after import catboost", datetime.datetime.now(),
              datetime.datetime.now() - time_start)
        time_start = datetime.datetime.now()

        def Pool(X, y):
            cols = self.num_feature_names + self.cat_feature_names
            cb = catboost.Pool(X, label=y, feature_names=cols, cat_features=[])
            return cb

        # cbtrain = get_pool(0, self.rows_num-1000)
        # try:
        #     pickle.dump(cbtrain, open('cbtrain-pool.pickle', 'wb'), protocol=4)
        # except Exception as ex:
        #     print(ex)

        # cbtest = get_pool(self.rows_num-1000, self.rows_num)

        # model =  catboost.CatBoostRegressor(**self.ml_params)
        model = catboost.CatBoostClassifier(**self.ml_params)
        self.model = model

        # eval_set_ = [cbtest]

        model.fit(Pool(X_train, y_train),
                  eval_set=[
                      Pool(X_train, y_train),
                      Pool(X_test1, y_test1),
                      Pool(X_test2, y_test2)
                  ],
                  use_best_model=True,
                  verbose=True)

        print("end learning ", datetime.datetime.now(),
              datetime.datetime.now() - time_start)
        time_start = datetime.datetime.now()

        pickle.dump(model, open('model-catboost.pickle', 'wb'), protocol=4)
        gc.collect()
Beispiel #30
0
feature_name = ['Parameter{0}'.format(i) for i in range(5, 11)]
tr_index = ~data['label'].isnull()
X_train = data[tr_index][feature_name].reset_index(drop=True)
y = data[tr_index]['label'].reset_index(drop=True).astype(int)
X_test = data[~tr_index][feature_name].reset_index(drop=True)

print(X_train.shape, X_test.shape)
oof = np.zeros((X_train.shape[0], 4))
prediction = np.zeros((X_test.shape[0], 4))

# cbt_model = cbt.CatBoostClassifier(iterations=800,verbose=300,learning_rate=0.01,
# task_type='GPU',
# loss_function='MultiClass')
cbt_model = cbt.CatBoostClassifier(iterations=1000,
                                   verbose=300,
                                   task_type='GPU',
                                   loss_function='MultiClass',
                                   random_state=303)
cbt_model.fit(X_train, y, eval_set=(X_train, y))
oof = cbt_model.predict_proba(X_test)
prediction = cbt_model.predict_proba(X_test)
gc.collect()  #垃圾回收

print('logloss', log_loss(pd.get_dummies(y).values, oof))
print('ac', accuracy_score(y, np.argmax(oof, axis=1)))
print('mae', 1 / (1 + np.sum(np.absolute(np.eye(4)[y] - oof)) / 480))
print('-' * 80)
sub = test[['Group']]
prob_cols = [i for i in submit.columns if i not in ['Group']]
for i, f in enumerate(prob_cols):
    sub[f] = prediction[:, i]