Beispiel #1
0
def test_wrong_feature_count():
    with pytest.raises(CatboostError):
        data = np.random.rand(100, 10)
        label = np.random.randint(2, size=100)
        model = CatBoostClassifier()
        model.fit(data, label)
        model.predict(data[:, :-1])
Beispiel #2
0
def test_no_cat_in_predict():
    train_pool = Pool(TRAIN_FILE, column_description=CD_FILE)
    test_pool = Pool(TEST_FILE, column_description=CD_FILE)
    model = CatBoostClassifier(iterations=2, random_seed=0)
    model.fit(train_pool)
    pred1 = model.predict(map_cat_features(test_pool.get_features(), train_pool.get_cat_feature_indices()))
    pred2 = model.predict(Pool(map_cat_features(test_pool.get_features(), train_pool.get_cat_feature_indices()), cat_features=train_pool.get_cat_feature_indices()))
    assert _check_data(pred1, pred2)
Beispiel #3
0
def test_ignored_features():
    train_pool = Pool(TRAIN_FILE, column_description=CD_FILE)
    test_pool = Pool(TEST_FILE, column_description=CD_FILE)
    model1 = CatBoostClassifier(iterations=5, random_seed=0, ignored_features=[1, 2, 3])
    model2 = CatBoostClassifier(iterations=5, random_seed=0)
    model1.fit(train_pool)
    model2.fit(train_pool)
    predictions1 = model1.predict(test_pool)
    predictions2 = model2.predict(test_pool)
    assert not _check_data(predictions1, predictions2)
    model1.save_model(OUTPUT_MODEL_PATH)
    return compare_canonical_models(OUTPUT_MODEL_PATH)
Beispiel #4
0
def test_fit_data():
    pool = Pool(CLOUDNESS_TRAIN_FILE, column_description=CLOUDNESS_CD_FILE)
    eval_pool = Pool(CLOUDNESS_TEST_FILE, column_description=CLOUDNESS_CD_FILE)
    base_model = CatBoostClassifier(iterations=2, random_seed=0, loss_function="MultiClass")
    base_model.fit(pool)
    baseline = np.array(base_model.predict(pool, prediction_type='RawFormulaVal'))
    eval_baseline = np.array(base_model.predict(eval_pool, prediction_type='RawFormulaVal'))
    eval_pool.set_baseline(eval_baseline)
    model = CatBoostClassifier(iterations=2, random_seed=0, loss_function="MultiClass")
    data = map_cat_features(pool.get_features(), pool.get_cat_feature_indices())
    model.fit(data, pool.get_label(), pool.get_cat_feature_indices(), sample_weight=np.arange(1, pool.num_row()+1), baseline=baseline, use_best_model=True, eval_set=eval_pool)
    model.save_model(OUTPUT_MODEL_PATH)
    return compare_canonical_models(OUTPUT_MODEL_PATH)
Beispiel #5
0
def test_ignored_features():
    train_pool = Pool(TRAIN_FILE, column_description=CD_FILE)
    test_pool = Pool(TEST_FILE, column_description=CD_FILE)
    model1 = CatBoostClassifier(iterations=5,
                                random_seed=0,
                                ignored_features=[1, 2, 3])
    model2 = CatBoostClassifier(iterations=5, random_seed=0)
    model1.fit(train_pool)
    model2.fit(train_pool)
    predictions1 = model1.predict(test_pool)
    predictions2 = model2.predict(test_pool)
    assert not _check_data(predictions1, predictions2)
    model1.save_model(OUTPUT_MODEL_PATH)
    return compare_canonical_models(OUTPUT_MODEL_PATH)
Beispiel #6
0
def test_custom_objective():
    class LoglossObjective(object):
        def calc_ders_range(self, approxes, targets, weights):
            assert len(approxes) == len(targets)
            if weights is not None:
                assert len(weights) == len(approxes)

            exponents = []
            for index in xrange(len(approxes)):
                exponents.append(math.exp(approxes[index]))

            result = []
            for index in xrange(len(targets)):
                p = exponents[index] / (1 + exponents[index])
                der1 = (1 - p) if targets[index] > 0.0 else -p
                der2 = -p * (1 - p)

                if weights is not None:
                    der1 *= weights[index]
                    der2 *= weights[index]

                result.append((der1, der2))

            return result

    train_pool = Pool(data=TRAIN_FILE, column_description=CD_FILE)
    test_pool = Pool(data=TEST_FILE, column_description=CD_FILE)

    model = CatBoostClassifier(
        iterations=5,
        random_seed=0,
        use_best_model=True,
        loss_function=LoglossObjective(),
        eval_metric="Logloss",
        # Leaf estimation method and gradient iteration are set to match
        # defaults for Logloss.
        leaf_estimation_method="Newton",
        gradient_iterations=10)
    model.fit(train_pool, eval_set=test_pool)
    pred1 = model.predict(test_pool, prediction_type='RawFormulaVal')

    model2 = CatBoostClassifier(iterations=5,
                                random_seed=0,
                                use_best_model=True,
                                loss_function="Logloss")
    model2.fit(train_pool, eval_set=test_pool)
    pred2 = model2.predict(test_pool, prediction_type='RawFormulaVal')

    for p1, p2 in zip(pred1, pred2):
        assert abs(p1 - p2) < EPS
def predict(model_path,
            X_test,
            is_lgbm=False,
            is_catboost=False,
            is_cnn=False,
            maxlen=400,
            lgbm_threshold=0.5):
    """
    load the model and predict unseen data
    """

    print('\n === predict === \n')

    if is_lgbm:
        # lightgbm
        model = lgb.Booster(model_file=model_path)
    elif is_catboost:
        model = CatBoostClassifier()
        model = model.load_model(model_path)
    elif is_cnn:
        model = load_model(model_path)
    else:
        # sklearn
        # xgboost
        model = joblib.load(model_path)

    # y_pred = model.predict_prob(X_test)
    y_pred = model.predict(X_test)

    if is_lgbm:
        #print('==')
        #print(y_pred)
        y_output = []
        for y in y_pred:
            if y > lgbm_threshold:
                y_output.append(1)
            else:
                y_output.append(0)
        #print('==')
        #print(y_output)
        return (np.array(y_output))
        #return np.array([np.argmax(y) for y in y_pred])
    elif is_cnn:
        # X_test = sequence.pad_sequences(X_test, maxlen=maxlen)
        y_pred = model.predict(X_test)
        y_pred = [np.argmax(y) for y in y_pred]
        return np.array(y_pred)
    else:
        return y_pred
Beispiel #8
0
def test_raw_predict_equals_to_model_predict():
    train_pool = Pool(TRAIN_FILE, column_description=CD_FILE)
    test_pool = Pool(TEST_FILE, column_description=CD_FILE)
    model = CatBoostClassifier(iterations=10, random_seed=0)
    model.fit(train_pool, eval_set=test_pool)
    pred = model.predict(test_pool, prediction_type='RawFormulaVal')
    assert all(model.get_test_eval() == pred)
Beispiel #9
0
def test_adult():
    train, test = adult()

    # CatBoost doesn't support pandas.DataFrame NaNs out of the box for categorical features, and
    # this dataset has NaNs only for categorical features, so we'll replace them manually with
    # string "nan"
    #
    # seed issue #571 on GitHub or issue MLTOOLS-2785 in internal tracker.
    #
    # oh, and don't forget to replace missing values with string "nan" when you are going to apply
    # the model!
    train.fillna(value='nan', inplace=True)
    test.fillna(value='nan', inplace=True)

    X_train, y_train = train.drop('income', axis=1), train.income
    X_test, y_test = test.drop('income', axis=1), test.income
    model = CatBoostClassifier(iterations=5,
                               loss_function='CrossEntropy',
                               class_names=['<=50K', '>50K'])
    model.fit(
        X_train,
        y_train,
        eval_set=(
            X_test,
            y_test,
        ),
        cat_features=np.where(X_train.dtypes != np.float)[0],
    )

    predictions = model.predict(X_test)
Beispiel #10
0
def test_adult():
    train, test = adult()

    # CatBoost doesn't support pandas.DataFrame NaNs out of the box for categorical features,
    # so we'll replace them manually with some special string (we'll use "nan")
    #
    # seed issue #571 on GitHub or issue MLTOOLS-2785 in internal tracker.
    #
    # oh, and don't forget to replace missing values with string "nan" when you are going to apply
    # the model!
    for dataset in (
            train,
            test,
    ):
        for name in (name for name, dtype in dict(dataset.dtypes).iteritems()
                     if dtype == np.object):
            dataset[name].fillna('nan', inplace=True)

    X_train, y_train = train.drop('income', axis=1), train.income
    X_test, y_test = test.drop('income', axis=1), test.income
    model = CatBoostClassifier(iterations=5,
                               loss_function='CrossEntropy',
                               class_names=['<=50K', '>50K'])
    model.fit(
        X_train,
        y_train,
        eval_set=(
            X_test,
            y_test,
        ),
        cat_features=np.where(X_train.dtypes != np.float)[0],
    )

    predictions = model.predict(X_test)
def rfe_cat(train_x, train_y, valid_x, valid_y, min_):
    train_pool = Pool(train_x, train_y, cat_features=[0])
    valid_pool = Pool(valid_x, valid_y, cat_features=[0])
    f1_score_ = []
    num_feature = []
    feature_name = []
    print('Start Recursive Feature Elimination')
    for i in tqdm_notebook(range(min_, 36),
                           desc='Iterating Feature Elimination'):
        model = CatBoostClassifier(iterations=50,
                                   random_seed=1234,
                                   used_ram_limit='10gb')
        summary = model.select_features(
            train_pool,
            eval_set=valid_pool,
            features_for_select='0-34',
            num_features_to_select=i,
            steps=2,
            algorithm=EFeaturesSelectionAlgorithm.RecursiveByShapValues,
            shap_calc_type=EShapCalcType.Regular,
            train_final_model=True,
            logging_level='Silent',
        )
        f1_ = f1_score(valid_y,
                       model.predict(valid_pool).tolist(),
                       average='micro')
        f1_score_.append(f1_)
        num_feature.append(i)
        feature_name.append(summary['selected_features_names'])
    print('Best F-1 score: ', max(f1_score_))
    indices = f1_score_.index(max(f1_score_))
    print('Best Number feature: ', num_feature[indices])
    print('Selected of Feature names: \n', feature_name[indices])
    return feature_name[indices]
def train(train_x, train_y, kfold, best_params=None, algorithm_name=None):
    models = []
    acc_results = []
    for i, (tr_idx, val_idx) in enumerate(kfold.split(train_x, train_y)):
        tr_x = train_x.iloc[tr_idx].reset_index(drop=True)
        tr_y = train_y.iloc[tr_idx].reset_index(drop=True)
        val_x = train_x.iloc[val_idx].reset_index(drop=True)
        val_y = train_y.iloc[val_idx].reset_index(drop=True)

        model = CatBoostClassifier(
            iterations=1000,
            learning_rate=0.1,
            use_best_model=True,
            # one_hot_max_size=1000,
            eval_metric="Accuracy",
        )
        model.fit(
            tr_x,
            tr_y,
            # cat_features=categorical_columns,
            eval_set=(val_x, val_y),
            plot=True,
        )

        y_pred = model.predict(val_x)
        accuracy = accuracy_score(val_y, y_pred)

        models.append(model)
        acc_results.append(accuracy)

    return models, acc_results
Beispiel #13
0
class CatBoostWrapper(mlflow.pyfunc.PythonModel):
    """
    MLflow wrapper for CatBoost estimators.
    """
    def load_context(self, context):
        # pylint: disable=attribute-defined-outside-init
        with open(context.artifacts['pipeline'], 'rb') as f:
            self.pipeline = pickle.load(f)

        with open(context.artifacts['col_config'], 'rb') as f:
            column_config = pickle.load(f)

        self.clf = CatBoostClassifier()
        self.clf.load_model(context.artifacts['cbm_model'])
        self.col_names = column_config['col_names']
        self.preserve_cols = column_config['preserve_neg_vals']

    def preprocess(self, data):
        """
        Applies the pre-processing pipeline to the features given in the input dataset.

        :param data: Input dataset.
        :return: Transformed dataset.
        """
        data = data[self.col_names]
        data = remove_inf_values(data)
        data = remove_negative_values(data, ignore_cols=self.preserve_cols)
        return self.pipeline.transform(data)

    def predict(self, context, model_input):
        X = self.preprocess(model_input)
        return self.clf.predict(X)
Beispiel #14
0
def get_predict_2020():
    df_data = pd.read_csv("dvhb_data/test/test 2020/grouped_full.csv", index_col=0)

    # кодирую слова векторами
    if os.path.isfile('cult_token.txtdic'):
        dictionary = corpora.Dictionary.load('cult_token.txtdic')
    else:
        df_train_full = my_full_cvs("dvhb_data/train", "train_full.csv")
        df_train_full_new_names = ['CODE_CULT', 'CODE_GROUP', 'CENTROID', 'YEAR']
        df_train_full.columns = df_train_full_new_names
        text = [df_train_full['CODE_CULT'].tolist()]

        dictionary = corpora.Dictionary(text)
        dictionary.save('cult_token.txtdic')

    # заменяем значения в столбце object_name_n на данные из словаря, а ключи берем из столбца object_type_number
    df_data['CODE_CULT_2019'] = df_data['CODE_CULT_2019'].map(dictionary.token2id)
    df_data['CODE_CULT_2018'] = df_data['CODE_CULT_2018'].map(dictionary.token2id)
    df_data['CODE_CULT_2017'] = df_data['CODE_CULT_2017'].map(dictionary.token2id)
    df_data['CODE_CULT_2016'] = df_data['CODE_CULT_2016'].map(dictionary.token2id)
    df_data['CODE_CULT_2015'] = df_data['CODE_CULT_2015'].map(dictionary.token2id)

    df_data.rename(columns={f'CODE_CULT_{2015 + i}': f'{i + 1}' for i in range(6)}, inplace=True)

    model = CatBoostClassifier()
    model.load_model("catboostmodel")
    predictions_valid = model.predict(
        df_data[['2', '3', '4', '5', 'LATITUDE', 'LONGTITUDE']].rename(columns={'2': '1', '3': '2', '4': '3', '5': '4'})
    )
    df_data = df_data.assign(CODE_CULT_2020=predictions_valid)

    df_data.rename(columns={f'{i + 1}': f'CODE_CULT_{2015 + i}' for i in range(6)}, inplace=True)
    df_permanent = df_data[
        (df_data['CODE_CULT_2015'] == df_data['CODE_CULT_2016'])
        & (df_data['CODE_CULT_2015'] == df_data['CODE_CULT_2017'])
        & (df_data['CODE_CULT_2015'] == df_data['CODE_CULT_2018'])
        & (df_data['CODE_CULT_2015'] == df_data['CODE_CULT_2019'])]
    df_two_year = df_data[
        (df_data['CODE_CULT_2015'] == df_data['CODE_CULT_2016'])
        & (df_data['CODE_CULT_2017'] == df_data['CODE_CULT_2018'])
        & (df_data['CODE_CULT_2015'] != df_data['CODE_CULT_2018'])
        & (df_data['CODE_CULT_2019'] != df_data['CODE_CULT_2018'])
        & ~df_data.index.isin(df_permanent.index)]

    for row in df_permanent.iterrows():
        df_data.loc[row[0]]['CODE_CULT_2020'] = row[1]['CODE_CULT_2015']

    for row in df_two_year.iterrows():
        df_data.loc[row[0]]['CODE_CULT_2020'] = row[1]['CODE_CULT_2019']

    df_data['CODE_CULT_2020'] = df_data['CODE_CULT_2020'].map(dictionary.get)
    df_data['CODE_CULT_2019'] = df_data['CODE_CULT_2019'].map(dictionary.get)
    df_data['CODE_CULT_2018'] = df_data['CODE_CULT_2018'].map(dictionary.get)
    df_data['CODE_CULT_2017'] = df_data['CODE_CULT_2017'].map(dictionary.get)
    df_data['CODE_CULT_2016'] = df_data['CODE_CULT_2016'].map(dictionary.get)
    df_data['CODE_CULT_2015'] = df_data['CODE_CULT_2015'].map(dictionary.get)

    df_data[['CODE_CULT_2015', 'CODE_CULT_2016', 'CODE_CULT_2017', 'CODE_CULT_2018', 'CODE_CULT_2019', 'CODE_CULT_2020',
             'LATITUDE', 'LONGTITUDE']].to_csv('predict_2020_full.csv', index=True)
    df_data['CODE_CULT_2020'].to_csv('predict_2020.csv', index=True)
    def trainDecisionTree(self):
        x = self.dataset.drop(['id', 'radiantClass'], axis=1)
        y = self.dataset['radiantClass']

        x_train, x_test, y_train, y_test = train_test_split(x,
                                                            y,
                                                            test_size=0.20,
                                                            random_state=0,
                                                            stratify=y)

        model = CatBoostClassifier(iterations=1000,
                                   learning_rate=1,
                                   depth=2,
                                   loss_function='MultiClass',
                                   eval_metric='Accuracy')

        # smote = SMOTE('minority')
        # x_sm, y_sm = smote.fit_sample(x_train, y_train)

        model.fit(x_train, y_train)
        pred = model.predict(x_test)

        accuracy = accuracy_score(y_test, pred, normalize=True)
        f1score = f1_score(y_test, pred, average=None)
        kappa = cohen_kappa_score(y_test, pred)
        cm = confusion_matrix(y_test, pred)

        print('Accuracy: ', accuracy)
        print('F1 score: ', f1score)
        print(cm)

        return [accuracy, f1score, kappa, cm, y_test, pred]
Beispiel #16
0
def catboost_model(X_train, X_test, y_train, y_test, catboost_params={}, verbose=100, plot=False):
    learn_pool = Pool(    
        X_train, 
        y_train, 
        cat_features=cat_features,
        text_features=text_features,
        feature_names=list(X_train)
    )
    test_pool = Pool(
        X_test, 
        y_test, 
        cat_features=cat_features,
        text_features=text_features,
        feature_names=list(X_train)
    )
    
    catboost_default_params = {
        'iterations': 1000,
#         'learning_rate': 0.1,
        'eval_metric': 'Accuracy',
        'task_type': 'GPU'
    }
    
    catboost_default_params.update(catboost_params)
    
    model = CatBoostClassifier(**catboost_default_params)
    # обучение модели
    model.fit(learn_pool, eval_set=test_pool, verbose=verbose, plot=plot)
    prediction = model.predict(X_test)

    return model
def train_gbm(train_data,
              train_labels,
              val_data,
              val_labels,
              test_data,
              test_labels,
              random_state=42):
    gbm = CatBoostClassifier(task_type="GPU",
                             logging_level='Silent',
                             loss_function='Logloss',
                             od_type='Iter',
                             od_wait=20,
                             random_state=random_state)
    eval_pool = Pool(val_data, val_labels)
    gbm.fit(train_data, train_labels, eval_set=eval_pool, use_best_model=True)
    gbm.save_model('catboost_1',
                   format="cbm",
                   export_parameters=None,
                   pool=None)
    pred_probs = gbm.predict_proba(test_data)[:, 1]
    pred_labels = gbm.predict(test_data)
    score = [
        roc_auc_score(test_labels, pred_probs),
        f1_score(test_labels, pred_labels)
    ]

    print('roc_auc: ', score[0])
    print('f1: ', score[1])

    average_precision = average_precision_score(pred_labels, test_labels)

    disp = plot_precision_recall_curve(gbm, test_data, test_labels)
    disp.ax_.set_title('2-class Precision-Recall curve: ')

    return gbm
Beispiel #18
0
def example_gpu():
    from catboost import CatBoostClassifier

    train_data = [[0, 3],
                  [4, 1],
                  [8, 1],
                  [9, 1]]
    train_labels = [0, 0, 1, 1]

    eval_data = [[2, 4],
                 [1, 4],
                 [20, 5],
                 [10, 1]]

    model = CatBoostClassifier(iterations=1000,
                               task_type="GPU",
                               devices='0:1')
    model.fit(train_data,
              train_labels,
              verbose=False)

    # Get predictions
    preds = model.predict(eval_data)

    print(preds)
def train_meta(train_x, train_y, kfold):
    models = []
    acc_results = []
    for i, (tr_idx, val_idx) in enumerate(kfold.split(train_x, train_y)):
        tr_x = train_x.iloc[tr_idx].reset_index(drop=True)
        tr_y = train_y.iloc[tr_idx].reset_index(drop=True)
        val_x = train_x.iloc[val_idx].reset_index(drop=True)
        val_y = train_y.iloc[val_idx].reset_index(drop=True)

        model = CatBoostClassifier(
            iterations=1000,
            # iterations=1,
            learning_rate=0.1,
            use_best_model=True,
            eval_metric="Accuracy",
            verbose=20,
        )
        model.fit(
            tr_x,
            tr_y,
            eval_set=(val_x, val_y),
        )
        y_pred = model.predict(val_x)
        accuracy = accuracy_score(val_y, y_pred)
        models.append(model)
        acc_results.append(accuracy)
    return models, acc_results
def objective(X, y, trial):
    """最適化する目的関数"""
    n_components = (trial.suggest_int("n_components", 1,
                                      len(list(X.columns))), )
    pca = PCA(n_components=n_components[0]).fit(X)
    x_pca = pd.DataFrame(pca.transform(X))

    print(x_pca, y)

    acc_results = []
    kfold = StratifiedKFold(n_splits=5, shuffle=True, random_state=0)
    for i, (tr_idx, val_idx) in enumerate(kfold.split(x_pca, y)):
        tr_x = x_pca.iloc[tr_idx].reset_index(drop=True)
        tr_y = y.iloc[tr_idx].reset_index(drop=True)
        val_x = x_pca.iloc[val_idx].reset_index(drop=True)
        val_y = y.iloc[val_idx].reset_index(drop=True)

        model = CatBoostClassifier(
            iterations=500,
            # iterations=1,
            learning_rate=0.1,
            use_best_model=True,
            eval_metric="Accuracy",
            verbose=20,
        )
        model.fit(tr_x, tr_y, eval_set=(val_x, val_y))
        y_pred = model.predict(val_x)
        acc = accuracy_score(val_y, y_pred)
        acc_results.append(acc)
    return sum(acc_results) / len(acc_results)  # accuracyの平均値
def train(train_x, train_y, kfold, best_params=None, algorithm_name=None):
    models = []
    acc_results = []
    for i, (tr_idx, val_idx) in enumerate(kfold.split(train_x, train_y)):
        tr_x = train_x.iloc[tr_idx].reset_index(drop=True)
        tr_y = train_y.iloc[tr_idx].reset_index(drop=True)
        val_x = train_x.iloc[val_idx].reset_index(drop=True)
        val_y = train_y.iloc[val_idx].reset_index(drop=True)

        model = CatBoostClassifier(
            # iterations=1,
            iterations=1000,
            learning_rate=0.1,
            use_best_model=True,
            eval_metric="Accuracy",
            verbose=20,
        )
        model.fit(
            tr_x,
            tr_y,
            eval_set=(val_x, val_y),
            plot=True,
        )

        y_pred = model.predict(val_x)
        accuracy = accuracy_score(val_y, y_pred)

        if algorithm_name is not None:
            joblib.dump(model, f"{DATA_DIR}/{algorithm_name}_model_{i}.pkl")

        models.append(model)
        acc_results.append(accuracy)

    return models, acc_results
def tdetect2(no,clf):
    customer_meter = c_no[no]
    X,y = ccnc2(no)
#    clf = XGBClassifier()
#    clf = SVC(kernel='rbf',probability=True)
#    clf = LGBMClassifier()
    clf = CatBoostClassifier(logging_level = "Silent")
    X_train, X_test, y_train, y_test = train_test_split(X,y, test_size=0.14, random_state=0)
    sm = SMOTE(random_state=42)
    X_res_train, y_res_train = sm.fit_sample(X_train, y_train)
    X_res_test, y_res_test = sm.fit_sample(X_test, y_test)
    clf.fit(X_res_train, y_res_train)
    score = clf.score(X_res_test, y_res_test)
    #print(Counter(y),Counter(y_train),Counter(y_test),Counter(y_res_train),Counter(y_res_test))
    #print("The score for customer :", customer_input, " is ",  score)
    y_pred = clf.predict(X_res_test)
    probs = clf.predict_proba(X_res_test)
    preds = probs[:,1]
#    print(confusion_matrix(y_res_test, y_pred))
    tn, fp, fn, tp = confusion_matrix(y_res_test, y_pred).ravel()
#    print("tn, fp, fn, tp",tn, fp, fn, tp)
    specificity = tn / (tn+fp)
    sensitivity =  tp/ (tp+fn)
    fpr =  1 - specificity
    print ("sensi = %.2f" %sensitivity, "fpr= %.2f" % fpr )
    total =sensitivity
    print("The score for customer :", customer_meter, " is %.2f" %  total)
#    plot_importance(clf,importance_type="weight", ax=plt.gca())
    return sensitivity,fpr
Beispiel #23
0
def GradientBoost(X_train, X_test, y_train):

    model = CatBoostClassifier(iterations=10, depth=5)
    model.fit(X_train, y_train)
    y_pred = model.predict(X_test)

    return y_pred
Beispiel #24
0
def gbm_predict(data):
    model = CatBoostClassifier()
    model.load_model('./models/gbm1.cbm')

    output = model.predict(data)

    return output
Beispiel #25
0
def score_model(train, test, b_cases, drivers):
    #    train, test, b_cases, drivers = proc_data_train, proc_data_test, b_cases_, sig_feats
    pred_scores = {}

    for target_col in list(b_cases):
        if target_col in ['Authentication', 'None']:
            continue
        train_x = deepcopy(StandardScaler().fit_transform(
            train[drivers[target_col]]))
        train_y = deepcopy(train[target_col])
        test_x = deepcopy(StandardScaler().fit_transform(
            test[drivers[target_col]]))
        test_y = deepcopy(test[target_col])

        predictor = CatBoostClassifier()

        predictor.fit(train_x, train_y)
        predictions = predictor.predict(test_x)

        predictor.save_model(
            os.path.join(cur_path, 'modelling', 'models',
                         '%s_classifier.mod' % (target_col.replace('/', '_'))))
        #        pred2 = CatBoostClassifier().load_model(os.path.join(cur_path, 'modelling', 'models', '%s_classifier.mod' % (target_col)))

        predictions = [i for i, j in zip(predictions, test_y.values) if j == j]

        pred_scores[target_col] = accuracy_score(test_y.dropna(), predictions)

    return (pred_scores)
Beispiel #26
0
def cross_val(X, y, X_test, param, cat_features, n_splits=3):
    skf = StratifiedKFold(n_splits=n_splits, shuffle=True, random_state=RANDOM_STATE)
    
    acc = []
    predict = None
    
    for tr_ind, val_ind in skf.split(X, y):
        X_train = X.iloc[tr_ind]
        y_train = y.iloc[tr_ind]
        
        X_valid = X.iloc[val_ind]
        y_valid = y.iloc[val_ind]
        
        clf = CatBoostClassifier(iterations=500,
                                loss_function = param['loss_function'],
                                depth=param['depth'],
                                l2_leaf_reg = param['l2_leaf_reg'],
                                eval_metric = 'Accuracy',
                                leaf_estimation_iterations = 10,
                                use_best_model=True,
                                logging_level='Silent'
        )
        
        clf.fit(X_train, 
                y_train,
                cat_features=cat_features,
                eval_set=(X_valid, y_valid)
        )
        
        y_pred = clf.predict(X_valid)
        accuracy = accuracy_score(y_valid, y_pred)
        acc.append(accuracy)
    return sum(acc)/n_splits
def train():
    env = my_env.MyEnv(0, realtime_mode=True)

    model = CatBoostClassifier()
    model.load_model("catboost_model.model")

    score = 0.0
    print_interval = 1

    for n_epi in range(10000):
        s = env.reset()
        done = False
        while not done:
            y_pred1 = model.predict(s, prediction_type="Probability")
            
            if deterministic:
                y_pred_max = int(np.argmax(y_pred1))
                a = action_mapping(y_pred_max)
            else:
                a = int(np.random.choice([0, 1, 3, 4, 5], p=y_pred1))            
            
            s_prime, r, done, info = env.step(a)

            s = s_prime

            score += r
            if done:
                break

        if n_epi%print_interval==0 and n_epi!=0:
            print("# of episode :{}, avg score : {:.5f}".format(n_epi, score/print_interval))
            score = 0.0

    env.close()
Beispiel #28
0
    def get_roc_auc_score(self, generated, real, weights=None):
        X = np.concatenate((generated, real))
        y = np.array([0] * generated.shape[0] + [1] * real.shape[0])
        weights = np.concatenate((weights, weights))

        (
            X_train,
            X_test,
            y_train,
            y_test,
            w_train,
            w_test
        ) = train_test_split(
            X,
            y,
            weights,
            test_size=0.2,
            random_state=self.params["seed"],
            stratify=y,
            shuffle=True,
        )

        classifier = CatBoostClassifier(iterations=1000, thread_count=10, silent=True)
        classifier.fit(X_train, y_train)
        predicted = classifier.predict(X_test)
        roc_auc = calculate_roc_auc(y_test, predicted, w_test)
        return roc_auc
Beispiel #29
0
def test_raw_predict_equals_to_model_predict():
    train_pool = Pool(TRAIN_FILE, column_description=CD_FILE)
    test_pool = Pool(TEST_FILE, column_description=CD_FILE)
    model = CatBoostClassifier(iterations=10, random_seed=0)
    model.fit(train_pool, eval_set=test_pool)
    pred = model.predict(test_pool, prediction_type='RawFormulaVal')
    assert all(model.get_test_eval() == pred)
def cross_val(X, y, param, cat_features, n_splits=3):
    skf = StratifiedKFold(n_splits=n_splits,
                          shuffle=True,
                          random_state=RANDOM_STATE)
    #print('missing value in y_train : {}'.format(sum(y.isna())))
    acc = []
    predict = None

    for tr_ind, val_ind in skf.split(X, y):
        X_train = X[tr_ind]
        y_train = y[tr_ind]

        X_valid = X[val_ind]
        y_valid = y[val_ind]
        #print('missing value in y_valid : {}'.format(sum(y_valid.isna())))
        clf = CatBoostClassifier(iterations=500,
                                 loss_function=param['loss_function'],
                                 depth=param['depth'],
                                 l2_leaf_reg=param['l2_leaf_reg'],
                                 eval_metric='Logloss',
                                 leaf_estimation_iterations=10,
                                 use_best_model=True,
                                 logging_level='Silent',
                                 thread_count=5,
                                 n_estimators=500)

        clf.fit(X_train,
                y_train,
                cat_features=cat_features,
                eval_set=(X_valid, y_valid))

        y_pred = clf.predict(X_valid)
        accuracy = auc_score(y_valid, y_pred)
        acc.append(accuracy)
    return sum(acc) / n_splits
Beispiel #31
0
def using_best_param(train, test, label):
    """
    使用最好的参数训练模型
    :param params:
    :return:
    """
    model = CatBoostClassifier(iterations=1000,
                               learning_rate=0.1,
                               max_depth=7,
                               cat_features=train.columns,
                               verbose=100,
                               custom_metric='F1',
                               random_seed=2019,
                               early_stopping_rounds=200,
                               task_type='CPU',
                               thread_count=11,
                               eval_metric='F1')

    model.fit(train, label)
    y_pred = model.predict(test).tolist()

    judge_df = pd.DataFrame()
    judge_df['sid'] = range(test.shape[0])
    judge_df['label'] = y_pred
    judge_df['label'] = judge_df['label'].apply(lambda x: 1
                                                if x >= 0.49 else 0)

    return judge_df[['sid', 'label']]
Beispiel #32
0
    def param_model_training(self, learning_rate: float, depth: int,
                             trees: int) -> tuple:
        """
        Training a model for a given hyper params
        Returns: model, model predictions and probs
        """
        X = self.X_train
        y = self.y_train
        X_train, X_val, y_train, y_val = train_test_split(X,
                                                          y,
                                                          test_size=0.2,
                                                          random_state=0)

        clf = CatBoostClassifier(
            iterations=trees,
            learning_rate=learning_rate,
            depth=depth,
        )

        clf.fit(X_train,
                y_train,
                cat_features=self.category_features,
                eval_set=(X_val, y_val),
                verbose=False)

        return clf, clf.predict(data=X_val), clf.predict_proba(
            data=X_val), y_val
def train(train_x, train_y, kfold, best_params=None):
    models = []
    acc_results = []
    for i, (tr_idx, val_idx) in enumerate(kfold.split(train_x, train_y)):
        tr_x = train_x.iloc[tr_idx].reset_index(drop=True)
        tr_y = train_y.iloc[tr_idx].reset_index(drop=True)
        val_x = train_x.iloc[val_idx].reset_index(drop=True)
        val_y = train_y.iloc[val_idx].reset_index(drop=True)

        model = CatBoostClassifier(
            iterations=1000,
            learning_rate=0.1,
            use_best_model=True,
            # one_hot_max_size=1000,
            eval_metric="Accuracy",
        )
        # categorical_columns = [x for x in train_x.columns if train_x[x].dtype == "object"]
        model.fit(
            tr_x,
            tr_y,
            # cat_features=categorical_columns,
            eval_set=(val_x, val_y),
            plot=True,
        )

        y_pred = model.predict(val_x)
        accuracy = accuracy_score(val_y, y_pred)
        # # 検証結果の描画
        # fig = lgb.plot_metric(evals_result)
        # plt.savefig(f"{DATA_DIR}/learning_curve_{i+1}.png")

        models.append(model)
        acc_results.append(accuracy)

    return models, acc_results
Beispiel #34
0
def test_predict_class():
    train_pool = Pool(TRAIN_FILE, column_description=CD_FILE)
    test_pool = Pool(TEST_FILE, column_description=CD_FILE)
    model = CatBoostClassifier(iterations=2, random_seed=0)
    model.fit(train_pool)
    pred = model.predict(test_pool, prediction_type="Class")
    np.save(PREDS_PATH, np.array(pred))
    return local_canonical_file(PREDS_PATH)
Beispiel #35
0
def test_titanic():
    train_df = titanic()[0].fillna(-999)
    X, y = train_df.drop('Survived', axis=1), train_df.Survived
    categorical_features_indices = np.where(X.dtypes != np.float)[0]

    model = CatBoostClassifier(iterations=5)
    model.fit(X, y, cat_features=categorical_features_indices)
    preds = model.predict(X)
Beispiel #36
0
def test_predict_class():
    train_pool = Pool(TRAIN_FILE, column_description=CD_FILE)
    test_pool = Pool(TEST_FILE, column_description=CD_FILE)
    model = CatBoostClassifier(iterations=2, random_seed=0)
    model.fit(train_pool)
    pred = model.predict(test_pool, prediction_type="Class")
    np.save(PREDS_PATH, np.array(pred))
    return local_canonical_file(PREDS_PATH)
Beispiel #37
0
def test_custom_objective():
    class LoglossObjective(object):
        def calc_ders_range(self, approxes, targets, weights):
            assert len(approxes) == len(targets)
            if weights is not None:
                assert len(weights) == len(approxes)

            exponents = []
            for index in xrange(len(approxes)):
                exponents.append(math.exp(approxes[index]))

            result = []
            for index in xrange(len(targets)):
                p = exponents[index] / (1 + exponents[index])
                der1 = (1 - p) if targets[index] > 0.0 else -p
                der2 = -p * (1 - p)

                if weights is not None:
                    der1 *= weights[index]
                    der2 *= weights[index]

                result.append((der1, der2))

            return result

    train_pool = Pool(data=TRAIN_FILE, column_description=CD_FILE)
    test_pool = Pool(data=TEST_FILE, column_description=CD_FILE)

    model = CatBoostClassifier(iterations=5, random_seed=0, use_best_model=True,
                               loss_function=LoglossObjective(), eval_metric="Logloss",
                               # Leaf estimation method and gradient iteration are set to match
                               # defaults for Logloss.
                               leaf_estimation_method="Newton", leaf_estimation_iterations=10)
    model.fit(train_pool, eval_set=test_pool)
    pred1 = model.predict(test_pool, prediction_type='RawFormulaVal')

    model2 = CatBoostClassifier(iterations=5, random_seed=0, use_best_model=True, loss_function="Logloss")
    model2.fit(train_pool, eval_set=test_pool)
    pred2 = model2.predict(test_pool, prediction_type='RawFormulaVal')

    for p1, p2 in zip(pred1, pred2):
        assert abs(p1 - p2) < EPS
Beispiel #38
0
def test_custom_eval():
    class LoglossMetric(object):
        def get_final_error(self, error, weight):
            return error / (weight + 1e-38)

        def is_max_optimal(self):
            return True

        def evaluate(self, approxes, target, weight):
            assert len(approxes) == 1
            assert len(target) == len(approxes[0])

            approx = approxes[0]

            error_sum = 0.0
            weight_sum = 0.0

            for i in xrange(len(approx)):
                w = 1.0 if weight is None else weight[i]
                weight_sum += w
                error_sum += w * (target[i] * approx[i] - math.log(1 + math.exp(approx[i])))

            return error_sum, weight_sum

    train_pool = Pool(data=TRAIN_FILE, column_description=CD_FILE)
    test_pool = Pool(data=TEST_FILE, column_description=CD_FILE)

    model = CatBoostClassifier(iterations=5, random_seed=0, use_best_model=True, eval_metric=LoglossMetric())
    model.fit(train_pool, eval_set=test_pool)
    pred1 = model.predict(test_pool)

    model2 = CatBoostClassifier(iterations=5, random_seed=0, use_best_model=True, eval_metric="Logloss")
    model2.fit(train_pool, eval_set=test_pool)
    pred2 = model2.predict(test_pool)

    for p1, p2 in zip(pred1, pred2):
        assert abs(p1 - p2) < EPS
Beispiel #39
0
def test_predict_without_fit():
    with pytest.raises(CatboostError):
        pool = Pool(TRAIN_FILE, column_description=CD_FILE)
        model = CatBoostClassifier()
        model.predict(pool)