Esempio n. 1
0
def CAT_test(train_x, train_y, val_x, val_y):
    import pandas as pd
    initial_params = {
        "verbose": 100,
        "loss_function": "Logloss",
        "eval_metric": "AUC",
        "iterations": 1000,
        "random_seed": 42,
        "learning_rate": 0.02,
        # "one_hot_max_size": 2,
        "depth": 6,
        # "border_count": 128,
        "thread_count": 16,
        # "class_weights":[0.1,1.8],
        # "l2_leaf_reg": 6,
        "use_best_model": True,
        # "save_snapshot":True,
        # "leaf_estimation_method": 'Newton',
        "od_type": 'Iter',
        "od_wait": 30,
        # "od_pval":0.0000001,
        # "used_ram_limit":1024*1024*1024*12,
        # "max_ctr_complexity":3,
        # "model_size_reg":10,
    }
    from catboost import CatBoostClassifier
    clf = CatBoostClassifier(**initial_params)
    clf.fit(X=train_x, y=train_y, eval_set=(val_x, val_y), verbose_eval=100)
    feature_importances = sorted(zip(train_x.columns,
                                     clf.feature_importances_),
                                 key=lambda x: x[1],
                                 reverse=True)
    feature_importances = pd.DataFrame([list(f) for f in feature_importances],
                                       columns=["features", "importance"])
    return clf.score(val_x, val_y), feature_importances
def tdetect2(no,clf):
    customer_meter = c_no[no]
    X,y = ccnc2(no)
#    clf = XGBClassifier()
#    clf = SVC(kernel='rbf',probability=True)
#    clf = LGBMClassifier()
    clf = CatBoostClassifier(logging_level = "Silent")
    X_train, X_test, y_train, y_test = train_test_split(X,y, test_size=0.14, random_state=0)
    sm = SMOTE(random_state=42)
    X_res_train, y_res_train = sm.fit_sample(X_train, y_train)
    X_res_test, y_res_test = sm.fit_sample(X_test, y_test)
    clf.fit(X_res_train, y_res_train)
    score = clf.score(X_res_test, y_res_test)
    #print(Counter(y),Counter(y_train),Counter(y_test),Counter(y_res_train),Counter(y_res_test))
    #print("The score for customer :", customer_input, " is ",  score)
    y_pred = clf.predict(X_res_test)
    probs = clf.predict_proba(X_res_test)
    preds = probs[:,1]
#    print(confusion_matrix(y_res_test, y_pred))
    tn, fp, fn, tp = confusion_matrix(y_res_test, y_pred).ravel()
#    print("tn, fp, fn, tp",tn, fp, fn, tp)
    specificity = tn / (tn+fp)
    sensitivity =  tp/ (tp+fn)
    fpr =  1 - specificity
    print ("sensi = %.2f" %sensitivity, "fpr= %.2f" % fpr )
    total =sensitivity
    print("The score for customer :", customer_meter, " is %.2f" %  total)
#    plot_importance(clf,importance_type="weight", ax=plt.gca())
    return sensitivity,fpr
Esempio n. 3
0
def getCatBoost():
    from catboost import CatBoostClassifier
    model = CatBoostClassifier(iterations=10, learning_rate=1, depth=4, loss_function='Logloss', random_state=20)
    model.fit(X_train, y_train)
    print("score %s" % (model.score(X_test, y_test)))
    y_pred = model.predict(X_test)
    sns.heatmap(confusion_matrix(y_test, y_pred), annot=True, fmt="d")
    return model
Esempio n. 4
0
def evaluate_cb(trainX, trainy, testX, testy, params):
    sc = StandardScaler()
    trainX = sc.fit_transform(trainX)
    testX = sc.transform(testX)
    model = CatBoostClassifier(**params)
    model.fit(trainX, trainy)
    test_acc = model.score(testX, testy)
    pred = model.predict_proba(testX)
    return model, test_acc, pred
def get_cat_score(X_train,y_train,X_test,y_test):
    cat_default = CatBoostClassifier(logging_level="Silent")
    cat_cross = CatBoostClassifier(logging_level="Silent")
    np.random.seed(200)
    cross_score = np.mean(cross_val_score(cat_cross, X_train, y_train, cv=5))

    cat_default.fit(X_train, y_train)
    score_cat = cat_default.score(X_test, y_test)

    neptune.log_metric('cat', score_cat)
    neptune.log_metric('cat_cross_score', cross_score)
    return score_cat
Esempio n. 6
0
def main(args):
    # get data
    X, y = get_gbm_database(args.telemetry_path,
                            args.maint_path,
                            args.machines_path,
                            args.errors_path,
                            args.failures_path,
                            seq_len=args.out_seq_len,
                            machine_id=args.machine_id,
                            )
    X_gbm = X.iloc[args.seq_len:-args.out_seq_len]
    y_target = y.iloc[args.seq_len:-args.out_seq_len]

    dm = TelemetryDataModule(path=args.telemetry_path,
                             seq_len=args.seq_len,
                             out_seq_len=args.out_seq_len,
                             batch_size=X_gbm.shape[0],
                             num_workers=args.num_workers,)
    dm.setup(stage="prodaction")
    X_lstm = dm.prodaction_dataset()
    
    # load models
    lstm = LSTM.load_from_checkpoint(checkpoint_path=args.checkpoint_path + '/lstm.ckpt',
                                     n_features=args.n_features,
                                     hidden_size=args.hidden_size,
                                     seq_len=args.seq_len,
                                     out_seq_len=args.out_seq_len,
                                     batch_size=X_gbm.shape[0],
                                     criterion=args.criterion,
                                     num_layers=args.num_layers,
                                     dropout=args.dropout,
                                     learning_rate=args.learning_rate,
                                     )
    lstm.freeze()
    
    gbm = CatBoostClassifier()
    gbm.load_model(args.checkpoint_path + '/gbm.cbm')

    # prediction
    y_hat_lstm = None
    for (x, _) in X_lstm:
        y_hat_lstm = lstm(x)

    X_gbm = get_lstm_feature(X_gbm, y_hat_lstm)

    score = gbm.score(X_gbm, y_target)

    print('Model accuracy: {0:.2f}%'.format(score*100))
Esempio n. 7
0
def main(**args):
    titanic_train, _ = titanic()
    titanic_train.fillna(-999, inplace=True)
    cols = [
        'Pclass', 'Name', 'Sex', 'Age', 'SibSp', 'Parch', 'Ticket', 'Fare',
        'Cabin', 'Embarked'
    ]
    train_sz = int(titanic_train.shape[0] * 0.1)
    x_train = titanic_train[:train_sz][cols]
    y_train = titanic_train[:train_sz]['Survived'].astype(int)
    x_test = titanic_train[train_sz:][cols]
    y_test = titanic_train[train_sz:]['Survived'].astype(int)
    try:
        model = CatBoostClassifier(random_seed=42, **args)
        model.fit(x_train, y_train, [1, 2, 6, 8, 9], silent=True)
        accuracy = model.score(x_test, y_test)
        print(-accuracy)
    except:
        print(0)
Esempio n. 8
0
def main():

    train_set = join(DATA_SPLIT_ROOT, 'train.csv')
    test_set = join(DATA_SPLIT_ROOT, 'test.csv')

    train = pd.read_csv(train_set, encoding='latin1', low_memory=True)
    test = pd.read_csv(test_set, encoding='latin1', low_memory=True)

    train_features = train.drop(['success'], axis=1)
    train_targets = train['success']

    test_features = test.drop(['success'], axis=1)
    test_targets = test['success']

    parser = argparse.ArgumentParser()
    # For whole folder processing
    parser.add_argument('--alg', help='The training algorithm')

    args = parser.parse_args()

    if args.alg == 'CART':
        carl = DecisionTreeClassifier()
        tree = carl.fit(train_features, train_targets)
        print("The CART accuracy is: ",
              tree.score(test_features, test_targets) * 100, "%")
    elif args.alg == 'xgboost':
        xgb = XGBClassifier()
        forest = xgb.fit(train_features, train_targets)
        print("The XGBoost accuracy is: ",
              forest.score(test_features, test_targets) * 100, "%")
        plot_importance(xgb)
        plt.show()
    elif args.alg == 'rf':
        rf = RandomForestClassifier()
        forest = rf.fit(train_features, train_targets)
        print("The Random Forest accuracy is: ",
              forest.score(test_features, test_targets) * 100, "%")
    elif args.alg == 'catboost':
        cb = CatBoostClassifier().fit(train_features, train_targets)
        print("The Cat Boost accuracy is: ",
              cb.score(test_features, test_targets) * 100, "%")
Esempio n. 9
0
class CatBoost(ClassifierAbstract):
    def __init__(self, **kwargs):
        self.iterations = kwargs['cat_boost_iterations']
        self.depth = kwargs['cat_boost_depth']
        self.learning_rate = kwargs['cat_boost_learning_rate']
        self.loss_function = kwargs['cat_boost_loss_function']
        self.name = 'CatBoost'
        self.model = CatBoostClassifier(iterations=self.iterations,
                                        depth=self.depth,
                                        learning_rate=self.learning_rate,
                                        loss_function=self.loss_function,
                                        verbose=True)

    def fit(self, x_train, y_train):
        self.model.fit(x_train, y_train)

    def predict(self, x_test):
        return self.model.predict(x_test)

    def evaluate(self, x_test, y_test):
        return self.model.score(x_test, y_test)
Esempio n. 10
0
# LIGHTGBM
lgb = lgbm.LGBMClassifier()
lgb.fit(x_train, y_train)

lgb_pred = lgb.predict(x_val)
lgb_pred_p = lgb.predict_proba(x_val)
lgb.score(x_val, y_val)  # 0.7896666666666666
cross_val_score(lgb, x_val, y_val).mean()  #0.7824333333333333

# Catboost
cat = CatBoostClassifier()
cat.fit(x_train, y_train)

cat_pred = cat.predict(x_val)
cat.score(x_val, y_val)
cross_val_score(cat, x_val, y_val).mean()  #0.7828333333333334

# most voting
temp = pd.DataFrame({'gbc': pred, 'lgbm': lgb_pred, 'cat': cat_pred})

result_survival = np.argmax((pred_p + lgb_pred_p) / 2, axis=1)
result_survival

submission = pd.read_csv(
    'C:/Users/10188/local_git/tabular-playground-series-apr-2021/sample_submission.csv'
)
submission['Survived'] = temp

submission.to_csv(
    'C:/Users/10188/local_git/tabular-playground-series-apr-2021/submission_files/20210413_GBC_lgbm_cat_freqvoting.csv',
plt.rcParams["figure.figsize"] = (502,7)
ax = feature_score.plot('Feature', 'Score', kind='bar', color='r')
ax.set_title("Catboost Feature Importance Ranking", fontsize = 6)
ax.set_xlabel('')
rects = ax.patches
labels = feature_score['Score'].round(2)
for rect, label in zip(rects, labels):
    height = rect.get_height()
    ax.text(rect.get_x() + rect.get_width()/2, height + 0.35, label, ha='center', va='bottom')
plt.show()


# In[ ]:


model.score(P_test, y_test)


# Catboost模型调参
# 

# In[372]:


model = CatBoostClassifier(
    l2_leaf_reg = 3,
    iterations = 1000,
    fold_len_multiplier = 1.05,
    learning_rate = 0.03,
    custom_loss = ['Accuracy'],
    random_seed = 100,
(thresholds, fnr) = get_fnr_curve(curve=curve)
plt.figure(figsize=(16, 8))
lw = 2
plt.plot(thresholds, fpr, color='blue', lw=lw, label='FPR', alpha=0.5)
plt.plot(thresholds, fnr, color='green', lw=lw, label='FNR', alpha=0.5)
plt.xlim([0.0, 1.0])
plt.ylim([0.0, 1.05])
plt.xticks(fontsize=16)
plt.yticks(fontsize=16)
plt.grid(True)
plt.xlabel('Threshold', fontsize=16)
plt.ylabel('Error Rate', fontsize=16)
#plt.title('FPR-FNR curves', fontsize=20)
plt.legend(loc="lower left", fontsize=16)
plt.show()
#find threshold
from catboost.utils import select_threshold
print(select_threshold(model=model, data=eval_train_pool, FNR=0.2))
print(select_threshold(model=model, data=eval_train_pool, FPR=0.4))
#confusion matrix
print(get_confusion_matrix(model, data=eval_pool))
from catboost.utils import get_confusion_matrix
#result show
test_pool = Pool(X_test, y_test, cat_features=categorical_features_indices)
from catboost import Pool
model.get_all_params()  #params

model.eval_metrics(data=eval_pool, metrics='Recall')
model.score(test_pool)
result = model.predict_proba(eval_test_pool)
Esempio n. 13
0
        continue
    X.append((dot.log, dot.lat, log(dot.trans_ts - b,
                                    a), log(dot.request_ts - b, c)))
    y.append(dot.label)

X_train, X_test, y_train, y_test = train_test_split(X,
                                                    y,
                                                    test_size=0.1,
                                                    random_state=42)

train_p = Pool(X, y)
test_p = Pool(X_test, y_test)
decision = CatBoostClassifier(iterations=35,
                              learning_rate=1,
                              depth=10,
                              loss_function='MultiClass',
                              custom_metric='MultiClassOneVsAll',
                              best_model_min_trees=10000)
decision.fit(train_p)

print('Accuracy: \n', decision.score(test_p))
pred = decision.predict(TEST)
print(decision.feature_importances_)
plt.bar(np.arange(len(decision.feature_importances_)),
        decision.feature_importances_,
        color='black')
plt.show()

with open("answerboost2.txt", 'w') as f:
    for item in pred:
        f.write(f"{int(item)}\n")
#pl_clf = RandomForestClassifier(n_estimators=200,max_depth=300,
#                                                            n_jobs=-1, \
#                                                  verbose=True,random_state=RS)
#pl_clf = LGBMClassifier(n_estimators=200,max_depth=200,
#                        n_jobs=-1,
#                        silent=False,
#                        random_state=RS)
pl_clf = CatBoostClassifier(iterations=1500,
                        task_type="GPU",
                        depth=8,
                        learning_rate=0.1,
                        random_seed=RS)

pl_clf.fit(df_x,df_y,eval_set=(x_val,y_val));

print('Скор на трейне',pl_clf.score(df_x, df_y))
print('Скор по валидации',pl_clf.score(x_val, y_val))

#let s save our model to use on server
filename = 'model_recommend_cb01.pickle'
pickle.dump(pl_clf, open(filename, 'wb'))

# This is example of code how to load saved model
# load the model from disk
# loaded_model = pickle.load(open(filename, 'rb'))
# prediction = loaded_model.predict(X_test)
# print(prediction)

#let s save model from colaboratory oto our PC
from google.colab import files
files.download('model_recommend_cb01.pickle')
Esempio n. 15
0
    "Breed1",
    "Breed2",
    "Breed3",
    "Breed4",
    "Breed5",
    "Breed6",
    "Breed7",
    "Breed8",
    "Breed9",
    "Breed10",
    "Color-light",
    "Color-medium",
    "Color-dark",
    "Color-warm",
    "Color-medium",
    "Color-cold",
    "Color_feature1",
    "Color_feature2",
]

print(model.score(x_test, y_test))
plt.figure(num=None, figsize=(10, 10), dpi=80, facecolor='w', edgecolor='k')
plt.bar(range(len(model.get_feature_importance(prettified=False))),
        model.get_feature_importance(prettified=False))
plt.title("Cat Feature Importance")
plt.xticks(range(len(model.get_feature_importance(prettified=False))),
           features,
           rotation='vertical')
plt.gcf().savefig('feature_importance_catboost.png')
plt.show()
    eval_metric= 'Logloss',
    #eval_metric='F1',
    task_type= 'GPU',
    early_stopping_rounds= 100,
    #class_weights=[0.95,0.05],
    use_best_model= True,
    random_seed=RS,
    verbose= 10
)
#clf=CatBoostClassifier(iterations=300, random_seed=RS,learning_rate=0.1,
#                       class_weights='balanced',task_type="GPU",eval_metric=f1_score)

clf.fit(train_pool, eval_set=valid_pool,plot=True)
#clf.fit(X_train,y_train,cat_features=categorical_cols,text_features=text_cols)

print('Правильность на обучающей выборке: {:.4f}'.format(clf.score(X_train,y_train)))

print('Правильность на валидационной выборке: {:.4f}'.format(clf.score(X_test,y_test)))

clf.feature_importances_

y_pred=clf.predict(X_test)
print(y_pred[:10])

print('Метрика ф1 на валидационной выборке: {:.4f}'.format(f1_score(y_pred,y_test)))

y_out=clf.predict(df_test[X_features])
print(y_out[0])

len(y_out)
Esempio n. 17
0
# Splitting the dataset into the Training set and Test set
from sklearn.model_selection import train_test_split
X_train, X_test, y_train, y_test = train_test_split(X,
                                                    y,
                                                    test_size=0.2,
                                                    random_state=0)

cat_featuresind = list(range(0, 11))

clf = CatBoostClassifier(iterations=10,
                         random_seed=rnd_state,
                         custom_metric='Accuracy')

clf.fit(X_train, y_train, cat_features=cat_featuresind, plot=True)

clf.score(X_test, y_test)

from sklearn.metrics import confusion_matrix
y_pred = clf.predict(X_test)

cm = confusion_matrix(y_test, y_pred)


def home(request):
    return render(request, 'predictpage.html', {"title": "Desease Predict"})


def predict(request):
    list = []
    comment = request.GET['menarchestarts1']
    data = int(comment)
Esempio n. 18
0
    def model_catboost(self, X, y, X_train, y_train, X_test, y_test,
                       categorical_features_indices, target, file):
        print("Processing CATBOOST....")

        # Adicione esto: inicio
        train_pool = Pool(X_train,
                          y_train,
                          cat_features=categorical_features_indices)
        validate_pool = Pool(X_test,
                             y_test,
                             cat_features=categorical_features_indices)
        # fin

        #         model=CatBoostClassifier(loss_function='MultiClass',use_best_model=True, random_seed=42)#, class_weights=[1,2,3,4,5,6,7,8,9,10,11])
        model = CatBoostClassifier(loss_function='MultiClass',
                                   eval_metric='TotalF1',
                                   use_best_model=True,
                                   random_seed=42,
                                   leaf_estimation_method='Newton')

        model.fit(train_pool,
                  eval_set=validate_pool,
                  use_best_model=True,
                  verbose=50,
                  plot=False,
                  early_stopping_rounds=100)

        # cross-validation
        cv_params = model.get_params()
        cv_data = cv(Pool(X, y, cat_features=categorical_features_indices),
                     cv_params,
                     fold_count=10,
                     plot=False)
        print('Precise validation accuracy score: {}'.format(
            np.max(cv_data)))  # ['TotalF1']
        # fin

        print("PRIMER prediccion")
        print()
        print(model)
        # make predictions
        expected_y = y_test
        predicted_y = model.predict(X_test)
        # summarize the fit of the model
        print()
        print(metrics.classification_report(expected_y, predicted_y))
        print()
        print(metrics.confusion_matrix(expected_y, predicted_y))

        print("SEGUNDO prediccion")
        print(model.best_iteration_, model.best_score_)
        print(model.evals_result_['validation']['MultiClass'][-10:])

        # prediction
        pred = model.predict(X_test)
        print("PREDICT")
        print(pred)

        print("print dataframe predictions:")
        cm = pd.DataFrame()
        #         cm['DAMAGE'] = y_test
        cm[target] = y_test
        cm['Predict'] = model.predict(X_test)
        print(cm)

        print("SCORES")
        print(model.score(X_test, y_test))
        cm.to_csv(file)  # , index=False)
        #         cm.to_csv("catboost_prediction.csv")#, index=False)

        # confusion matrix
        print("confusion matrix:")
        #         conf_mat = get_confusion_matrix(model, Pool(X_train, y_train, cat_features=categorical_features_indices))
        conf_mat = get_confusion_matrix(
            model,
            Pool(X_test, y_test, cat_features=categorical_features_indices))
        print(conf_mat)

        # feature selection
        print(model.get_feature_importance(prettified=True))
        # feature_importances = model.get_feature_importance(train_pool)
        # feature_names = X_train.columns
        # for score, name in sorted(zip(feature_importances, feature_names), reverse=True):
        #     print('{}: {}'.format(name, score))
        ##

        return model, cv_data
Esempio n. 19
0
# discrete_names=['AT_05','GENDER','teacherRel_cate']
# discrete_data=df[discrete_names]
# discrete_data['BYSID']=df['BYSID']
# categorical_data=df[categorical_names]
#
# categorical_data=pd.get_dummies(categorical_data.astype('str'))
# categorical_data['BYSID']=df['BYSID']
# data=pd.merge(discrete_data,categorical_data, on='BYSID')
# print(data)

target=df.iloc[:,-1]
data=df.iloc[:,1:-1]
X_trainval, X_test, y_trainval, y_test=train_test_split(data, target, random_state=7)
cb=CatBoostClassifier(iterations=8, learning_rate=0.1,depth=6,loss_function='MultiClass')
cb.fit(X_trainval,y_trainval)
print(cb.score(X_trainval,y_trainval))
print(cb.score(X_test, y_test))
fi=cb.feature_importances_
feat_importance=pd.Series(fi, index=data.columns,)
print(data)

# result=cb.predict(X_test.values[1])
# label='경영, 사무, 금융, 공공','미용, 여행, 음식','영업, 판매, 운송직','기술, 정비, 생산직'
# sizes=cb.predict_proba(predict_data)
# print(result[0][0])
# explode=dict_ex[result[0][0]-1]
# plt.figure(figsize=(14,7))
# plt.pie(sizes,explode=explode,labels=label,counterclock=False, autopct='%1.1f%%', shadow=True,startangle=90)
# plt.axis('equal')
#
# plt.legend(label, loc="right", bbox_transform=plt.gcf().transFigure)
Esempio n. 20
0
                          eval_set=_valid,
                          use_best_model=True,
                          verbose=200,
                          plot=True
                         )
    pred = fit_model.predict_proba(X_valid)[:,1]
    print( "  auc = ", roc_auc_score(y_valid, pred) )
    y_valid_pred.iloc[valid_index] = pred
    y_test_pred += fit_model.predict_proba(test)[:,1]
    
y_test_pred /= n_split


modelCatBoost = modelCatBoost.fit(X_train, y_train)
print("Results For CatBoost")
scoreCatBoost=modelCatBoost.score(X_test, y_test)
print("\nScore", scoreCatBoost*100)
y_Pred_Cat = modelCatBoost.predict(X_train_sub)


#LightGBM
import lightgbm as lgb
from sklearn.model_selection import StratifiedKFold

params = {
        'num_leaves': 8,
         'min_data_in_leaf': 42,
         'objective': 'binary',
         'max_depth': 16,
         'learning_rate': 0.0123,
         'boosting': 'gbdt',
Esempio n. 21
0
print(X_test.shape, y_test.shape)

X_train_cat, X_test_cat, y_train_cat, y_test_cat = train_test_split(
    X, y, test_size=0.3, random_state=RANDOM_STATE)
# MODEL

# Catboost
cat_features = [
    'race', 'sex', 'relationship', 'occupation', 'education', 'workclass'
]

cat = CatBoostClassifier(cat_features=cat_features, random_seed=RANDOM_STATE)
#cat.load_model('cat')

cat.fit(X_train_cat, y_train_cat)
print('CatBoost train score: {:.3f}'.format(cat.score(X_train_cat,
                                                      y_train_cat)))
print('CatBoost test score: {:.3f}'.format(cat.score(X_test_cat, y_test_cat)))
print(classification_report(y_test_cat, cat.predict(X_test_cat)))
#cat.save_model('cat',pool=X_train_cat)

# HistGradientBoostingClassifier
param_distributions_hgb = {
    'learning_rate': np.logspace(-3, -1, 25),
    'max_iter': np.arange(100, 300, 50),
    'min_samples_leaf': np.arange(10, 50, 10),
    'random_state': [RANDOM_STATE]
}
hgb = HistGradientBoostingClassifier()
hgb_CV = RandomizedSearchCV(hgb,
                            param_distributions=param_distributions_hgb,
                            cv=10,
Esempio n. 22
0
#分类
wine = load_wine()
Xtrain, Xtest, Ytrain, Ytest = train_test_split(wine.data,
                                                wine.target,
                                                test_size=0.3)
print(wine.items)
categorical_features_indices = np.where(Xtrain.dtype != np.float)[0]
#catboost = CatBoostClassifier(iterations=100, depth=5,cat_features=categorical_features_indices ,learning_rate=0.5, loss_function='Logloss',logging_level='Verbose')#2分类`
catboost = CatBoostClassifier(iterations=10,
                              depth=5,
                              cat_features=categorical_features_indices,
                              learning_rate=0.5,
                              loss_function='MultiClass',
                              logging_level='Verbose')  #多分类
catboost.fit(Xtrain, Ytrain)
score_r = catboost.score(Xtest, Ytest)

print("catboost:{}".format(score_r))
# 保存模型
joblib.dump(catboost, "catboost.model")
# 加载模型
catboost = joblib.load("catboost.model")

#交叉验证
catboost_score = cross_val_score(catboost, wine.data, wine.target,
                                 cv=10).mean()
print("10 folder val score: ", catboost_score)

# 对测试集做预测
y_pred = catboost.predict(Xtest)
predictions = [np.round(value) for value in y_pred]
    learning_rate= 0.2,
    depth=12,
    eval_metric= 'Logloss',  # logLoss
    task_type= 'CPU',
    early_stopping_rounds= 100,
    #class_weights=[0.95,0.05],
    use_best_model= True,
    random_seed=RS,
    verbose= 10
)

clf.fit(train_pool, eval_set=valid_pool,plot=True)

clf.save_model('cb_clf01.cbm',format='cbm',pool=train_pool)

clf.score(valid_pool)

X_test[1:2]

clf.predict(X_test[2:3])

!ls

from google.colab import files

files.download('cb_clf01.cbm')




# Define the categorical features for the CatBoost model
cat_features = np.where(x_train.dtypes != np.float)[0]
# Use the CatBoost Pool() function to pool together the training data and categorical feature labels
train_pool = Pool(x_train, y_train, cat_features)

# CatBoost model definition
catboost_model = CatBoostClassifier(iterations=200,
                                    custom_loss=['Accuracy'],
                                    loss_function='Logloss')

# Fit CatBoost model
catboost_model.fit(train_pool)  #,plot=True)

# CatBoost accuracy
acc_catboost = round(catboost_model.score(x_train, y_train) * 100, 2)

# How long will this take?
start_time = time.time()

# Set params for cross-validation as same as initial model
cv_params = catboost_model.get_params()

# Run the cross-validation for 10-folds (same as the other models)
cv_data = cv(train_pool, cv_params, fold_count=10)  #,plot=True)

# How long did it take?
catboost_time = (time.time() - start_time)

# CatBoost CV results save into a dataframe (cv_data), let's withdraw the maximum accuracy score
acc_cv_catboost = round(np.max(cv_data['test-Accuracy-mean']) * 100, 2)
Esempio n. 25
0
    catIndicies = [len(masterList[0]) - 1]
    masterTrainList, masterTestList, zScoreTrainList, zScoreTestList, fiveDayChangeTrainList, fiveDayChangeTestList =\
    train_test_split(masterList,zScoreAnswer,fiveDayChangeAnswer,test_size = .3)

    trainPools = [
        Pool(data=masterTrainList,
             label=zScoreTrainList,
             cat_features=catIndicies),
        Pool(data=masterTrainList,
             label=fiveDayChangeTrainList,
             cat_features=catIndicies)
    ]
    testPools = [
        Pool(data=masterTestList,
             label=zScoreTestList,
             cat_features=catIndicies),
        Pool(data=masterTestList,
             label=fiveDayChangeTestList,
             cat_features=catIndicies)
    ]
    modelNames = ['ZScorePredictor', 'FiveDayPredictor']

    for name, train, test in zip(modelNames, trainPools, testPools):
        print(modelNames)
        model = CatBoostClassifier()
        model.fit(train, eval_set=test, logging_level='Silent')
        sector = sector.replace(' ', '_')
        model.save_model(name + sector + '.mlmodel')
        print('Score: ', model.score(test))
        errorScoreCalculator(model, test, test.get_label())
from sklearn import metrics

from catboost import CatBoostClassifier

# initialize data
X_train = pd.read_csv("Train_Test_Data/X_train.csv")
X_test = pd.read_csv("Train_Test_Data/X_test.csv")
X_predict = pd.read_csv("Train_Test_Data/X_predict.csv")
y_train = pd.read_csv("Train_Test_Data/y_train.csv")
y_test = pd.read_csv("Train_Test_Data/y_test.csv")

X_train_a = X_train.drop(columns=['Number']).values
X_test_a = X_test.drop(columns=['Number']).values
X_predict_a = X_predict.drop(columns=['Number']).values
y_train_a = y_train.drop(columns=['Number']).values.flatten()
y_test_a = y_test.drop(columns=['Number']).values.flatten()

model = CatBoostClassifier(iterations=50,
                           bagging_temperature=2,
                           random_strength=10,
                           boosting_type='Ordered',
                           depth=9,
                           loss_function='Logloss',
                           logging_level='Verbose')
model.fit(X_train_a, y_train_a)

prediction = model.predict(X_test_a)

acc_catboost = round(model.score(X_test_a, y_test_a) * 100, 2)
metrics.accuracy_score(prediction, y_test_a)
Esempio n. 27
0
display_classification_report(y_test, y_pred)

_, axs = plt.subplots(1, 2,figsize=(10,5))
axs = axs.ravel()
plot_pr(y_test, y_pred, ax=axs[0], label="DecisionTreeClassifier")
plot_roc(y_test, y_pred, ax=axs[1], label="DecisionTreeClassifier")


# #### CatBoost

# In[32]:


cb = CatBoostClassifier(verbose=0, random_state=rnd_state).fit(X_train, y_train)
y_pred = cb.predict(X_test)
print(cb.score(X_train, y_train))
print(accuracy_score(y_test, y_pred))
print(confusion_matrix(y_test, y_pred))
display_classification_report(y_test, y_pred)

_, axs = plt.subplots(1, 2,figsize=(10,5))
axs = axs.ravel()
plot_pr(y_test, y_pred, ax=axs[0], label="CatBoostClassifier")
plot_roc(y_test, y_pred, ax=axs[1], label="CatBoostClassifier")


# #### XGBoost

# In[33]:

Esempio n. 28
0
# In[154]:

categorical_features_indices2 = np.where(X2.dtypes != np.float)[0]

# In[155]:

model2 = CatBoostClassifier()
model2.fit(X_train2,
           y_train2,
           cat_features=categorical_features_indices2,
           eval_set=(X_test2, y_test2))

# In[156]:

print('Accuracy of CatBoost classifier on training set: {:.2f}'.format(
    model2.score(X_train2, y_train2)))
print('Accuracy of CatBoost classifier on test set: {:.2f}'.format(
    model2.score(X_test2, y_test2)))

# In[157]:

model2.get_feature_importance()

# In[158]:

X2.columns

# In[183]:

X_test2.shape
Esempio n. 29
0
    'leaf_estimation_method': 'Gradient',
    'l2_leaf_reg': 2,
    'fold_len_multiplier': 1.2,
    'od_type': 'IncToDec',
    'train_dir': 'log'
}

# unpacking 的形式传入参数
model = CatBoostClassifier(**config)

# train
model.fit(X_train, y_train, use_best_model=True, eval_set=[(X_valid, y_valid)], verbose=False, early_stopping_rounds=10)

# make the prediction using the resulting model
preds_class = model.predict(X_valid, prediction_type='Class')
score = model.score(X_valid, y_valid)
print(f'CatBoostClassifier accuracy is {score}')

# 4 lightGBM
# 参数设置
params = {
    'boosting_type': 'gbdt',
    'objective': 'multiclass',
    'n_estimators': 150,
    'random_state': 123,
    'objective': 'binary',
    'num_leaves': 31,
    'learning_rate': 0.1,
}
gbm = LGBMClassifier(**params)
Esempio n. 30
0
print("VALIDATION : ", x_val.shape, " and ", y_val.shape)
print("MAIN TO PREDICT ", test.shape)

#Random Oversampling
ros = RandomOverSampler(random_state=0)
ros.fit(x_train, y_train)
X_resampledo, y_resampledo = ros.fit_sample(x_train, y_train)
print(X_resampledo.shape, y_resampledo.shape)

#model_selection
catboost_pool = Pool(X_resampledo, y_resampledo)
cat_model = CatBoostClassifier(task_type='CPU', iterations=20000, learning_rate=0.03, early_stopping_rounds=5)
cat_model.fit(X_resampledo, y_resampledo, verbose=True, plot=False, eval_set=(x_val, y_val),)

#accuracy on test categories
print(cat_model.score(x_test,y_test))

#metrics and score
y_pred = cat_model.predict(x_test)
print("ACCURACY SCORE : ", accuracy_score(y_test, y_pred))
print("MAE : ",mean_absolute_error(y_test, y_pred))
print("MSE : ", mean_squared_error(y_test, y_pred))
print("LOG LOSS : ", log_loss(y_test, y_pred))
print("COHEN KAPPA : ", cohen_kappa_score(y_test, y_pred))

#uncomment next lines to generate new csv results.
'''
y_proba = cat_model.predict_proba(test)
result = pd.DataFrame(data=y_proba, index=test.index)
df.drop("0", axis=1, inplace=True)
df['id'] = df["Unnamed: 0"]