def CAT_test(train_x, train_y, val_x, val_y): import pandas as pd initial_params = { "verbose": 100, "loss_function": "Logloss", "eval_metric": "AUC", "iterations": 1000, "random_seed": 42, "learning_rate": 0.02, # "one_hot_max_size": 2, "depth": 6, # "border_count": 128, "thread_count": 16, # "class_weights":[0.1,1.8], # "l2_leaf_reg": 6, "use_best_model": True, # "save_snapshot":True, # "leaf_estimation_method": 'Newton', "od_type": 'Iter', "od_wait": 30, # "od_pval":0.0000001, # "used_ram_limit":1024*1024*1024*12, # "max_ctr_complexity":3, # "model_size_reg":10, } from catboost import CatBoostClassifier clf = CatBoostClassifier(**initial_params) clf.fit(X=train_x, y=train_y, eval_set=(val_x, val_y), verbose_eval=100) feature_importances = sorted(zip(train_x.columns, clf.feature_importances_), key=lambda x: x[1], reverse=True) feature_importances = pd.DataFrame([list(f) for f in feature_importances], columns=["features", "importance"]) return clf.score(val_x, val_y), feature_importances
def tdetect2(no,clf): customer_meter = c_no[no] X,y = ccnc2(no) # clf = XGBClassifier() # clf = SVC(kernel='rbf',probability=True) # clf = LGBMClassifier() clf = CatBoostClassifier(logging_level = "Silent") X_train, X_test, y_train, y_test = train_test_split(X,y, test_size=0.14, random_state=0) sm = SMOTE(random_state=42) X_res_train, y_res_train = sm.fit_sample(X_train, y_train) X_res_test, y_res_test = sm.fit_sample(X_test, y_test) clf.fit(X_res_train, y_res_train) score = clf.score(X_res_test, y_res_test) #print(Counter(y),Counter(y_train),Counter(y_test),Counter(y_res_train),Counter(y_res_test)) #print("The score for customer :", customer_input, " is ", score) y_pred = clf.predict(X_res_test) probs = clf.predict_proba(X_res_test) preds = probs[:,1] # print(confusion_matrix(y_res_test, y_pred)) tn, fp, fn, tp = confusion_matrix(y_res_test, y_pred).ravel() # print("tn, fp, fn, tp",tn, fp, fn, tp) specificity = tn / (tn+fp) sensitivity = tp/ (tp+fn) fpr = 1 - specificity print ("sensi = %.2f" %sensitivity, "fpr= %.2f" % fpr ) total =sensitivity print("The score for customer :", customer_meter, " is %.2f" % total) # plot_importance(clf,importance_type="weight", ax=plt.gca()) return sensitivity,fpr
def getCatBoost(): from catboost import CatBoostClassifier model = CatBoostClassifier(iterations=10, learning_rate=1, depth=4, loss_function='Logloss', random_state=20) model.fit(X_train, y_train) print("score %s" % (model.score(X_test, y_test))) y_pred = model.predict(X_test) sns.heatmap(confusion_matrix(y_test, y_pred), annot=True, fmt="d") return model
def evaluate_cb(trainX, trainy, testX, testy, params): sc = StandardScaler() trainX = sc.fit_transform(trainX) testX = sc.transform(testX) model = CatBoostClassifier(**params) model.fit(trainX, trainy) test_acc = model.score(testX, testy) pred = model.predict_proba(testX) return model, test_acc, pred
def get_cat_score(X_train,y_train,X_test,y_test): cat_default = CatBoostClassifier(logging_level="Silent") cat_cross = CatBoostClassifier(logging_level="Silent") np.random.seed(200) cross_score = np.mean(cross_val_score(cat_cross, X_train, y_train, cv=5)) cat_default.fit(X_train, y_train) score_cat = cat_default.score(X_test, y_test) neptune.log_metric('cat', score_cat) neptune.log_metric('cat_cross_score', cross_score) return score_cat
def main(args): # get data X, y = get_gbm_database(args.telemetry_path, args.maint_path, args.machines_path, args.errors_path, args.failures_path, seq_len=args.out_seq_len, machine_id=args.machine_id, ) X_gbm = X.iloc[args.seq_len:-args.out_seq_len] y_target = y.iloc[args.seq_len:-args.out_seq_len] dm = TelemetryDataModule(path=args.telemetry_path, seq_len=args.seq_len, out_seq_len=args.out_seq_len, batch_size=X_gbm.shape[0], num_workers=args.num_workers,) dm.setup(stage="prodaction") X_lstm = dm.prodaction_dataset() # load models lstm = LSTM.load_from_checkpoint(checkpoint_path=args.checkpoint_path + '/lstm.ckpt', n_features=args.n_features, hidden_size=args.hidden_size, seq_len=args.seq_len, out_seq_len=args.out_seq_len, batch_size=X_gbm.shape[0], criterion=args.criterion, num_layers=args.num_layers, dropout=args.dropout, learning_rate=args.learning_rate, ) lstm.freeze() gbm = CatBoostClassifier() gbm.load_model(args.checkpoint_path + '/gbm.cbm') # prediction y_hat_lstm = None for (x, _) in X_lstm: y_hat_lstm = lstm(x) X_gbm = get_lstm_feature(X_gbm, y_hat_lstm) score = gbm.score(X_gbm, y_target) print('Model accuracy: {0:.2f}%'.format(score*100))
def main(**args): titanic_train, _ = titanic() titanic_train.fillna(-999, inplace=True) cols = [ 'Pclass', 'Name', 'Sex', 'Age', 'SibSp', 'Parch', 'Ticket', 'Fare', 'Cabin', 'Embarked' ] train_sz = int(titanic_train.shape[0] * 0.1) x_train = titanic_train[:train_sz][cols] y_train = titanic_train[:train_sz]['Survived'].astype(int) x_test = titanic_train[train_sz:][cols] y_test = titanic_train[train_sz:]['Survived'].astype(int) try: model = CatBoostClassifier(random_seed=42, **args) model.fit(x_train, y_train, [1, 2, 6, 8, 9], silent=True) accuracy = model.score(x_test, y_test) print(-accuracy) except: print(0)
def main(): train_set = join(DATA_SPLIT_ROOT, 'train.csv') test_set = join(DATA_SPLIT_ROOT, 'test.csv') train = pd.read_csv(train_set, encoding='latin1', low_memory=True) test = pd.read_csv(test_set, encoding='latin1', low_memory=True) train_features = train.drop(['success'], axis=1) train_targets = train['success'] test_features = test.drop(['success'], axis=1) test_targets = test['success'] parser = argparse.ArgumentParser() # For whole folder processing parser.add_argument('--alg', help='The training algorithm') args = parser.parse_args() if args.alg == 'CART': carl = DecisionTreeClassifier() tree = carl.fit(train_features, train_targets) print("The CART accuracy is: ", tree.score(test_features, test_targets) * 100, "%") elif args.alg == 'xgboost': xgb = XGBClassifier() forest = xgb.fit(train_features, train_targets) print("The XGBoost accuracy is: ", forest.score(test_features, test_targets) * 100, "%") plot_importance(xgb) plt.show() elif args.alg == 'rf': rf = RandomForestClassifier() forest = rf.fit(train_features, train_targets) print("The Random Forest accuracy is: ", forest.score(test_features, test_targets) * 100, "%") elif args.alg == 'catboost': cb = CatBoostClassifier().fit(train_features, train_targets) print("The Cat Boost accuracy is: ", cb.score(test_features, test_targets) * 100, "%")
class CatBoost(ClassifierAbstract): def __init__(self, **kwargs): self.iterations = kwargs['cat_boost_iterations'] self.depth = kwargs['cat_boost_depth'] self.learning_rate = kwargs['cat_boost_learning_rate'] self.loss_function = kwargs['cat_boost_loss_function'] self.name = 'CatBoost' self.model = CatBoostClassifier(iterations=self.iterations, depth=self.depth, learning_rate=self.learning_rate, loss_function=self.loss_function, verbose=True) def fit(self, x_train, y_train): self.model.fit(x_train, y_train) def predict(self, x_test): return self.model.predict(x_test) def evaluate(self, x_test, y_test): return self.model.score(x_test, y_test)
# LIGHTGBM lgb = lgbm.LGBMClassifier() lgb.fit(x_train, y_train) lgb_pred = lgb.predict(x_val) lgb_pred_p = lgb.predict_proba(x_val) lgb.score(x_val, y_val) # 0.7896666666666666 cross_val_score(lgb, x_val, y_val).mean() #0.7824333333333333 # Catboost cat = CatBoostClassifier() cat.fit(x_train, y_train) cat_pred = cat.predict(x_val) cat.score(x_val, y_val) cross_val_score(cat, x_val, y_val).mean() #0.7828333333333334 # most voting temp = pd.DataFrame({'gbc': pred, 'lgbm': lgb_pred, 'cat': cat_pred}) result_survival = np.argmax((pred_p + lgb_pred_p) / 2, axis=1) result_survival submission = pd.read_csv( 'C:/Users/10188/local_git/tabular-playground-series-apr-2021/sample_submission.csv' ) submission['Survived'] = temp submission.to_csv( 'C:/Users/10188/local_git/tabular-playground-series-apr-2021/submission_files/20210413_GBC_lgbm_cat_freqvoting.csv',
plt.rcParams["figure.figsize"] = (502,7) ax = feature_score.plot('Feature', 'Score', kind='bar', color='r') ax.set_title("Catboost Feature Importance Ranking", fontsize = 6) ax.set_xlabel('') rects = ax.patches labels = feature_score['Score'].round(2) for rect, label in zip(rects, labels): height = rect.get_height() ax.text(rect.get_x() + rect.get_width()/2, height + 0.35, label, ha='center', va='bottom') plt.show() # In[ ]: model.score(P_test, y_test) # Catboost模型调参 # # In[372]: model = CatBoostClassifier( l2_leaf_reg = 3, iterations = 1000, fold_len_multiplier = 1.05, learning_rate = 0.03, custom_loss = ['Accuracy'], random_seed = 100,
(thresholds, fnr) = get_fnr_curve(curve=curve) plt.figure(figsize=(16, 8)) lw = 2 plt.plot(thresholds, fpr, color='blue', lw=lw, label='FPR', alpha=0.5) plt.plot(thresholds, fnr, color='green', lw=lw, label='FNR', alpha=0.5) plt.xlim([0.0, 1.0]) plt.ylim([0.0, 1.05]) plt.xticks(fontsize=16) plt.yticks(fontsize=16) plt.grid(True) plt.xlabel('Threshold', fontsize=16) plt.ylabel('Error Rate', fontsize=16) #plt.title('FPR-FNR curves', fontsize=20) plt.legend(loc="lower left", fontsize=16) plt.show() #find threshold from catboost.utils import select_threshold print(select_threshold(model=model, data=eval_train_pool, FNR=0.2)) print(select_threshold(model=model, data=eval_train_pool, FPR=0.4)) #confusion matrix print(get_confusion_matrix(model, data=eval_pool)) from catboost.utils import get_confusion_matrix #result show test_pool = Pool(X_test, y_test, cat_features=categorical_features_indices) from catboost import Pool model.get_all_params() #params model.eval_metrics(data=eval_pool, metrics='Recall') model.score(test_pool) result = model.predict_proba(eval_test_pool)
continue X.append((dot.log, dot.lat, log(dot.trans_ts - b, a), log(dot.request_ts - b, c))) y.append(dot.label) X_train, X_test, y_train, y_test = train_test_split(X, y, test_size=0.1, random_state=42) train_p = Pool(X, y) test_p = Pool(X_test, y_test) decision = CatBoostClassifier(iterations=35, learning_rate=1, depth=10, loss_function='MultiClass', custom_metric='MultiClassOneVsAll', best_model_min_trees=10000) decision.fit(train_p) print('Accuracy: \n', decision.score(test_p)) pred = decision.predict(TEST) print(decision.feature_importances_) plt.bar(np.arange(len(decision.feature_importances_)), decision.feature_importances_, color='black') plt.show() with open("answerboost2.txt", 'w') as f: for item in pred: f.write(f"{int(item)}\n")
#pl_clf = RandomForestClassifier(n_estimators=200,max_depth=300, # n_jobs=-1, \ # verbose=True,random_state=RS) #pl_clf = LGBMClassifier(n_estimators=200,max_depth=200, # n_jobs=-1, # silent=False, # random_state=RS) pl_clf = CatBoostClassifier(iterations=1500, task_type="GPU", depth=8, learning_rate=0.1, random_seed=RS) pl_clf.fit(df_x,df_y,eval_set=(x_val,y_val)); print('Скор на трейне',pl_clf.score(df_x, df_y)) print('Скор по валидации',pl_clf.score(x_val, y_val)) #let s save our model to use on server filename = 'model_recommend_cb01.pickle' pickle.dump(pl_clf, open(filename, 'wb')) # This is example of code how to load saved model # load the model from disk # loaded_model = pickle.load(open(filename, 'rb')) # prediction = loaded_model.predict(X_test) # print(prediction) #let s save model from colaboratory oto our PC from google.colab import files files.download('model_recommend_cb01.pickle')
"Breed1", "Breed2", "Breed3", "Breed4", "Breed5", "Breed6", "Breed7", "Breed8", "Breed9", "Breed10", "Color-light", "Color-medium", "Color-dark", "Color-warm", "Color-medium", "Color-cold", "Color_feature1", "Color_feature2", ] print(model.score(x_test, y_test)) plt.figure(num=None, figsize=(10, 10), dpi=80, facecolor='w', edgecolor='k') plt.bar(range(len(model.get_feature_importance(prettified=False))), model.get_feature_importance(prettified=False)) plt.title("Cat Feature Importance") plt.xticks(range(len(model.get_feature_importance(prettified=False))), features, rotation='vertical') plt.gcf().savefig('feature_importance_catboost.png') plt.show()
eval_metric= 'Logloss', #eval_metric='F1', task_type= 'GPU', early_stopping_rounds= 100, #class_weights=[0.95,0.05], use_best_model= True, random_seed=RS, verbose= 10 ) #clf=CatBoostClassifier(iterations=300, random_seed=RS,learning_rate=0.1, # class_weights='balanced',task_type="GPU",eval_metric=f1_score) clf.fit(train_pool, eval_set=valid_pool,plot=True) #clf.fit(X_train,y_train,cat_features=categorical_cols,text_features=text_cols) print('Правильность на обучающей выборке: {:.4f}'.format(clf.score(X_train,y_train))) print('Правильность на валидационной выборке: {:.4f}'.format(clf.score(X_test,y_test))) clf.feature_importances_ y_pred=clf.predict(X_test) print(y_pred[:10]) print('Метрика ф1 на валидационной выборке: {:.4f}'.format(f1_score(y_pred,y_test))) y_out=clf.predict(df_test[X_features]) print(y_out[0]) len(y_out)
# Splitting the dataset into the Training set and Test set from sklearn.model_selection import train_test_split X_train, X_test, y_train, y_test = train_test_split(X, y, test_size=0.2, random_state=0) cat_featuresind = list(range(0, 11)) clf = CatBoostClassifier(iterations=10, random_seed=rnd_state, custom_metric='Accuracy') clf.fit(X_train, y_train, cat_features=cat_featuresind, plot=True) clf.score(X_test, y_test) from sklearn.metrics import confusion_matrix y_pred = clf.predict(X_test) cm = confusion_matrix(y_test, y_pred) def home(request): return render(request, 'predictpage.html', {"title": "Desease Predict"}) def predict(request): list = [] comment = request.GET['menarchestarts1'] data = int(comment)
def model_catboost(self, X, y, X_train, y_train, X_test, y_test, categorical_features_indices, target, file): print("Processing CATBOOST....") # Adicione esto: inicio train_pool = Pool(X_train, y_train, cat_features=categorical_features_indices) validate_pool = Pool(X_test, y_test, cat_features=categorical_features_indices) # fin # model=CatBoostClassifier(loss_function='MultiClass',use_best_model=True, random_seed=42)#, class_weights=[1,2,3,4,5,6,7,8,9,10,11]) model = CatBoostClassifier(loss_function='MultiClass', eval_metric='TotalF1', use_best_model=True, random_seed=42, leaf_estimation_method='Newton') model.fit(train_pool, eval_set=validate_pool, use_best_model=True, verbose=50, plot=False, early_stopping_rounds=100) # cross-validation cv_params = model.get_params() cv_data = cv(Pool(X, y, cat_features=categorical_features_indices), cv_params, fold_count=10, plot=False) print('Precise validation accuracy score: {}'.format( np.max(cv_data))) # ['TotalF1'] # fin print("PRIMER prediccion") print() print(model) # make predictions expected_y = y_test predicted_y = model.predict(X_test) # summarize the fit of the model print() print(metrics.classification_report(expected_y, predicted_y)) print() print(metrics.confusion_matrix(expected_y, predicted_y)) print("SEGUNDO prediccion") print(model.best_iteration_, model.best_score_) print(model.evals_result_['validation']['MultiClass'][-10:]) # prediction pred = model.predict(X_test) print("PREDICT") print(pred) print("print dataframe predictions:") cm = pd.DataFrame() # cm['DAMAGE'] = y_test cm[target] = y_test cm['Predict'] = model.predict(X_test) print(cm) print("SCORES") print(model.score(X_test, y_test)) cm.to_csv(file) # , index=False) # cm.to_csv("catboost_prediction.csv")#, index=False) # confusion matrix print("confusion matrix:") # conf_mat = get_confusion_matrix(model, Pool(X_train, y_train, cat_features=categorical_features_indices)) conf_mat = get_confusion_matrix( model, Pool(X_test, y_test, cat_features=categorical_features_indices)) print(conf_mat) # feature selection print(model.get_feature_importance(prettified=True)) # feature_importances = model.get_feature_importance(train_pool) # feature_names = X_train.columns # for score, name in sorted(zip(feature_importances, feature_names), reverse=True): # print('{}: {}'.format(name, score)) ## return model, cv_data
# discrete_names=['AT_05','GENDER','teacherRel_cate'] # discrete_data=df[discrete_names] # discrete_data['BYSID']=df['BYSID'] # categorical_data=df[categorical_names] # # categorical_data=pd.get_dummies(categorical_data.astype('str')) # categorical_data['BYSID']=df['BYSID'] # data=pd.merge(discrete_data,categorical_data, on='BYSID') # print(data) target=df.iloc[:,-1] data=df.iloc[:,1:-1] X_trainval, X_test, y_trainval, y_test=train_test_split(data, target, random_state=7) cb=CatBoostClassifier(iterations=8, learning_rate=0.1,depth=6,loss_function='MultiClass') cb.fit(X_trainval,y_trainval) print(cb.score(X_trainval,y_trainval)) print(cb.score(X_test, y_test)) fi=cb.feature_importances_ feat_importance=pd.Series(fi, index=data.columns,) print(data) # result=cb.predict(X_test.values[1]) # label='경영, 사무, 금융, 공공','미용, 여행, 음식','영업, 판매, 운송직','기술, 정비, 생산직' # sizes=cb.predict_proba(predict_data) # print(result[0][0]) # explode=dict_ex[result[0][0]-1] # plt.figure(figsize=(14,7)) # plt.pie(sizes,explode=explode,labels=label,counterclock=False, autopct='%1.1f%%', shadow=True,startangle=90) # plt.axis('equal') # # plt.legend(label, loc="right", bbox_transform=plt.gcf().transFigure)
eval_set=_valid, use_best_model=True, verbose=200, plot=True ) pred = fit_model.predict_proba(X_valid)[:,1] print( " auc = ", roc_auc_score(y_valid, pred) ) y_valid_pred.iloc[valid_index] = pred y_test_pred += fit_model.predict_proba(test)[:,1] y_test_pred /= n_split modelCatBoost = modelCatBoost.fit(X_train, y_train) print("Results For CatBoost") scoreCatBoost=modelCatBoost.score(X_test, y_test) print("\nScore", scoreCatBoost*100) y_Pred_Cat = modelCatBoost.predict(X_train_sub) #LightGBM import lightgbm as lgb from sklearn.model_selection import StratifiedKFold params = { 'num_leaves': 8, 'min_data_in_leaf': 42, 'objective': 'binary', 'max_depth': 16, 'learning_rate': 0.0123, 'boosting': 'gbdt',
print(X_test.shape, y_test.shape) X_train_cat, X_test_cat, y_train_cat, y_test_cat = train_test_split( X, y, test_size=0.3, random_state=RANDOM_STATE) # MODEL # Catboost cat_features = [ 'race', 'sex', 'relationship', 'occupation', 'education', 'workclass' ] cat = CatBoostClassifier(cat_features=cat_features, random_seed=RANDOM_STATE) #cat.load_model('cat') cat.fit(X_train_cat, y_train_cat) print('CatBoost train score: {:.3f}'.format(cat.score(X_train_cat, y_train_cat))) print('CatBoost test score: {:.3f}'.format(cat.score(X_test_cat, y_test_cat))) print(classification_report(y_test_cat, cat.predict(X_test_cat))) #cat.save_model('cat',pool=X_train_cat) # HistGradientBoostingClassifier param_distributions_hgb = { 'learning_rate': np.logspace(-3, -1, 25), 'max_iter': np.arange(100, 300, 50), 'min_samples_leaf': np.arange(10, 50, 10), 'random_state': [RANDOM_STATE] } hgb = HistGradientBoostingClassifier() hgb_CV = RandomizedSearchCV(hgb, param_distributions=param_distributions_hgb, cv=10,
#分类 wine = load_wine() Xtrain, Xtest, Ytrain, Ytest = train_test_split(wine.data, wine.target, test_size=0.3) print(wine.items) categorical_features_indices = np.where(Xtrain.dtype != np.float)[0] #catboost = CatBoostClassifier(iterations=100, depth=5,cat_features=categorical_features_indices ,learning_rate=0.5, loss_function='Logloss',logging_level='Verbose')#2分类` catboost = CatBoostClassifier(iterations=10, depth=5, cat_features=categorical_features_indices, learning_rate=0.5, loss_function='MultiClass', logging_level='Verbose') #多分类 catboost.fit(Xtrain, Ytrain) score_r = catboost.score(Xtest, Ytest) print("catboost:{}".format(score_r)) # 保存模型 joblib.dump(catboost, "catboost.model") # 加载模型 catboost = joblib.load("catboost.model") #交叉验证 catboost_score = cross_val_score(catboost, wine.data, wine.target, cv=10).mean() print("10 folder val score: ", catboost_score) # 对测试集做预测 y_pred = catboost.predict(Xtest) predictions = [np.round(value) for value in y_pred]
learning_rate= 0.2, depth=12, eval_metric= 'Logloss', # logLoss task_type= 'CPU', early_stopping_rounds= 100, #class_weights=[0.95,0.05], use_best_model= True, random_seed=RS, verbose= 10 ) clf.fit(train_pool, eval_set=valid_pool,plot=True) clf.save_model('cb_clf01.cbm',format='cbm',pool=train_pool) clf.score(valid_pool) X_test[1:2] clf.predict(X_test[2:3]) !ls from google.colab import files files.download('cb_clf01.cbm')
# Define the categorical features for the CatBoost model cat_features = np.where(x_train.dtypes != np.float)[0] # Use the CatBoost Pool() function to pool together the training data and categorical feature labels train_pool = Pool(x_train, y_train, cat_features) # CatBoost model definition catboost_model = CatBoostClassifier(iterations=200, custom_loss=['Accuracy'], loss_function='Logloss') # Fit CatBoost model catboost_model.fit(train_pool) #,plot=True) # CatBoost accuracy acc_catboost = round(catboost_model.score(x_train, y_train) * 100, 2) # How long will this take? start_time = time.time() # Set params for cross-validation as same as initial model cv_params = catboost_model.get_params() # Run the cross-validation for 10-folds (same as the other models) cv_data = cv(train_pool, cv_params, fold_count=10) #,plot=True) # How long did it take? catboost_time = (time.time() - start_time) # CatBoost CV results save into a dataframe (cv_data), let's withdraw the maximum accuracy score acc_cv_catboost = round(np.max(cv_data['test-Accuracy-mean']) * 100, 2)
catIndicies = [len(masterList[0]) - 1] masterTrainList, masterTestList, zScoreTrainList, zScoreTestList, fiveDayChangeTrainList, fiveDayChangeTestList =\ train_test_split(masterList,zScoreAnswer,fiveDayChangeAnswer,test_size = .3) trainPools = [ Pool(data=masterTrainList, label=zScoreTrainList, cat_features=catIndicies), Pool(data=masterTrainList, label=fiveDayChangeTrainList, cat_features=catIndicies) ] testPools = [ Pool(data=masterTestList, label=zScoreTestList, cat_features=catIndicies), Pool(data=masterTestList, label=fiveDayChangeTestList, cat_features=catIndicies) ] modelNames = ['ZScorePredictor', 'FiveDayPredictor'] for name, train, test in zip(modelNames, trainPools, testPools): print(modelNames) model = CatBoostClassifier() model.fit(train, eval_set=test, logging_level='Silent') sector = sector.replace(' ', '_') model.save_model(name + sector + '.mlmodel') print('Score: ', model.score(test)) errorScoreCalculator(model, test, test.get_label())
from sklearn import metrics from catboost import CatBoostClassifier # initialize data X_train = pd.read_csv("Train_Test_Data/X_train.csv") X_test = pd.read_csv("Train_Test_Data/X_test.csv") X_predict = pd.read_csv("Train_Test_Data/X_predict.csv") y_train = pd.read_csv("Train_Test_Data/y_train.csv") y_test = pd.read_csv("Train_Test_Data/y_test.csv") X_train_a = X_train.drop(columns=['Number']).values X_test_a = X_test.drop(columns=['Number']).values X_predict_a = X_predict.drop(columns=['Number']).values y_train_a = y_train.drop(columns=['Number']).values.flatten() y_test_a = y_test.drop(columns=['Number']).values.flatten() model = CatBoostClassifier(iterations=50, bagging_temperature=2, random_strength=10, boosting_type='Ordered', depth=9, loss_function='Logloss', logging_level='Verbose') model.fit(X_train_a, y_train_a) prediction = model.predict(X_test_a) acc_catboost = round(model.score(X_test_a, y_test_a) * 100, 2) metrics.accuracy_score(prediction, y_test_a)
display_classification_report(y_test, y_pred) _, axs = plt.subplots(1, 2,figsize=(10,5)) axs = axs.ravel() plot_pr(y_test, y_pred, ax=axs[0], label="DecisionTreeClassifier") plot_roc(y_test, y_pred, ax=axs[1], label="DecisionTreeClassifier") # #### CatBoost # In[32]: cb = CatBoostClassifier(verbose=0, random_state=rnd_state).fit(X_train, y_train) y_pred = cb.predict(X_test) print(cb.score(X_train, y_train)) print(accuracy_score(y_test, y_pred)) print(confusion_matrix(y_test, y_pred)) display_classification_report(y_test, y_pred) _, axs = plt.subplots(1, 2,figsize=(10,5)) axs = axs.ravel() plot_pr(y_test, y_pred, ax=axs[0], label="CatBoostClassifier") plot_roc(y_test, y_pred, ax=axs[1], label="CatBoostClassifier") # #### XGBoost # In[33]:
# In[154]: categorical_features_indices2 = np.where(X2.dtypes != np.float)[0] # In[155]: model2 = CatBoostClassifier() model2.fit(X_train2, y_train2, cat_features=categorical_features_indices2, eval_set=(X_test2, y_test2)) # In[156]: print('Accuracy of CatBoost classifier on training set: {:.2f}'.format( model2.score(X_train2, y_train2))) print('Accuracy of CatBoost classifier on test set: {:.2f}'.format( model2.score(X_test2, y_test2))) # In[157]: model2.get_feature_importance() # In[158]: X2.columns # In[183]: X_test2.shape
'leaf_estimation_method': 'Gradient', 'l2_leaf_reg': 2, 'fold_len_multiplier': 1.2, 'od_type': 'IncToDec', 'train_dir': 'log' } # unpacking 的形式传入参数 model = CatBoostClassifier(**config) # train model.fit(X_train, y_train, use_best_model=True, eval_set=[(X_valid, y_valid)], verbose=False, early_stopping_rounds=10) # make the prediction using the resulting model preds_class = model.predict(X_valid, prediction_type='Class') score = model.score(X_valid, y_valid) print(f'CatBoostClassifier accuracy is {score}') # 4 lightGBM # 参数设置 params = { 'boosting_type': 'gbdt', 'objective': 'multiclass', 'n_estimators': 150, 'random_state': 123, 'objective': 'binary', 'num_leaves': 31, 'learning_rate': 0.1, } gbm = LGBMClassifier(**params)
print("VALIDATION : ", x_val.shape, " and ", y_val.shape) print("MAIN TO PREDICT ", test.shape) #Random Oversampling ros = RandomOverSampler(random_state=0) ros.fit(x_train, y_train) X_resampledo, y_resampledo = ros.fit_sample(x_train, y_train) print(X_resampledo.shape, y_resampledo.shape) #model_selection catboost_pool = Pool(X_resampledo, y_resampledo) cat_model = CatBoostClassifier(task_type='CPU', iterations=20000, learning_rate=0.03, early_stopping_rounds=5) cat_model.fit(X_resampledo, y_resampledo, verbose=True, plot=False, eval_set=(x_val, y_val),) #accuracy on test categories print(cat_model.score(x_test,y_test)) #metrics and score y_pred = cat_model.predict(x_test) print("ACCURACY SCORE : ", accuracy_score(y_test, y_pred)) print("MAE : ",mean_absolute_error(y_test, y_pred)) print("MSE : ", mean_squared_error(y_test, y_pred)) print("LOG LOSS : ", log_loss(y_test, y_pred)) print("COHEN KAPPA : ", cohen_kappa_score(y_test, y_pred)) #uncomment next lines to generate new csv results. ''' y_proba = cat_model.predict_proba(test) result = pd.DataFrame(data=y_proba, index=test.index) df.drop("0", axis=1, inplace=True) df['id'] = df["Unnamed: 0"]