Without much preprocessing and parameter tuning a simple LGBMClassifier should work decently. """ # Split training testing data enc = LabelEncoder() label_encoded = enc.fit_transform(label) X_train, X_test, y_train, y_test = train_test_split(tsne_data, label_encoded, random_state=3) # Create the model lgbm = LGBMClassifier(n_estimators=500, random_state=3) lgbm = lgbm.fit(X_train, y_train) # Test the model score = accuracy_score(y_true=y_test, y_pred=lgbm.predict(X_test)) print('Accuracy on testset:\t{:.4f}\n'.format(score)) """With a basic untuned model the **activity of the smartphone user** can be predicted with an **accuracy of 95%.**<br> This is pretty striking regarding six equally distributed labels. **Summary:**<br> If the smartphone or an App wants to know what you are doing, this is feasible. ## <a id=5>Participant Exploration</a> ### <a id=5.1>How Good Are the Participants Separable?</a> As we have seen in the second t-SNE plot the separability of the participants seem to vary regarding their activity. Let us investigate this a little bit by fitting the same basic model to the data of each activity separately. """ # Store the data data = []
random_state=19, max_depth=4, num_leaves=30, objective='binary', learning_rate=0.01, colsample_bytree=1, subsample=1, verbose=-1) } stack_train, stack_test = stack(k=5, models=models, train_X=train_X, train_y=train_y, test_X=test_X) # Main training Process lgb_stack = LGBMClassifier(n_estimators=2000, silent=False, random_state=19, max_depth=4, num_leaves=20, objective='binary', learning_rate=0.005, colsample_bytree=1, subsample=1, verbose=-1).fit(stack_train, train.label) stack_pred = lgb_stack.predict(stack_test) pd.DataFrame({ "a": stack_pred }).to_csv(PATH + "\submission.csv", header=None, index=None) # save submission to PATH
y = dataset.target x_train, x_test, y_train, y_test = tts(x, y, train_size=0.8, random_state=66) lgbm = LGBMClassifier(n_estimators=100, learning_rate=0.1, n_jobs=-1) lgbm.fit(x_train, y_train, verbose=True, eval_metric=["logloss", "rmse"], eval_set=[(x_train, y_train), (x_test, y_test)], early_stopping_rounds=20) #rmse,mae,logloss,error,auc y_pre = lgbm.predict(x_test) r2 = r2_score(y_test, y_pre) score = lgbm.score(x_test, y_test) print(__file__) print("r2") print(r2) print("score") print(score) #6)selectFromModel thresholds = np.sort(lgbm.feature_importances_) idx_max = -1 max = r2
lgbmBO = BayesianOptimization(lt.lgbm_evaluate, { 'min_child_weight': (0.01, 1), 'learning_rate': (1, 10), 'max_depth': (-1, 15), 'num_leaves': (5, 50) }, random_state=3) lgbmBO.maximize(init_points=3, n_iter=10) # In[24]: params = lt.clean_param(lgbmBO.res['max']['max_params']) lgbm_model = LGBMClassifier(**params) lgbm_model.fit(x_pci_train, y_pci_train) y_pci_pred = lgbm_model.predict(x_pci_test) predictions = [round(value) for value in y_pci_pred] accuracy = accuracy_score(y_pci_test, predictions) print(accuracy) # In[13]: params = { 'learning_rate': 0.099387, 'max_depth': 14, 'min_child_weight': 0, 'num_leaves': 5 } lgbm_model = LGBMClassifier(**params) lgbm_model.fit(x_pci_train, y_pci_train) y_pci_pred = lgbm_model.predict(x_pci_test)
#%% 建模 # xgboost xgb = XGBClassifier() xgb.fit(tfidf_train, y_train.values.ravel()) xgb_pred = xgb.predict(tfidf_test) print("xgboost") print(" Accuracy: ", accuracy_score(y_test, xgb_pred)) print(" Precision: ", precision_score(y_test, xgb_pred, pos_label='1')) print(" Recall: ", recall_score(y_test, xgb_pred, pos_label='1')) print(" F-measure: ", f1_score(y_test, xgb_pred, pos_label='1')) # GBDT gbr = GradientBoostingClassifier() gbr.fit(tfidf_train, y_train.values.ravel()) gbr_pred = gbr.predict(tfidf_test) print("GBDT") print(" Accuracy: ", accuracy_score(y_test, gbr_pred)) print(" Precision: ", precision_score(y_test, gbr_pred, pos_label='1')) print(" Recall: ", recall_score(y_test, gbr_pred, pos_label='1')) print(" F-measure: ", f1_score(y_test, gbr_pred, pos_label='1')) # LightGBM lgbm = LGBMClassifier() lgbm.fit(tfidf_train, y_train.values.ravel()) lgbm_pred = lgbm.predict(tfidf_test) print("LightGBM") print(" Accuracy: ", accuracy_score(y_test, lgbm_pred)) print(" Precision: ", precision_score(y_test, lgbm_pred, pos_label='1')) print(" Recall: ", recall_score(y_test, lgbm_pred, pos_label='1')) print(" F-measure: ", f1_score(y_test, lgbm_pred, pos_label='1'))
# %% # 打乱数据 state = np.random.get_state() np.random.shuffle(data_x) np.random.set_state(state) np.random.shuffle(data_y) # 划分训练集和测试集 X_train, X_test, y_train, y_test = train_test_split(data_x, data_y, test_size=0.3) # 转换为Dataset数据格式 train_data = lgb.Dataset(X_train, label=y_train) validation_data = lgb.Dataset(X_test, label=y_test) lgbm_model = LGBMClassifier(boosting_type='gbdt', num_leaves=300, max_depth=-1, learning_rate=0.03, n_estimators=100, subsample_for_bin=200000, objective='binary') lgbm_model.fit(X_train, y_train) #用建立好的lightbm模型运用到训练集和测试集上,进行预测 y_train_pred = lgbm_model.predict(X_train) y_test_pred = lgbm_model.predict(X_test) print('训练集:{:.4f}'.format(roc_auc_score(y_train, y_train_pred))) print('测试集:{:.4f}'.format(roc_auc_score(y_test, y_test_pred)))
eval_metric = lgb_f1, early_stopping_rounds = 100, verbose = 10, ) print('best score', lgb.best_score_) # ============================================================== # 使用全部的 train data 和 调好迭代轮数训练模型,并用 test data 做预测 # ============================================================== print("=" * 25) print('predicting') lgb.n_estimators = lgb.best_iteration_ lgb.fit(all_train_x, all_train_y) test_y = lgb.predict(test_x) # ============================================================== # 创建submission.csv文件 # ============================================================== print("=" * 25) print("submission file") print("=" * 25) df_sub = pd.concat([df_test['sid'], pd.Series(test_y)], axis = 1) df_sub.columns = ['sid', 'label'] df_sub.to_csv('/Users/zfwang/project/mlproj/projects/move_ad_fraud/submission_file/submit-{}.csv' \ .format(datetime.now().strftime('%m%d_%H%M%S')), sep = ',', index = False)
from sklearn.metrics import * dia = pd.read_csv("10.1 diabetes.csv.csv") df = dia.copy() df = df.dropna() y = df["Outcome"] X = df.drop(["Outcome"], axis=1) #X=df["Pregnancies"] X = pd.DataFrame(X) X_train, X_test, y_train, y_test = train_test_split(X, y, test_size=0.30, random_state=42) lgbm_model = LGBMClassifier().fit(X_train, y_train) y_pred = lgbm_model.predict(X_test) print(accuracy_score(y_test, y_pred)) lgbm_params = { 'n_estimators': [100, 500, 1000, 2000], 'subsample': [0.6, 0.8, 1.0], 'max_depth': [3, 4, 5, 6], 'learning_rate': [0.1, 0.01, 0.02, 0.05], 'min_child_samples': [20, 5, 10] } lgbm = LGBMClassifier() lgbm_cv_model = GridSearchCV(lgbm, lgbm_params, cv=10, n_jobs=-1, verbose=2) lgbm_cv_model.fit(X_train, y_train) print("En iyi paramereler:" + str(lgbm_cv_model.best_params_)) xgb = LGBMClassifier(learning_rate=0.01, n_estimators=500, max_depth=3,
class LGBBaseline(BaseBaseline): def __init__(self): super(LGBBaseline, self).__init__(name="lgb") def fit(self, X_train, y_train, X_val, y_val, categoricals=None): results = dict() self.num_classes = len(np.unique(y_train)) self.config["num_class"] = self.num_classes self.all_nan = np.all(np.isnan(X_train), axis=0) X_train = X_train[:, ~self.all_nan] X_val = X_val[:, ~self.all_nan] X_train = np.nan_to_num(X_train) X_val = np.nan_to_num(X_val) early_stopping = 150 if X_train.shape[0] > 10000 else max( round(150 * 10000 / X_train.shape[0]), 10) self.config["early_stopping_rounds"] = early_stopping categoricals = [ ind for ind in range(X_train.shape[1]) if isinstance(X_train[0, ind], str) ] X_train, X_val, self.encode_dicts = encode_categoricals( X_train, X_val, encode_dicts=None) self.model = LGBMClassifier(**self.config) self.model.fit(X_train, y_train, eval_set=[(X_val, y_val)]) pred_train = self.model.predict_proba(X_train) pred_val = self.model.predict_proba(X_val) # This fixes a bug if self.num_classes == 2: pred_train = pred_train.transpose()[0:len(y_train)] pred_val = pred_val.transpose()[0:len(y_val)] results["val_preds"] = pred_val.tolist() results["labels"] = y_val.tolist() pred_train = np.argmax(pred_train, axis=1) pred_val = np.argmax(pred_val, axis=1) results["train_acc"] = metrics.accuracy_score(y_train, pred_train) results["train_balanced_acc"] = metrics.balanced_accuracy_score( y_train, pred_train) results["val_acc"] = metrics.accuracy_score(y_val, pred_val) results["val_balanced_acc"] = metrics.balanced_accuracy_score( y_val, pred_val) return results def score(self, X_test, y_test): results = dict() y_pred = self.predict(X_test) results["test_acc"] = metrics.accuracy_score(y_test, y_pred) results["test_balanced_acc"] = metrics.balanced_accuracy_score( y_test, y_pred) return results def predict(self, X_test, predict_proba=False): X_test = X_test[:, ~self.all_nan] X_test = np.nan_to_num(X_test) X_test, _, _ = encode_categoricals(X_test, encode_dicts=self.encode_dicts) if predict_proba: y_pred_proba = self.model.predict_proba(X_test) if self.num_classes == 2: y_pred_proba = y_pred_proba.transpose()[0:len(X_test)] return y_pred_proba y_pred = self.model.predict(X_test) if self.num_classes == 2: y_pred = y_pred.transpose()[0:len(X_test)] y_pred = np.argmax(y_pred, axis=1) return y_pred
print('processing lightgbm.............') model2 = LGBMClassifier(learning_rate=0.1, max_depth=3, num_leaves=15, n_estimators=300) model2.fit(x_train, y_train) feature = model2.feature_importances_ idxsorted = np.argsort(-feature) lgb_feature = [colnames[i] for i in idxsorted] lgb_feature_score = [feature[i] for i in idxsorted] lgb_feature_final = pd.DataFrame(lgb_feature_score, index=lgb_feature, columns=['feature_score']) train_pred = model2.predict_proba(x_train) test_pred = model2.predict_proba(x_test) train_pred_label = model2.predict(x_train) test_pred_label = model2.predict(x_test) lgb_cf = confusion_matrix_score(test_pred_label) lgb_acc = Counter(test_pred_label == y_test['First_label'])[1] / len( y_test['First_label']) lgb_acc_train = Counter(train_pred_label == y_train)[1] / len(y_train) Normal, AF, I_AVF, LBBB, RBBB, PAC, PVC, STD, STE, F1 = ecg_score(lgb_cf) print('acc为', lgb_acc, 'f1为', F1) #lgb_feature_final.to_csv('D:/ecg12/feture_score_train.csv') elapsed = (time.clock() - start) print("Time used:", elapsed)
subsample=.8, max_depth=10, reg_alpha=.1, reg_lambda=.05, min_split_gain=.005 ) lgbm_class.fit(features_train, target_train, eval_set= [(features_train, target_train), (features_test, target_test)], eval_metric='auc', verbose=0, early_stopping_rounds=30 ) pred = lgbm_class.predict(features_test) print('\n Percentage accuracy') print(classification_report(pred, target_test)) #%% [markdown] # Now, since LGBM performed the best (as expected), train it on all of the data. I won't be able to see the accuracy this time. #%% final_model = LGBMClassifier( n_estimators=300, num_leaves=30, colsample_bytree=.8, subsample=.8, max_depth=10, reg_alpha=.1,
lbs_valid = np.load(path_lbs_valid) fts_train = np.load(path_fts_train) lbs_train = np.load(path_lbs_train) fts_train.shape, lbs_train.shape, fts_valid.shape, lbs_valid.shape # %% def report_intermediate_result(env): nni.report_intermediate_result(env.evaluation_result_list[1][2]) # print(env.evaluation_result_list) # %% params = nni.get_next_parameter() lgb = LGBMClassifier(n_jobs=-1, **params) lgb.fit(fts_train, lbs_train, eval_set=[(fts_valid, lbs_valid)], eval_metric='multi_error', verbose=100, callbacks=[report_intermediate_result], early_stopping_rounds=50) # %% preds = lgb.predict(fts_valid) score = accuracy_score(lbs_valid, preds) nni.report_final_result(1 - score) # %%
# model = RandomForestClassifier(n_jobs=-1, **experiment) model = LGBMClassifier(boosting_type="rf", verbose=-1, **experiment) accuracies = [] kappas = [] # f1_scores = [] times = [] for folds in splits.values(): for fold in folds.values(): for repeat in fold.values(): start = time.time() train_index, test_index = repeat.train, repeat.test model.fit(X[train_index], y[train_index]) y_pred = model.predict(X[test_index]) end = time.time() times.append(end - start) accuracies.append(accuracy_score( y[test_index], y_pred)) kappas.append(cohen_kappa_score(y[test_index], y_pred)) # f1_scores.append(f1_score(y[test_index], y_pred)) eval_time = np.sum(times) mean_acc = np.mean(accuracies) # mean_f1 = np.mean(f1_scores) mean_kappa = np.mean(kappas) db.insert({ "task_id": task_id,
from pprint import pprint print('Parameters Currently In Use:\n') pprint(classifier_lgbm_corr.get_params()) # Fit e Predição import time start = time.time() classifier_lgbm_corr.fit(X_corr_train, Y_corr_train) end = time.time() print("Tempo de Execução: {} sec".format(end - start)) Y_pred_lgbm_corr = classifier_lgbm_corr.predict(X_corr_test) #Análise de Métricas from sklearn.metrics import accuracy_score, recall_score, precision_score, f1_score #Accuracy Score mtrc_accuracy_score_lgbm_corr = accuracy_score(Y_corr_test, Y_pred_lgbm_corr) print('Accuracy Score : ' + str(mtrc_accuracy_score_lgbm_corr)) #Precision Score mtrc_precision_score_lgbm_corr = precision_score(Y_corr_test, Y_pred_lgbm_corr) print('Precision Score : ' + str(mtrc_precision_score_lgbm_corr))
# evals = [(x_test, y_test)] # model.fit(x_train, y_train, early_stopping_rounds= 100, eval_metric= 'logloss', eval_set=evals, verbose=True) # model.fit(x_train, y_train, eval_metric= 'logloss') model = LGBMClassifier() model.fit(x_train, y_train) # model & weight save pickle.dump(model, open('C:/nmb/nmb_data/h5/LGBM0.data', 'wb')) # wb : write # print("== save complete ==") # model load # model = pickle.load(open('E:/nmb/nmb_data/cp/m03_mels_SVC.data', 'rb')) # rb : read # time >> 0:01:07.868304 # evaluate y_pred = model.predict(x_test) # print(y_pred[:100]) # print(y_pred[100:]) accuracy = accuracy_score(y_test, y_pred) recall = recall_score(y_test, y_pred) precision = precision_score(y_test, y_pred) f1 = f1_score(y_test, y_pred) hamm_loss = hamming_loss(y_test, y_pred) hinge_loss = hinge_loss(y_test, y_pred) log_loss = log_loss(y_test, y_pred) print("accuracy : \t", accuracy) print("recall : \t", recall) print("precision : \t", precision) print("f1 : \t", f1)
score = [] for train_index, test_index in skf.split(data, data_y): # 训练集 train_data = data.iloc[train_index] train_data_y = train_data['tz_students'].values train_data = train_data.drop(['STUDENTCODE', 'tz_students', 'FACTTUITION', 'STUDYMODE_1', 'STUDYMODE_2', 'earliestchoosefrom2'], axis=1) # 测试集 test_data = data.iloc[test_index] test_y = data_y[test_index] # 训练模型 clf = LGBMClassifier(num_leaves=8, learning_rate=0.05, max_depth=8, n_estimators=300, subsample=0.8, colsample_bytree=1, min_child_weight=1, ) clf.fit(X=train_data, y=train_data_y) # 预测 test_x = test_data.drop(['STUDENTCODE', 'tz_students', 'FACTTUITION', 'STUDYMODE_1', 'STUDYMODE_2', 'earliestchoosefrom2'], axis=1) test_data['pre'] = clf.predict(test_x) # test_data.at[test_data[test_data.STUDYMODE_1 == 1].index, 'pre'] = 0 # tmp_score = metrics.roc_auc_score(y_true=test_y, y_score=pred) tmp_score = metrics.f1_score(y_true=test_y, y_pred=test_data['pre'].values, average='macro') # print(tmp_score) score.append(tmp_score) print(score) print('f1:', sum(score)/len(score))
def cv_scores(df, num_folds, params, stratified=False, verbose=-1, save_train_prediction=True, train_prediction_file_name='train_prediction.csv', save_test_prediction=True, test_prediction_file_name='test_prediction.csv'): warnings.simplefilter('ignore') clf = LGBMClassifier(**params) # Divide in training/validation and test data train_df = df[df['TARGET'].notnull()] test_df = df[df['TARGET'].isnull()] print("Starting LightGBM. Train shape: {}, test shape: {}".format( train_df.shape, test_df.shape)) # Cross validation model if stratified: folds = StratifiedKFold(n_splits=num_folds, shuffle=True, random_state=1001) else: folds = KFold(n_splits=num_folds, shuffle=True, random_state=1001) # Create arrays and dataframes to store results train_pred = np.zeros(train_df.shape[0]) train_pred_proba = np.zeros(train_df.shape[0]) test_pred = np.zeros(train_df.shape[0]) test_pred_proba = np.zeros(train_df.shape[0]) prediction = np.zeros(test_df.shape[0]) feats = [ f for f in train_df.columns if f not in ['TARGET', 'SK_ID_CURR', 'SK_ID_BUREAU', 'SK_ID_PREV', 'index'] ] df_feature_importance = pd.DataFrame(index=feats) for n_fold, (train_idx, valid_idx) in enumerate( folds.split(train_df[feats], train_df['TARGET'])): print('Fold', n_fold, 'started at', time.ctime()) train_x, train_y = train_df[feats].iloc[train_idx], train_df[ 'TARGET'].iloc[train_idx] valid_x, valid_y = train_df[feats].iloc[valid_idx], train_df[ 'TARGET'].iloc[valid_idx] clf.fit(train_x, train_y, eval_set=[(train_x, train_y), (valid_x, valid_y)], eval_metric='auc', verbose=verbose, early_stopping_rounds=200) train_pred[train_idx] = clf.predict(train_x, num_iteration=clf.best_iteration_) train_pred_proba[train_idx] = clf.predict_proba( train_x, num_iteration=clf.best_iteration_)[:, 1] test_pred[valid_idx] = clf.predict(valid_x, num_iteration=clf.best_iteration_) test_pred_proba[valid_idx] = clf.predict_proba( valid_x, num_iteration=clf.best_iteration_)[:, 1] prediction += \ clf.predict_proba(test_df[feats], num_iteration = clf.best_iteration_)[:, 1] / folds.n_splits df_feature_importance[n_fold] = pd.Series(clf.feature_importances_, index=feats) print('Fold %2d AUC : %.6f' % (n_fold, roc_auc_score(valid_y, test_pred_proba[valid_idx]))) del train_x, train_y, valid_x, valid_y gc.collect() roc_auc_train = roc_auc_score(train_df['TARGET'], train_pred_proba) precision_train = precision_score(train_df['TARGET'], train_pred, average=None) recall_train = recall_score(train_df['TARGET'], train_pred, average=None) roc_auc_test = roc_auc_score(train_df['TARGET'], test_pred_proba) precision_test = precision_score(train_df['TARGET'], test_pred, average=None) recall_test = recall_score(train_df['TARGET'], test_pred, average=None) print('Full AUC score %.6f' % roc_auc_test) df_feature_importance.fillna(0, inplace=True) df_feature_importance['mean'] = df_feature_importance.mean(axis=1) # Write prediction files if save_train_prediction: df_prediction = train_df[['SK_ID_CURR', 'TARGET']] df_prediction['Prediction'] = test_pred_proba df_prediction.to_csv(train_prediction_file_name, index=False) del df_prediction gc.collect() if save_test_prediction: df_prediction = test_df[['SK_ID_CURR']] df_prediction['TARGET'] = prediction df_prediction.to_csv(test_prediction_file_name, index=False) del df_prediction gc.collect() return df_feature_importance, \ [roc_auc_train, roc_auc_test, precision_train[0], precision_test[0], precision_train[1], precision_test[1], recall_train[0], recall_test[0], recall_train[1], recall_test[1], 0]
[ 25 26 33 46 47 48 49 51 59 62 63 64 70 71 83 85 85 94 95 98 110 112 112 124 128 138 146 168 175 302] ''' for thresh in thresholds: selection = SelectFromModel(model, threshold=thresh, prefit=True) select_x_train = selection.transform(x_train) select_x_test = selection.transform(x_test) selection_model = LGBMClassifier(n_estimators=300, learning_rate=0.1, n_jobs=-1) selection_model.fit(select_x_train, y_train, verbose=False, eval_metric='logloss', eval_set=[(select_x_train, y_train), (select_x_test, y_test)], early_stopping_rounds=20) y_pred = selection_model.predict(select_x_test) acc = accuracy_score(y_test, y_pred) # get_clf_eval(y_test, y_pred) print('Thresh=%.3f, n=%d, acc: %.2f%%' %(thresh, select_x_train.shape[1], acc*100.0)) # model.save_model('./model/xgb_save/cancer_n=%d_acc=%.3f.model' %(select_x_train.shape[1], acc)) def get_clf_eval(y_test, y_pred): confusion = confusion_matrix(y_test, y_pred) accuracy = accuracy_score(y_test, y_pred) precision = precision_score(y_test, y_pred) recall = recall_score(y_test, y_pred) F1 = f1_score(y_test, y_pred) AUC = roc_auc_score(y_test, y_pred)
import lightgbm as lgb train_data = lgb.Dataset(x_train, y_train, free_raw_data=False, categorical_feature = cat_feat) valid_data = lgb.Dataset(x_valid, y_valid, free_raw_data=False, categorical_feature = cat_feat) params = { 'task': 'train', 'boosting_type': 'gbdt', 'objective': 'binary', 'metric': 'binary_logloss', 'num_leaves': 96, 'max_depth': 10, 'feature_fraction': 0.9, 'bagging_fraction': 0.95, 'bagging_freq': 5 } from lightgbm import LGBMClassifier num_round = 1000 lgbm = LGBMClassifier(num_leaves= 180, max_depth= -1, n_estimators = 2000, n_jobs = 16, random_state = 4, subsample = 0.9, gpu_id = 0, colsample_bytree = 0.85, max_bin = 512, tree_method = 'gpu_hist') lgbm.fit(X=x_train,y=y_train,eval_set = [(x_train,y_train),(x_valid, y_valid)], eval_metric = ['binary_logloss'], early_stopping_rounds = 70) # model = lgb.train(parameter, train_data, num_round, valid_sets = [train_data, valid_data], verbose_eval = 100, early_stopping_rounds = 50) pred_lgb = lgbm.predict(X_test) idx = [] for i in range (X_test.shape[0]): idx.append(i) mysubmit = pd.DataFrame({'id': idx, 'up_down': pred_lgb}) mysubmit.to_csv('submission.csv', index=True)
pickle.dump(model, open(model_file, 'wb')) # calculate the fpr and tpr for all thresholds of the classification probs = model.predict_proba(images_validation) preds = probs[:,1] fpr, tpr, threshold = metrics.roc_curve(labels_validation, preds) roc_score = roc_auc_score(labels_validation, preds) print("ROC score: %s" % roc_score) roc_auc = metrics.auc(fpr, tpr) # ploting to a file plt.title('Receiver Operating Characteristic') plt.plot(fpr, tpr, 'b', label = 'AUC = %0.2f' % roc_auc) plt.legend(loc = 'lower right') plt.plot([0, 1], [0, 1],'r--') plt.xlim([0, 1]) plt.ylim([0, 1]) plt.ylabel('True Positive Rate') plt.xlabel('False Positive Rate') plt.savefig(gw_roc_file, bbox_inches = 'tight',pad_inches = 0) # Training report target_names = [1,0] pred_labels = model.predict(images_validation) print(classification_report(labels_validation, pred_labels))
seed=42, feature_fraction_seed=42, bagging_seed=42, drop_seed=42, data_random_seed=42, boost_from_average=True, scale_pos_weight=w) clf.fit(train_x, train_y, eval_set=[(train_x, train_y), (valid_x, valid_y)], eval_metric='auc', early_stopping_rounds=200, verbose=10) ho_pred = clf.predict(valid_x) ho_proba = clf.predict_proba(valid_x)[:, 1] v_pred = roc_auc_score(valid_y, ho_pred) v_proba = roc_auc_score(valid_y, ho_proba) print('##################') print('Training : Single Model Hold Out Pred AUC=', v_pred) print('##################') print('Training : Single Model Hold out ProabA AUC=', v_proba) ct = pd.crosstab(valid_y, ho_pred, rownames=['Actual'], colnames=['Predicted'], margins=True) print(ct) print(classification_report(valid_y, ho_pred))
feature_importance_df = pd.DataFrame(data=None, columns=['feature', 'importances']) feature_importance_df['importances'] = rf.feature_importances_ feature_importance_df['feature'] = Xtrain.columns feature_importance_df = feature_importance_df.sort_values(by='importances', ascending=False) feature_importance_df.tail() feature_names = feature_importance_df.head(56)['feature'] Xtrain = train[feature_names] Xtest = test[feature_names] print(Xtrain.shape, Ytrain.shape, Xtest.shape) params = { 'n_estimators': 1222, 'learning_rate': 0.07307234151834806, 'num_leaves': 96, 'colsample_bytree': 0.8972376156262298, 'subsample': 0.9312856106293543, 'min_child_samples': 1 } lightgbm = LGBMClassifier(random_state=18, subsample_freq=1, silent=False, **params) lightgbm.fit(Xtrain, Ytrain) predictions = lightgbm.predict(Xtest) submission['Cover_Type'] = predictions submission.to_csv('LGBSingleModel.csv') submission.head()
cnt = 0 for flat_data in flats_data: x = [] y = flat_data['rating'] for k, v in flat_data.items(): if k != 'rating': x.append(v) X.append(x) Y.append(y) cnt += 1 if cnt == n: break return X, Y train_x, train_y = get_xy(train_flats_data) test_x, test_y = get_xy(test_flats_data) model = LGBMClassifier() model.fit(train_x, train_y) yhat = list(model.predict(test_x)) errs = {0:0, 1:0, 2:0, 3:0, 4:0} for i in range(len(test_y)): err = abs(test_y[i] - yhat[i]) errs[err] += 1 for k, v in errs.items(): print(k, v) print(get_quality_rmse(test_y, yhat))
def objective(config): # Get and log parameters params = { "num_leaves": config["num_leaves"], "learning_rate": config["learning_rate"], "n_estimators": config["n_estimators"], "objective": config["objective"], "reg_alpha": config["reg_alpha"], "reg_lambda": config["reg_lambda"], "tree_learner": config["tree_learner"], "subsample": config["subsample"], "subsample_freq": config["subsample_freq"], "feature_sel": fet_sel_dict[config["feature_sel"]] } mlflow.log_params(params) model = LGBMClassifier(**params, random_state=0) X_train, X_test, y_train, y_test = give_data( feature_sel=config["feature_sel"]) model.fit(X_train, np.ravel(y_train), eval_set=[(X_test, np.ravel(y_test))], verbose=False, early_stopping_rounds=50, callbacks=[LightGBMCallback]) eval_results = classification_report(np.ravel(y_test), model.predict(X_test), output_dict=True) eval_results["accuracy"] = accuracy_score(y_test, model.predict(X_test)) eval_results["auroc"] = roc_auc_score(y_test, model.predict_proba(X_test)[:, 1]) mlflow.log_metric("val_auroc", eval_results["auroc"]) fold_accuracy = eval_results["accuracy"] mlflow.log_metric("val_accuracy", fold_accuracy) fold_f1 = eval_results["1"]["f1-score"] mlflow.log_metric("val_f1-score-1", fold_f1) mlflow.log_metric("val_f1-score-0", eval_results["0"]["f1-score"]) fold_precision = eval_results["1"]["precision"] mlflow.log_metric("val_precision", fold_precision) fold_recall = eval_results["1"]["recall"] mlflow.log_metric("val_recall", fold_recall) eval_results_tr = classification_report(np.ravel(y_train), model.predict(X_train), output_dict=True) eval_results_tr["accuracy"] = accuracy_score(y_train, model.predict(X_train)) fold_accuracy_tr = eval_results_tr["accuracy"] mlflow.log_metric("tr_accuracy", fold_accuracy_tr) fold_f1_tr = eval_results_tr["1"]["f1-score"] mlflow.log_metric("tr_f1-score", fold_f1_tr) fold_precision_tr = eval_results_tr["1"]["precision"] mlflow.log_metric("tr_precision", fold_precision_tr) fold_recall_tr = eval_results_tr["1"]["recall"] mlflow.log_metric("tr_recall", fold_recall_tr) tune.report(auroc=eval_results["auroc"], done=True)
class PHSICAdasynLGBM(BaseEstimator): """ An estimator upsampling minority classes, finding a small set of stable biomarkers, and fitting a gradient boosting model over them Parameters ---------- n_features : int, optional (default=30) Max. number of biomarkers (important features) to be selected adasyn_neighbors : int, optional (default=10) K neighbors for ADASYN upsampling algorithm B : int, optional (default=20) Block size for Block HSIC Lasso M : int, optional (default=10) Max allowed permutations of samples for Block HSIC Lasso hsic_splits : int, optional (default=5) number of folds for verifying feature stability feature_neighbor_threshold : float, optional (default=0.4) threshold for considering neighbors of important features in stability check """ def __init__(self, n_features=30, adasyn_neighbors=10, B=20, M=10, hsic_splits=3, stability_minimum_across_splits=2, feature_neighbor_threshold=0.4): self.n_features = n_features self.adasyn_neighbors = adasyn_neighbors self.M = M self.B = B self.hsic_splits = hsic_splits self.neighbor_threshold = feature_neighbor_threshold self.stability_minimum_across_splits = stability_minimum_across_splits def fit(self, X, y): if X.shape[1] > 10000: #clf = RandomForestClassifier(n_estimators=1000,n_jobs=-1).fit(X,y) clf = LGBMClassifier(n_estimators=1000, n_jobs=-1).fit(X, y) ftimp = clf.feature_importances_ relevant = np.where(ftimp > 0)[0] print("relevant ft:", len(relevant), "/", X.shape[1]) else: relevant = np.arange(X.shape[1]) sss = StratifiedShuffleSplit(n_splits=self.hsic_splits, random_state=42) idxs = [] hsics = [] for train_index, test_index in list(sss.split(X, y)): hsic_lasso2 = HSICLasso() hsic_lasso2.input(X[:, relevant][train_index], y[train_index]) hsic_lasso2.classification( self.n_features, B=self.B, M=self.M) #(self.n_features, B=self.B, M=self.M) hsics.append(hsic_lasso2) # not just best features - get their neighbors (similar features) too all_ft_idx = np.array(hsic_lasso2.get_index(), dtype=int).ravel() for i in range(len(all_ft_idx)): idx = np.array(hsic_lasso2.get_index_neighbors( feat_index=i, num_neighbors=10), dtype=int) score = np.array(hsic_lasso2.get_index_neighbors_score( feat_index=i, num_neighbors=10), dtype=int) idx = idx[np.where(score > self.neighbor_threshold)[0]] all_ft_idx = np.concatenate((all_ft_idx, idx)) all_ft_idx = np.unique(all_ft_idx) idxs.append(relevant[all_ft_idx]) #if len(idxs) == 1: # self.hsic_idx_ = idxs[0] #else: # self.hsic_idx_ = np.intersect1d(idxs[-1], self.hsic_idx_) self.hsic_idx_ = [] stability_concession = 0 while len(self.hsic_idx_) == 0: featurecandidates = np.unique(np.concatenate(idxs)) for candidate in featurecandidates: occurrences = np.sum( [1 if candidate in idx else 0 for idx in idxs]) if occurrences > self.stability_minimum_across_splits - stability_concession: self.hsic_idx_.append(candidate) if len(self.hsic_idx_) > 1: break else: # failed to find commonly occurring features - reduce threshold stability_concession += 1 print("HSIC done.", len(self.hsic_idx_), "(out of ", len(featurecandidates), " candidates)") print("Upsampling with ADASYN... (features: " + str(len(self.hsic_idx_)) + ")") sm = ADASYN(sampling_strategy="minority", n_neighbors=self.adasyn_neighbors, n_jobs=-1) sX, sy = X[:, self.hsic_idx_], y if self.adasyn_neighbors > 0: try: sX, sy = sm.fit_resample(X[:, self.hsic_idx_], y) for i in range(len(np.unique(y) - 1)): sX, sy = sm.fit_resample(sX, sy) except: pass print("ADASYN done. Starting clf") self.clf_ = LGBMClassifier(n_estimators=1000).fit(sX, sy) print("done") return self def predict_proba(self, X): return self.clf_.predict_proba(X[:, self.hsic_idx_]) def predict(self, X): return self.clf_.predict(X[:, self.hsic_idx_])
learner.fit(cv=3, optimizer='scikit-bayes') score_auto_skopt = accuracy_score(y_test, learner.predict(X_test)) xgb_default = XGBClassifier() cat_default = CatBoostClassifier(logging_level='Silent') lgbm_default = LGBMClassifier() X_train, X_test, y_train = _feature_preprocessor.transform( X_train), _feature_preprocessor.transform( X_test), _target_preprocessor.transform( np.array(y_train).reshape(-1, 1)).ravel() print('training defaults') xgb_default.fit(X_train, y_train) cat_default.fit(X_train, y_train) lgbm_default.fit(X_train, y_train) score_xgb = accuracy_score(y_test, xgb_default.predict(X_test)) score_cat = accuracy_score(y_test, cat_default.predict(X_test)) score_lgbm = accuracy_score(y_test, lgbm_default.predict(X_test)) del xgb_default, cat_default, lgbm_default, learner results['name'].append(name) results['xgboost'].append(score_xgb) results['lightgbm'].append(score_lgbm) results['catboost'].append(score_cat) results['automl-grid'].append(score_auto_grid) results['automl-hyperopt'].append(score_auto_hyperopt) results['automl-skopt'].append(score_auto_skopt) print('RESULTS:') print( f"Name: {name}, XGB-default: {score_xgb}, LGBM-default: {score_lgbm}, CAT-default: {score_cat}, GRID: {score_auto_grid}, HYPEROPT: {score_auto_hyperopt}, SKOPT: {score_auto_skopt}" ) except Exception as ex: print(f'{name} failed') print(ex)
) x1_train, x1_test, y1_train, y1_test = train_test_split(x1, y1, test_size=0.2, random_state=42, stratify=y1) lg1.fit(x1_train, y1_train) print("YOUR R2 MACHINE LEARNED WITH THIS ACCURACY : ", lg1.score(x1_test, y1_test)) print( "\n.................................................................................\n" ) y1_pred = lg1.predict(x1_test) print(classification_report(y1_test, y1_pred)) precision, recall, fscore, support = score(y1_test, y1_pred) print('precision: ', np.mean(precision)) print('recall: ', np.mean(recall)) print('fscore: ', np.mean(fscore)) print("---+++---+++---+++---+++---") # THIS PART FOR CHECK RESIDENT 2 LightGBM: for i in range(10): globals()["url0" + str(
# Scaling data to remove any potential bias when fitting sc = StandardScaler() train_features = sc.fit_transform(train_features) test_features = sc.transform(test_features) # Using Light Gradient Boosting Model classification with a maximum tree depth of 4 model = LGBMClassifier(max_depth=4) # Fitting the model model.fit(train_features, train_labels) # Extract feature importances fi = pd.DataFrame({'feature': list(feature_cols), 'importance': model.feature_importances_}).\ sort_values('importance', ascending = False) fi.head(20) # Predicting the Test set results predictions = model.predict(test_features) # Making the Confusion Matrix pd.crosstab(test_labels, predictions, rownames=['Actual'], colnames=['Predicted']) # Accuracy Score accuracy_score(test_labels, predictions)
tprs_knn[-1][0] = 0.0 roc_auc = auc(fpr, tpr) aucs_knn.append(roc_auc) clf_rf = clf_rf.fit(X[train], y[train]) ac_rf.append(accuracy_score(y[test], clf_rf.predict(X[test]))) mean_fpr = np.linspace(0, 1, 100) probas_ = clf_rf.fit(X[train], y[train]).predict_proba(X[test]) fpr, tpr, thresholds = roc_curve(y[test], probas_[:, 1]) tprs_rf.append(interp(mean_fpr, fpr, tpr)) tprs_rf[-1][0] = 0.0 roc_auc = auc(fpr, tpr) aucs_rf.append(roc_auc) clf_lgbc = clf_lgbc.fit(X[train], y[train]) ac_lgbc.append(accuracy_score(y[test], clf_lgbc.predict(X[test]))) mean_fpr = np.linspace(0, 1, 100) probas_ = clf_lgbc.fit(X[train], y[train]).predict_proba(X[test]) fpr, tpr, thresholds = roc_curve(y[test], probas_[:, 1]) tprs_lgbc.append(interp(mean_fpr, fpr, tpr)) tprs_lgbc[-1][0] = 0.0 roc_auc = auc(fpr, tpr) aucs_lgbc.append(roc_auc) clf_xgb = clf_xgb.fit(X[train], y[train]) ac_xgb.append(accuracy_score(y[test], clf_xgb.predict(X[test]))) mean_fpr = np.linspace(0, 1, 100) probas_ = clf_xgb.fit(X[train], y[train]).predict_proba(X[test]) fpr, tpr, thresholds = roc_curve(y[test], probas_[:, 1]) tprs_xgb.append(interp(mean_fpr, fpr, tpr)) tprs_xgb[-1][0] = 0.0
lowercase=True, use_idf=True) # %% # Apply to train and test tfidf_train = tfidf_vectorizer.fit_transform(x_train) tfidf_test = tfidf_vectorizer.transform(x_test) # %% # Model model = LGBMClassifier(learning_rate=0.1, num_leaves=128, min_child_samples=100, ubsample=0.96, colsample_bytree=0.28, random_state=0, subsample_freq=1, n_estimators=100) model.fit(tfidf_train, y_train) y_pred = model.predict(tfidf_test) score = accuracy_score(y_test, y_pred) print(f'Accuracy: {round(score*100,2)}%') # %% # Saving model jl.dump(tfidf_vectorizer, 'tfidf_vectorizer.pkl.z') jl.dump(model, 'model.pkl.z') # %%
#aa = model_lr.coef_ if cond01 == 3: from sklearn.naive_bayes import GaussianNB # Gaussian Naive Bayes model_nb = GaussianNB(); model_nb.fit(X_train, Y_train) predicted_nb = model_nb.predict(X_test) ; print("Gaussian Naive Bayes",metrics.accuracy_score(Y_test, predicted_nb),"\n") if cond01 == 4: from sklearn.ensemble import GradientBoostingClassifier # GradientBoosting model_gb = GradientBoostingClassifier(); model_gb.fit(X_train, Y_train) predicted_gb = model_gb.predict(X_test) ; print("GradientBoosting",metrics.accuracy_score(Y_test, predicted_gb),"\n") if cond01 == 5: from lightgbm import LGBMClassifier # LightGBM model_lgbm = LGBMClassifier(); model_lgbm.fit(X_train, Y_train) predicted_lgbm = model_lgbm.predict(X_test); print("LightGBM",metrics.accuracy_score(Y_test, predicted_lgbm),"\n") # ##http://myenigma.hatenablog.com/entry/2015/10/09/223629 #import seaborn as sns #iris = sns.load_dataset("iris") #サンプルデータセット ##sns.pairplot(iris); #sns.pairplot(iris,hue="species"); #sns.plt.savefig("iris.png") #sns.plt.show() #