#Análise de Métricas from sklearn.metrics import accuracy_score, recall_score, precision_score, f1_score print('Accuracy Score : ' + str(accuracy_score(Y_chegada, Y_chegada_pred_lgbm))) print('Precision Score : ' + str(precision_score(Y_chegada, Y_chegada_pred_lgbm))) print('Recall Score : ' + str(recall_score(Y_chegada, Y_chegada_pred_lgbm))) print('F1 Score : ' + str(f1_score(Y_chegada, Y_chegada_pred_lgbm))) #Treinamento e Predição com as Bases X_chegada e Y_chegada from lightgbm import LGBMClassifier classifier_lgbm_chegada = LGBMClassifier( max_depth = 1000, learning_rate = 0.01, num_leaves = 2000, min_data_in_leaf = 200, n_estimators = 5000, objective = 'binary', metric = 'binary_logloss' ) classifier_lgbm_chegada.fit(X_chegada_train, Y_chegada_train) Y_chegada_pred_lgbm = classifier_lgbm_chegada.predict(X_chegada_test) #Análise de Métricas from sklearn.metrics import accuracy_score, recall_score, precision_score, f1_score print('Accuracy Score : ' + str(accuracy_score(Y_chegada_test, Y_chegada_pred_lgbm))) print('Precision Score : ' + str(precision_score(Y_chegada_test, Y_chegada_pred_lgbm))) print('Recall Score : ' + str(recall_score(Y_chegada_test, Y_chegada_pred_lgbm))) print('F1 Score : ' + str(f1_score(Y_chegada_test, Y_chegada_pred_lgbm)))
def cv_lgbm_scores(df_, num_folds, params, target_name = 'TARGET', index_name = 'SK_ID_CURR', stratified = False, rs = 1001, verbose = -1): warnings.simplefilter('ignore') # Cleaning and defining parameters for LGBM params = int_lgbm_params(params) clf = LGBMClassifier(**params, n_estimators = 20000, nthread = 4, n_jobs = -1) # Divide in training/validation and test data df_train_ = df_[df_[target_name].notnull()] df_test_ = df_[df_[target_name].isnull()] print("Starting LightGBM cross-validation at {}".format(time.ctime())) print("Train shape: {}, test shape: {}".format(df_train_.shape, df_test_.shape)) # Cross validation model if stratified: folds = StratifiedKFold(n_splits = num_folds, shuffle = True, random_state = rs) else: folds = KFold(n_splits = num_folds, shuffle = True, random_state = rs) # Create arrays to store results train_pred = np.zeros(df_train_.shape[0]) train_pred_proba = np.zeros(df_train_.shape[0]) test_pred = np.zeros(df_train_.shape[0]) test_pred_proba = np.zeros(df_train_.shape[0]) prediction = np.zeros(df_test_.shape[0]) # prediction for test set feats = df_train_.columns.drop([target_name, index_name]) df_feat_imp_ = pd.DataFrame(index = feats) # Cross-validation cycle for n_fold, (train_idx, valid_idx) in enumerate(folds.split(df_train_[feats], df_train_[target_name])): print('--- Fold {} started at {}'.format(n_fold, time.ctime())) train_x, train_y = df_train_[feats].iloc[train_idx], df_train_[target_name].iloc[train_idx] valid_x, valid_y = df_train_[feats].iloc[valid_idx], df_train_[target_name].iloc[valid_idx] clf.fit(train_x, train_y, eval_set = [(valid_x, valid_y)], eval_metric = 'auc', verbose = verbose, early_stopping_rounds = 100) train_pred[train_idx] = clf.predict(train_x, num_iteration = clf.best_iteration_) train_pred_proba[train_idx] = clf.predict_proba(train_x, num_iteration = clf.best_iteration_)[:, 1] test_pred[valid_idx] = clf.predict(valid_x, num_iteration = clf.best_iteration_) test_pred_proba[valid_idx] = clf.predict_proba(valid_x, num_iteration = clf.best_iteration_)[:, 1] prediction += clf.predict_proba(df_test_[feats], num_iteration = clf.best_iteration_)[:, 1] / folds.n_splits df_feat_imp_[n_fold] = pd.Series(clf.feature_importances_, index = feats) del train_x, train_y, valid_x, valid_y gc.collect() # Computation of metrics roc_auc_train = roc_auc_score(df_train_[target_name], train_pred_proba) precision_train = precision_score(df_train_[target_name], train_pred, average = None) recall_train = recall_score(df_train_[target_name], train_pred, average = None) roc_auc_test = roc_auc_score(df_train_[target_name], test_pred_proba) precision_test = precision_score(df_train_[target_name], test_pred, average = None) recall_test = recall_score(df_train_[target_name], test_pred, average = None) print('Full AUC score {:.6f}'.format(roc_auc_test)) # Filling the feature_importance table df_feat_imp_.fillna(0, inplace = True) df_feat_imp_['mean'] = df_feat_imp_.mean(axis = 1) # Preparing results of prediction for saving prediction_train = df_train_[[index_name]] prediction_train[target_name] = test_pred_proba prediction_test = df_test_[[index_name]] prediction_test[target_name] = prediction del df_train_, df_test_ gc.collect() # Returning the results and metrics in format for scores' table return df_feat_imp_, prediction_train, prediction_test, [roc_auc_train, roc_auc_test, precision_train[0], precision_test[0], precision_train[1], precision_test[1], recall_train[0], recall_test[0], recall_train[1], recall_test[1], 0]
test_pca = pca.transform(test) # use stratifiedkfold(n_splits=3) to fit_predict the test set 3 times # and get the proba lgb_params = dict() lgb_params['learning_rate'] = 0.01 lgb_params['n_estimators'] = 1000 # lgb_params['max_depth'] = 10 lgb_params['max_bin'] = 10 lgb_params['subsample'] = 0.8 lgb_params['subsample_freq'] = 10 lgb_params['colsample_bytree'] = 0.8 lgb_params['min_child_samples'] = 500 lgb = LGBMClassifier(**lgb_params) skf = StratifiedKFold(n_splits=3, shuffle=True) predictions = np.zeros((test_pca.shape[0], 3)) for train_index, test_index in skf.split(train_pca, train_target): i = 0 lgb_train = train_pca[train_index] lgb_target = train_target[train_index] lgb.fit(lgb_train, lgb_target) y_pred = lgb.predict_proba(test_pca)[:, 1] predictions[:, i] = y_pred i += 1 # write the result to a csv
def fit(self, X: pd.DataFrame, y: np.array) -> tuple: # process cat cols if self.cat_validation == "None": encoder = MultipleEncoder(cols=self.cat_cols, encoders_names_tuple=self.encoders_names) X = encoder.fit_transform(X, y) for n_fold, (train_idx, val_idx) in enumerate(self.model_validation.split(X, y)): X_train, X_val = ( X.iloc[train_idx].reset_index(drop=True), X.iloc[val_idx].reset_index(drop=True), ) y_train, y_val = y.iloc[train_idx], y.iloc[val_idx] if self.cat_validation == "Single": encoder = MultipleEncoder( cols=self.cat_cols, encoders_names_tuple=self.encoders_names) X_train = encoder.fit_transform(X_train, y_train) X_val = encoder.transform(X_val) if self.cat_validation == "Double": encoder = DoubleValidationEncoderNumerical( cols=self.cat_cols, encoders_names_tuple=self.encoders_names) X_train = encoder.fit_transform(X_train, y_train) X_val = encoder.transform(X_val) pass self.encoders_list.append(encoder) # check for OrdinalEncoder encoding for col in [ col for col in X_train.columns if "OrdinalEncoder" in col ]: X_train[col] = X_train[col].astype("category") X_val[col] = X_val[col].astype("category") # fit model model = LGBMClassifier(**self.model_params) model.fit( X_train, y_train, eval_set=[(X_train, y_train), (X_val, y_val)], early_stopping_rounds=50, verbose=False, ) self.models_trees.append(model.best_iteration_) self.models_list.append(model) y_hat = model.predict_proba(X_train)[:, 1] score_train = roc_auc_score(y_train, y_hat) self.scores_list_train.append(score_train) y_hat = model.predict_proba(X_val)[:, 1] score_val = roc_auc_score(y_val, y_hat) self.scores_list_val.append(score_val) mean_score_train = np.mean(self.scores_list_train) mean_score_val = np.mean(self.scores_list_val) avg_num_trees = int(np.mean(self.models_trees)) print(f"Mean score train : {np.round(mean_score_train, 4)}") print(f"Mean score val : {np.round(mean_score_val, 4)}") return mean_score_train, mean_score_val, avg_num_trees
def without_cv_transfer_a_to_b_modeling(): """ :return: """ '''Data input''' data_a_train = pd.read_csv('../data/A_train_final.csv', index_col='no') data_b_train = pd.read_csv('../data/B_train_final.csv', index_col='no') y_of_b_train = data_b_train['flag'] data_b_test = pd.read_csv('../data/B_test_final.csv', index_col='no') '''A train特征工程''' data_a_train_without_label = data_a_train.drop('flag', axis=1) data_a_train_without_label['UserInfo_222x82'] = data_a_train_without_label['UserInfo_82'] * data_a_train_without_label['UserInfo_222'] '''缺失值填充''' data_a_train_filled = data_a_train_without_label.fillna(value=10) '''特征的名字''' feature_name = list(data_a_train_without_label.columns.values) data_b_test_user_id = list(data_b_test.index.values) '''构造训练集和测试集''' x_temp = data_a_train_filled.iloc[:, :].as_matrix() # 自变量 y = data_a_train.iloc[:, -1].as_matrix() # 因变量 '''Feature selection 注意如果加特征的话,feature name还是需要改的''' X, dropped_feature_name, len_feature_choose = lgb_feature_selection(feature_name, x_temp, y, "0.1*mean") '''B train特征工程''' data_b_train_without_label = data_b_train.drop('flag', axis=1) data_b_train_without_label['UserInfo_222x82'] = data_b_train_without_label['UserInfo_82'] * data_b_train_without_label['UserInfo_222'] data_b_train_filled = data_b_train_without_label.fillna(value=10) '''b test 特征工程''' data_b_test['UserInfo_222x82'] = data_b_test['UserInfo_82'] * data_b_test['UserInfo_222'] data_b_test_filled = data_b_test.fillna(value=10) '''特征筛选''' data_b_train_filled_after_feature_selection = data_test_feature_drop(data_b_train_filled, dropped_feature_name) data_b_test_filled_after_feature_selection = data_test_feature_drop(data_b_test_filled, dropped_feature_name) '''用A_train建模预测B_train''' print '起始时间' print time.clock()*1.0/60 parameter_n_estimators = 400 classifier = LGBMClassifier(n_estimators=parameter_n_estimators) a_model = classifier.fit(X, y) prob_of_b_train = a_model.predict_proba(data_b_train_filled_after_feature_selection) print '训练终止时间' print time.clock()*1.0/60 '''画roc曲线''' fpr, tpr, thresholds = roc_curve(y_of_b_train, prob_of_b_train[:, 1]) roc_auc = auc(fpr, tpr) print '\nauc='+str(roc_auc) '''预测Btest''' prob_of_b_test = a_model.predict_proba(data_b_test_filled_after_feature_selection) result_file_name = '../result/B_test_predict_using_A_LGBLGB_without_cv_fillna_10' + '_N_' + str(parameter_n_estimators) + '_features_' + \ str(len_feature_choose) + '_offline_'+str(roc_auc)+'.csv' write_predict_results_to_csv(result_file_name, data_b_test_user_id, prob_of_b_test[:, 1].tolist())
def kfold_lightgbm(df, debug=False): # Divide in training/validation and test data train_df = df[df['TARGET'].notnull()] test_df = df[df['TARGET'].isnull()] print("Starting LightGBM. Train shape: {}, test shape: {}".format(train_df.shape, test_df.shape)) del df gc.collect() folds = KFold(n_splits=10, shuffle=True, random_state=1001) # Create arrays and dataframes to store results oof_preds = np.zeros(train_df.shape[0]) # predicted valid_y sub_preds = np.zeros(test_df.shape[0]) # submission preds feature_importance_df = pd.DataFrame() # feature importance fold_auc_best_df = pd.DataFrame(columns=["FOLD", "AUC", "BEST_ITER"]) # holding best iter to save model feats = [f for f in train_df.columns if f not in ['TARGET', 'SK_ID_CURR', 'SK_ID_BUREAU', 'SK_ID_PREV', 'index', "APP_index", "BURO_index", "PREV_index", "INSTAL_index", "CC_index", "POS_index"]] for n_fold, (train_idx, valid_idx) in enumerate(folds.split(train_df[feats], train_df['TARGET'])): train_x, train_y = train_df[feats].iloc[train_idx], train_df['TARGET'].iloc[train_idx] valid_x, valid_y = train_df[feats].iloc[valid_idx], train_df['TARGET'].iloc[valid_idx] # LightGBM parameters found by Bayesian optimization clf = LGBMClassifier( n_jobs=-1, n_estimators=10000, learning_rate=0.02, num_leaves=34, colsample_bytree=0.9497036, subsample=0.8715623, max_depth=8, reg_alpha=0.041545473, reg_lambda=0.0735294, min_split_gain=0.0222415, min_child_weight=39.3259775, silent=-1, verbose=-1, ) clf.fit(train_x, train_y, eval_set=[(train_x, train_y), (valid_x, valid_y)], eval_metric='auc', verbose=200, early_stopping_rounds=200) # predicted valid_y oof_preds[valid_idx] = clf.predict_proba(valid_x, num_iteration=clf.best_iteration_)[:, 1] # submission preds. her kat icin test setini tahmin edip tum katların ortalamasini alıyor. sub_preds += clf.predict_proba(test_df[feats], num_iteration=clf.best_iteration_)[:, 1] / folds.n_splits # fold, auc and best iteration print('Fold %2d AUC : %.6f' % (n_fold + 1, roc_auc_score(valid_y, oof_preds[valid_idx]))) # best auc & iteration fold_auc_best_df = fold_auc_best_df.append({'FOLD': int(n_fold + 1), 'AUC': roc_auc_score(valid_y, oof_preds[valid_idx]), "BEST_ITER": clf.best_iteration_}, ignore_index=True) fold_importance_df = pd.DataFrame() fold_importance_df["feature"] = feats fold_importance_df["importance"] = clf.feature_importances_ fold_importance_df["fold"] = n_fold + 1 feature_importance_df = pd.concat([feature_importance_df, fold_importance_df], axis=0) del clf, train_x, train_y, valid_x, valid_y gc.collect() # OUTPUTS print(fold_auc_best_df) print(feature_importance_df) # feature importance'ları df olarak kaydet feature_importance_df.to_pickle("outputs/features/feature_importance_df.pkl") fold_auc_best_df.to_pickle("outputs/features/fold_auc_best_df.pkl") # Final Model best_iter_1 = int(fold_auc_best_df.sort_values(by="AUC", ascending=False)[:1]["BEST_ITER"].values) y_train = train_df["TARGET"] x_train = train_df[feats] final_model = LGBMClassifier( n_jobs=-1, n_estimators=best_iter_1, learning_rate=0.02, num_leaves=34, colsample_bytree=0.9497036, subsample=0.8715623, max_depth=8, reg_alpha=0.041545473, reg_lambda=0.0735294, min_split_gain=0.0222415, min_child_weight=39.3259775, silent=-1, verbose=-1).fit(x_train, y_train) cur_dir = os.getcwd() os.chdir('models/reference/') pickle.dump(final_model, open("lightgbm_final_model.pkl", 'wb')) # model os.chdir(cur_dir) # her bir fold icin tahmin edilen valid_y'ler aslında train setinin y'lerinin farklı parcalarda yer alan tahminleri. cowsay.cow('Full Train(Validation) AUC score %.6f' % roc_auc_score(train_df['TARGET'], oof_preds)) # Write submission file and plot feature importance if not debug: cur_dir = os.getcwd() os.chdir('outputs/predictions/') test_df['TARGET'] = sub_preds test_df[['SK_ID_CURR', 'TARGET']].to_csv("reference_submission.csv", index=False) os.chdir(cur_dir) display_importances(feature_importance_df) del x_train, y_train return feature_importance_df
for i in categorical_features: class_le.fit(alldata.iloc[:, i].values) alldata.iloc[:, i] = class_le.transform(alldata.iloc[:, i].values) alldata.head() # In[49]: x_train = alldata.iloc[:, 1:] y_train = alldata.iloc[:, 0] # In[59]: start = time.time() estimator = LGBMClassifier(objective='binary', colsample_bytree=0.8, subsample=0.8, eval_metric='auc', learning_rate=0.3, n_estimators=25) param_grid = { 'max_depth': range(6, 18, 3), 'num_leaves': range(1000, 10000, 2000) } gs = GridSearchCV(estimator, param_grid, cv=3) print(gs.fit(x_train.head(100000), y_train.head(100000))) print('{:.2f}'.format(time.time() - start) + ' sec') # In[10]: #网格化搜索max_depth,num_leaves,且num_leaves<2**max_depth estimator = LGBMClassifier(objective='binary', colsample_bytree=0.8,
''' # for fixing LightGBMError: Do not support special JSON characters in feature name. #train_data.columns = ["".join (c if c.isalnum() else "_" for c in str(x)) for x in train_data.columns] #train_data_label.columns = ["".join (c if c.isalnum() else "_" for c in str(x)) for x in train_data_label.columns] # LightGBM Classifier lgbm_param = { 'max_depth':[30,35,40,45,50,55,60], 'min_child_samples':[10,15,20,30,40,45,50], 'n_estimators':[200,300,400,500,600,650,700,800], 'learning_rate':stats.uniform(0.2, 0.5), 'num_leaves':[40,45,50,55,60,65,70,80]} lgbm = LGBMClassifier() start = time() random_search = RandomizedSearchCV(lgbm, param_distributions=lgbm_param,n_iter=N_ITER,n_jobs=4) random_search.fit(train_data,mapped_labels ) print("RandomizedSearchCV took %.2f seconds for LGBM." % (time() - start)) report(random_search.cv_results_) ''' # XGboost Classifier xg_params = { 'min_child_weight': [1, 5, 10], 'gamma': [0.5, 1, 1.5, 2, 5], 'subsample': [0.6, 0.8, 1.0], 'colsample_bytree': [0.6, 0.8, 1.0], 'learning_rate': stats.uniform(0, 0.02), 'max_depth': [5,6,7,8,9,10,11,12],
rfc_predict = rfc.predict(X_test_std) rfc_predict_proba = rfc.predict_proba(X_test_std)[:,1] get_scores(y_test,rfc_predict,rfc_predict_proba) print('') #GBDT print('GBDT:') gdbt = GradientBoostingClassifier(random_state=2018) gdbt.fit(X_train_std,y_train) gdbt_predict = gdbt.predict(X_test_std) gdbt_predict_proba = gdbt.predict_proba(X_test_std)[:,1] get_scores(y_test,gdbt_predict,gdbt_predict_proba) print('') #XGBoost print('XGBoost:') xgbs = XGBClassifier(random_state=2018) xgbs.fit(X_train_std,y_train) xgbs_predict = xgbs.predict(X_test_std) xgbs_predict_proba = xgbs.predict_proba(X_test_std)[:,1] get_scores(y_test,xgbs_predict,xgbs_predict_proba) print('') #LightGBM print('LightGBM:') lgbm = LGBMClassifier(random_state=2018) lgbm.fit(X_train_std,y_train) lgbm_predict = lgbm.predict(X_test_std) lgbm_predict_proba = lgbm.predict_proba(X_test_std)[:,1] get_scores(y_test,lgbm_predict,lr_predict_pro)
# 3 拆分测试集 drop_feature = [ 'risk_time', 'consumer_no', 'month_status', 'first_status', 'data_type' ] test_x = test_data.loc[:, ~test_data.columns.isin(drop_feature)] test_y = test_data.loc[:, 'month_status'] # 4 不平衡样本训练 numericFeature = train_x.columns.tolist() OversampleRandom5 = {'RandomSample': {'ratio': 0.5, 'random_state': 10}} OversampleRandom4 = {'SMOTEENN': {'ratio': 0.4, 'random_state': 10}} OversampleRandom3 = {'Smote': {'ratio': 0.3, 'random_state': 10}} lgb = LGBMClassifier(boosting_type='gbdt',learning_rate=0.1, max_depth=2,n_estimators=500,\ n_jobs=-1,objective='binary',importance_type = 'gain',min,\ random_state=10) imbRandom5 = imbalanceOversampleProcess(numericFeature, OversampleRandom5, lgb) imbRandom4 = imbalanceOversampleProcess(numericFeature, OversampleRandom4, lgb) imbRandom3 = imbalanceOversampleProcess(numericFeature, OversampleRandom3, lgb) # 5 评估流程 data_dict = { 'train': { 'X': train_x, 'y': train_y }, 'test': { 'X': test_x, 'y': test_y }
CASE = 1 titanic = pd.read_pickle('tests/data/clean_titanic.pkl') if CASE == 1: features = ['Pclass', 'Survived', 'Embarked', 'Sex'] encoder = one_hot.OneHotEncoder(titanic, cols=['Embarked', 'Sex']) X = titanic[features] y = titanic['Age'].to_frame() model = LGBMRegressor() elif CASE == 2: features = ['Pclass', 'Age', 'Embarked', 'Sex'] encoder = one_hot.OneHotEncoder(titanic, cols=['Embarked', 'Sex']) X = titanic[features] y = titanic['Survived'].to_frame() model = LGBMClassifier() else: features = ['Survived', 'Age', 'Embarked', 'Sex'] encoder = one_hot.OneHotEncoder(titanic, cols=['Embarked', 'Sex']) X = titanic[features] y = titanic['Pclass'].to_frame() model = LGBMClassifier() titanic_enc = encoder.fit_transform(X) X_train, X_test, y_train, y_test = train_test_split( titanic_enc, y, test_size=0.2, )
def lgbm_modeling_cross_validation(params, full_train, y, classes, class_weights, nr_fold=10, random_state=7): unique_y = np.unique(y) class_map = dict() for i, val in enumerate(unique_y): class_map[val] = i # y = np.array([class_map[val] for val in y]) y = y.apply(lambda x: class_map[x]) # Compute weights w = y.value_counts() weights = {i: np.sum(w) / w[i] for i in w.index} clfs = [] importances = pd.DataFrame() folds = StratifiedKFold(n_splits=nr_fold, shuffle=True, random_state=random_state) oof_preds = np.zeros((len(full_train), np.unique(y).shape[0])) for fold_, (trn_, val_) in enumerate(folds.split(y, y)): trn_x, trn_y = full_train.iloc[trn_], y.iloc[trn_] val_x, val_y = full_train.iloc[val_], y.iloc[val_] trn_xa, trn_y, val_xa, val_y = smoteAdataset(trn_x.values, trn_y.values, val_x.values, val_y.values) trn_x = pd.DataFrame(data=trn_xa, columns=trn_x.columns) val_x = pd.DataFrame(data=val_xa, columns=val_x.columns) clf = LGBMClassifier(**params) clf.fit(trn_x, trn_y, eval_set=[(trn_x, trn_y), (val_x, val_y)], eval_metric=lgbm_multi_weighted_logloss, verbose=100, early_stopping_rounds=50, sample_weight=trn_y.map(weights)) clf.my_name = "lgbm" clfs.append(clf) oof_preds[val_, :] = clf.predict_proba( val_x) #, num_iteration=clf.best_iteration_) print('no {}-fold loss: {}'.format( fold_ + 1, multi_weighted_logloss(val_y, oof_preds[val_, :], classes, class_weights))) imp_df = pd.DataFrame({ 'feature': full_train.columns, 'gain': clf.feature_importances_, 'fold': [fold_ + 1] * len(full_train.columns), }) importances = pd.concat([importances, imp_df], axis=0, sort=False) score = multi_weighted_logloss(y_true=y, y_preds=oof_preds, classes=classes, class_weights=class_weights) print('MULTI WEIGHTED LOG LOSS: {:.5f}'.format(score)) df_importances = save_importances(importances_=importances) df_importances.to_csv('lgbm_importances.csv', index=False) cnf = confusion_matrix(y, np.argmax(oof_preds, axis=1)) plot_confusion_matrix(cnf, classes=classes, normalize=True, filename="lgbm") return clfs, score, oof_preds
''' for each day build a model ''' for i in tqdm(range(start - 1, end, 1)): output = '' ''' import dataframe ''' train = pd.read_csv(train_path[i], dtype=train_dtypes, usecols=train_fields) X_train = train.loc[:, train.columns != 'skip_2'] y_train = train['skip_2'] X_test = pd.read_csv(test_path[i], dtype=test_dtypes, usecols=test_fields) clf = LGBMClassifier(n_estimators=100, objective='binary', learning_rate=0.05, n_jobs=-1, random_state=42) clf.fit(X_train, y_train) y_pred = clf.predict(X_test) ''' file format ''' position = X_test['session_position'].tolist() length = X_test['session_length'].tolist() for pos in range(len(y_pred)): output += str(y_pred[pos]) if position[pos] == length[pos]: output += '\n' '''save file''' output_path = 'output_1224/' + date_of_name[i] + '.txt'
X_test = test.drop(columns='id') from sklearn.preprocessing import StandardScaler _ = StandardScaler().fit_transform(X_test) X_test = pd.DataFrame(_, columns=X_test.columns) clf_feature_selection = LGBMClassifier(boosting_type='gbdt', class_weight='balanced', colsample_bytree=1.0, importance_type='split', learning_rate=0.1, max_depth=10, min_child_samples=20, min_child_weight=0.001, min_split_gain=0.01, n_estimators=200, n_jobs=-1, num_leaves=31, objective='binary', random_state=42, reg_alpha=0.5, reg_lambda=0, silent=True, subsample=1.0, subsample_for_bin=200000, subsample_freq=0) rfecv = RFECV(estimator=clf_feature_selection, step=1, cv=5, scoring='roc_auc') params = { 'random_state': [42], 'objective': ['binary'], 'class_weight': ['balanced', None],
real = pd.read_pickle(os.path.join(data_dir, "eval_real.p")) train_users = pd.read_pickle(os.path.join(data_dir, "train_users.p")) test_users = pd.read_pickle(os.path.join(data_dir, "test_users.p")) trainset = pd.read_pickle(os.path.join(data_dir, "train.p")) evalset = pd.read_pickle(os.path.join(data_dir, "eval.p")) real = real.loc[train_users] evalset = evalset[evalset.user_id.isin(train_users)] mds = [3, 5, 8, 10] eval_ps = [1 / i for i in range(1, 20)] res = pd.DataFrame([], index=mds, columns=eval_ps) for md in mds: learner = LGBMClassifier(n_estimators=10000, max_depth=md) learner.fit(trainset.drop("reordered", axis=1), trainset.reordered, eval_metric="auc", early_stopping_rounds=10, eval_set=[(trainset.drop("reordered", axis=1), trainset.reordered), (evalset.drop("reordered", axis=1), evalset.reordered)]) preds = learner.predict_proba(evalset.drop("reordered", axis=1))[:, -1] for p in eval_ps: ppreds = evalset[preds > p] ppreds = ppreds.groupby("user_id").product_id.apply(set)
def kfold_lightgbm(df, num_folds, stratified=False, debug=False): # Divide in training/validation and test data train_df = df[df['TARGET'].notnull()] test_df = df[df['TARGET'].isnull()] print("Starting LightGBM. Train shape: {}, test shape: {}".format( train_df.shape, test_df.shape)) del df gc.collect() # Cross validation model if stratified: folds = StratifiedKFold(n_splits=num_folds, shuffle=True, random_state=47) else: folds = KFold(n_splits=num_folds, shuffle=True, random_state=47) # Create arrays and dataframes to store results oof_preds = np.zeros(train_df.shape[0]) sub_preds = np.zeros(test_df.shape[0]) feature_importance_df = pd.DataFrame() feats = [ f for f in train_df.columns if f not in ['TARGET', 'SK_ID_CURR', 'SK_ID_BUREAU', 'SK_ID_PREV', 'index'] ] for n_fold, (train_idx, valid_idx) in enumerate( folds.split(train_df[feats], train_df['TARGET'])): train_x, train_y = train_df[feats].iloc[train_idx], train_df[ 'TARGET'].iloc[train_idx] valid_x, valid_y = train_df[feats].iloc[valid_idx], train_df[ 'TARGET'].iloc[valid_idx] # LightGBM parameters found by Bayesian optimization clf = LGBMClassifier( nthread=4, #is_unbalance=True, n_estimators=10000, learning_rate=0.02, num_leaves=32, colsample_bytree=0.9497036, subsample=0.8715623, max_depth=8, reg_alpha=0.04, reg_lambda=0.073, min_split_gain=0.0222415, min_child_weight=40, silent=-1, verbose=-1, #scale_pos_weight=11 ) clf.fit(train_x, train_y, eval_set=[(train_x, train_y), (valid_x, valid_y)], eval_metric='auc', verbose=100, early_stopping_rounds=200) oof_preds[valid_idx] = clf.predict_proba( valid_x, num_iteration=clf.best_iteration_)[:, 1] sub_preds += clf.predict_proba( test_df[feats], num_iteration=clf.best_iteration_)[:, 1] / folds.n_splits fold_importance_df = pd.DataFrame() fold_importance_df["feature"] = feats fold_importance_df["importance"] = clf.feature_importances_ fold_importance_df["fold"] = n_fold + 1 feature_importance_df = pd.concat( [feature_importance_df, fold_importance_df], axis=0) print('Fold %2d AUC : %.6f' % (n_fold + 1, roc_auc_score(valid_y, oof_preds[valid_idx]))) del clf, train_x, train_y, valid_x, valid_y gc.collect() print('Full AUC score %.6f' % roc_auc_score(train_df['TARGET'], oof_preds)) # Write submission file and plot feature importance if not debug: test_df['TARGET'] = sub_preds test_df[['SK_ID_CURR', 'TARGET']].to_csv(submission_file_name, index=False) display_importances(feature_importance_df) return feature_importance_df
def __buildModel__(self,modelParams): #note that lgbm does not supports multi-output classification. self.model=MultiOutputClassifier(LGBMClassifier(n_jobs=-1, **modelParams))
# print(train_y) # 특성 라벨 정하기 feat_labels = train_x.columns[:] # print(feat_labels) # 훈련 데이터와 테스트 데이터로 나누기 from sklearn.model_selection import train_test_split X_train, X_test, y_train, y_test = train_test_split(train_x, train_y, test_size=0.2, random_state=0) # 모델링 / 훈련 from lightgbm import LGBMClassifier LGBM = LGBMClassifier() LGBM.fit(X_train, y_train, verbose=True) # 정규화 from sklearn.preprocessing import StandardScaler scaler = StandardScaler() scaler.fit(X_train) X_train_scaled = scaler.transform(X_train) X_test_scaled = scaler.transform(X_test) test_x_scaled = scaler.transform(test_x) print(X_train_scaled) # 시각화 import matplotlib.pyplot as plt plt.hist(X_train_scaled) plt.title('StandardScaler')
GB_Classifier.fit(X_train, Y_train) # AdaBoost AD_Classifier = ske.AdaBoostClassifier(n_estimators=100) AD_Classifier.fit(X_train, Y_train) # GaussianNB GS_Classifier = GaussianNB() GS_Classifier.fit(X_train, Y_train) # XGBoost XGB_Classifier = xgb.XGBClassifier() XGB_Classifier.fit(X_train, Y_train) # LightGBM lgbm_Classifier = LGBMClassifier() lgbm_Classifier.fit(X_train, Y_train) models = [] models.append(('Naive Baye Classifier', BNB_Classifier)) models.append(('Decision Tree Classifier', DTC_Classifier)) models.append(('KNeighborsClassifier', KNN_Classifier)) models.append(('LogisticRegression', LGR_Classifier)) models.append(('RandomForest', RD_Classifier)) models.append(('GradientBoosting', GB_Classifier)) models.append(('AdaBoost', AD_Classifier)) models.append(('GaussianNB', GS_Classifier)) models.append(('XGBoost', XGB_Classifier)) models.append(('LightGBM', lgbm_Classifier)) for i, v in models:
xgb.fit(x_train, y_train) test_xg_prob = xgb.predict_proba(x_test) train_xg_prob = xgb.predict_proba(x_train) print('xgboost的训练集log损失', log_loss(y_train, train_xg_prob)) print('xgboost的测试集log损失', log_loss(y_test, test_xg_prob)) time2_1 = time.time() print('xgboost计算时间', time2_1 - time2_0) # 用lgb建模 if flag == 3 or flag == 0: print("开始lgbm训练") time3_0 = time.time() lgb = LGBMClassifier(objective='binary', learning_rate=0.02, n_estimators=100, num_leaves=45, depth=12, colsample_bytree=0.8, min_child_samples=14, subsample=0.9) lgb.fit(x_train, y_train) test_lgb_prob = lgb.predict_proba(x_test) train_lgb_prob = lgb.predict_proba(x_train) print('lightgbm的训练集log损失', log_loss(y_train, train_lgb_prob)) print('lightgbm的测试集集log损失', log_loss(y_test, test_lgb_prob)) time3_1 = time.time() print('lightgbm计算时间', time3_1 - time3_0) ''' #验证集输出结果,线上测试 import getFearures01 path_test = '../data/round1_ijcai_18_test_b_20180418.txt' test_df = getFearures01.cpfeature(path_test)
'Arbitration_ID', 'Data0', 'Data1', 'Data2', 'Data3', 'Data4', 'Data5', 'Data6', 'Data7' ]] #test_ys = test_s_df['Class'] ''' scaler = StandardScaler() scaler.fit(df_x) scaler.fit(test_x) x_test_scaled = scaler.transform(test_x) x_scaled = scaler.transform(df_x) ''' model_d = LGBMClassifier(random_state=0, metric='binary_error', boosting_type='gbdt', learning_rate=0.1, n_estimators=100, num_leaves=16, objective='binary') model_d.fit(df_xd, df_yd, verbose=2) pred_yd = model_d.predict(test_xd) model_s = LGBMClassifier(random_state=0, metric='binary_error', boosting_type='gbdt', learning_rate=0.1, n_estimators=100, num_leaves=16, objective='binary') model_s.fit(df_xs, df_ys, verbose=2) pred_ys = model_s.predict(test_xs)
from sklearn.model_selection import train_test_split from lightgbm import LGBMClassifier import sys sys.path.insert(1,'../paragrid') #%% from paradec import parallel @parallel def ml_model(X,y, model): X_train, X_test, y_train, y_test = train_test_split(X, y, test_size=0.33, random_state=42) model.fit(X_train,y_train) y_pred = model.predict(X_test) return np.sum(y_pred==y_test)/len(y_test) # spaces space_gpdt = {'learning_rate': [0.001, 0.1, ], 'n_estimators': [2, 70, 5], 'max_depth': [2, 50, 4]} # Classification breast_cancer = load_breast_cancer() X, y = breast_cancer.data, breast_cancer.target args = [[X,y,LGBMClassifier(n_estimators=i)] for i in [5,10,15,25]] with concurrent.futures.ThreadPoolExecutor() as executor: results = executor.map(ml_model, args) for i in results: print(i)
def fill_nan_lgbm(dataframe, feature, target): from sklearn import preprocessing from lightgbm import LGBMClassifier from lightgbm import LGBMRegressor from sklearn.model_selection import KFold PREDICT_NAME = 'predict' df = dataframe df = df.reset_index(drop=True) feature_list = [ f_ for f_ in dataframe.columns if f_ != feature and f_ != target ] train = df[feature_list] train, _ = one_hot_encoder(train, True) for f_ in train.columns: train, _ = fill_nan_mean(train, f_, intern=True) train = pd.concat([train, df[feature]], axis=1) train_df = train[train[feature].notnull()].drop(feature, axis=1) train_target = train.loc[train[feature].notnull(), feature] test_df = train[train[feature].isnull()].drop(feature, axis=1) test_target = train[train[feature].isnull()][[feature]] valid = train[train[feature].notnull()][[feature]] valid[PREDICT_NAME] = 0 valid.reset_index(inplace=True) folds = KFold(n_splits=5, shuffle=True, random_state=1001) for n_fold, (train_idx, valid_idx) in enumerate(folds.split(train_df, train_target)): train_x, train_y = train_df.iloc[train_idx], train_target.iloc[ train_idx] valid_x, valid_y = train_df.iloc[valid_idx], train_target.iloc[ valid_idx] # Fix new label in valid_y if train_y.dtype == 'object': train_y_value_list = train_y.unique() train_y_value_mode = train_y.mode().values[0] valid_y = valid_y.apply(lambda x: x if x in train_y_value_list\ else train_y_value_mode) lgbm = 0 if train_target.dtype == 'object': lgbm = LGBMClassifier( nthread=4, n_estimators=1000, learning_rate=0.02, # num_leaves=34, num_leaves=6, colsample_bytree=0.95, subsample=0.87, # max_depth=8, reg_alpha=0.04, reg_lambda=0.07, min_split_gain=0.02, min_child_weight=40, silent=-1, verbose=-1, ) else: lgbm = LGBMRegressor( nthread=4, n_estimators=1000, learning_rate=0.02, # num_leaves=34, num_leaves=6, colsample_bytree=0.95, subsample=0.87, # max_depth=8, reg_alpha=0.04, reg_lambda=0.07, min_split_gain=0.02, min_child_weight=40, silent=-1, verbose=-1, ) debug('++++++++++++++++++++LGBM++++++++++' + feature + '+++++++++++++++++++++++++++++++++') lgbm.fit(train_x, train_y, eval_set=[(train_x, train_y), (valid_x, valid_y)], \ verbose= 1000, early_stopping_rounds= 200) valid.ix[valid_idx, [PREDICT_NAME]] = lgbm.predict(valid_x) if test_target.shape[0] > 0: test_target.loc[:, PREDICT_NAME] = lgbm.predict(test_df) acc = fill_na_performance(valid, feature, PREDICT_NAME) if test_target.shape[0] > 0: for i in test_target.index.values: df.loc[i, feature] = test_target.loc[i, PREDICT_NAME] trace('fill_nan_lgbm ' + feature + ' acc: ' + str(acc)) return df, acc
lgb_params4 = {} lgb_params4['n_estimators'] = 1450 lgb_params4['max_bin'] = 20 lgb_params4['max_depth'] = 6 lgb_params4['learning_rate'] = 0.25 # shrinkage_rate lgb_params4['boosting_type'] = 'gbdt' lgb_params4['objective'] = 'binary' lgb_params4['min_data'] = 500 # min_data_in_leaf lgb_params4['min_hessian'] = 0.05 # min_sum_hessian_in_leaf lgb_params2['num_leaves'] = 64 lgb_params4['verbose'] = 0 lgb_params4['device'] = 'gpu' lgb_params4['gpu_platform_id'] = 0 lgb_params4['gpu_device_id'] = 0 lgb_model = LGBMClassifier(**lgb_params) lgb_model2 = LGBMClassifier(**lgb_params2) lgb_model3 = LGBMClassifier(**lgb_params3) lgb_model4 = LGBMClassifier(**lgb_params4) log_model = LogisticRegression() stack = Ensemble(n_splits=5, stacker=log_model, base_models=(lgb_model, lgb_model2, lgb_model3, lgb_model4)) y_pred = stack.fit_predict(train, target_train, test)
min_samples_leaf=10, n_estimators=300, verbose=True) clf.fit(train_x, train_y) selector = SelectFromModel(clf, prefit=True) new_train_x = selector.transform(train_x) # Train stacking model rf = RandomForestClassifier(max_depth=6, random_state=0, min_samples_leaf=10, n_estimators=300) lgb1 = LGBMClassifier(n_estimators=400, num_leaves=50, max_depth=6, learning_rate=0.03, subsample=0.8, reg_alpha=1.0, reg_lambda=0.5, n_jobs=6) lgb2 = LGBMClassifier(n_estimators=300, num_leaves=60, max_depth=3, learning_rate=0.07, subsample=0.8, reg_alpha=0, reg_lambda=1, n_jobs=6) estimators = [('rf', rf), ('lgbt1', lgb1), ('lgbt2', lgb2)] clf = StackingClassifier( estimators=estimators, final_estimator=LogisticRegression(max_iter=400,
min_child_weight=10, zero_as_missing=True, learning_rate=0.01, num_leaves=100, feature_fraction=0.7, bagging_fraction=0.7, n_estimators=800, n_jobs=-1, min_child_samples=30) else: model = LGBMClassifier(reg_alpha=0.3, reg_lambda=0.1, min_child_weight=10, zero_as_missing=True, learning_rate=0.01, num_leaves=100, feature_fraction=0.7, bagging_fraction=0.7, n_estimators=800, n_jobs=-1, min_child_samples=30) train_x, test_x, train_y, test_y = df_X[train], df_X[test], df_y[ train], df_y[test] # train_test_split(df_X, df_y, test_size=0.15) model.fit(train_x, train_y, eval_set=(test_x, test_y), early_stopping_rounds=7) models.append(model)
def get_model_from_name(model_name, training_params=None, is_hp_search=False): # For Keras epochs = 1000 # if os.environ.get('is_test_suite', 0) == 'True' and model_name[:12] == 'DeepLearning': # print('Heard that this is the test suite. Limiting number of epochs, which will increase ' # 'training speed dramatically at the expense of model accuracy') # epochs = 100 all_model_params = { 'LogisticRegression': {}, 'RandomForestClassifier': { 'n_jobs': -2, 'n_estimators': 30 }, 'ExtraTreesClassifier': { 'n_jobs': -1 }, 'AdaBoostClassifier': {}, 'SGDClassifier': { 'n_jobs': -1 }, 'Perceptron': { 'n_jobs': -1 }, 'LinearSVC': { 'dual': False }, 'LinearRegression': { 'n_jobs': -2 }, 'RandomForestRegressor': { 'n_jobs': -2, 'n_estimators': 30 }, 'LinearSVR': { 'dual': False, 'loss': 'squared_epsilon_insensitive' }, 'ExtraTreesRegressor': { 'n_jobs': -1 }, 'MiniBatchKMeans': { 'n_clusters': 8 }, 'GradientBoostingRegressor': { 'presort': False, 'learning_rate': 0.1, 'warm_start': True }, 'GradientBoostingClassifier': { 'presort': False, 'learning_rate': 0.1, 'warm_start': True }, 'SGDRegressor': { 'shuffle': False }, 'PassiveAggressiveRegressor': { 'shuffle': False }, 'AdaBoostRegressor': {}, 'LGBMRegressor': { 'n_estimators': 2000, 'learning_rate': 0.15, 'num_leaves': 8, 'lambda_l2': 0.001, 'histogram_pool_size': 16384 }, 'LGBMClassifier': { 'n_estimators': 2000, 'learning_rate': 0.15, 'num_leaves': 8, 'lambda_l2': 0.001, 'histogram_pool_size': 16384 }, 'DeepLearningRegressor': { 'epochs': epochs, 'batch_size': 50, 'verbose': 2 }, 'DeepLearningClassifier': { 'epochs': epochs, 'batch_size': 50, 'verbose': 2 }, 'CatBoostRegressor': {}, 'CatBoostClassifier': {} } # if os.environ.get('is_test_suite', 0) == 'True': # all_model_params model_params = all_model_params.get(model_name, None) if model_params is None: model_params = {} if is_hp_search is True: if model_name[:12] == 'DeepLearning': model_params['epochs'] = 50 if model_name[:4] == 'LGBM': model_params['n_estimators'] = 500 if training_params is not None: print('Now using the model training_params that you passed in:') print(training_params) # Overwrite our stock params with what the user passes in (i.e., if the user wants 10, # 000 trees, we will let them do it) model_params.update(training_params) print( 'After overwriting our defaults with your values, here are the final params that will ' 'be used to initialize the model:') print(model_params) model_map = { # Classifiers 'LogisticRegression': LogisticRegression(), 'RandomForestClassifier': RandomForestClassifier(), 'RidgeClassifier': RidgeClassifier(), 'GradientBoostingClassifier': GradientBoostingClassifier(), 'ExtraTreesClassifier': ExtraTreesClassifier(), 'AdaBoostClassifier': AdaBoostClassifier(), 'LinearSVC': LinearSVC(), # Regressors 'LinearRegression': LinearRegression(), 'RandomForestRegressor': RandomForestRegressor(), 'Ridge': Ridge(), 'LinearSVR': LinearSVR(), 'ExtraTreesRegressor': ExtraTreesRegressor(), 'AdaBoostRegressor': AdaBoostRegressor(), 'RANSACRegressor': RANSACRegressor(), 'GradientBoostingRegressor': GradientBoostingRegressor(), 'Lasso': Lasso(), 'ElasticNet': ElasticNet(), 'LassoLars': LassoLars(), 'OrthogonalMatchingPursuit': OrthogonalMatchingPursuit(), 'BayesianRidge': BayesianRidge(), 'ARDRegression': ARDRegression(), # Clustering 'MiniBatchKMeans': MiniBatchKMeans(), } try: model_map['SGDClassifier'] = SGDClassifier(max_iter=1000, tol=0.001) model_map['Perceptron'] = Perceptron(max_iter=1000, tol=0.001) model_map['PassiveAggressiveClassifier'] = PassiveAggressiveClassifier( max_iter=1000, tol=0.001) model_map['SGDRegressor'] = SGDRegressor(max_iter=1000, tol=0.001) model_map['PassiveAggressiveRegressor'] = PassiveAggressiveRegressor( max_iter=1000, tol=0.001) except TypeError: model_map['SGDClassifier'] = SGDClassifier() model_map['Perceptron'] = Perceptron() model_map['PassiveAggressiveClassifier'] = PassiveAggressiveClassifier( ) model_map['SGDRegressor'] = SGDRegressor() model_map['PassiveAggressiveRegressor'] = PassiveAggressiveRegressor() if xgb_installed: model_map['XGBClassifier'] = XGBClassifier() model_map['XGBRegressor'] = XGBRegressor() if lgb_installed: model_map['LGBMRegressor'] = LGBMRegressor() model_map['LGBMClassifier'] = LGBMClassifier() if catboost_installed: model_map['CatBoostRegressor'] = CatBoostRegressor() model_map['CatBoostClassifier'] = CatBoostClassifier() if model_name[:12] == 'DeepLearning': if keras_installed is False: # Suppress some level of logs if TF is installed (but allow it to not be installed, # and use Theano instead) try: os.environ['TF_CPP_MIN_VLOG_LEVEL'] = '3' os.environ['TF_CPP_MIN_LOG_LEVEL'] = '3' from tensorflow import logging logging.set_verbosity(logging.INFO) except: # TODO: Fix bare Except pass model_map['DeepLearningClassifier'] = KerasClassifier( build_fn=make_deep_learning_classifier) model_map['DeepLearningRegressor'] = KerasRegressor( build_fn=make_deep_learning_model) try: model_without_params = model_map[model_name] except KeyError as e: print( 'It appears you are trying to use a library that is not available when we try to ' 'import it, or using a value for model_names that we do not recognize.' ) raise e if os.environ.get('is_test_suite', False) == 'True': if 'n_jobs' in model_params: model_params['n_jobs'] = 1 model_with_params = model_without_params.set_params(**model_params) return model_with_params
import shap import os import time from matplotlib import pyplot as plt import matplotlib matplotlib.use('TkAgg') lgbm = LGBMClassifier(boosting_type='gbdt', num_leaves=31, max_depth=-1, learning_rate=0.001, n_estimators=2000, objective=None, min_split_gain=0, min_child_weight=3, min_child_samples=10, subsample=0.8, subsample_freq=1, colsample_bytree=0.7, reg_alpha=0.3, reg_lambda=0, seed=17) path_to_features = r'C:\Users\kotov-d\Documents\TASKS\feature_selection\\features_to_calc' path_to_calculated = r'\\Zstorage\!z\Shuranov\calculated_features' for xXx in os.listdir(path_to_features): print(xXx[:-4]) start = time.time() with open(os.path.join(path_to_features, xXx), "rb") as f:
'mean_word_len' ]] feature_inputs = X_data.columns label = data['encoded_sentiment'] X_train, X_test, y_train, y_test = train_test_split(X_data, label, test_size=0.3, random_state=42) clf = LGBMClassifier( n_estimators=10000, learning_rate=0.10, num_leaves=30, subsample=.9, max_depth=7, reg_alpha=.1, reg_lambda=.1, min_split_gain=.01, min_child_weight=2, silent=-1, verbose=-1, ) print('bat dau train') LGBM_model = clf.fit(X_train, y_train) filename = 'lgbm.pkl' with open(filename, 'wb') as file: pickle.dump(LGBMModel, file) print('Da luu model thanh cong')
def trainModel(x,y): #lightgbm/xgboost的自定义评价指标 def self_metric(y_true, y_pred): score = f1_score(y_true, 1*(y_pred>=0.5)) # 因为传入进去的y_true和y_pred必须是二进制的数据类型,因此需要 return 'f1', score, False from sklearn.ensemble import BaggingClassifier params = {"num_leaves":81, "n_estimators":100, "learning_rate":0.2,#绝对需要的参数 "subsample":0.9,"class_weight":{1:1,0:1},"reg_lambda":2 #仅做尝试 } x_train,x_test,y_train,y_test = train_test_split(x,y,train_size=0.8,random_state=345) lgb = LGBMClassifier(**params) """ boosting_type='gbdt', 默认是 gdbt 梯度提升树 dart dropout和MArt的结合 后者就是多层加法树模型(multiple additive regression tree)goss(基于梯度的单侧采样) rf 随机森林 num_leaves=31, 基础学习器最多的叶子节点 max_depth=-1, 基础学习器的最大树深 小于等于0意味着没有限制 learning_rate=0.1, boosting的缩放系数 n_estimators=100, 学习器的数量 subsample_for_bin=200000, 多少样本构建分箱 objective=None, 指定具体的任务类型 如果是是分类就是 binary或者multiple 回归就是regression 排序就是lambdarank class_weight=None, 样本权重 不同类别的样本权重可能不一样 min_split_gain=0., min_child_weight=1e-3, min_child_samples=20, subsample=1., subsample_freq=0, colsample_bytree=1., reg_alpha=0., reg_lambda=0., random_state=None, n_jobs=-1, silent=True, importance_type='split' """ # model = BaggingClassifier(base_estimator=lg, n_estimators=100, max_samples=0.8, max_features=0.8) model = lgb model.fit(x_train, y_train, eval_metric=self_metric, eval_set=[(x_train, y_train),(x_test, y_test)]) # 默认没有F1指标 所以自定义 # model.fit(x_train, y_train) """ sample_weight=None, init_score=None, eval_set=None, eval_names=None, eval_sample_weight=None, eval_class_weight=None, eval_init_score=None, eval_metric=None, early_stopping_rounds=None, 正常的是应该在测试集效果越来越好,如果连续n轮效果越来越差 就提前结束训练 verbose=True, feature_name='auto', categorical_feature='auto', callbacks=None """ """ 如果设置early_stopping这个参数 那么要将迭代的轮数回传给模型,把迭代效果不好的轮次不要掉 model.n_estimators = model.best_iteration_ """ #质变部分 - 取合理的阈值来指定 f1指标 #todo 可以自己划分多个阈值(2000个以上)直接计算f1指标,看哪个阈值最好,更加精确 pre_train = model.predict_proba(x_train)[:,1] pre_test = model.predict_proba(x_test)[:,1] fpr, tpr, thresholds = roc_curve(y_train, pre_train) thre_index = (tpr - fpr).argmax() thres = thresholds[thre_index] print("训练集阈值",thres) pre_train = 1*(pre_train>=thres) pre_test = 1 * (pre_test >= thres) print("train f1_score",f1_score(y_train, pre_train)) print("test f1_score", f1_score(y_test, pre_test)) print("train recall_score",recall_score(y_train, pre_train)) print("test recall_score", recall_score(y_test, pre_test)) print("train precision_score",precision_score(y_train, pre_train)) print("test precision_score", precision_score(y_test, pre_test)) return model,thres