def tune_params(): f1_t_total, f1_v_total = [], [] for max_depth in range(6,15): for subsample in [0.6,0.7,0.8]: for colsample_bytree in [0.6,0.7,0.8]: for reg_alpha in [0.1,1,10]: lgb_base = LGBMClassifier(n_estimators = 150,objective = 'binary', random_state=1234,n_jobs = 3,colsample_bytree=colsample_bytree, reg_alpha=reg_alpha, max_depth = max_depth, subsample = subsample) _params = { 'max_depth':max_depth, 'subsample':subsample, 'colsample_bytree':colsample_bytree, 'reg_alpha':reg_alpha, } lgb_base.fit(X_t, y_t) y_t_pre = lgb_base.predict(X_t) y_v_pre = lgb_base.predict(X_v) f1_t_each = f1_score(y_t, y_t_pre,average = 'micro') f1_v_each = f1_score(y_v, y_v_pre,average = 'micro') f1_t_total.append(f1_t_each) f1_v_total.append(f1_v_each) print(_params) myfile1 = open('D:\\workspace python\\contest\\accu_save\\' + 'lgbbase_saveparams_f1_0418.txt', 'a', encoding='utf-8') print(_params['max_depth'],_params['subsample'],_params['colsample_bytree'], _params['reg_alpha'],file = myfile1) myfile1.close() print(f1_t_each,f1_v_each) myfile = open('D:\\workspace python\\contest\\accu_save\\' + 'lgbbase_tunparms_f1_0418.txt', 'a', encoding='utf-8') print(f1_t_each,',',f1_v_each,file = myfile) myfile.close() return f1_t_total,f1_v_total
def baseline_xiong(self, profile: Profile, shared: Storage, logger: Logger, converted): a_std = converted[1].std(-1) g_mean = converted[3].mean(-1) g_std = converted[3].std(-1) m_over_0_count = (converted[2] >= 0.0).sum(-1).astype(np.float32) a_mean = converted[1].mean(-1) a_l2_std = np.sqrt(converted[1][:, 0, :]**2 + converted[1][:, 1, :]**2 + converted[1][:, 2, :]**2).std(-1)[:, np.newaxis] m_l2_std = np.sqrt(converted[2][:, 0, :]**2 + converted[2][:, 1, :]**2 + converted[2][:, 2, :]**2).std(-1)[:, np.newaxis] features = np.concatenate( (a_std, g_mean, g_std, m_over_0_count, a_mean, a_l2_std, m_l2_std), axis=1) labels = converted[ 0] # onehot.fit_transform(converted[0].reshape(-1, 1)).toarray() length = labels.shape[0] classifier = LGBMClassifier() classifier.fit(features[:int(length * 0.7)], labels[:int(length * 0.7)]) validate_y = labels[int(length * 0.7):] predict_y = classifier.predict(features[int(length * 0.7):]) logger.info('Xiong') logger.info(f'Accuracy: {accuracy_score(validate_y, predict_y)}') logger.info( f'Precision: {precision_score(validate_y, predict_y, average=None)}' ) logger.info( f'Recall: {recall_score(validate_y, predict_y, average=None)}')
def lgb_initialise(param={}): config = LGBMClassifier().get_params() config['boosting_type'] = 'gbdt' config['class_weight'] = None config['colsample_bytree'] = 0.7 config['importance_type'] = 'split' config['is_unbalance'] = True config['learning_rate'] = 0.05 config['max_depth'] = 4 config['min_child_samples'] = 20 config['min_child_weight'] = 0.001 config['min_split_gain'] = 0.0 config['n_estimators'] = 600 config['n_jobs'] = -1 config['nthread'] = 3 config['num_leaves'] = 8 config['objective'] = 'binary' config['random_state'] = None config['reg_alpha'] = 0 config['reg_lambda'] = 0 config['seed'] = 777 config['silent'] = False config['subsample'] = 0.8 config['subsample_for_bin'] = 200000 config['subsample_freq'] = 0 config.update(param) return LGBMClassifier(**config)
def runTrain(train_df, workspace, debug, model_config): kfold_setting = model_config['kfold_setting'] is_single = kfold_setting['num_folds'] < 1 model_param = model_config['model_param'] if debug: model_param['n_estimators'] = 100 model_param['learning_rate'] = 0.3 clf = LGBMClassifier(**model_param) feats = [ f for f in train_df.columns if f not in ['TARGET', 'SK_ID_CURR', 'SK_ID_BUREAU', 'SK_ID_PREV', 'index'] ] X, y = train_df[feats], train_df['TARGET'] if is_single: print("INFO: num_folds is less than 1, SINGLE MODEL would be trained.") model = clf with timer('Train Single LGB Model'): clf.fit(X, y, eval_set=[(X, y)], eval_metric='auc', verbose=100) workspace.save(model, 'single_model.pkl') # I do not prepare a analysis report for single_model else: cv = gen_cv(**kfold_setting) model = KFoldClassifier(clf, cv) with timer('Train KFold LGB Model'): model.fit(X, y) workspace.save(model, 'kfold_model.pkl') workspace.gen_report('kfold') return model
def fit_lgb_model(model_spec, early_stopping_rounds=10): ms = model_spec print("loading data using", ms["filename"]) df = ms["dataset"]() print("converting all feature columns to float32") for col in ms["features"].values(): df[col] = df[col].astype("float32") print(df.describe().T) n_targets_with_null = df["target"].isna().sum() print("dropping", n_targets_with_null, "rows with null in target") df = df[df["target"].notna()] monotone_constraints = [ ms["monotone_constraints"].get(col, 0) for col in ms["features"].keys() ] print("monotone constraints:", monotone_constraints) model = LGBMClassifier( n_estimators=5_000, num_leaves=11, learning_rate=0.01, monotone_constraints=monotone_constraints, monotone_constraints_method="advanced", ) X = df[ms["features"].values()] y = df["target"] n_games = df["game_id"].max() + 1 game_pct = (df["game_id"] + 1) / n_games w = config.GAME_WEIGHTING_FACTOR + (1 - config.GAME_WEIGHTING_FACTOR) * game_pct eval_size = ms["validation_size"] X_tr, X_te = X.iloc[:-eval_size], X.iloc[-eval_size:] y_tr, y_te = y.iloc[:-eval_size], y.iloc[-eval_size:] w_tr, w_te = w.iloc[:-eval_size], w.iloc[-eval_size:] eval_set = [(X_te.values, y_te.values)] model.fit( X_tr, y_tr, sample_weight=w_tr, eval_set=eval_set, eval_sample_weight=[w_te], early_stopping_rounds=early_stopping_rounds, verbose=early_stopping_rounds, ) print("refitting model with full dataset") model.set_params(n_estimators=model.best_iteration_) model.fit(X, y, sample_weight=w) pred = pd.Series(model.predict_proba(X)[:, 1]) print("distribution of predictions:") print(pred.describe(percentiles=[0.05, 0.1, 0.25, 0.5, 0.75, 0.9, 0.95])) feature_importances = [ (feature, importance) for feature, importance in zip( model.booster_.feature_name(), model.booster_.feature_importance(importance_type="gain"), ) ] print("feature importance (by gain):") for feature, importance in sorted(feature_importances, key=lambda row: -row[1]): print(f" {feature}: {importance}") filepath = os.path.join(FILEPATH, "models", ms["filename"]) print("saving to", filepath) model.booster_.save_model(filepath)
def lgb(x_train, y_train, x_val, y_val): lgb = LGBMClassifier(n_estimators=1000, max_depth=10, subsample=0.7, colsample_bytree=0.7, learning_rate=0.01, random_state=2020) lgb.fit(x_train, y_train) result = lgb.predict(x_val) score = f1_score(result, y_val) return score
def setUp(self): X_train, y_train, X_test, y_test = titanic_survive() train_names, test_names = titanic_names() model = LGBMClassifier() model.fit(X_train, y_train) self.explainer = ClassifierExplainer( model, X_test, y_test, cats=[{'Gender': ['Sex_female', 'Sex_male', 'Sex_nan']}, 'Deck', 'Embarked'], labels=['Not survived', 'Survived'], idxs=test_names)
def setUp(self): X_train, y_train, X_test, y_test = titanic_survive() train_names, test_names = titanic_names() model = LGBMClassifier() model.fit(X_train, y_train) self.explainer = ClassifierExplainer( model, X_test, y_test, roc_auc_score, shap='tree', cats=['Sex', 'Cabin', 'Embarked'], labels=['Not survived', 'Survived'], idxs=test_names)
def fit(self, X, y): sss = StratifiedShuffleSplit(n_splits=self.hsic_splits, random_state=42) idxs = [] hsics = [] for train_index, test_index in list(sss.split(X, y)): hsic_lasso2 = HSICLasso() hsic_lasso2.input(X[train_index], y[train_index]) hsic_lasso2.classification( self.n_features, B=self.B, M=self.M) #(self.n_features, B=self.B, M=self.M) hsics.append(hsic_lasso2) # not just best features - get their neighbors (similar features) too all_ft_idx = np.array(hsic_lasso2.get_index(), dtype=int).ravel() for i in range(len(all_ft_idx)): idx = np.array(hsic_lasso2.get_index_neighbors( feat_index=i, num_neighbors=10), dtype=int) score = np.array(hsic_lasso2.get_index_neighbors_score( feat_index=i, num_neighbors=10), dtype=int) idx = idx[np.where(score > self.neighbor_threshold)[0]] all_ft_idx = np.concatenate((all_ft_idx, idx)) all_ft_idx = np.unique(all_ft_idx) idxs.append(all_ft_idx) if len(idxs) == 1: self.hsic_idx_ = idxs[0] else: self.hsic_idx_ = np.intersect1d(idxs[-1], self.hsic_idx_) print("HSIC done.", len(self.hsic_idx_)) print("Upsampling with ADASYN... (features: " + str(len(self.hsic_idx_)) + ")") sm = ADASYN(sampling_strategy="minority", n_neighbors=self.adasyn_neighbors, n_jobs=-1) sX, sy = X[:, self.hsic_idx_], y if self.adasyn_neighbors > 0: try: sX, sy = sm.fit_resample(X[:, self.hsic_idx_], y) for i in range(len(np.unique(y) - 1)): sX, sy = sm.fit_resample(sX, sy) except: pass print("ADASYN done. Starting clf") self.clf_ = LGBMClassifier(n_estimators=1000).fit(sX, sy) print("done") return self
def train_LGBM(src_folder, cols, model_save_folder, lr): ''' 从src_folder中加载trainset,valset, 并训练LGBM,将训练后的模型保存到model_save_folder中 :param src_folder: :param cols: :param model_save_folder: :param lr: :return: ''' os.makedirs(model_save_folder, exist_ok=True) def auc_prc(y_true, y_pred): return 'AUC_PRC', average_precision_score(y_true, y_pred), True train_months_li = [[201803, 201804]] val_months_li = [[201806]] for train_months, val_months in zip(train_months_li, val_months_li): print('************** train Months: {}, val Months: {}**************'. format(', '.join([str(i) for i in train_months]), ', '.join([str(i) for i in val_months]))) trainX, trainy, valX, valy = prepare_Train_Val_set( src_folder, train_months, val_months, cols) print('trainX shape: {}, valX shape: {}'.format( trainX.shape, valX.shape)) print('trainy value_counts: {}'.format(trainy.value_counts())) print('valy value_counts: {}'.format(valy.value_counts())) clf = LGBMClassifier(num_leaves=127, learning_rate=lr, n_estimators=10000, objective='binary', is_unbalance=True, subsample=0.8, colsample_bytree=0.8, device_type='gpu', gpu_platform_id=1, gpu_device_id=0) t0 = time.time() clf.fit(trainX, trainy, eval_set=[(valX, valy)], eval_metric=auc_prc, early_stopping_rounds=50, verbose=100) print('fit time: {:.4f}'.format(time.time() - t0)) save_name = 'LGBM_' + 'Val_M' + ', '.join( [str(i) for i in val_months]) + datetime.now().strftime('%Y%m%d_%H%M%S') joblib.dump(clf, os.path.join(model_save_folder, save_name)) print('model is saved to {}'.format(save_name)) gc.collect()
def find_best_cv(self): Util.split_cv(self.X, self.y, self.n_folds_list, ORG_DATA_DIR) acc_score_means = [] acc_score_vars = [] for num_of_fold in self.n_folds_list: print("============") logger.info("==evaluating %s fold==" % num_of_fold) CV_DIR = os.path.join(ORG_DATA_DIR, "n_folds_%s/" % num_of_fold) acc_score = [] for i in range(num_of_fold): logger.info("loading %s th cv data in %s folds" % (i, num_of_fold)) X_train = pd.read_csv(os.path.join(CV_DIR, "X_train_%s.csv") % i, header=None, sep="\t").values X_val = pd.read_csv(os.path.join(CV_DIR, "X_val_%s.csv") % i, header=None, sep="\t").values y_train = pd.read_csv(os.path.join(CV_DIR, "y_train_%s.csv") % i, header=None, sep="\t").values y_c, y_r = y_train.shape y_train = y_train.reshape(y_c, ) y_val = pd.read_csv(os.path.join(CV_DIR, "y_val_%s.csv") % i, header=None, sep="\t").values y_c, y_r = y_val.shape y_val = y_val.reshape(y_c, ) logger.info("end loading %s th cv data in %s folds" % (i, num_of_fold)) logger.info("X_train.shape: %s %s" % X_train.shape) logger.info("X_val.shape: %s %s" % X_val.shape) logger.info("y_train.shape: %s" % y_train.shape) logger.info("y_val.shape: %s" % y_val.shape) clf = LGBMClassifier(objective="binary", n_estimators=20) weight_train = self._calc_w(y_train) clf.fit(X_train, y_train, sample_weight=weight_train, eval_set=[(X_val, y_val)], verbose=True) y_pred = clf.predict(X_val) logger.info("acc socore: %s folds, %s iteration" % (num_of_fold, i)) acc_score.append(accuracy_score(y_val, y_pred)) logger.info("mean acc score of %s folds is %s" % (num_of_fold, np.mean(acc_score))) acc_score_means.append(np.mean(acc_score)) logger.info("variance of acc score of %s folds is %s" % (num_of_fold, np.var(acc_score))) acc_score_vars.append(np.var(acc_score)) for i in range(len(self.n_folds_list)): logger.info( "===%s_folds=== mean acc:%s, var acc: %s " % (self.n_folds_list[i], acc_score_means[i], acc_score_vars[i]) )
def fit(): train, validation, _ = train_validation_holdout_split(read('./data/train_set.csv')) steps = [ preprocess, russia_only, rouble_only, with_transaction_location, with_job, (partial(fit_categories, ['mcc', 'city', 'terminal_id']), transform_categories), partial(calc_is_close, ['transaction_lat', 'transaction_lon'], ['work_add_lat', 'work_add_lon']) ] pipeline, train = fit_pipeline(steps, train) validation = pipeline(validation) feature_columns = ['mcc', 'city', 'amount', 'terminal_id'] print(f'Train size: {len(train)}, Validation size: {len(validation)}') print(f'Features: {feature_columns}') model = LGBMClassifier() model.fit(train[feature_columns], train['is_close']) predictions = model.predict_proba(validation[feature_columns]) accuracy_value = accuracy_score(validation['is_close'], np.argmax(predictions, axis=1)) logloss_value = log_loss(validation['is_close'], predictions) print(f'Accuracy: {accuracy_value:.5f}, Logloss: {logloss_value:.5f}') print(classification_report(validation['is_close'], np.argmax(predictions, axis=1))) validation['probs'] = predictions[:, 1] top1_accuracy = validation.groupby('customer_id').apply(lambda group: group.sort_values('probs').tail(1).is_close.max()).mean() top5_accuracy = validation.groupby('customer_id').apply(lambda group: group.sort_values('probs').tail(5).is_close.max()).mean() top10_accuracy = validation.groupby('customer_id').apply(lambda group: group.sort_values('probs').tail(10).is_close.max()).mean() print(f'Top1: {top1_accuracy:.5f}') print(f'Top5: {top5_accuracy:.5f}') print(f'Top10: {top10_accuracy:.5f}') # contributions = model._Booster.predict(validation[feature_columns], pred_contrib=True) # contributions_df = pd.DataFrame( # index=validation.index, # data=contributions, # columns=list(map(lambda col: col + '_contr', feature_columns)) + ['expected_value'] # ) # debug_df = pd.concat([validation, contributions_df], axis=1) # debug_df.index.name = 'id' # debug_df.to_csv('./data/debug.csv') import pdb; pdb.set_trace()
def lgb_model(X_train, y_train, X_val, y_val, save_file, folds, param_comb, n_jobs, scoring): skf = StratifiedKFold(n_splits=folds, shuffle=True, random_state=123) lgb = LGBMClassifier(n_jobs=n_jobs, random_state=123) params = { "num_leaves": [3, 5, 10], "max_depth": [-1], "learning_rate": [0.2575640770995011], "n_estimators": [5000, 10000, 50000], "objective": ["binary"], "class_weight": ["balanced", None], "subsample": [0.7], "colsample_bytree": [0.6], "reg_lambda": [1.6599030323415402], "reg_alpha": [0.7044747533204038], "min_child_weight": [7] } model_lgb = RandomizedSearchCV(estimator=lgb, param_distributions=params, n_iter=param_comb, scoring=scoring, n_jobs=n_jobs, cv=skf.split(X_train, y_train), verbose=0, random_state=123) model_lgb.fit(X_train, y_train, eval_set=[(X_val, y_val)], eval_metric='auc', early_stopping_rounds=50) joblib.dump(model_lgb.best_estimator_, os.path.join(os.path.dirname(__file__), f'best_{save_file}'), compress=1) return model_lgb
def __init__(self, data, continuous_feature_names, label, score, t='R', showFig=False): self.data = data self.continuous_feature_names = continuous_feature_names self.label = label self.score = score self.K = len(continuous_feature_names) self.T = t self.showFig = showFig # 备选特征 self.train_X = data[continuous_feature_names] self.train_y = data[label] self.numNull = self.train_X.isnull().sum().sum() self.numInf = np.isinf(self.train_X.values).sum() # 备选模型 self.linearRegressionModel = [LinearRegression(), Ridge(), Lasso(), LinearSVR()] self.linearClassModel = [LogisticRegression(), LinearSVC(), RidgeClassifier()] self.treeRegressionModel = [ExtraTreesRegressor(), DecisionTreeRegressor(), RandomForestRegressor(), # RF相对较慢 GradientBoostingRegressor(), XGBRegressor(n_estimators=100, objective='reg:squarederror'), LGBMRegressor(n_estimators=100)] self.treeClassModel = [ExtraTreesClassifier(), DecisionTreeClassifier(), RandomForestClassifier(), GradientBoostingClassifier(), XGBClassifier(n_estimators=100, objective="binary:logistic"), LGBMClassifier(n_estimators=100)] self.nonlinearRegressionModel = self.treeRegressionModel + [SVR(), MLPRegressor(solver='lbfgs', max_iter=100),] self.nonlinearClassModel = self.treeClassModel + [SVC(), MLPClassifier(),]
def lgb_model(X_train, y_train, X_val, y_val, save_file, folds, param_comb, n_jobs, scoring): skf = StratifiedKFold(n_splits=folds, shuffle=True, random_state=123) lgb = LGBMClassifier(n_jobs=n_jobs, random_state=123) params = { "num_leaves": ss.randint(2, 50), "max_depth": ss.randint(3, 10), "learning_rate": ss.uniform(0.001, 0.5), "n_estimators": [1000], "objective": ["binary"], "class_weight": ["balanced", None], "subsample": [0.5, 0.6, 0.7, 0.8, 0.9, 1.0], "colsample_bytree": [0.4, 0.5, 0.6, 0.7, 0.8, 0.9, 1.0], "reg_lambda": ss.uniform(0.05, 5), "reg_alpha": ss.uniform(0.05, 5), "min_child_weight": ss.randint(1, 15) } model_lgb = RandomizedSearchCV(estimator=lgb, param_distributions=params, n_iter=param_comb, scoring=scoring, n_jobs=n_jobs, cv=skf.split(X_train, y_train), verbose=0, random_state=123) model_lgb.fit(X_train, y_train, eval_set=[(X_val, y_val)], eval_metric='auc', early_stopping_rounds=50) joblib.dump(model_lgb.best_estimator_, os.path.join(os.path.dirname(__file__), f'best_{save_file}'), compress=1) return model_lgb
def autoengineer_ratios(self, ae_params=None, n_iter=1000): if ae_params is None: ae_params = { 'boosting_type': 'gbdt', 'max_depth': -1, 'objective': 'binary', 'learning_rate': 0.0212, 'reg_alpha': 0.8, 'reg_lambda': 0.4, 'subsample': 1, 'feature_fraction': 0.3, 'device_type': 'gpu', 'metric': 'auc', 'random_state': 123, 'n_estimators': 300, 'num_leaves': 40, 'max_bin': 255, 'min_data_in_leaf': 2400, 'min_data_in_bin': 5 } def _fn_column_selector(X, k): ''' select up to kth column ''' return X[:, :k] ColumnSelector = FunctionTransformer(_fn_column_selector, validate=False) importance_weights = self.ae_feature_importances / self.ae_feature_importances.sum( ) kfold = StratifiedKFold(n_splits=5, random_state=123) model = Pipeline([('selector', ColumnSelector), ('clf', LGBMClassifier(**ae_params))]) for i in range(n_iter): random_vars = list( choice(self.X_train.columns, size=2, p=importance_weights, replace=False)) X_tmp = self.X_train.loc[:, random_vars] X_tmp['_DIV_'.join( random_vars)] = X_tmp.iloc[:, 0] / (X_tmp.iloc[:, 1] + 1) gs = GridSearchCV( estimator=model, param_grid={'selector__kw_args': [{ 'k': 2 }, { 'k': 3 }]}, scoring='roc_auc', cv=kfold) gs.fit(X_tmp.values, self.y_train) perf_1, perf_2 = gs.cv_results_.get('mean_test_score') if perf_2 > perf_1: self.ae_discovery_ratios.append( (random_vars[0], random_vars[1], perf_2 / perf_1))
def optimize_model(character_model): """ Optimizes a classification model with the given param_grid Parameters: ----------- model: CharacterPredictiveModel Fitted character model for optimizing Returns: ----------- pandas DataFrame A dataframe with optimization results """ if verbose: print("Performing hyperparameter optimization on best model") #Performing hyperparameter optimization on LightGBM model = {"best_model": LGBMClassifier(random_state = 123)} param_grid = {'best_model__n_estimators' : [5, 100, 500, 700, 1000, 1500, 4000], 'best_model__learning_rate' : [0.01, 0.1, 1], 'best_model__max_depth' : [1, 3, 5, 6, 10], 'best_model__subsample' : [0.15, 0.25, 0.5, 0.75, 1], 'best_model__num_leaves' : [31, 64, 128] } model_df = character_model.optimize_model(model, param_grid) save_img_large(model_df, output_dir, "optimized_model", filename_prefix) if verbose: print("Model optimization complete!") return character_model
def compare_decision_tree_forests(character_model): """ Compares a sampling of random forest decision tree type models Parameters: ----------- model: CharacterPredictiveModel Fitted character model for testing Returns: ----------- None """ if verbose: print("Comparing random forest type model(s)") # List of decision tree types to compare models = { "Random Forest Classifier": RandomForestClassifier(random_state=123), "XGBClassifier": XGBClassifier(eval_metric="mlogloss", random_state=123), "LGBMClassifier": LGBMClassifier(num_leaves=31, random_state=123), "CatBoostClassifier": CatBoostClassifier(random_state=123, verbose=0) } rf_models_df = character_model.model_compare(models) save_img(rf_models_df, output_dir, "forest_model_comparison", filename_prefix) if verbose: print("Trained Forest model(s)") return
def LGB_train(self,X_train, X_valid, labels_train, labels_valid, X_test, lgb_param_all): lgb_param_contrl = {'early_stopping_rounds': 100, 'categorical_feature': 'auto'} lgb_param = lgb_param_all.copy() objective_type = lgb_param['objective_type'] lgb_param.pop('objective_type') for k in ['early_stopping_rounds', 'categorical_feature']: if k in lgb_param: lgb_param_contrl[k] = lgb_param[k] lgb_param.pop(k) if not self.config.retrain: # 调用已有模型进行增量训练 model_load = self.load_model() if not model_load: print('不存在模型:{},从头训练'.format(self.modelName)) if objective_type == 'regressor': clf = LGBMRegressor(**lgb_param) else: clf = LGBMClassifier(**lgb_param) clf.fit(X_train, labels_train, eval_set=[(X_valid, labels_valid)], eval_metric='rmse', early_stopping_rounds=lgb_param_contrl['early_stopping_rounds'], categorical_feature=lgb_param_contrl['categorical_feature']) else: clf = model_load.fit(X_train, labels_train, eval_set=[(X_valid, labels_valid)], eval_metric='rmse', early_stopping_rounds=lgb_param_contrl['early_stopping_rounds'], categorical_feature=lgb_param_contrl['categorical_feature']) else: if objective_type == 'regressor': clf = LGBMRegressor(**lgb_param) else: clf = LGBMClassifier(**lgb_param) clf.fit(X_train, labels_train, eval_set=[(X_valid, labels_valid)], eval_metric='rmse', early_stopping_rounds=lgb_param_contrl['early_stopping_rounds'], categorical_feature=lgb_param_contrl['categorical_feature']) val_lgb_pre = clf.predict(X_valid.values, num_iteration=clf.best_iteration_) test_lgb_pre = clf.predict(X_test.values, num_iteration=clf.best_iteration_) metrics_name = self.config.metrics_name myMetrics = defindMetrics.MyMetrics(metrics_name) score_lgb = myMetrics.metricsFunc(val_lgb_pre, labels_valid) self.save_model(clf, self.config.saveModel) return val_lgb_pre, test_lgb_pre, score_lgb
def feature_selcetion(df_final,train_y,num=train_num): trn_x, trn_y = df_final[:num], train_y['label'][:num] x_train, x_val, y_train, y_val = train_test_split(trn_x, trn_y, train_size=0.8, random_state=2019, stratify=trn_y) clf = LGBMClassifier(learning_rate=0.05, n_estimators=10000, subsample=0.8, subsample_freq=1, colsample_bytree=0.8, random_state=2019) t = time.time() clf.fit(x_train, y_train, eval_set=[(x_train, y_train), (x_val, y_val)],early_stopping_rounds=20, verbose=5) print('runtime: {}\n'.format(time.time() - t)) feature_impo = pd.DataFrame(sorted(zip(clf.feature_importances_, range(df_final.shape[1]))),columns=['Value', 'Feature']) feature_impo.to_csv(path + 'feature_impo.csv', index=False) df_final_selected=pd.read_csv(path + 'feature_impo.csv') fea_num_0=df_final.shape[1] fea_num_1=fea_num_0-1000 head_1000_fea=feature_impo[fea_num_1:fea_num_0]['Feature'].values.tolist() df_final = df_final[:, head_1000_fea] print('特征选择已完成') return df_final
def __init__(self, estimator=LGBMClassifier(), cv=5, random_state=None, n_repeats=None): self.estimator = estimator if n_repeats: self._kf = RepeatedStratifiedKFold(cv, True, random_state) self._num_preds = cv * n_repeats else: self._kf = StratifiedKFold(cv, True, random_state) self._num_preds = cv
def main(): transaction = pd.read_csv('transaction_new.csv') dis = pd.read_csv('submit_disv1.csv') transaction_new = pd.merge(transaction, dis[['TransactionID', 'score']], on='TransactionID') feature = [ f for f in transaction_new.columns if f != 'TransactionID' and f != 'split' and f != 'isFraud' ] fmap = {} for f in feature: fmap[f] = f.replace(' ', '_') transaction_new = transaction_new.rename(columns=fmap) data = transaction_new[transaction_new['split'] == 1] valid = transaction_new[transaction_new['split'] == 2] train, test = train_test_split(data, test_size=0.3, random_state=42) train_x = train[list(fmap.values())] test_x = test[list(fmap.values())] train_y = train['isFraud'].astype('int') test_y = test['isFraud'].astype('int') clf = LGBMClassifier( boosting_type='gbdt', colsample_bytree=0.2, drop_rate=0.1, importance_type='split', learning_rate=0.04, max_bin=500, max_depth=4, min_child_samples=50, min_split_gain=0.1, n_estimators=500, n_jobs=-1, num_leaves=9, objective=None, random_state=24, reg_alpha=40, reg_lambda=10, sigmoid=0.4, silent=True, #class_weight={0:1,1:10}, #subsample=0.3, subsample_for_bin=24000, is_unbalance=True, subsample_freq=1) clf.fit(train_x, train_y) train_y_pred = clf.predict_proba(train_x)[:, 1] train_ks = cal_ks_scipy(train_y_pred, train_y) y_pred = clf.predict_proba(test_x)[:, 1] test_ks = cal_ks_scipy(y_pred, test_y) print(train_ks, test_ks) tr_auc = metrics.roc_auc_score(train_y, train_y_pred) te_auc = metrics.roc_auc_score(test_y, y_pred) print(tr_auc, te_auc) valid['isFraud'] = clf.predict_proba(valid[clf._Booster.feature_name()])[:, 1] valid[['TransactionID', 'isFraud']].to_csv('submit6.csv', index=False)
def _set_algorithm(self, prms): model = LGBMClassifier(objective="binary", n_estimators=1000, learning_rate=0.3, min_child_samples=40, reg_alpha=0.5, reg_lambda=0.5, **prms) return model
def single_model(df_final, train_y,weight=None,metric=None): train_values, test_values = df_final[:train_num], df_final[test_num:] skf = StratifiedKFold(n_splits=5, shuffle=True, random_state=2019) clf = LGBMClassifier(learning_rate=0.05, n_estimators=10000, subsample=0.8, subsample_freq=1, colsample_bytree=0.8, random_state=2019) test_pred_prob = np.zeros((test_values.shape[0], 33)) for i, (trn_idx, val_idx) in enumerate(skf.split(train_values, train_y['label'])): print(i, 'fold...') t = time.time() trn_x, trn_y = train_values[trn_idx], train_y['label'][trn_idx] val_x, val_y = train_values[val_idx], train_y['label'][val_idx] train_amt, val_amt = train_y['due_amt'][trn_idx].values, train_y['due_amt'][val_idx].values clf.fit(trn_x, trn_y, eval_set=[(trn_x, trn_y), (val_x, val_y)],sample_weight=weight, eval_metric=metric, early_stopping_rounds=100, verbose=5) test_pred_prob += clf.predict_proba(test_values, num_iteration=clf.best_iteration_) / skf.n_splits print('runtime: {}\n'.format(time.time() - t)) print('单模型拟合已完成') return test_pred_prob
def make_cls(self): cls = LGBMClassifier( n_estimators=2048, num_leaves=1024, learning_rate=self.learning_rate.get(), min_child_samples=self.min_child_samples.get(), subsample=0.75, ) return AutoExitingGBMLike(cls, additional_fit_args={'verbose': False})
def __init__(self, x, hyper_search_strat, hyper_search_kwargs, n_trees=512, preprocessor=None, transformer=None, additional_hyper_dists=None): super().__init__( estimator=LGBMClassifier(n_estimators=n_trees), hyper_search_strat=hyper_search_strat, hyper_search_kwargs=hyper_search_kwargs, transformer=transformer, additional_hyper_dists=additional_hyper_dists ) self.x = x
def train_stack(self, n_features, n_classes, estimators, tr_stack, y_train, val_stack, y_val, test_stack, score_name): tr_pred = [] val_pred = [] test_pred = [] for estimator_name in estimators: estimator, params = select_model(estimator_name, n_features, n_classes) # Train 2nd and 3rd layer with val_stack and evaluate with tr_stack train_kwargs = { 'estimator': estimator, 'params': params, 'X_train': val_stack, 'y_train': y_val, 'X_val': tr_stack, 'y_val': y_train, 'n_iter': 100, 'score_name': score_name, 'report': False, 'cv': 3, 'random_state': 42, } # Random train with stacked data and get best_params params, _, _ = random_model(**train_kwargs) if estimator_name == 'xgb': clf = XGBClassifier(**params) elif estimator_name == 'lgb': clf = LGBMClassifier(**params) elif estimator_name == 'rfo': clf = RandomForestClassifier(**params) elif estimator_name == 'log': clf = LogisticRegression(**params) elif estimator_name == 'svc': clf = SVC(**params) elif estimator_name == 'knn': clf = KNeighborsClassifier(**params) elif estimator_name == 'ada': clf = AdaBoostClassifier(**params) elif estimator_name == 'ext': clf = ExtraTreesClassifier(**params) clf.fit(val_stack, y_val) tr_prob = clf.predict_proba(tr_stack) val_prob = clf.predict_proba(val_stack) test_prob = clf.predict_proba(test_stack) tr_pred.append(tr_prob) val_pred.append(val_prob) test_pred.append(test_prob) return tr_pred, val_pred, test_pred
def get_ntree(): f1_t_total, f1_v_total = [], [] for ntree in range(10, 810, 10): lgb_base = LGBMClassifier(n_estimators = ntree,objective = 'binary', random_state=1234,n_jobs = 2,colsample_bytree=0.8, reg_alpha=1, max_depth = 15, subsample = 0.8) print('此时 ntree = %s' % ntree) lgb_base.fit(X_t, y_t) y_t_pre = lgb_base.predict(X_t) y_v_pre = lgb_base.predict(X_v) f1_t_each = f1_score(y_t, y_t_pre,average = 'micro') f1_v_each = f1_score(y_v, y_v_pre,average = 'micro') f1_t_total.append(f1_t_each) f1_v_total.append(f1_v_each) myfile = open('D:\\workspace python\\contest\\accu_save\\' + 'lgbbase_810_2.txt', 'a', encoding='utf-8') print(f1_t_each,',',f1_v_each,file = myfile) myfile.close() return f1_t_total,f1_v_total
def evaluate_age(): features = pd.read_csv( 'data/combine_feature/part-00000-380aaa4b-c838-43f4-8cb7-80164a4256f2-c000.csv' ) y = features.age.values features.drop(['user_id', 'age', 'gender'], axis=1, inplace=True) print(features.shape) X_train, X_test, y_train, y_test = train_test_split(features, y, test_size=0.2) lightgbm = LGBMClassifier(n_estimators=200, num_leaves=100, feature_fraction=0.75, bagging_fraction=0.75, learning_rate=0.1) lightgbm.fit(X_train, y_train, eval_set=[(X_test, y_test)], early_stopping_rounds=5) pred = lightgbm.predict(X_test) print(classification_report(y_test, pred)) joblib.dump(lightgbm, 'data/lgb_age')
def get_base_models(): base_models = [] base_models.append(('LR', LogisticRegression())) base_models.append(('LDA', LinearDiscriminantAnalysis())) base_models.append(('KNN', KNeighborsClassifier())) base_models.append(('DTC', DecisionTreeClassifier())) base_models.append(('NB', GaussianNB())) base_models.append(('SVM', SVC(probability=True, max_iter=100))) base_models.append(('AB', AdaBoostClassifier())) base_models.append(('RF', RandomForestClassifier())) base_models.append(('ET', ExtraTreesClassifier())) base_models.append(('LGBM', LGBMClassifier())) return base_models