def fit_lightgbm(self, x, y, early_stopping_rounds): print('fit lightgbm') optimized_params = self.lgb_opt.optimize(x, y) optimized_params['objective'] = 'binary' optimized_params['random_state'] = self.random_state optimized_params['n_jobs'] = -1 self.lgb_model = LGBMModel(**optimized_params) self.lgb_model.fit(x, y) if early_stopping_rounds is not None and early_stopping_rounds > 0: x_train, x_valid, y_train, y_valid = train_test_split( x, y, stratify=y, shuffle=True, test_size=self.test_size, random_state=self.random_state) self.lgb_model.fit(x_train, y_train, eval_set=[(x_valid, y_valid)], verbose=self.verbose) else: self.lgb_model.fit(x, y)
def fit(self, x, y, early_stopping_rounds=None): self.fit_lightgbm(x, y, early_stopping_rounds) self.fit_knn(x, y) self.fit_mlp(x, y, early_stopping_rounds) self.fit_svm(x, y) x_stack = self.stack_predict(x) print('fit stack') optimized_params = self.opt.optimize(x, y) optimized_params['objective'] = 'binary' self.model = LGBMModel(**optimized_params) self.model.fit(x_stack, y) if early_stopping_rounds is not None and early_stopping_rounds > 0: x_train, x_valid, y_train, y_valid = train_test_split( x, y, stratify=y, shuffle=True, test_size=self.test_size, random_state=self.random_state) self.lgb_model.fit(x_train, y_train, eval_set=[(x_valid, y_valid)], verbose=self.verbose) else: self.lgb_model.fit(x, y)
def find_n_estimators(model: LGBMModel, X_train, y_train, eval_metric, learning_rates=[0.01, 0.03, 0.1, 0.3], max_n_estimators=20000, early_stopping_rounds=200, validation_size=0.25, random_state=None, verbose=True): """Optimizes the 'n_estimators' parameters using the early stopping method. For each tuple containing 'learning_rate' and 'max_depth', it optimizes 'n_estimators' using the early stopping technique which is implemented by LightGBM. """ X_dev, X_val, y_dev, y_val = train_test_split(X_train, y_train, test_size=validation_size, random_state=random_state) results = pd.DataFrame( columns=['learning_rate', 'best_score', 'best_n_estimators']) for learning_rate in tqdm(learning_rates, desc='learning rates', disable=not verbose): model.learning_rate = learning_rate model.n_estimators = max_n_estimators model.random_state = random_state model.fit(X_dev, y_dev, eval_set=(X_val, y_val), eval_metric=eval_metric, early_stopping_rounds=early_stopping_rounds, verbose=False) results = results.append( { 'learning_rate': learning_rate, 'best_score': model.best_score_['valid_0'][eval_metric], 'best_n_estimators': model.best_iteration_ }, ignore_index=True) results['best_n_estimators'] = results['best_n_estimators'].astype(int) return results
def test_update_booster(): ''' Test whether the booster indeed gets updated :return: ''' Xtrain = np.random.randn(100, 10) ytrain = np.random.randint(0, 2, 100) Xval = np.random.randn(20, 10) yval = np.random.randint(0, 2, 20) init_n_estimators = 2 n_new_iterations = 5 scoring = make_scorer(accuracy_score) classifier = SHLGBMEstimator(model=LGBMModel( objective='binary', n_estimators=init_n_estimators, max_depth=1), ressource_name='n_estimators') classifier.fit(Xtrain, ytrain) classifier.update(Xtrain, ytrain, Xval, yval, scoring=scoring, n_iterations=n_new_iterations) expected_n_estimators = init_n_estimators + n_new_iterations assert (classifier.get_params()['n_estimators'] == expected_n_estimators)
def lgbm_model(self): self.model = LGBMModel( objective='regression', metric='rmse', n_estimators=1000, learning_rate=0.01, min_child_samples=100, bagging_fraction=0.7, feature_fraction=0.5, bagging_freq=5, bagging_seed=2020 ) self.model = self.model.fit( self.train_X, self.train_log_y, eval_set=(self.val_X, self.val_log_y), early_stopping_rounds=100, verbose=100 )
def main(): from lightgbm import LGBMModel from sklearn.metrics import mean_absolute_error from assess import assess df = load_data('train', '../input', sample_size=10000) columns = split_columns_by_types(df) df.drop(df[df['win_place_perc'].isnull()].index, inplace=True) model_params = dict( objective='regression', metric='mae', # n_estimators=20000, n_estimators=2000, num_leaves=31, learning_rate=0.05, bagging_fraction=0.7, bagging_seed=0, num_threads=4, colsample_bytree=0.7) assessment_log = assess( LGBMModel(**model_params), df, columns, metrics=mean_absolute_error, n_splits=1, early_stopping_rounds=200, # early_stopping_rounds=20000, verbose=1, ) del df best_model = [step for step in assessment_log if step['best']].pop() df_test = load_data('test', '../input') pipeline = best_model['pipeline'] model = best_model['model'] x_test = pipeline.transform(df_test) pred_test = model.predict(x_test) del df_test, x_test df_sub = load_data('sub', '../input', normilize_names=False) df_sub['winPlacePerc'] = pred_test df_sub_adjusted = postprocessing(pred_test, '../input') df_sub.to_csv('submission.csv', index=False) df_sub_adjusted.to_csv('submission_adjusted.csv', index=False) print( np.corrcoef(df_sub['winPlacePerc'], df_sub_adjusted['winPlacePerc']))
def test_class_name(self): m1 = LGBMClassifier() m2 = LGBMRegressor() m3 = LGBMModel() m4 = CatBoostRegressor() m5 = CatBoostClassifier() assert is_lightgbm_model(m1) == True assert is_lightgbm_model(m2) == True assert is_lightgbm_model(m3) == True assert is_lightgbm_model(m4) == False assert is_catboost_model(m1) == False assert is_catboost_model(m4) == True assert is_catboost_model(m5) == True
def fit_lightgbm(self, x, y, early_stopping_rounds): self.model = LGBMModel(**self.optimized_params) if early_stopping_rounds is not None: x_valid, y_valid = train_test_split(x, stratify=y, shuffle=True, test_size=self.test_size, random_state=self.random_state) self.model.fit(x, y, eval_set=Dataset(x_valid, y_valid), early_stopping_rounds=early_stopping_rounds, verbose=self.verbose) else: self.model.fit(x, y)
def lightgbm(X_train, y_train, X_test, y_test): reg = LGBMModel(objective='regression') # reg_cv = GridSearchCV(reg, {'max_depth': [2,4,6], 'n_estimators': [50]}, verbose=1) # reg_cv.fit(X_train, y_train) # print(reg_cv.best_params_, reg_cv.best_score_) # reg = xgb.LGBMModel(**reg_cv.best_params_) start = time.time() reg.fit(X_train, y_train) time_train = time.time() - start pred_train = reg.predict(X_train) start = time.time() pred_test = reg.predict(X_test) time_test = time.time() - start return pred_train, pred_test, time_train, time_test, reg.feature_importances_
def objective( num_leaves, scale_pos_weight, min_child_samples, bin_construct_sample_cnt, max_bin, min_sum_hessian_in_leaf, max_depth, min_split_gain, min_child_weight, ): try: scores = [] params = { 'num_leaves': int(round(num_leaves, ndigits=0)), 'scale_pos_weight': scale_pos_weight, 'min_child_samples': int(round(min_child_samples, ndigits=0)), 'bin_construct_sample_cnt': int(round(bin_construct_sample_cnt, ndigits=0)), 'max_bin': int(round(max_bin, ndigits=0)), 'min_sum_hessian_in_leaf': min_sum_hessian_in_leaf, 'max_depth': int(round(max_depth, ndigits=0)), 'min_split_gain': min_split_gain, 'min_child_weight': min_child_weight, 'n_jobs': self.n_jobs, 'silent': self.verbose < 1, 'random_state': self.random_state} if isinstance(self.fixed_parameters, dict): params.update(self.fixed_parameters) if self.use_gpu: params.update({'device': 'gpu', 'gpu_platform_id': 1, 'gpu_device_id': 0}) skf = StratifiedKFold( self.n_folds, shuffle=self.shuffle, random_state=self.random_state) for train_index, valid_index in skf.split(x, y): x_train, y_train = x[train_index, :], y[train_index] x_valid, y_valid = x[valid_index, :], y[valid_index] params['objective'] = 'binary' gbm = LGBMModel(**params) gbm.fit(x_train, y_train, eval_set=[(x_valid, y_valid)], early_stopping_rounds=self.early_stopping_rounds, verbose=int(self.verbose > 0)) y_valid_hat = gbm.predict(x_valid, num_iteration=gbm.best_iteration_) loss_valid = log_loss(y_valid, y_valid_hat) scores.append(loss_valid) result = np.mean(scores) self.iterations.append((params, result)) return result except: # exc_type, exc_obj, exc_tb = sys.exc_info() # fname = os.path.split(exc_tb.tb_frame.f_code.co_filename)[1] # print(exc_type, fname, exc_tb.tb_lineno) return 999.99
class Stack(object): def __init__(self, random_state=None, test_size=0.2, verbose=None, optimization_n_call=50, optimization_n_folds=2, optimization_early_stopping_rounds=1, optimization_shuffle=True): self.opt = LightGBMOptimizer( n_folds=optimization_n_folds, n_calls=optimization_n_call, early_stopping_rounds=optimization_early_stopping_rounds, shuffle=optimization_shuffle, n_jobs=-1) self.lgb_opt = LightGBMOptimizer( n_folds=optimization_n_folds, n_calls=optimization_n_call, early_stopping_rounds=optimization_early_stopping_rounds, shuffle=optimization_shuffle, n_jobs=-1) self.mlp_opt = MLPOptimizer(n_folds=optimization_n_folds, n_calls=optimization_n_call, shuffle=optimization_shuffle, n_jobs=-1) self.knn_opt = KNNOptimizer(n_folds=optimization_n_folds, n_calls=optimization_n_call, shuffle=optimization_shuffle, n_jobs=-1) self.svm_opt = SVMOptimizer( n_folds=optimization_n_folds, n_calls=optimization_n_call, early_stopping_rounds=optimization_early_stopping_rounds, shuffle=optimization_shuffle, n_jobs=-1) self.model = None self.lgb_model = None self.mlp_model = None self.knn_model = None self.svm_model = None self.random_state = random_state self.test_size = test_size self.verbose = verbose def stack_predict(self, x): lgb_y_hat = self.lgb_model.predict( x, num_iteration=self.lgb_model.best_iteration_) print(lgb_y_hat.shape) mlp_y_hat = self.mlp_model.predict_proba(x)[:, -1] print(mlp_y_hat.shape) knn_y_hat = self.knn_model.predict_proba(x)[:, -1] print(knn_y_hat.shape) svm_y_hat = self.svm_model.predict_proba(x)[:, -1] print(svm_y_hat.shape) return np.array([lgb_y_hat, mlp_y_hat, knn_y_hat, svm_y_hat]).T def fit(self, x, y, early_stopping_rounds=None): self.fit_lightgbm(x, y, early_stopping_rounds) self.fit_knn(x, y) self.fit_mlp(x, y, early_stopping_rounds) self.fit_svm(x, y) x_stack = self.stack_predict(x) print('fit stack') optimized_params = self.opt.optimize(x, y) optimized_params['objective'] = 'binary' self.model = LGBMModel(**optimized_params) self.model.fit(x_stack, y) if early_stopping_rounds is not None and early_stopping_rounds > 0: x_train, x_valid, y_train, y_valid = train_test_split( x, y, stratify=y, shuffle=True, test_size=self.test_size, random_state=self.random_state) self.lgb_model.fit(x_train, y_train, eval_set=[(x_valid, y_valid)], verbose=self.verbose) else: self.lgb_model.fit(x, y) def fit_lightgbm(self, x, y, early_stopping_rounds): print('fit lightgbm') optimized_params = self.lgb_opt.optimize(x, y) optimized_params['objective'] = 'binary' optimized_params['random_state'] = self.random_state optimized_params['n_jobs'] = -1 self.lgb_model = LGBMModel(**optimized_params) self.lgb_model.fit(x, y) if early_stopping_rounds is not None and early_stopping_rounds > 0: x_train, x_valid, y_train, y_valid = train_test_split( x, y, stratify=y, shuffle=True, test_size=self.test_size, random_state=self.random_state) self.lgb_model.fit(x_train, y_train, eval_set=[(x_valid, y_valid)], verbose=self.verbose) else: self.lgb_model.fit(x, y) def fit_svm(self, x, y): print('fit svm') optimized_params = self.svm_opt.optimize(x, y) optimized_params['random_state'] = self.random_state self.svm_model = SVC(**optimized_params, probability=True) self.svm_model.fit(x, y) def fit_mlp(self, x, y, early_stopping_rounds): print('fit mlp') optimized_params = self.mlp_opt.optimize(x, y) optimized_params['random_state'] = self.random_state esr = early_stopping_rounds is not None and early_stopping_rounds > 0 self.mlp_model = MLPClassifier(**optimized_params, early_stopping=esr, validation_fraction=self.test_size) self.mlp_model.fit(x, y) def fit_knn(self, x, y): print('fit knn') optimized_params = self.knn_opt.optimize(x, y) optimized_params['n_jobs'] = -1 self.knn_model = KNeighborsClassifier(**optimized_params) self.knn_model.fit(x, y) def predict(self, x): x_stack = self.stack_predict(x) return self.model.predict(x_stack, num_iteration=self.model.best_iteration_)
class MyLGBMModel(BaseModel): """ Parameters ----------- ref: https://lightgbm.readthedocs.io/en/latest/Parameters.html model_params: objective: default=regression mae rmse poison binary multiclass cross_entropy boosting: default=gbdt gbdt rf dart goss learning_rate: default=0.1 num_leaves: default=31 num_threads: default=0 num_class:default=1 max_depth:default=-1 bagging_fraction:default=1.0 feature_fraction:default=1.0 alias=>colsample_bytree lambda_l1:default=0.0 lambda_l2:default=0.0 is_unbalance:default=False fit_params: verbose: default=1 early_stopping_rounds:None eval_metric:mae mse rmse poison auc average_precision binary_logloss binary_error multi_logloss cross_entropy """ # ref:https://qiita.com/tubo/items/f83a97f2488cc1f40088 tuboさんのベースラインから # :https://signate.jp/competitions/402/discussions/lgbm-baseline-except-text-vs-include-text-lb07994-1 masatoさんのベースラインから def __init__(self, model_params, fit_params: Optional[Dict], categorical_features: Optional[Union[List[str], List[int]]]): self.model_params = model_params self.fit_params = fit_params if self.fit_params is None: self.fit_params = {} self.categorical_features = categorical_features def build_model(self): self.model = LGBMModel(**self.model_params) return self.model def fit(self, train_x, train_y, valid_x=None, valid_y=None): self.model = self.build_model() self.model.fit(train_x, train_y, eval_set=[[valid_x, valid_y]], categorical_feature=self.categorical_features, **self.fit_params) return self.model def predict(self, est, valid_x): preds = est.predict(valid_x) return preds def get_feature_importance(self, train_feat_df: pd.DataFrame, is_save=False, filepath=None): feature_importance_df = pd.DataFrame() num = 0 for i, model in self.models.items(): _df = pd.DataFrame() _df['feature_importance'] = model.feature_importances_ _df['column'] = train_feat_df.columns _df['fold'] = num + 1 feature_importance_df = pd.concat([feature_importance_df, _df], axis=0, ignore_index=True) num += 1 order = feature_importance_df.groupby('column')\ .sum()[['feature_importance']]\ .sort_values('feature_importance', ascending=False).index[:50] fig, ax = plt.subplots(figsize=(8, max(6, len(order) * .25))) if is_save: fig.savefig(filepath + "lgbm_feature_importance.png") _df.to_csv(filepath + "lgbm_feature_importance.csv", index=False) sns.boxenplot(data=feature_importance_df, x='feature_importance', y='column', order=order, ax=ax, palette='viridis', orient='h') ax.tick_params(axis='x', rotation=90) ax.set_title('Lightgbm Feature Importance') ax.grid() plt.show()
def build_model(self): self.model = LGBMModel(**self.model_params) return self.model
min_child_weight=4, objective='reg:linear', subsample=0.8)) XGBRegressor()._get_param_names() model = PipelineXGB.fit(predictor[modelList], response) resultTrain = PipelineXGB.predict(predictor[modelList]) rmsle(response, resultTrain) # rmsle2(response, resultTrain) # rmsle3(response, resultTrain) # %% PipelineLGB = make_pipeline( preprocess, LGBMModel(colsample_bytree=0.7, max_depth=4, min_child_weight=3, objective='regression', subsample=0.6)) LGBMModel()._get_param_names() model = PipelineLGB.fit(predictor[modelList], response) resultTrain = PipelineLGB.predict(predictor[modelList]) rmsle(response, resultTrain) # %% # create a function that returns a model, taking as parameters things you # want to verify using cross-valdiation and model selection def create_model(optimizer='Adagrad', kernel_initializer='he_normal',
class LGBMPredictor: def __init__(self): self.data_dir = '../../datasets' if not path.exists(self.data_dir): raise Exception( '{} directory not found.'.format(self.data_dir) ) self.train_file = '{}/{}'.format(self.data_dir, 'train.zip') self.val_file = '{}/{}'.format(self.data_dir, 'val.zip') self.pred_val_file = '{}/{}'.format( self.data_dir, 'lgbm_pred_val.zip' ) self.test_file = '{}/{}'.format(self.data_dir, 'test.zip') self.pred_test_file = '{}/{}'.format( self.data_dir, 'lgbm_pred_test.zip' ) def load_data(self, zip_path): df = pd.read_csv( zip_path, dtype={'fullVisitorId': 'str'}, compression='zip' ) [rows, columns] = df.shape print('\nLoaded {} rows with {} columns from {}.\n'.format( rows, columns, zip_path )) return df def load(self): print('Loading train data from {}'.format(self.train_file)) self.train_df = self.load_data(self.train_file) print('Loading val data from {}'.format(self.val_file)) self.val_df = self.load_data(self.val_file) print('Loading test data from {}'.format(self.test_file)) self.test_df = self.load_data(self.test_file) def prepare_data(self): train_df = self.train_df val_df = self.val_df test_df = self.test_df self.train_id = train_df['fullVisitorId'].values self.val_id = val_df['fullVisitorId'].values self.test_id = test_df['fullVisitorId'].values self.train_y = train_df['totals.transactionRevenue'].values self.train_log_y = np.log1p(self.train_y) self.val_y = val_df['totals.transactionRevenue'].values self.val_log_y = np.log1p(self.val_y) self.train_X = train_df.drop( ['totals.transactionRevenue', 'fullVisitorId'], axis=1 ) self.val_X = val_df.drop( ['totals.transactionRevenue', 'fullVisitorId'], axis=1 ) self.test_X = test_df.drop(['fullVisitorId'], axis=1) print('\nShape of the train dataset: {}'.format(self.train_X.shape)) print('\nShape of the val dataset: {}'.format(self.val_X.shape)) print('\nShape of the test dataset: {}\n'.format(self.test_X.shape)) def lgbm_model(self): self.model = LGBMModel( objective='regression', metric='rmse', n_estimators=1000, learning_rate=0.01, min_child_samples=100, bagging_fraction=0.7, feature_fraction=0.5, bagging_freq=5, bagging_seed=2020 ) self.model = self.model.fit( self.train_X, self.train_log_y, eval_set=(self.val_X, self.val_log_y), early_stopping_rounds=100, verbose=100 ) def lgbm_predict(self, X): return self.model.predict(X, self.model.best_iteration_) def lgbm_train(self): self.lgbm_model() def predict(self): self.prev_val = self.lgbm_predict(self.val_X) self.prev_test = self.lgbm_predict(self.test_X) def evaluate_val_prediction(self): pred_val = self.prev_val pred_val[pred_val < 0] = 0 pred_val_data = { 'fullVisitorId': self.val_id, 'transactionRevenue': self.val_y, 'predictedRevenue': np.expm1(pred_val) } pred_val_df = pd.DataFrame(pred_val_data) pred_val_df = pred_val_df.groupby('fullVisitorId') pred_val_df = pred_val_df['transactionRevenue', 'predictedRevenue']\ .sum().reset_index() rsme_val = np.sqrt( mean_squared_error( np.log1p(pred_val_df['transactionRevenue'].values), np.log1p(pred_val_df['predictedRevenue'].values) ) ) self.rsme_val = rsme_val self.prev_val_df = pred_val_df def evaluate_test_prediction(self): pred_test = self.pred_test pred_test[pred_test < 0] = 0 pred_test_data = { 'fullVisitorId': self.test_id, 'predictedRevenue': np.expm1(pred_test) } pred_test_df = pd.DataFrame(pred_test_data) pred_test_df = pred_test_df.groupby('fullVisitorId') pred_test_df = pred_test_df['predictedRevenue'].sum().reset_index() self.pred_test_df = pred_test_df def write_to_csv(self): self.pred_val_df.to_csv( self.pred_val_file, index=False, compression='zip' ) self.pred_test_df.to_csv( self.pred_test_file, index=False, compression='zip' )
def run_experiment(experiment_params: Dict[str, any]) -> Dict[str, float]: experiment_results = collections.OrderedDict() # generate data data = generate_normal_correlated_data( mu=experiment_params["mu"], var=experiment_params["var"], n_features=experiment_params["n_features"], n_samples=experiment_params["n_samples"], max_correlation=experiment_params["max_correlation"], noise_magnitude_max=experiment_params["noise_magnitude_max"], seed=experiment_params["seed"]) # get data's correlation statistics data_stats = get_correlated_data_stats(data) for key, value in data_stats.items(): experiment_results[f"corr_data_{key}"] = value # generate weights of features weights = generate_weights_gamma(n_features=data.shape[1], gamma=experiment_params["gamma"], scale=experiment_params["scale"], seed=experiment_params["seed"]) expected_ranks = rank_array(-weights) # generate target y = generate_normal_target(data, weights, task=experiment_params["task"]) data = pd.DataFrame(data) # permutation importance model, score, importances, importances_ranks = calculate_permutation_importance( LGBMModel(**experiment_params["model_params"]), data, y, scoring_function=experiment_params["metric"], n_repeats=experiment_params["n_repeats_permutations"], ) permutation_ranks_corr = spearmanr(expected_ranks, importances_ranks)[0] experiment_results["model_roc_auc"] = score experiment_results["permutation_ranks_corr"] = permutation_ranks_corr # shap explainer = shap.TreeExplainer(model.booster_, feature_perturbation="tree_path_dependent") shap_values = explainer.shap_values(data) if len(shap_values ) == 2: # 2 - list of 2 elements for classification, select class 1 shap_values = shap_values[1] shap_values = abs(shap_values) shap_fe = shap_values.sum(axis=0) shap_ranks_corr = spearmanr(expected_ranks, -shap_fe)[0] experiment_results["shap_ranks_corr"] = shap_ranks_corr # gain model_fe = model.booster_.feature_importance(importance_type='gain') gain_ranks_corr = spearmanr(expected_ranks, -model_fe)[0] experiment_results["gain_ranks_corr"] = gain_ranks_corr if experiment_params["apply_relearn"]: # drop and relearn _, _, _, importances_ranks_drop = calculate_drop_and_relearn_importance( LGBMModel(**experiment_params["model_params"]), data, y, scoring_function=experiment_params["metric"], ) drop_and_relearn_ranks_corr = spearmanr(expected_ranks, importances_ranks_drop)[0] experiment_results[ "drop_and_relearn_ranks_corr"] = drop_and_relearn_ranks_corr # permute and relearn _, _, _, importances_ranks_permute = calculate_permute_and_relearn_importance( LGBMModel(**experiment_params["model_params"]), data, y, scoring_function=experiment_params["metric"], ) permute_and_relearn_ranks_corr = spearmanr( expected_ranks, importances_ranks_permute)[0] experiment_results[ "permute_and_relearn_ranks_corr"] = permute_and_relearn_ranks_corr return experiment_results