def test(): from pycaret.datasets import get_data data = get_data("boston") from pycaret.regression import setup, create_model, tune_model s = setup(data, target="medv", silent=True, html=False, session_id=123) gbr = create_model("gbr") tuned_gbr = tune_model(gbr) xgboost = create_model("xgboost") tuned_xgboost = tune_model(xgboost) lightgbm = create_model("lightgbm") tuned_lightgbm = tune_model(lightgbm) assert 1 == 1
from pycaret.regression import setup, create_model, tune_model, save_model import pandas as pd data = pd.read_csv('C:/tmp/insurance.csv', delimiter=',') print(data.head()) r2 = setup(data, target='charges', session_id=123, normalize=True, polynomial_features=True, trigonometry_features=True, feature_interaction=True, bin_numeric_features=['age', 'bmi']) lr = create_model('lr') tuned_lr = tune_model(lr) save_model(tuned_lr, model_name='./models/lr_deployment_20210521')
def regression_model(*, y_col, training_set, normalize, test_size, folds, metric, model_name, testing_set, imbalanced, seed, include_models, normalize_method): """ Build a regression model for prediction. Parameters ---------- y_col : str the name of the target column. training_set : pd.DataFrame DataFrame containing the training data. normalize : bool if True the dataset will be normalized before training. test_size : float Between [0.0-1.0]. The size of the split for test within the training set. folds : int number of folds for cross validation. metric : str the metric used for evaluating the best model. model_name : str the name to save the model. testing_set : pd.DataFrame the external dataset for evaluating the best model. imbalanced seed : int random number to initilize the process. include_models : List a list of models to be included in the process. normalize_method : str The method used for normalizing the data. Returns ------- Final regression model """ if not metric: metric = 'RMSE' setup = pyreg.setup(target=y_col, data=training_set, normalize=normalize, normalize_method=normalize_method, train_size=1 - test_size, fold=folds, silent=True, session_id=seed) best_model = pyreg.compare_models(sort=metric, include=include_models) pyreg.pull().to_csv(model_name + '_compare_models.tsv', sep='\t', index=False) reg_model = pyreg.create_model(best_model) reg_tuned_model = pyreg.tune_model(reg_model, optimize=metric) pyreg.pull().to_csv(model_name + '_tuned_model.tsv', sep='\t', index=False) final_model = pyreg.finalize_model(reg_tuned_model) pyreg.plot_model(final_model, save=True) pyreg.plot_model(final_model, plot='feature', save=True) pyreg.plot_model(final_model, plot='error', save=True) pyreg.save_model(final_model, model_name) if len(testing_set.index) != 0: unseen_predictions = test_regressor( model_path=model_name + '.pkl', x_set=testing_set.drop(columns=[y_col]), y_col=testing_set[y_col], output=model_name) unseen_predictions.to_csv(model_name + '_external_testing_results.tsv', sep='\t', index=True) return final_model
fold=3, round=5, ) # ---- 模型调参 --------------------------------------------------------------------------------- # 初始化模型, 固定参数. params = {'max_features': 'auto'} rgsr = create_model('rf', verbose=False, **params) # 模型调参. params4tuning = { "n_estimators": np.arange(30, 250, 30), "min_samples_leaf": [10, 15, 20, 30, 40, 50], "min_samples_split": [20, 30, 40], } rgsr_tuned = tune_model(rgsr, optimize='R2', n_iter=2, fold=5, round=2, custom_grid=params4tuning) # ---- 模型训练和预测 --------------------------------------------------------------------------- # evaluate_model(rgsr_tuned) # ---- 模型可解释性 ----------------------------------------------------------------------------- interpret_model(rgsr_tuned, plot='summary')