Example #1
0
def test():
    from pycaret.datasets import get_data

    data = get_data("boston")
    from pycaret.regression import setup, create_model, tune_model

    s = setup(data, target="medv", silent=True, html=False, session_id=123)
    gbr = create_model("gbr")
    tuned_gbr = tune_model(gbr)
    xgboost = create_model("xgboost")
    tuned_xgboost = tune_model(xgboost)
    lightgbm = create_model("lightgbm")
    tuned_lightgbm = tune_model(lightgbm)
    assert 1 == 1
Example #2
0
from pycaret.regression import setup, create_model, tune_model, save_model
import pandas as pd

data = pd.read_csv('C:/tmp/insurance.csv',  delimiter=',')
print(data.head())

r2 = setup(data, target='charges', session_id=123,
           normalize=True,
           polynomial_features=True, trigonometry_features=True,
           feature_interaction=True,
           bin_numeric_features=['age', 'bmi'])

lr = create_model('lr')
tuned_lr = tune_model(lr)
save_model(tuned_lr, model_name='./models/lr_deployment_20210521')
Example #3
0
def regression_model(*, y_col, training_set, normalize, test_size, folds,
                     metric, model_name, testing_set, imbalanced, seed,
                     include_models, normalize_method):
    """
    Build a regression model for prediction.

    Parameters
    ----------
    y_col : str
        the name of the target column.
    training_set : pd.DataFrame
        DataFrame containing the training data.
    normalize : bool
        if True the dataset will be normalized before training.
    test_size : float
        Between [0.0-1.0]. The size of the split for test within the training set.
    folds : int
        number of folds for cross validation.
    metric : str
        the metric used for evaluating the best model.
    model_name : str
        the name to save the model.
    testing_set : pd.DataFrame
        the external dataset for evaluating the best model.
    imbalanced
    seed : int
        random number to initilize the process.
    include_models : List
        a list of models to be included in the process.
    normalize_method : str
        The method used for normalizing the data.

    Returns
    -------
    Final regression model

    """
    if not metric:
        metric = 'RMSE'
    setup = pyreg.setup(target=y_col,
                        data=training_set,
                        normalize=normalize,
                        normalize_method=normalize_method,
                        train_size=1 - test_size,
                        fold=folds,
                        silent=True,
                        session_id=seed)
    best_model = pyreg.compare_models(sort=metric, include=include_models)
    pyreg.pull().to_csv(model_name + '_compare_models.tsv',
                        sep='\t',
                        index=False)
    reg_model = pyreg.create_model(best_model)
    reg_tuned_model = pyreg.tune_model(reg_model, optimize=metric)
    pyreg.pull().to_csv(model_name + '_tuned_model.tsv', sep='\t', index=False)
    final_model = pyreg.finalize_model(reg_tuned_model)
    pyreg.plot_model(final_model, save=True)
    pyreg.plot_model(final_model, plot='feature', save=True)
    pyreg.plot_model(final_model, plot='error', save=True)
    pyreg.save_model(final_model, model_name)
    if len(testing_set.index) != 0:
        unseen_predictions = test_regressor(
            model_path=model_name + '.pkl',
            x_set=testing_set.drop(columns=[y_col]),
            y_col=testing_set[y_col],
            output=model_name)
        unseen_predictions.to_csv(model_name + '_external_testing_results.tsv',
                                  sep='\t',
                                  index=True)
    return final_model
        fold=3,
        round=5,
    )

    # ---- 模型调参 ---------------------------------------------------------------------------------

    # 初始化模型, 固定参数.
    params = {'max_features': 'auto'}
    rgsr = create_model('rf', verbose=False, **params)

    # 模型调参.
    params4tuning = {
        "n_estimators": np.arange(30, 250, 30),
        "min_samples_leaf": [10, 15, 20, 30, 40, 50],
        "min_samples_split": [20, 30, 40],
    }
    rgsr_tuned = tune_model(rgsr,
                            optimize='R2',
                            n_iter=2,
                            fold=5,
                            round=2,
                            custom_grid=params4tuning)

    # ---- 模型训练和预测 ---------------------------------------------------------------------------

    # evaluate_model(rgsr_tuned)

    # ---- 模型可解释性 -----------------------------------------------------------------------------

    interpret_model(rgsr_tuned, plot='summary')