Esempio n. 1
0
def test():
    from pycaret.datasets import get_data

    data = get_data("boston")
    from pycaret.regression import setup, create_model, tune_model

    s = setup(data, target="medv", silent=True, html=False, session_id=123)
    gbr = create_model("gbr")
    tuned_gbr = tune_model(gbr)
    xgboost = create_model("xgboost")
    tuned_xgboost = tune_model(xgboost)
    lightgbm = create_model("lightgbm")
    tuned_lightgbm = tune_model(lightgbm)
    assert 1 == 1
 def _regression(self, combined_data):
     self._regression_setup(combined_data)
     # Train
     regresser = regression.create_model(self.task.pycaret_model,
                                         verbose=False)
     # Store
     if self.task.output_dir:
         self._store_regresser(regresser)
     # Predict on Test set
     predictions = regression.predict_model(
         regresser, verbose=False)  # TODO get raw_scores for AUC
     return predictions
Esempio n. 3
0
from pycaret.regression import setup, create_model, tune_model, save_model
import pandas as pd

data = pd.read_csv('C:/tmp/insurance.csv',  delimiter=',')
print(data.head())

r2 = setup(data, target='charges', session_id=123,
           normalize=True,
           polynomial_features=True, trigonometry_features=True,
           feature_interaction=True,
           bin_numeric_features=['age', 'bmi'])

lr = create_model('lr')
tuned_lr = tune_model(lr)
save_model(tuned_lr, model_name='./models/lr_deployment_20210521')
Esempio n. 4
0
def app_main():
    st.title("Machine learning analysis platform")
    if st.sidebar.checkbox('Define Data Source'):
        filesFolder = st.sidebar.text_input('folder', value="data")
        dataList = list_files(filesFolder, 'csv')
        if len(dataList) == 0:
            st.warning('No data set available')
        else:
            file_selected = st.sidebar.selectbox('Select a document', dataList)
            file_selected_path = concat_file_path(filesFolder, file_selected)
            nrows = st.sidebar.number_input('Number of lines', value=-1)
            n_rows_str = 'All' if nrows == -1 else str(nrows)
            st.info(
                'Selected file:{file_selected_path},The number of rows read is{n_rows_str}'
            )
    else:
        file_selected_path = None
        nrows = 100
        st.warning('The currently selected file is empty, please select:')
    if st.sidebar.checkbox('Exploratory Analysis'):
        if file_selected_path is not None:
            if st.sidebar.button('Report Generation'):
                df = load_csv(file_selected_path, nrows)
                pr = ProfileReport(df, explorative=True)
                st_profile_report(pr)
        else:
            st.info('No file selected, analysis cannot be performed')
    if st.sidebar.checkbox('Modeling'):
        if file_selected_path is not None:
            task = st.sidebar.selectbox('Select Task', ML_LIST)
            if task == 'Regression':
                model = st.sidebar.selectbox('Select Model', RG_LIST)
            elif task == 'Classification':
                model = st.sidebar.selectbox('Select Model', RG_LIST)
            df = load_csv(file_selected_path, nrows)
            try:
                cols = df.columns.to_list()
                target_col = st.sidebar.selectbox('Select Prediction Object',
                                                  cols)
            except BaseException:
                st.sidebar.warning('The data format cannot be read correctly')
                target_col = None

            if target_col is not None and st.sidebar.button('Training Model'):
                if task == 'Regression':
                    st.success('Data preprocessing...')
                    pc_rg.setup(df,
                                target=target_col,
                                log_experiment=True,
                                experiment_name='ml_',
                                log_plots=True,
                                silent=True,
                                verbose=False,
                                profile=True)
                    st.success('Data preprocessing is complete')
                    st.success('Training model. . .')
                    pc_rg.create_model(model, verbose=False)
                    st.success('The model training is complete. . .')
                    #pc_rg.finalize_model(model)
                    st.success('Model has been created')
                elif task == 'Classification':
                    st.success('Data preprocessing. . .')
                    pc_cl.setup(df,
                                target=target_col,
                                fix_imbalance=True,
                                log_experiment=True,
                                experiment_name='ml_',
                                log_plots=True,
                                silent=True,
                                verbose=False,
                                profile=True)
                    st.success('Data preprocessing is complete.')
                    st.success('Training model. . .')
                    pc_cl.create_model(model, verbose=False)
                    st.success('The model training is complete. . .')
                    #pc_cl.finalize_model(model)
                    st.success('Model has been created')

    if st.sidebar.checkbox('View System Log'):
        n_lines = st.sidebar.slider(label='Number of lines',
                                    min_value=3,
                                    max_value=50)
        if st.sidebar.button("Check View"):
            logs = get_model_training_logs(n_lines=n_lines)
            st.text('System log')
            st.write(logs)
    try:
        allOfRuns = mlflow.search_runs(experiment_ids=0)
    except:
        allOfRuns = []
    if len(allOfRuns) != 0:
        if st.sidebar.checkbox('Preview model'):
            ml_logs = 'http://kubernetes.docker.internal:5000/  -->Open mlflow, enter the command line: mlflow ui'
            st.markdown(ml_logs)
            st.dataframe(allOfRuns)
        if st.sidebar.checkbox('Choose a model'):
            selected_run_id = st.sidebar.selectbox(
                'Choose from saved models',
                allOfRuns[allOfRuns['tags.Source'] ==
                          'create_model']['run_id'].tolist())
            selected_run_info = allOfRuns[(
                allOfRuns['run_id'] == selected_run_id)].iloc[0, :]
            st.code(selected_run_info)
            if st.sidebar.button('Forecast data'):
                model_uri = 'runs:/' + selected_run_id + '/model/'
                model_loaded = mlflow.sklearn.load_model(model_uri)
                df = pd.read_csv(file_selected_path, nrows=nrows)
                #st.success('Model prediction. . .')
                pred = model_loaded.predict(df)
                pred_df = pd.DataFrame(pred, columns=['Predictive Data'])
                st.dataframe(pred_df)
                pred_df.plot()
                st.pyplot()
    else:
        st.sidebar.warning('Did not find a trained model')
Esempio n. 5
0
def app_main():
    st.title("自动化机器学习平台")
    if st.sidebar.checkbox('定义数据源'):
        file_folder = st.sidebar.text_input('文件夹', value="data")
        data_file_list = list_files(file_folder, 'csv')
        if len(data_file_list) == 0:
            st.warning(f'当路径无可用数据集')
        else:
            file_selected = st.sidebar.selectbox('选择文件', data_file_list)
            file_selected_path = concat_file_path(file_folder, file_selected)
            nrows = st.sidebar.number_input('行数', value=-1)
            n_rows_str = '全部' if nrows == -1 else str(nrows)
            st.info(f'已选择文件:{file_selected_path},读取行数为{n_rows_str}')
    else:
        file_selected_path = None
        nrows = 100
        st.warning(f'当前选择文件为空,请选择。')
    if st.sidebar.checkbox('探索性分析'):
        if file_selected_path is not None:
            if st.sidebar.button('一键生成报告'):
                df = load_csv(file_selected_path, nrows)
                pr = ProfileReport(df, explorative=True)
                st_profile_report(pr)
        else:
            st.info(f'没有选择文件,无法进行分析。')

    if st.sidebar.checkbox('快速建模'):
        if file_selected_path is not None:
            task = st.sidebar.selectbox('选择任务', ML_TASK_LIST)
            if task == '回归':
                model = st.sidebar.selectbox('选取模型', RG_MODEL_LIST)
            elif task == '分类':
                model = st.sidebar.selectbox('选取模型', RG_MODEL_LIST)
            df = load_csv(file_selected_path, nrows)
            try:
                cols = df.columns.to_list()
                target_col = st.sidebar.selectbox('选取预测对象', cols)
            except BaseException:
                st.sidebar.warning(f'数据格式无法正确读取')
                target_col = None

            if target_col is not None and st.sidebar.button('训练模型'):
                if task == '回归':
                    st.success(f'数据预处理。。。')
                    pc_rg.setup(df,
                                target=target_col,
                                log_experiment=True,
                                experiment_name='ml_',
                                log_plots=True,
                                silent=True,
                                verbose=False,
                                profile=True)
                    st.success(f'数据预处理完毕。')
                    st.success(f'训练模型。。。')
                    pc_rg.create_model(model, verbose=False)
                    st.success(f'模型训练完毕。。。')
                    #pc_rg.finalize_model(model)
                    st.success(f'模型已经创建')
                elif task == '分类':
                    st.success(f'数据预处理。。。')
                    pc_cl.setup(df,
                                target=target_col,
                                fix_imbalance=True,
                                log_experiment=True,
                                experiment_name='ml_',
                                log_plots=True,
                                silent=True,
                                verbose=False,
                                profile=True)
                    st.success(f'数据预处理完毕。')
                    st.success(f'训练模型。。。')
                    pc_cl.create_model(model, verbose=False)
                    st.success(f'模型训练完毕。。。')
                    #pc_cl.finalize_model(model)
                    st.success(f'模型已经创建')
    if st.sidebar.checkbox('查看系统日志'):
        n_lines = st.sidebar.slider(label='行数', min_value=3, max_value=50)
        if st.sidebar.button("查看"):
            logs = get_model_training_logs(n_lines=n_lines)
            st.text('系统日志')
            st.write(logs)
    try:
        all_runs = mlflow.search_runs(experiment_ids=0)
    except:
        all_runs = []
    if len(all_runs) != 0:
        if st.sidebar.checkbox('预览模型'):
            ml_logs = 'http://kubernetes.docker.internal:5000/  -->开启mlflow,命令行输入:mlflow ui'
            st.markdown(ml_logs)
            st.dataframe(all_runs)
        if st.sidebar.checkbox('选择模型'):
            selected_run_id = st.sidebar.selectbox(
                '从已保存模型中选择', all_runs[all_runs['tags.Source'] ==
                                      'create_model']['run_id'].tolist())
            selected_run_info = all_runs[(
                all_runs['run_id'] == selected_run_id)].iloc[0, :]
            st.code(selected_run_info)
            if st.sidebar.button('预测数据'):
                model_uri = f'runs:/' + selected_run_id + '/model/'
                model_loaded = mlflow.sklearn.load_model(model_uri)
                df = pd.read_csv(file_selected_path, nrows=nrows)
                #st.success(f'模型预测中。。。   ')
                pred = model_loaded.predict(df)
                pred_df = pd.DataFrame(pred, columns=['预测值'])
                st.dataframe(pred_df)
                pred_df.plot()
                st.pyplot()
    else:
        st.sidebar.warning('没有找到训练好的模型')
Esempio n. 6
0
def regression_model(*, y_col, training_set, normalize, test_size, folds,
                     metric, model_name, testing_set, imbalanced, seed,
                     include_models, normalize_method):
    """
    Build a regression model for prediction.

    Parameters
    ----------
    y_col : str
        the name of the target column.
    training_set : pd.DataFrame
        DataFrame containing the training data.
    normalize : bool
        if True the dataset will be normalized before training.
    test_size : float
        Between [0.0-1.0]. The size of the split for test within the training set.
    folds : int
        number of folds for cross validation.
    metric : str
        the metric used for evaluating the best model.
    model_name : str
        the name to save the model.
    testing_set : pd.DataFrame
        the external dataset for evaluating the best model.
    imbalanced
    seed : int
        random number to initilize the process.
    include_models : List
        a list of models to be included in the process.
    normalize_method : str
        The method used for normalizing the data.

    Returns
    -------
    Final regression model

    """
    if not metric:
        metric = 'RMSE'
    setup = pyreg.setup(target=y_col,
                        data=training_set,
                        normalize=normalize,
                        normalize_method=normalize_method,
                        train_size=1 - test_size,
                        fold=folds,
                        silent=True,
                        session_id=seed)
    best_model = pyreg.compare_models(sort=metric, include=include_models)
    pyreg.pull().to_csv(model_name + '_compare_models.tsv',
                        sep='\t',
                        index=False)
    reg_model = pyreg.create_model(best_model)
    reg_tuned_model = pyreg.tune_model(reg_model, optimize=metric)
    pyreg.pull().to_csv(model_name + '_tuned_model.tsv', sep='\t', index=False)
    final_model = pyreg.finalize_model(reg_tuned_model)
    pyreg.plot_model(final_model, save=True)
    pyreg.plot_model(final_model, plot='feature', save=True)
    pyreg.plot_model(final_model, plot='error', save=True)
    pyreg.save_model(final_model, model_name)
    if len(testing_set.index) != 0:
        unseen_predictions = test_regressor(
            model_path=model_name + '.pkl',
            x_set=testing_set.drop(columns=[y_col]),
            y_col=testing_set[y_col],
            output=model_name)
        unseen_predictions.to_csv(model_name + '_external_testing_results.tsv',
                                  sep='\t',
                                  index=True)
    return final_model
Esempio n. 7
0
def train_xgboost_regression(fold, cross_validation):
    create_model('xgboost', fold=fold, cross_validation=cross_validation)
    best_model = compare_models(
        include=[
            'rf', 'lightgbm', 'lasso', 'ridge', 'xgboost', 'en', 'knn', 'mlp',
            'lr', 'dt'
        ],
        sort='R2',
        verbose=True,
        fold=3,
        round=5,
    )

    # ---- 模型调参 ---------------------------------------------------------------------------------

    # 初始化模型, 固定参数.
    params = {'max_features': 'auto'}
    rgsr = create_model('rf', verbose=False, **params)

    # 模型调参.
    params4tuning = {
        "n_estimators": np.arange(30, 250, 30),
        "min_samples_leaf": [10, 15, 20, 30, 40, 50],
        "min_samples_split": [20, 30, 40],
    }
    rgsr_tuned = tune_model(rgsr,
                            optimize='R2',
                            n_iter=2,
                            fold=5,
                            round=2,
                            custom_grid=params4tuning)

    # ---- 模型训练和预测 ---------------------------------------------------------------------------
Esempio n. 9
0
def run_model():
    clf = py.setup(data, target = 'rent', silent=True)
    rf_model = py.create_model('rf', fold=5, verbose=False)
    model = py.finalize_model(rf_model)
    return model