def test(): from pycaret.datasets import get_data data = get_data("boston") from pycaret.regression import setup, create_model, tune_model s = setup(data, target="medv", silent=True, html=False, session_id=123) gbr = create_model("gbr") tuned_gbr = tune_model(gbr) xgboost = create_model("xgboost") tuned_xgboost = tune_model(xgboost) lightgbm = create_model("lightgbm") tuned_lightgbm = tune_model(lightgbm) assert 1 == 1
def _regression(self, combined_data): self._regression_setup(combined_data) # Train regresser = regression.create_model(self.task.pycaret_model, verbose=False) # Store if self.task.output_dir: self._store_regresser(regresser) # Predict on Test set predictions = regression.predict_model( regresser, verbose=False) # TODO get raw_scores for AUC return predictions
from pycaret.regression import setup, create_model, tune_model, save_model import pandas as pd data = pd.read_csv('C:/tmp/insurance.csv', delimiter=',') print(data.head()) r2 = setup(data, target='charges', session_id=123, normalize=True, polynomial_features=True, trigonometry_features=True, feature_interaction=True, bin_numeric_features=['age', 'bmi']) lr = create_model('lr') tuned_lr = tune_model(lr) save_model(tuned_lr, model_name='./models/lr_deployment_20210521')
def app_main(): st.title("Machine learning analysis platform") if st.sidebar.checkbox('Define Data Source'): filesFolder = st.sidebar.text_input('folder', value="data") dataList = list_files(filesFolder, 'csv') if len(dataList) == 0: st.warning('No data set available') else: file_selected = st.sidebar.selectbox('Select a document', dataList) file_selected_path = concat_file_path(filesFolder, file_selected) nrows = st.sidebar.number_input('Number of lines', value=-1) n_rows_str = 'All' if nrows == -1 else str(nrows) st.info( 'Selected file:{file_selected_path},The number of rows read is{n_rows_str}' ) else: file_selected_path = None nrows = 100 st.warning('The currently selected file is empty, please select:') if st.sidebar.checkbox('Exploratory Analysis'): if file_selected_path is not None: if st.sidebar.button('Report Generation'): df = load_csv(file_selected_path, nrows) pr = ProfileReport(df, explorative=True) st_profile_report(pr) else: st.info('No file selected, analysis cannot be performed') if st.sidebar.checkbox('Modeling'): if file_selected_path is not None: task = st.sidebar.selectbox('Select Task', ML_LIST) if task == 'Regression': model = st.sidebar.selectbox('Select Model', RG_LIST) elif task == 'Classification': model = st.sidebar.selectbox('Select Model', RG_LIST) df = load_csv(file_selected_path, nrows) try: cols = df.columns.to_list() target_col = st.sidebar.selectbox('Select Prediction Object', cols) except BaseException: st.sidebar.warning('The data format cannot be read correctly') target_col = None if target_col is not None and st.sidebar.button('Training Model'): if task == 'Regression': st.success('Data preprocessing...') pc_rg.setup(df, target=target_col, log_experiment=True, experiment_name='ml_', log_plots=True, silent=True, verbose=False, profile=True) st.success('Data preprocessing is complete') st.success('Training model. . .') pc_rg.create_model(model, verbose=False) st.success('The model training is complete. . .') #pc_rg.finalize_model(model) st.success('Model has been created') elif task == 'Classification': st.success('Data preprocessing. . .') pc_cl.setup(df, target=target_col, fix_imbalance=True, log_experiment=True, experiment_name='ml_', log_plots=True, silent=True, verbose=False, profile=True) st.success('Data preprocessing is complete.') st.success('Training model. . .') pc_cl.create_model(model, verbose=False) st.success('The model training is complete. . .') #pc_cl.finalize_model(model) st.success('Model has been created') if st.sidebar.checkbox('View System Log'): n_lines = st.sidebar.slider(label='Number of lines', min_value=3, max_value=50) if st.sidebar.button("Check View"): logs = get_model_training_logs(n_lines=n_lines) st.text('System log') st.write(logs) try: allOfRuns = mlflow.search_runs(experiment_ids=0) except: allOfRuns = [] if len(allOfRuns) != 0: if st.sidebar.checkbox('Preview model'): ml_logs = 'http://kubernetes.docker.internal:5000/ -->Open mlflow, enter the command line: mlflow ui' st.markdown(ml_logs) st.dataframe(allOfRuns) if st.sidebar.checkbox('Choose a model'): selected_run_id = st.sidebar.selectbox( 'Choose from saved models', allOfRuns[allOfRuns['tags.Source'] == 'create_model']['run_id'].tolist()) selected_run_info = allOfRuns[( allOfRuns['run_id'] == selected_run_id)].iloc[0, :] st.code(selected_run_info) if st.sidebar.button('Forecast data'): model_uri = 'runs:/' + selected_run_id + '/model/' model_loaded = mlflow.sklearn.load_model(model_uri) df = pd.read_csv(file_selected_path, nrows=nrows) #st.success('Model prediction. . .') pred = model_loaded.predict(df) pred_df = pd.DataFrame(pred, columns=['Predictive Data']) st.dataframe(pred_df) pred_df.plot() st.pyplot() else: st.sidebar.warning('Did not find a trained model')
def app_main(): st.title("自动化机器学习平台") if st.sidebar.checkbox('定义数据源'): file_folder = st.sidebar.text_input('文件夹', value="data") data_file_list = list_files(file_folder, 'csv') if len(data_file_list) == 0: st.warning(f'当路径无可用数据集') else: file_selected = st.sidebar.selectbox('选择文件', data_file_list) file_selected_path = concat_file_path(file_folder, file_selected) nrows = st.sidebar.number_input('行数', value=-1) n_rows_str = '全部' if nrows == -1 else str(nrows) st.info(f'已选择文件:{file_selected_path},读取行数为{n_rows_str}') else: file_selected_path = None nrows = 100 st.warning(f'当前选择文件为空,请选择。') if st.sidebar.checkbox('探索性分析'): if file_selected_path is not None: if st.sidebar.button('一键生成报告'): df = load_csv(file_selected_path, nrows) pr = ProfileReport(df, explorative=True) st_profile_report(pr) else: st.info(f'没有选择文件,无法进行分析。') if st.sidebar.checkbox('快速建模'): if file_selected_path is not None: task = st.sidebar.selectbox('选择任务', ML_TASK_LIST) if task == '回归': model = st.sidebar.selectbox('选取模型', RG_MODEL_LIST) elif task == '分类': model = st.sidebar.selectbox('选取模型', RG_MODEL_LIST) df = load_csv(file_selected_path, nrows) try: cols = df.columns.to_list() target_col = st.sidebar.selectbox('选取预测对象', cols) except BaseException: st.sidebar.warning(f'数据格式无法正确读取') target_col = None if target_col is not None and st.sidebar.button('训练模型'): if task == '回归': st.success(f'数据预处理。。。') pc_rg.setup(df, target=target_col, log_experiment=True, experiment_name='ml_', log_plots=True, silent=True, verbose=False, profile=True) st.success(f'数据预处理完毕。') st.success(f'训练模型。。。') pc_rg.create_model(model, verbose=False) st.success(f'模型训练完毕。。。') #pc_rg.finalize_model(model) st.success(f'模型已经创建') elif task == '分类': st.success(f'数据预处理。。。') pc_cl.setup(df, target=target_col, fix_imbalance=True, log_experiment=True, experiment_name='ml_', log_plots=True, silent=True, verbose=False, profile=True) st.success(f'数据预处理完毕。') st.success(f'训练模型。。。') pc_cl.create_model(model, verbose=False) st.success(f'模型训练完毕。。。') #pc_cl.finalize_model(model) st.success(f'模型已经创建') if st.sidebar.checkbox('查看系统日志'): n_lines = st.sidebar.slider(label='行数', min_value=3, max_value=50) if st.sidebar.button("查看"): logs = get_model_training_logs(n_lines=n_lines) st.text('系统日志') st.write(logs) try: all_runs = mlflow.search_runs(experiment_ids=0) except: all_runs = [] if len(all_runs) != 0: if st.sidebar.checkbox('预览模型'): ml_logs = 'http://kubernetes.docker.internal:5000/ -->开启mlflow,命令行输入:mlflow ui' st.markdown(ml_logs) st.dataframe(all_runs) if st.sidebar.checkbox('选择模型'): selected_run_id = st.sidebar.selectbox( '从已保存模型中选择', all_runs[all_runs['tags.Source'] == 'create_model']['run_id'].tolist()) selected_run_info = all_runs[( all_runs['run_id'] == selected_run_id)].iloc[0, :] st.code(selected_run_info) if st.sidebar.button('预测数据'): model_uri = f'runs:/' + selected_run_id + '/model/' model_loaded = mlflow.sklearn.load_model(model_uri) df = pd.read_csv(file_selected_path, nrows=nrows) #st.success(f'模型预测中。。。 ') pred = model_loaded.predict(df) pred_df = pd.DataFrame(pred, columns=['预测值']) st.dataframe(pred_df) pred_df.plot() st.pyplot() else: st.sidebar.warning('没有找到训练好的模型')
def regression_model(*, y_col, training_set, normalize, test_size, folds, metric, model_name, testing_set, imbalanced, seed, include_models, normalize_method): """ Build a regression model for prediction. Parameters ---------- y_col : str the name of the target column. training_set : pd.DataFrame DataFrame containing the training data. normalize : bool if True the dataset will be normalized before training. test_size : float Between [0.0-1.0]. The size of the split for test within the training set. folds : int number of folds for cross validation. metric : str the metric used for evaluating the best model. model_name : str the name to save the model. testing_set : pd.DataFrame the external dataset for evaluating the best model. imbalanced seed : int random number to initilize the process. include_models : List a list of models to be included in the process. normalize_method : str The method used for normalizing the data. Returns ------- Final regression model """ if not metric: metric = 'RMSE' setup = pyreg.setup(target=y_col, data=training_set, normalize=normalize, normalize_method=normalize_method, train_size=1 - test_size, fold=folds, silent=True, session_id=seed) best_model = pyreg.compare_models(sort=metric, include=include_models) pyreg.pull().to_csv(model_name + '_compare_models.tsv', sep='\t', index=False) reg_model = pyreg.create_model(best_model) reg_tuned_model = pyreg.tune_model(reg_model, optimize=metric) pyreg.pull().to_csv(model_name + '_tuned_model.tsv', sep='\t', index=False) final_model = pyreg.finalize_model(reg_tuned_model) pyreg.plot_model(final_model, save=True) pyreg.plot_model(final_model, plot='feature', save=True) pyreg.plot_model(final_model, plot='error', save=True) pyreg.save_model(final_model, model_name) if len(testing_set.index) != 0: unseen_predictions = test_regressor( model_path=model_name + '.pkl', x_set=testing_set.drop(columns=[y_col]), y_col=testing_set[y_col], output=model_name) unseen_predictions.to_csv(model_name + '_external_testing_results.tsv', sep='\t', index=True) return final_model
def train_xgboost_regression(fold, cross_validation): create_model('xgboost', fold=fold, cross_validation=cross_validation)
best_model = compare_models( include=[ 'rf', 'lightgbm', 'lasso', 'ridge', 'xgboost', 'en', 'knn', 'mlp', 'lr', 'dt' ], sort='R2', verbose=True, fold=3, round=5, ) # ---- 模型调参 --------------------------------------------------------------------------------- # 初始化模型, 固定参数. params = {'max_features': 'auto'} rgsr = create_model('rf', verbose=False, **params) # 模型调参. params4tuning = { "n_estimators": np.arange(30, 250, 30), "min_samples_leaf": [10, 15, 20, 30, 40, 50], "min_samples_split": [20, 30, 40], } rgsr_tuned = tune_model(rgsr, optimize='R2', n_iter=2, fold=5, round=2, custom_grid=params4tuning) # ---- 模型训练和预测 ---------------------------------------------------------------------------
def run_model(): clf = py.setup(data, target = 'rent', silent=True) rf_model = py.create_model('rf', fold=5, verbose=False) model = py.finalize_model(rf_model) return model