def _classifier_setup(self, combined_data):
     classification.setup(
         combined_data.sample(frac=1),  #shuffles the data
         target=self.task.target,
         test_data=self.test_data,
         fold_strategy="kfold",  # TODO allow more strategies as hyperparam
         silent=True,
         verbose=False)
Beispiel #2
0
    def fit(self,
            train: pd.DataFrame,
            test: pd.DataFrame,
            target: str = "name",
            finetune: bool = False,
            text_feature: str = "text",
            **kwargs) -> Pipeline:
        """Trains and finetunes model for project prediction.

        Args:
            train (pd.DataFrame): training data
            test (pd.DataFrame): test dataset
            finetune (bool, optional): Performs model finetuning if selected. Defaults to False.

        Returns:
            Pipeline: trained sklearn pipeline
        """

        text_pipeline = Pipeline([
            ('vect', CountVectorizer(lowercase=True)),
            ('tfidf', TfidfTransformer()),
        ])
        custom_transformer = make_column_transformer(
            (text_pipeline, text_feature),
            (OneHotEncoder(handle_unknown="ignore"),
             make_column_selector(dtype_include=object)))

        self.clf = setup(train,
                         target=target,
                         test_data=test,
                         session_id=123,
                         custom_pipeline=custom_transformer,
                         preprocess=False,
                         numeric_features=["duration", "attendee_cnt"],
                         silent=True,
                         **kwargs)

        model = create_model('svm', fold=3)
        if finetune:
            model = tune_model(model,
                               search_library="optuna",
                               search_algorithm="tpe",
                               n_iter=200,
                               fold=3)

        final_model = finalize_model(model)

        self.pipeline, self.filename = save_model(final_model, "trained_model")
        return self.pipeline
Beispiel #3
0
 def fit(self):
     """
     Performs the search
     """
     self.train, self.test = train_test_split(self.dataset, test_size=0.1)
     self.X_train = self.train.drop(columns=self.target)
     self.y_train = self.train[self.target]
     self.X_test = self.test.drop(columns=self.target)
     self.y_test = self.test[self.target]
     if self.problem_type == "classification":
         from pycaret.classification import automl, compare_models, setup
     else:
         from pycaret.regression import automl, compare_models, setup
     experiment = setup(data=self.train, target=self.target, silent=True, html=False)
     compare_models(**self.automl_settings)
     self.automl_pipeline = automl()
Beispiel #4
0
#import dataset
from pycaret.datasets import get_data
data = get_data('juice')

#init setup
from pycaret.classification import setup, compare_models, save_model, deploy_model
reg1 = setup(data,
             target='Purchase',
             logging=True,
             experiment_name='juice-script1',
             silent=True,
             html=False)

#compare models
c = compare_models(n_select=1)

#save model
save_model(c, model_name='tfdemo')

#deploy model on S3
deploy_model(c,
             model_name='tfdemo',
             platform='aws',
             authentication={'bucket': 'pycaret-test'})
from pycaret.classification import setup
from pycaret.classification import create_model
from pycaret.classification import finalize_model
from pycaret.classification import save_model
from pycaret.classification import blend_models
from pycaret.classification import compare_models
from pandas import read_csv

#data = read_csv('data/envtrain_xv.csv')
data = read_csv('data_2.0/envtrain_xv.csv')

#data = data.drop(['Unnamed: 0'], axis=1)
exp_clf = setup(data,
                target='pa',
                log_experiment=True,
                experiment_name='xv-21',
                session_id=110,
                numeric_features=['bclim14'])

# create models
etrees = create_model('et')
xgboost = create_model('xgboost')
catboost = create_model('catboost')
rf = create_model('rf')
lgbm = create_model('lightgbm')
log = create_model('lr')

# save models as .pkl files
finalize_model(etrees)
save_model(etrees, 'classifier_models(pkl)/xant_etrees')
Beispiel #6
0
target = sys.argv[3]
exp_name = str(dataset) + '_exp'

# import dataset
from pycaret.datasets import get_data
data = get_data(dataset)

#initialize setup

t0 = time.time()

if module == 'classification':
    from pycaret.classification import setup, compare_models, automl, save_model
    setup(data,
          target=target,
          silent=True,
          html=False,
          log_experiment=True,
          experiment_name=exp_name)
    best_model = compare_models()
    model = automl()
    save_model(model, model_name='pycaret-clf-best')

elif module == 'regression':
    from pycaret.regression import setup, compare_models, automl, save_model
    setup(data,
          target=target,
          silent=True,
          html=False,
          log_experiment=True,
          experiment_name=exp_name)
    best_model = compare_models()
Beispiel #7
0
        # Setup PyCaret
        with st.spinner('PyCaret setup is running...'):
            pycset = regression.setup(data=df, target=df_target)

        # Compare models
        st.dataframe(regression.compare_models())

        # End
        st.success('End of execution!')

    if df_task in ['Classification (Binary)', 'Classification (Multiclass)']:

        # Setup PyCaret
        with st.spinner('PyCaret setup is running...'):
            pycset = classification.setup(data=df, target=df_target)

        # Compare models
        st.dataframe(classification.compare_models())

        # End
        st.success('End of execution!')

    if df_task in ['NLP']:

        # Setup PyCaret
        with st.spinner('PyCaret setup is running...'):
            pycset = nlp.setup(data=df, target=df_target)

        # Compare models
        #st.dataframe(classification.compare_models())
Beispiel #8
0
"""#### No missing values

## Visualization
"""

plt.figure(figsize=(12,12))
plt.pie(data['Type'].value_counts(), labels=data['Type'].value_counts().index)
plt.title('Class Distribution')
plt.show()

"""## Setup, it is like pipeline where we can pass peprocessing and transformation steps."""

setup(
       data = data,
       target = 'Type', #target meaning the target feature
       normalize = True,
       train_size = 0.7
)

models()

best_model = compare_models()

"""### Extra Trees Classifier and Random Forest are better as shwon by PyCaret."""

best_model

predict_model(best_model)

Beispiel #9
0
project_path = base_dir.joinpath('Project3Data')
data_file = project_path.joinpath('creditcard.csv')
df = pd.read_csv(data_file)

# define the location of the dataset
#url = 'https://raw.githubusercontent.com/jbrownlee/Datasets/master/sonar.csv'
# load the dataset
#df = read_csv(url, header=None)

# set column names as the column number
n_cols = df.shape[1]
df.columns = [str(i) for i in range(n_cols)]
# setup the dataset
grid = setup(data=df,
             target=df.columns[-1],
             html=False,
             silent=True,
             verbose=False)
# evaluate models and compare models
best = compare_models()
# report the best model
print("best")
print(best)

# tune model hyperparameters on the sonar classification dataset
from pandas import read_csv
from sklearn.ensemble import ExtraTreesClassifier
from pycaret.classification import setup
from pycaret.classification import tune_model
# define the location of the dataset
#url = 'https://raw.githubusercontent.com/jbrownlee/Datasets/master/sonar.csv'
Beispiel #10
0
def app_main():
    st.title("自动化机器学习平台")
    if st.sidebar.checkbox('定义数据源'):
        file_folder = st.sidebar.text_input('文件夹', value="data")
        data_file_list = list_files(file_folder, 'csv')
        if len(data_file_list) == 0:
            st.warning(f'当路径无可用数据集')
        else:
            file_selected = st.sidebar.selectbox('选择文件', data_file_list)
            file_selected_path = concat_file_path(file_folder, file_selected)
            nrows = st.sidebar.number_input('行数', value=-1)
            n_rows_str = '全部' if nrows == -1 else str(nrows)
            st.info(f'已选择文件:{file_selected_path},读取行数为{n_rows_str}')
    else:
        file_selected_path = None
        nrows = 100
        st.warning(f'当前选择文件为空,请选择。')
    if st.sidebar.checkbox('探索性分析'):
        if file_selected_path is not None:
            if st.sidebar.button('一键生成报告'):
                df = load_csv(file_selected_path, nrows)
                pr = ProfileReport(df, explorative=True)
                st_profile_report(pr)
        else:
            st.info(f'没有选择文件,无法进行分析。')

    if st.sidebar.checkbox('快速建模'):
        if file_selected_path is not None:
            task = st.sidebar.selectbox('选择任务', ML_TASK_LIST)
            if task == '回归':
                model = st.sidebar.selectbox('选取模型', RG_MODEL_LIST)
            elif task == '分类':
                model = st.sidebar.selectbox('选取模型', RG_MODEL_LIST)
            df = load_csv(file_selected_path, nrows)
            try:
                cols = df.columns.to_list()
                target_col = st.sidebar.selectbox('选取预测对象', cols)
            except BaseException:
                st.sidebar.warning(f'数据格式无法正确读取')
                target_col = None

            if target_col is not None and st.sidebar.button('训练模型'):
                if task == '回归':
                    st.success(f'数据预处理。。。')
                    pc_rg.setup(df,
                                target=target_col,
                                log_experiment=True,
                                experiment_name='ml_',
                                log_plots=True,
                                silent=True,
                                verbose=False,
                                profile=True)
                    st.success(f'数据预处理完毕。')
                    st.success(f'训练模型。。。')
                    pc_rg.create_model(model, verbose=False)
                    st.success(f'模型训练完毕。。。')
                    #pc_rg.finalize_model(model)
                    st.success(f'模型已经创建')
                elif task == '分类':
                    st.success(f'数据预处理。。。')
                    pc_cl.setup(df,
                                target=target_col,
                                fix_imbalance=True,
                                log_experiment=True,
                                experiment_name='ml_',
                                log_plots=True,
                                silent=True,
                                verbose=False,
                                profile=True)
                    st.success(f'数据预处理完毕。')
                    st.success(f'训练模型。。。')
                    pc_cl.create_model(model, verbose=False)
                    st.success(f'模型训练完毕。。。')
                    #pc_cl.finalize_model(model)
                    st.success(f'模型已经创建')
    if st.sidebar.checkbox('查看系统日志'):
        n_lines = st.sidebar.slider(label='行数', min_value=3, max_value=50)
        if st.sidebar.button("查看"):
            logs = get_model_training_logs(n_lines=n_lines)
            st.text('系统日志')
            st.write(logs)
    try:
        all_runs = mlflow.search_runs(experiment_ids=0)
    except:
        all_runs = []
    if len(all_runs) != 0:
        if st.sidebar.checkbox('预览模型'):
            ml_logs = 'http://kubernetes.docker.internal:5000/  -->开启mlflow,命令行输入:mlflow ui'
            st.markdown(ml_logs)
            st.dataframe(all_runs)
        if st.sidebar.checkbox('选择模型'):
            selected_run_id = st.sidebar.selectbox(
                '从已保存模型中选择', all_runs[all_runs['tags.Source'] ==
                                      'create_model']['run_id'].tolist())
            selected_run_info = all_runs[(
                all_runs['run_id'] == selected_run_id)].iloc[0, :]
            st.code(selected_run_info)
            if st.sidebar.button('预测数据'):
                model_uri = f'runs:/' + selected_run_id + '/model/'
                model_loaded = mlflow.sklearn.load_model(model_uri)
                df = pd.read_csv(file_selected_path, nrows=nrows)
                #st.success(f'模型预测中。。。   ')
                pred = model_loaded.predict(df)
                pred_df = pd.DataFrame(pred, columns=['预测值'])
                st.dataframe(pred_df)
                pred_df.plot()
                st.pyplot()
    else:
        st.sidebar.warning('没有找到训练好的模型')
Beispiel #11
0
def train_trad_ml_baseline(train_set_name,
                           val_set_name,
                           use_eiz=True,
                           demographic_features=False):
    '''
    Trains a ensemble based classifier on a distribution based feature representation of EI or EIZ scores to predict
    whether or not a patient has an NMD
    :param train_set_name: The name of the training set to use
    :param val_set_name: The name of the validation set to use
    :param use_eiz: Whether to use EIZ or raw EI scores
    :param demographic_features: Whether to include demographic features.
    :return: A dictionary with the path to the stored model and its best operating threshold.
    '''
    additional_features = ['Age', 'Sex', 'BMI'] if demographic_features else []
    # obtain feature representations
    train_set = obtain_feature_rep_ml_experiment(
        train_set_name,
        use_eiz=use_eiz,
        additional_features=additional_features)
    val_set = obtain_feature_rep_ml_experiment(
        val_set_name, use_eiz=use_eiz, additional_features=additional_features)
    # map to real-valued
    train_set['Class'] = train_set['Class'].replace({'no NMD': 0, 'NMD': 1})
    val_set['Class'] = val_set['Class'].replace({'no NMD': 0, 'NMD': 1})
    # use only ensemble models
    models_to_use = models(type='ensemble')
    models_to_use = models_to_use.index.to_list()
    # get the set of all features in the dataset
    features = set(train_set.columns)
    features.remove('Class')

    # set the experiment up
    exp = setup(train_set,
                target='Class',
                numeric_features=features,
                html=False,
                session_id=123,
                train_size=0.7)
    # sidestep the fact that the the lib makes another validation set

    # manually get the pipeline pycaret uses for transforming the data
    pipeline = exp[7]
    X_train = train_set.drop(columns='Class')
    # transform into the format pycaret expects
    X_train = pipeline.transform(X_train)
    # overwrite the selected train set to use the entire training set instead
    set_config('X_train', X_train)
    set_config('y_train', train_set['Class'])
    # same logic with the val set, use our own instead of the pre-sliced one
    X_test = val_set.drop(columns='Class')
    # transform and set as the validation set
    X_test = pipeline.transform(X_test)
    # overwrite config
    set_config('X_test', X_test)
    set_config('y_test', val_set['Class'])

    # obtain the best model from the list, sorted by val set AUC
    best_model = compare_models(whitelist=models_to_use,
                                sort='AUC',
                                n_select=1)
    # interpretability output, get SHAP plots to judge feature importance
    interpret_model(best_model)

    # now, do some additional tuning, compare different hyperparemters, maximize AUC
    best_model = tune_model(best_model, optimize='AUC')
    # interpret the best model
    interpret_model(best_model)
    # the path to save the model at
    model_path = get_model_name(train_set_name, use_eiz, demographic_features)
    # save the model
    save_model(best_model, model_path)
    # get results on val set as dataframe
    results = predict_model(best_model, verbose=False)
    # get the threshold at which the model performed best on the val set
    best_threshold = evaluate_roc(results['Class'],
                                  results['Score'],
                                  method='val_set_training')

    return {'best_threshold': best_threshold, 'model_path': model_path}
Beispiel #12
0
def classification_model(
    *,
    y_col,
    training_set,
    normalize,
    test_size,
    folds,
    metric,
    model_name,
    testing_set,
    imbalanced,
    seed,
    include_models,
    normalize_method,
):
    """
    Build a classification model for prediction.

    Parameters
    ----------
    y_col : str
        the name of the target column.
    training_set : pd.DataFrame
        DataFrame containing the training data.
    normalize : bool
        if True the dataset will be normalized before training.
    test_size : float
        Between [0.0-1.0]. The size of the split for test within the training set.
    folds : int
        number of folds for cross validation.
    metric : str
        the metric used for evaluating the best model.
    model_name : str
        the name to save the model.
    testing_set : pd.DataFrame
        the external dataset for evaluating the best model.
    imbalanced : bool
        if True the imbalance will be fixed before the training.
    seed : int
        random number to initilize the process.
    include_models : List
        a list of models to be included in the process.
    normalize_method : str
        The method used for normalizing the data.

    Returns
    -------
    Final classification model

    """
    if not metric:
        metric = 'AUC'
    setup = pycl.setup(target=y_col,
                       fix_imbalance=imbalanced,
                       normalize=normalize,
                       normalize_method=normalize_method,
                       data=training_set,
                       train_size=1 - test_size,
                       silent=True,
                       fold=folds,
                       session_id=seed)
    best_model = pycl.compare_models(sort=metric, include=include_models)
    pycl.pull().to_csv(model_name + '_compare_models.tsv',
                       sep='\t',
                       index=False)
    cl_model = pycl.create_model(best_model)
    cl_tuned_model = pycl.tune_model(cl_model, optimize=metric)
    pycl.pull().to_csv(model_name + '_tuned_model.tsv', sep='\t', index=False)
    final_model = pycl.finalize_model(cl_tuned_model)
    pycl.plot_model(final_model, plot='pr', save=True)
    pycl.plot_model(final_model, plot='confusion_matrix', save=True)
    pycl.plot_model(final_model, plot='feature', save=True)
    pycl.save_model(final_model, model_name)
    if len(testing_set.index) != 0:
        unseen_predictions = test_classifier(
            model_path=model_name + '.pkl',
            x_set=testing_set.drop(columns=[y_col]),
            y_col=testing_set[y_col],
            output=model_name)
        unseen_predictions.to_csv(model_name + '_external_testing_results.tsv',
                                  sep='\t',
                                  index=True)
    return final_model
Beispiel #13
0
    def do_modeling(self, dataFrame, pipeline_dict):

        prob_type = st.selectbox('Select type of problem',
                                 ['Classification', 'Regression'])
        target_variable = st.selectbox('Select target variable',
                                       dataFrame.columns)

        classification_model_library = [
            'lr', 'knn', 'nb', 'dt', 'svm', 'rbfsvm', 'gpc', 'mlp', 'ridge',
            'rf', 'qda', 'ada', 'gbc', 'lda', 'et', 'xgboost', 'lightgbm',
            'catboost'
        ]

        tree_based_models = [
            'Random Forest Classifier', 'Decision Tree Classifier',
            'Extra Trees Classifier', 'Gradient Boosting Classifier',
            'Extreme Gradient Boosting', 'Light Gradient Boosting Machine',
            'CatBoost Classifier'
        ]

        classification_model_names = [
            'Logistic Regression', 'K Neighbors Classifier', 'Naive Bayes',
            'Decision Tree Classifier', 'SVM - Linear Kernel',
            'SVM - Radial Kernel', 'Gaussian Process Classifier',
            'MLP Classifier', 'Ridge Classifier', 'Random Forest Classifier',
            'Quadratic Discriminant Analysis', 'Ada Boost Classifier',
            'Gradient Boosting Classifier', 'Linear Discriminant Analysis',
            'Extra Trees Classifier', 'Extreme Gradient Boosting',
            'Light Gradient Boosting Machine', 'CatBoost Classifier'
        ]

        classification_models = dict(
            zip(classification_model_names, classification_model_library))

        if st.checkbox('X and y Split'):
            X = self.get_features(dataFrame, target_variable)
            y = dataFrame[target_variable]
            st.write('Done!')

        if st.checkbox('X,y Info'):
            st.write(X)
            st.write(y)

        if st.checkbox('Scaling of data'):
            scale_X = self.do_standardScale(X)
            columns = X.columns
            pipeline_dict['Scaling'] = True
            for col in scale_X:
                X[col] = scale_X[col].values
            #X.drop(columns,axis=1,inplace=True)
            st.write(X)
            st.write('Done!')

        if st.checkbox('Dimensionality Reduction'):
            if st.checkbox('PCA'):
                information_loss = st.text_input(
                    'Enter Information loss in percentage(%)')

                if st.button('PCA'):
                    pipeline_dict['PCA_info_loss'] = information_loss
                    pca_X = self.dimred_PCA(X, information_loss)
                    columns = X.columns
                    for i, val in enumerate(pca_X.T):
                        X[i] = val
                    X.drop(columns, axis=1, inplace=True)
                    st.write('Done!')

            if st.checkbox('LDA'):
                number_components = st.text_input(
                    'Enter the number of components')
                if st.button('LDA'):
                    pipeline_dict['LDA_number_components'] = number_components
                    lda = LDA(n_components=number_components)
                    lda_X = lda.fit_transform(X, y)
                    columns = X.columns
                    for i, val in enumerate(lda_X.T):
                        X[i] = val
                    X.drop(columns, axis=1, inplace=True)
                    st.write('Done!')

        if st.checkbox('Start Base-Line modeling Classification'):
            py_data = X
            py_data[target_variable] = y
            st.write('Name :' + str(target_variable))
            st.write('Type :' + str(prob_type))
            if st.checkbox('Start Modeling'):
                exp1 = cl.setup(data=py_data,
                                target=target_variable,
                                session_id=123,
                                silent=True)
                st.write('Compare Models...')
                #models_info = cl.create_model('lr',verbose = False)
                models_info = cl.compare_models()
                st.write(models_info)
            if st.checkbox('Tuning Models'):
                tuning_model_name = st.selectbox('Select Model for Tuning',
                                                 classification_model_names)
                if st.button('Start'):
                    st.write(tuning_model_name)
                    tuned_model, result = cl.tune_model(
                        classification_models[tuning_model_name],
                        verbose=False)
                    st.write(result)
                    if tuning_model_name in tree_based_models:
                        cl.interpret_model(tuned_model)
                        st.pyplot()
                        cl.plot_model(tuned_model, plot='confusion_matrix')
                        st.pyplot()
                    else:
                        cl.plot_model(tuned_model, plot='confusion_matrix')
                        st.pyplot()

            if st.checkbox('Finalize Model'):
                final_model_name = st.selectbox('Select Model for Tuning',
                                                classification_model_names)
                if st.checkbox('Finalize'):
                    tuned_model, result = cl.tune_model(
                        classification_models[final_model_name], verbose=False)
                    st.write(result)
                    finalize_model = cl.finalize_model(tuned_model)
                    st.write(final_model_name)
                    st.write(finalize_model.get_params())
                    st.write('Done!')
                    st.write(pipeline_dict)
                    url = st.text_input(
                        "Enter Test Data Url(Must be csv file)")

                    if st.button('Click'):
                        test_dataFrame = self.get_test_data_csv(url)
                        st.write(test_dataFrame)
                        for k, v in pipeline_dict.items():
                            if k == 'Convert_Data_Type':
                                st.write('Convert_Data_Type')
                                self.convert_type(
                                    test_dataFrame,
                                    pipeline_dict['Convert_Data_Type']
                                    ['column_name'],
                                    pipeline_dict['Convert_Data_Type']
                                    ['data_type'])

                            elif k == 'remove_columns':
                                st.write('remove_columns')
                                test_dataFrame.drop(
                                    pipeline_dict['remove_columns'],
                                    axis=1,
                                    inplace=True)

                            elif k == 'remove_columns_threshold':
                                st.write('remove_columns_threshold..')
                                for threshold in pipeline_dict[
                                        'remove_columns_threshold']:
                                    remove_columns = self.remove_null_columns(
                                        test_dataFrame, float(threshold))
                                    test_dataFrame.drop(remove_columns,
                                                        axis=1,
                                                        inplace=True)

                            elif k == 'Fill_Median_Mode_Columns':
                                st.write('Fill_Median_Mode_Columns..')
                                test_dataFrame = self.replace_null_columns(
                                    test_dataFrame,
                                    pipeline_dict['Fill_Median_Mode_Columns'])

                            elif k == 'Create_Bins':
                                st.write('Create_Bins..')
                                column = pipeline_dict['Create_Bins'][
                                    'column_Name']
                                bins = pipeline_dict['Create_Bins'][
                                    'Numbers_bin']
                                for i, c in enumerate(column):
                                    test_dataFrame[c] = self.do_bining(
                                        test_dataFrame, c, int(bins[i]))

                            elif k == 'OneHotEncoding':
                                st.write('OneHotEncoding..')
                                list_columns = pipeline_dict['OneHotEncoding']
                                for col in list_columns:
                                    tempdf = pd.get_dummies(
                                        data=test_dataFrame[col])
                                    for in_col in tempdf.columns:
                                        colName = str(col) + '_' + str(in_col)
                                        test_dataFrame[colName] = tempdf[
                                            in_col].values
                                test_dataFrame.drop(list_columns,
                                                    axis=1,
                                                    inplace=True)

                            elif k == 'LabelEncoding':
                                st.write('LabelEncoding..')
                                test_dataFrame = self.do_label_Encoding(
                                    test_dataFrame,
                                    pipeline_dict['LabelEncoding'])

                            elif k == 'BinaryEncoding':
                                st.write('BinaryEncoding..')
                                binary_encoding_columns = pipeline_dict[
                                    'BinaryEncoding']
                                for col in binary_encoding_columns:
                                    encoder = ce.BinaryEncoder(cols=[col])
                                    dfbin = encoder.fit_transform(
                                        dataFrame[col])
                                    for col in dfbin.columns:
                                        test_dataFrame[col] = dfbin[col].values
                                test_dataFrame.drop(binary_encoding_columns,
                                                    axis=1,
                                                    inplace=True)

                            elif k == 'Scaling':
                                st.write('Scaling..')
                                scale_X = self.do_standardScale(test_dataFrame)
                                columns = test_dataFrame.columns
                                for col in scale_X:
                                    test_dataFrame[col] = scale_X[col].values

                        st.write(test_dataFrame)
                        unseen_predictions = cl.predict_model(
                            finalize_model, data=test_dataFrame)
                        st.write(unseen_predictions.head())
                        unseen_predictions.to_csv('result.csv')
Beispiel #14
0
target = 'Purchase'

#checking version
from pycaret.utils import version
version()

import time
t0 = time.time()

#loading dataset
from pycaret.datasets import get_data
data = get_data(dataset, verbose=False)

#init regression
from pycaret.classification import setup
exp1 = setup(data, target=target, silent=True, html=False, verbose=False)

#RECEIPE #1 - SELECT TOP 5 MODELS
from pycaret.classification import compare_models
top5 = compare_models(n_select=5,
                      whitelist=['dt', 'lr', 'rf', 'lightgbm', 'xgboost'])

#RECEIPE #2 - TUNE TOP 5 MODELS
from pycaret.classification import tune_model
tuned_top5 = [tune_model(i) for i in top5]
print(len(tuned_top5))

#RECIPE #3
from pycaret.classification import blend_models
blender = blend_models(top5, verbose=False)
print(blender)
dataset = sys.argv[1]
target = sys.argv[2]
exp_name = str(dataset) + '_exp'

#import dataset using sys arg
from pycaret.datasets import get_data

data = get_data(dataset)

#initialize setup
from pycaret.classification import setup, compare_models, blend_models, tune_model, save_model, deploy_model, automl

clf1 = setup(data,
             target=target,
             silent=True,
             html=False,
             log_experiment=True,
             experiment_name=exp_name)

#compare models and select top5
top5 = compare_models(n_select=5, blacklist=['catboost'])

#blend top 5 models
blender = blend_models(estimator_list=top5)

#tune best model
tuned_best_model = tune_model(top5[0])

#select best model
a = automl()
save_model(a, 'best_model')
Beispiel #16
0
import pandas as pd
from pycaret import classification

data_classification = pd.read_csv('./db_GIST.csv')
classification_setup = classification.setup(data= data_classification, target='strok')
tune_catboost = classification.tune_model('xgboost')
tune_catboost.to_csv('./data.csv')
Beispiel #17
0
import pandas as pd
import seaborn as sns
import plotly.express as px
import pycaret.classification as pyclf
import matplotlib.pyplot as plt
from mlxtend.plotting import plot_confusion_matrix
from sklearn.metrics import confusion_matrix

st.set_page_config(layout="wide")

# load data
df = pd.read_excel('data/default of credit card clients.xls',
                   skiprows=1,
                   index_col='ID').sample(1000)

setup = pyclf.setup(df, target='default payment next month', silent=True)
lgbm = pyclf.create_model('lightgbm')
lgbm, tuner = pyclf.tune_model(lgbm, return_tuner=True)

cv_acc = round(tuner.cv_results_['mean_test_score'].mean(), 3)
st.title(f"CV Accuracy is {cv_acc}")

# EDA plots
phik_corr = df.phik_matrix()
correlogram = sns.heatmap(phik_corr)

barchart = px.histogram(df,
                        x='PAY_0',
                        color='default payment next month',
                        barmode='group')
import pandas as pd
import numpy
from pycaret.classification import setup, create_model, tune_model, save_model

train_data = pd.read_csv("../data/HR_training_data.csv")

#initializing pycaret environment
employee_class = setup(data=train_data, target='left', session_id=123)

#creating model
lightgbm = create_model('lightgbm')

#tuned the model by optimizing on AUC
tuned_lightgbm = tune_model(lightgbm, optimize='AUC')

#saving the model
save_model(tuned_lightgbm, '../model/employees_churn_model')
def app_main():
    st.title("Machine learning analysis platform")
    if st.sidebar.checkbox('Define Data Source'):
        filesFolder = st.sidebar.text_input('folder', value="data")
        dataList = list_files(filesFolder, 'csv')
        if len(dataList) == 0:
            st.warning('No data set available')
        else:
            file_selected = st.sidebar.selectbox('Select a document', dataList)
            file_selected_path = concat_file_path(filesFolder, file_selected)
            nrows = st.sidebar.number_input('Number of lines', value=-1)
            n_rows_str = 'All' if nrows == -1 else str(nrows)
            st.info(
                'Selected file:{file_selected_path},The number of rows read is{n_rows_str}'
            )
    else:
        file_selected_path = None
        nrows = 100
        st.warning('The currently selected file is empty, please select:')
    if st.sidebar.checkbox('Exploratory Analysis'):
        if file_selected_path is not None:
            if st.sidebar.button('Report Generation'):
                df = load_csv(file_selected_path, nrows)
                pr = ProfileReport(df, explorative=True)
                st_profile_report(pr)
        else:
            st.info('No file selected, analysis cannot be performed')
    if st.sidebar.checkbox('Modeling'):
        if file_selected_path is not None:
            task = st.sidebar.selectbox('Select Task', ML_LIST)
            if task == 'Regression':
                model = st.sidebar.selectbox('Select Model', RG_LIST)
            elif task == 'Classification':
                model = st.sidebar.selectbox('Select Model', RG_LIST)
            df = load_csv(file_selected_path, nrows)
            try:
                cols = df.columns.to_list()
                target_col = st.sidebar.selectbox('Select Prediction Object',
                                                  cols)
            except BaseException:
                st.sidebar.warning('The data format cannot be read correctly')
                target_col = None

            if target_col is not None and st.sidebar.button('Training Model'):
                if task == 'Regression':
                    st.success('Data preprocessing...')
                    pc_rg.setup(df,
                                target=target_col,
                                log_experiment=True,
                                experiment_name='ml_',
                                log_plots=True,
                                silent=True,
                                verbose=False,
                                profile=True)
                    st.success('Data preprocessing is complete')
                    st.success('Training model. . .')
                    pc_rg.create_model(model, verbose=False)
                    st.success('The model training is complete. . .')
                    #pc_rg.finalize_model(model)
                    st.success('Model has been created')
                elif task == 'Classification':
                    st.success('Data preprocessing. . .')
                    pc_cl.setup(df,
                                target=target_col,
                                fix_imbalance=True,
                                log_experiment=True,
                                experiment_name='ml_',
                                log_plots=True,
                                silent=True,
                                verbose=False,
                                profile=True)
                    st.success('Data preprocessing is complete.')
                    st.success('Training model. . .')
                    pc_cl.create_model(model, verbose=False)
                    st.success('The model training is complete. . .')
                    #pc_cl.finalize_model(model)
                    st.success('Model has been created')

    if st.sidebar.checkbox('View System Log'):
        n_lines = st.sidebar.slider(label='Number of lines',
                                    min_value=3,
                                    max_value=50)
        if st.sidebar.button("Check View"):
            logs = get_model_training_logs(n_lines=n_lines)
            st.text('System log')
            st.write(logs)
    try:
        allOfRuns = mlflow.search_runs(experiment_ids=0)
    except:
        allOfRuns = []
    if len(allOfRuns) != 0:
        if st.sidebar.checkbox('Preview model'):
            ml_logs = 'http://kubernetes.docker.internal:5000/  -->Open mlflow, enter the command line: mlflow ui'
            st.markdown(ml_logs)
            st.dataframe(allOfRuns)
        if st.sidebar.checkbox('Choose a model'):
            selected_run_id = st.sidebar.selectbox(
                'Choose from saved models',
                allOfRuns[allOfRuns['tags.Source'] ==
                          'create_model']['run_id'].tolist())
            selected_run_info = allOfRuns[(
                allOfRuns['run_id'] == selected_run_id)].iloc[0, :]
            st.code(selected_run_info)
            if st.sidebar.button('Forecast data'):
                model_uri = 'runs:/' + selected_run_id + '/model/'
                model_loaded = mlflow.sklearn.load_model(model_uri)
                df = pd.read_csv(file_selected_path, nrows=nrows)
                #st.success('Model prediction. . .')
                pred = model_loaded.predict(df)
                pred_df = pd.DataFrame(pred, columns=['Predictive Data'])
                st.dataframe(pred_df)
                pred_df.plot()
                st.pyplot()
    else:
        st.sidebar.warning('Did not find a trained model')
if __name__ == "__main__":
    df = load_combine_saif_liu_data()
    sample = False
    if sample:
        df = df.groupby(
            'emotion',
            group_keys=False).apply(lambda x: x.sample(min(len(x), 100)))

    vector_df_filepath = 'data/vector_df.csv'
    if os.path.exists(vector_df_filepath):
        vector_df = pd.read_csv(vector_df_filepath)
    else:
        vectors = df['tweet'].swifter.apply(get_text_vector)
        vector_df = pd.DataFrame(
            vectors.array,
            columns=[f'v{r}' for r in range(vectors.iloc[0].shape[0])])
        vector_df.to_csv(vector_df_filepath)

    data_df = pd.concat(
        [vector_df,
         pd.DataFrame(df['emotion']).reset_index(drop=True)],
        axis=1)

    models = pycc.setup(data=data_df, target='emotion', session_id=123)

    best = pycc.compare_models()

    pycc.save_model(best, 'lr_emotion.model')
    with open('pycc_setup.pk', 'wb') as f:
        pk.dump(models, f)
Beispiel #21
0
 def create_model(self, model, target):
     classification.setup(data=self.data,
                          target=target,
                          silent=True,
                          html=False)
     return classification.create_model(model)