Ejemplo n.º 1
0
def wrong_module(pycaret_model):
    with bentoml.models.create(
        "wrong_module",
        module=__name__,
        options=None,
        context=None,
        metadata=None,
    ) as _model:
        save_model(pycaret_model, _model.path_of("saved_model.pkl"))
        return _model.path
Ejemplo n.º 2
0
    def fit(self,
            train: pd.DataFrame,
            test: pd.DataFrame,
            target: str = "name",
            finetune: bool = False,
            text_feature: str = "text",
            **kwargs) -> Pipeline:
        """Trains and finetunes model for project prediction.

        Args:
            train (pd.DataFrame): training data
            test (pd.DataFrame): test dataset
            finetune (bool, optional): Performs model finetuning if selected. Defaults to False.

        Returns:
            Pipeline: trained sklearn pipeline
        """

        text_pipeline = Pipeline([
            ('vect', CountVectorizer(lowercase=True)),
            ('tfidf', TfidfTransformer()),
        ])
        custom_transformer = make_column_transformer(
            (text_pipeline, text_feature),
            (OneHotEncoder(handle_unknown="ignore"),
             make_column_selector(dtype_include=object)))

        self.clf = setup(train,
                         target=target,
                         test_data=test,
                         session_id=123,
                         custom_pipeline=custom_transformer,
                         preprocess=False,
                         numeric_features=["duration", "attendee_cnt"],
                         silent=True,
                         **kwargs)

        model = create_model('svm', fold=3)
        if finetune:
            model = tune_model(model,
                               search_library="optuna",
                               search_algorithm="tpe",
                               n_iter=200,
                               fold=3)

        final_model = finalize_model(model)

        self.pipeline, self.filename = save_model(final_model, "trained_model")
        return self.pipeline
if __name__ == "__main__":
    df = load_combine_saif_liu_data()
    sample = False
    if sample:
        df = df.groupby(
            'emotion',
            group_keys=False).apply(lambda x: x.sample(min(len(x), 100)))

    vector_df_filepath = 'data/vector_df.csv'
    if os.path.exists(vector_df_filepath):
        vector_df = pd.read_csv(vector_df_filepath)
    else:
        vectors = df['tweet'].swifter.apply(get_text_vector)
        vector_df = pd.DataFrame(
            vectors.array,
            columns=[f'v{r}' for r in range(vectors.iloc[0].shape[0])])
        vector_df.to_csv(vector_df_filepath)

    data_df = pd.concat(
        [vector_df,
         pd.DataFrame(df['emotion']).reset_index(drop=True)],
        axis=1)

    models = pycc.setup(data=data_df, target='emotion', session_id=123)

    best = pycc.compare_models()

    pycc.save_model(best, 'lr_emotion.model')
    with open('pycc_setup.pk', 'wb') as f:
        pk.dump(models, f)
import pandas as pd
import numpy
from pycaret.classification import setup, create_model, tune_model, save_model

train_data = pd.read_csv("../data/HR_training_data.csv")

#initializing pycaret environment
employee_class = setup(data=train_data, target='left', session_id=123)

#creating model
lightgbm = create_model('lightgbm')

#tuned the model by optimizing on AUC
tuned_lightgbm = tune_model(lightgbm, optimize='AUC')

#saving the model
save_model(tuned_lightgbm, '../model/employees_churn_model')
 def _store_classifier(self, classifier_model):
     task_output_dir = self.task.output_dir
     classifier_file_name = f"classifier_{self.task.pycaret_model}"
     classifier_output_path = os.path.join(task_output_dir,
                                           classifier_file_name)
     classification.save_model(classifier_model, classifier_output_path)
Ejemplo n.º 6
0
from pycaret.datasets import get_data

data = get_data(dataset)

#initialize setup
from pycaret.classification import setup, compare_models, blend_models, tune_model, save_model, deploy_model, automl

clf1 = setup(data,
             target=target,
             silent=True,
             html=False,
             log_experiment=True,
             experiment_name=exp_name)

#compare models and select top5
top5 = compare_models(n_select=5, blacklist=['catboost'])

#blend top 5 models
blender = blend_models(estimator_list=top5)

#tune best model
tuned_best_model = tune_model(top5[0])

#select best model
a = automl()
save_model(a, 'best_model')

#deploy best model
deploy_model(a,
             model_name='best-model-aws',
             authentication={'bucket': 'pycaret-test'})
Ejemplo n.º 7
0
#import dataset
from pycaret.datasets import get_data
data = get_data('juice')

#init setup
from pycaret.classification import setup, compare_models, save_model, deploy_model
reg1 = setup(data,
             target='Purchase',
             logging=True,
             experiment_name='juice-script1',
             silent=True,
             html=False)

#compare models
c = compare_models(n_select=1)

#save model
save_model(c, model_name='tfdemo')

#deploy model on S3
deploy_model(c,
             model_name='tfdemo',
             platform='aws',
             authentication={'bucket': 'pycaret-test'})
                log_experiment=True,
                experiment_name='xv-21',
                session_id=110,
                numeric_features=['bclim14'])

# create models
etrees = create_model('et')
xgboost = create_model('xgboost')
catboost = create_model('catboost')
rf = create_model('rf')
lgbm = create_model('lightgbm')
log = create_model('lr')

# save models as .pkl files
finalize_model(etrees)
save_model(etrees, 'classifier_models(pkl)/xant_etrees')

finalize_model(xgboost)
save_model(xgboost, 'classifier_models(pkl)/xant_xgb')

finalize_model(catboost)
save_model(catboost, 'classifier_models(pkl)/xant_cboost')

finalize_model(rf)
save_model(rf, 'classifier_models(pkl)/xant_rf')

finalize_model(lgbm)
save_model(lgbm, 'classifier_models(pkl)/xant_lgbm')

finalize_model(log)
save_model(log, 'classifier_models(pkl)/xant_log')
Ejemplo n.º 9
0
#initialize setup

t0 = time.time()

if module == 'classification':
    from pycaret.classification import setup, compare_models, automl, save_model
    setup(data,
          target=target,
          silent=True,
          html=False,
          log_experiment=True,
          experiment_name=exp_name)
    best_model = compare_models()
    model = automl()
    save_model(model, model_name='pycaret-clf-best')

elif module == 'regression':
    from pycaret.regression import setup, compare_models, automl, save_model
    setup(data,
          target=target,
          silent=True,
          html=False,
          log_experiment=True,
          experiment_name=exp_name)
    best_model = compare_models()
    model = automl()
    save_model(model, model_name='pycaret-reg-best')

elif module == 'clustering':
    from pycaret.clustering import setup, create_model, save_model
Ejemplo n.º 10
0
def train_trad_ml_baseline(train_set_name,
                           val_set_name,
                           use_eiz=True,
                           demographic_features=False):
    '''
    Trains a ensemble based classifier on a distribution based feature representation of EI or EIZ scores to predict
    whether or not a patient has an NMD
    :param train_set_name: The name of the training set to use
    :param val_set_name: The name of the validation set to use
    :param use_eiz: Whether to use EIZ or raw EI scores
    :param demographic_features: Whether to include demographic features.
    :return: A dictionary with the path to the stored model and its best operating threshold.
    '''
    additional_features = ['Age', 'Sex', 'BMI'] if demographic_features else []
    # obtain feature representations
    train_set = obtain_feature_rep_ml_experiment(
        train_set_name,
        use_eiz=use_eiz,
        additional_features=additional_features)
    val_set = obtain_feature_rep_ml_experiment(
        val_set_name, use_eiz=use_eiz, additional_features=additional_features)
    # map to real-valued
    train_set['Class'] = train_set['Class'].replace({'no NMD': 0, 'NMD': 1})
    val_set['Class'] = val_set['Class'].replace({'no NMD': 0, 'NMD': 1})
    # use only ensemble models
    models_to_use = models(type='ensemble')
    models_to_use = models_to_use.index.to_list()
    # get the set of all features in the dataset
    features = set(train_set.columns)
    features.remove('Class')

    # set the experiment up
    exp = setup(train_set,
                target='Class',
                numeric_features=features,
                html=False,
                session_id=123,
                train_size=0.7)
    # sidestep the fact that the the lib makes another validation set

    # manually get the pipeline pycaret uses for transforming the data
    pipeline = exp[7]
    X_train = train_set.drop(columns='Class')
    # transform into the format pycaret expects
    X_train = pipeline.transform(X_train)
    # overwrite the selected train set to use the entire training set instead
    set_config('X_train', X_train)
    set_config('y_train', train_set['Class'])
    # same logic with the val set, use our own instead of the pre-sliced one
    X_test = val_set.drop(columns='Class')
    # transform and set as the validation set
    X_test = pipeline.transform(X_test)
    # overwrite config
    set_config('X_test', X_test)
    set_config('y_test', val_set['Class'])

    # obtain the best model from the list, sorted by val set AUC
    best_model = compare_models(whitelist=models_to_use,
                                sort='AUC',
                                n_select=1)
    # interpretability output, get SHAP plots to judge feature importance
    interpret_model(best_model)

    # now, do some additional tuning, compare different hyperparemters, maximize AUC
    best_model = tune_model(best_model, optimize='AUC')
    # interpret the best model
    interpret_model(best_model)
    # the path to save the model at
    model_path = get_model_name(train_set_name, use_eiz, demographic_features)
    # save the model
    save_model(best_model, model_path)
    # get results on val set as dataframe
    results = predict_model(best_model, verbose=False)
    # get the threshold at which the model performed best on the val set
    best_threshold = evaluate_roc(results['Class'],
                                  results['Score'],
                                  method='val_set_training')

    return {'best_threshold': best_threshold, 'model_path': model_path}
Ejemplo n.º 11
0
def classification_model(
    *,
    y_col,
    training_set,
    normalize,
    test_size,
    folds,
    metric,
    model_name,
    testing_set,
    imbalanced,
    seed,
    include_models,
    normalize_method,
):
    """
    Build a classification model for prediction.

    Parameters
    ----------
    y_col : str
        the name of the target column.
    training_set : pd.DataFrame
        DataFrame containing the training data.
    normalize : bool
        if True the dataset will be normalized before training.
    test_size : float
        Between [0.0-1.0]. The size of the split for test within the training set.
    folds : int
        number of folds for cross validation.
    metric : str
        the metric used for evaluating the best model.
    model_name : str
        the name to save the model.
    testing_set : pd.DataFrame
        the external dataset for evaluating the best model.
    imbalanced : bool
        if True the imbalance will be fixed before the training.
    seed : int
        random number to initilize the process.
    include_models : List
        a list of models to be included in the process.
    normalize_method : str
        The method used for normalizing the data.

    Returns
    -------
    Final classification model

    """
    if not metric:
        metric = 'AUC'
    setup = pycl.setup(target=y_col,
                       fix_imbalance=imbalanced,
                       normalize=normalize,
                       normalize_method=normalize_method,
                       data=training_set,
                       train_size=1 - test_size,
                       silent=True,
                       fold=folds,
                       session_id=seed)
    best_model = pycl.compare_models(sort=metric, include=include_models)
    pycl.pull().to_csv(model_name + '_compare_models.tsv',
                       sep='\t',
                       index=False)
    cl_model = pycl.create_model(best_model)
    cl_tuned_model = pycl.tune_model(cl_model, optimize=metric)
    pycl.pull().to_csv(model_name + '_tuned_model.tsv', sep='\t', index=False)
    final_model = pycl.finalize_model(cl_tuned_model)
    pycl.plot_model(final_model, plot='pr', save=True)
    pycl.plot_model(final_model, plot='confusion_matrix', save=True)
    pycl.plot_model(final_model, plot='feature', save=True)
    pycl.save_model(final_model, model_name)
    if len(testing_set.index) != 0:
        unseen_predictions = test_classifier(
            model_path=model_name + '.pkl',
            x_set=testing_set.drop(columns=[y_col]),
            y_col=testing_set[y_col],
            output=model_name)
        unseen_predictions.to_csv(model_name + '_external_testing_results.tsv',
                                  sep='\t',
                                  index=True)
    return final_model
Ejemplo n.º 12
0
 def save(self, model):
     classification.save_model(model, self.model_name)