def wrong_module(pycaret_model): with bentoml.models.create( "wrong_module", module=__name__, options=None, context=None, metadata=None, ) as _model: save_model(pycaret_model, _model.path_of("saved_model.pkl")) return _model.path
def fit(self, train: pd.DataFrame, test: pd.DataFrame, target: str = "name", finetune: bool = False, text_feature: str = "text", **kwargs) -> Pipeline: """Trains and finetunes model for project prediction. Args: train (pd.DataFrame): training data test (pd.DataFrame): test dataset finetune (bool, optional): Performs model finetuning if selected. Defaults to False. Returns: Pipeline: trained sklearn pipeline """ text_pipeline = Pipeline([ ('vect', CountVectorizer(lowercase=True)), ('tfidf', TfidfTransformer()), ]) custom_transformer = make_column_transformer( (text_pipeline, text_feature), (OneHotEncoder(handle_unknown="ignore"), make_column_selector(dtype_include=object))) self.clf = setup(train, target=target, test_data=test, session_id=123, custom_pipeline=custom_transformer, preprocess=False, numeric_features=["duration", "attendee_cnt"], silent=True, **kwargs) model = create_model('svm', fold=3) if finetune: model = tune_model(model, search_library="optuna", search_algorithm="tpe", n_iter=200, fold=3) final_model = finalize_model(model) self.pipeline, self.filename = save_model(final_model, "trained_model") return self.pipeline
if __name__ == "__main__": df = load_combine_saif_liu_data() sample = False if sample: df = df.groupby( 'emotion', group_keys=False).apply(lambda x: x.sample(min(len(x), 100))) vector_df_filepath = 'data/vector_df.csv' if os.path.exists(vector_df_filepath): vector_df = pd.read_csv(vector_df_filepath) else: vectors = df['tweet'].swifter.apply(get_text_vector) vector_df = pd.DataFrame( vectors.array, columns=[f'v{r}' for r in range(vectors.iloc[0].shape[0])]) vector_df.to_csv(vector_df_filepath) data_df = pd.concat( [vector_df, pd.DataFrame(df['emotion']).reset_index(drop=True)], axis=1) models = pycc.setup(data=data_df, target='emotion', session_id=123) best = pycc.compare_models() pycc.save_model(best, 'lr_emotion.model') with open('pycc_setup.pk', 'wb') as f: pk.dump(models, f)
import pandas as pd import numpy from pycaret.classification import setup, create_model, tune_model, save_model train_data = pd.read_csv("../data/HR_training_data.csv") #initializing pycaret environment employee_class = setup(data=train_data, target='left', session_id=123) #creating model lightgbm = create_model('lightgbm') #tuned the model by optimizing on AUC tuned_lightgbm = tune_model(lightgbm, optimize='AUC') #saving the model save_model(tuned_lightgbm, '../model/employees_churn_model')
def _store_classifier(self, classifier_model): task_output_dir = self.task.output_dir classifier_file_name = f"classifier_{self.task.pycaret_model}" classifier_output_path = os.path.join(task_output_dir, classifier_file_name) classification.save_model(classifier_model, classifier_output_path)
from pycaret.datasets import get_data data = get_data(dataset) #initialize setup from pycaret.classification import setup, compare_models, blend_models, tune_model, save_model, deploy_model, automl clf1 = setup(data, target=target, silent=True, html=False, log_experiment=True, experiment_name=exp_name) #compare models and select top5 top5 = compare_models(n_select=5, blacklist=['catboost']) #blend top 5 models blender = blend_models(estimator_list=top5) #tune best model tuned_best_model = tune_model(top5[0]) #select best model a = automl() save_model(a, 'best_model') #deploy best model deploy_model(a, model_name='best-model-aws', authentication={'bucket': 'pycaret-test'})
#import dataset from pycaret.datasets import get_data data = get_data('juice') #init setup from pycaret.classification import setup, compare_models, save_model, deploy_model reg1 = setup(data, target='Purchase', logging=True, experiment_name='juice-script1', silent=True, html=False) #compare models c = compare_models(n_select=1) #save model save_model(c, model_name='tfdemo') #deploy model on S3 deploy_model(c, model_name='tfdemo', platform='aws', authentication={'bucket': 'pycaret-test'})
log_experiment=True, experiment_name='xv-21', session_id=110, numeric_features=['bclim14']) # create models etrees = create_model('et') xgboost = create_model('xgboost') catboost = create_model('catboost') rf = create_model('rf') lgbm = create_model('lightgbm') log = create_model('lr') # save models as .pkl files finalize_model(etrees) save_model(etrees, 'classifier_models(pkl)/xant_etrees') finalize_model(xgboost) save_model(xgboost, 'classifier_models(pkl)/xant_xgb') finalize_model(catboost) save_model(catboost, 'classifier_models(pkl)/xant_cboost') finalize_model(rf) save_model(rf, 'classifier_models(pkl)/xant_rf') finalize_model(lgbm) save_model(lgbm, 'classifier_models(pkl)/xant_lgbm') finalize_model(log) save_model(log, 'classifier_models(pkl)/xant_log')
#initialize setup t0 = time.time() if module == 'classification': from pycaret.classification import setup, compare_models, automl, save_model setup(data, target=target, silent=True, html=False, log_experiment=True, experiment_name=exp_name) best_model = compare_models() model = automl() save_model(model, model_name='pycaret-clf-best') elif module == 'regression': from pycaret.regression import setup, compare_models, automl, save_model setup(data, target=target, silent=True, html=False, log_experiment=True, experiment_name=exp_name) best_model = compare_models() model = automl() save_model(model, model_name='pycaret-reg-best') elif module == 'clustering': from pycaret.clustering import setup, create_model, save_model
def train_trad_ml_baseline(train_set_name, val_set_name, use_eiz=True, demographic_features=False): ''' Trains a ensemble based classifier on a distribution based feature representation of EI or EIZ scores to predict whether or not a patient has an NMD :param train_set_name: The name of the training set to use :param val_set_name: The name of the validation set to use :param use_eiz: Whether to use EIZ or raw EI scores :param demographic_features: Whether to include demographic features. :return: A dictionary with the path to the stored model and its best operating threshold. ''' additional_features = ['Age', 'Sex', 'BMI'] if demographic_features else [] # obtain feature representations train_set = obtain_feature_rep_ml_experiment( train_set_name, use_eiz=use_eiz, additional_features=additional_features) val_set = obtain_feature_rep_ml_experiment( val_set_name, use_eiz=use_eiz, additional_features=additional_features) # map to real-valued train_set['Class'] = train_set['Class'].replace({'no NMD': 0, 'NMD': 1}) val_set['Class'] = val_set['Class'].replace({'no NMD': 0, 'NMD': 1}) # use only ensemble models models_to_use = models(type='ensemble') models_to_use = models_to_use.index.to_list() # get the set of all features in the dataset features = set(train_set.columns) features.remove('Class') # set the experiment up exp = setup(train_set, target='Class', numeric_features=features, html=False, session_id=123, train_size=0.7) # sidestep the fact that the the lib makes another validation set # manually get the pipeline pycaret uses for transforming the data pipeline = exp[7] X_train = train_set.drop(columns='Class') # transform into the format pycaret expects X_train = pipeline.transform(X_train) # overwrite the selected train set to use the entire training set instead set_config('X_train', X_train) set_config('y_train', train_set['Class']) # same logic with the val set, use our own instead of the pre-sliced one X_test = val_set.drop(columns='Class') # transform and set as the validation set X_test = pipeline.transform(X_test) # overwrite config set_config('X_test', X_test) set_config('y_test', val_set['Class']) # obtain the best model from the list, sorted by val set AUC best_model = compare_models(whitelist=models_to_use, sort='AUC', n_select=1) # interpretability output, get SHAP plots to judge feature importance interpret_model(best_model) # now, do some additional tuning, compare different hyperparemters, maximize AUC best_model = tune_model(best_model, optimize='AUC') # interpret the best model interpret_model(best_model) # the path to save the model at model_path = get_model_name(train_set_name, use_eiz, demographic_features) # save the model save_model(best_model, model_path) # get results on val set as dataframe results = predict_model(best_model, verbose=False) # get the threshold at which the model performed best on the val set best_threshold = evaluate_roc(results['Class'], results['Score'], method='val_set_training') return {'best_threshold': best_threshold, 'model_path': model_path}
def classification_model( *, y_col, training_set, normalize, test_size, folds, metric, model_name, testing_set, imbalanced, seed, include_models, normalize_method, ): """ Build a classification model for prediction. Parameters ---------- y_col : str the name of the target column. training_set : pd.DataFrame DataFrame containing the training data. normalize : bool if True the dataset will be normalized before training. test_size : float Between [0.0-1.0]. The size of the split for test within the training set. folds : int number of folds for cross validation. metric : str the metric used for evaluating the best model. model_name : str the name to save the model. testing_set : pd.DataFrame the external dataset for evaluating the best model. imbalanced : bool if True the imbalance will be fixed before the training. seed : int random number to initilize the process. include_models : List a list of models to be included in the process. normalize_method : str The method used for normalizing the data. Returns ------- Final classification model """ if not metric: metric = 'AUC' setup = pycl.setup(target=y_col, fix_imbalance=imbalanced, normalize=normalize, normalize_method=normalize_method, data=training_set, train_size=1 - test_size, silent=True, fold=folds, session_id=seed) best_model = pycl.compare_models(sort=metric, include=include_models) pycl.pull().to_csv(model_name + '_compare_models.tsv', sep='\t', index=False) cl_model = pycl.create_model(best_model) cl_tuned_model = pycl.tune_model(cl_model, optimize=metric) pycl.pull().to_csv(model_name + '_tuned_model.tsv', sep='\t', index=False) final_model = pycl.finalize_model(cl_tuned_model) pycl.plot_model(final_model, plot='pr', save=True) pycl.plot_model(final_model, plot='confusion_matrix', save=True) pycl.plot_model(final_model, plot='feature', save=True) pycl.save_model(final_model, model_name) if len(testing_set.index) != 0: unseen_predictions = test_classifier( model_path=model_name + '.pkl', x_set=testing_set.drop(columns=[y_col]), y_col=testing_set[y_col], output=model_name) unseen_predictions.to_csv(model_name + '_external_testing_results.tsv', sep='\t', index=True) return final_model
def save(self, model): classification.save_model(model, self.model_name)