def fit(self): """ Performs the search """ self.train, self.test = train_test_split(self.dataset, test_size=0.1) self.X_train = self.train.drop(columns=self.target) self.y_train = self.train[self.target] self.X_test = self.test.drop(columns=self.target) self.y_test = self.test[self.target] if self.problem_type == "classification": from pycaret.classification import automl, compare_models, setup else: from pycaret.regression import automl, compare_models, setup experiment = setup(data=self.train, target=self.target, silent=True, html=False) compare_models(**self.automl_settings) self.automl_pipeline = automl()
if __name__ == "__main__": df = load_combine_saif_liu_data() sample = False if sample: df = df.groupby( 'emotion', group_keys=False).apply(lambda x: x.sample(min(len(x), 100))) vector_df_filepath = 'data/vector_df.csv' if os.path.exists(vector_df_filepath): vector_df = pd.read_csv(vector_df_filepath) else: vectors = df['tweet'].swifter.apply(get_text_vector) vector_df = pd.DataFrame( vectors.array, columns=[f'v{r}' for r in range(vectors.iloc[0].shape[0])]) vector_df.to_csv(vector_df_filepath) data_df = pd.concat( [vector_df, pd.DataFrame(df['emotion']).reset_index(drop=True)], axis=1) models = pycc.setup(data=data_df, target='emotion', session_id=123) best = pycc.compare_models() pycc.save_model(best, 'lr_emotion.model') with open('pycc_setup.pk', 'wb') as f: pk.dump(models, f)
"""#### No missing values ## Visualization """ plt.figure(figsize=(12,12)) plt.pie(data['Type'].value_counts(), labels=data['Type'].value_counts().index) plt.title('Class Distribution') plt.show() """## Setup, it is like pipeline where we can pass peprocessing and transformation steps.""" setup( data = data, target = 'Type', #target meaning the target feature normalize = True, train_size = 0.7 ) models() best_model = compare_models() """### Extra Trees Classifier and Random Forest are better as shwon by PyCaret.""" best_model predict_model(best_model)
from pycaret.datasets import get_data data = get_data(dataset) #initialize setup from pycaret.classification import setup, compare_models, blend_models, tune_model, save_model, deploy_model, automl clf1 = setup(data, target=target, silent=True, html=False, log_experiment=True, experiment_name=exp_name) #compare models and select top5 top5 = compare_models(n_select=5, blacklist=['catboost']) #blend top 5 models blender = blend_models(estimator_list=top5) #tune best model tuned_best_model = tune_model(top5[0]) #select best model a = automl() save_model(a, 'best_model') #deploy best model deploy_model(a, model_name='best-model-aws', authentication={'bucket': 'pycaret-test'})
#import dataset from pycaret.datasets import get_data data = get_data('juice') #init setup from pycaret.classification import setup, compare_models, save_model, deploy_model reg1 = setup(data, target='Purchase', logging=True, experiment_name='juice-script1', silent=True, html=False) #compare models c = compare_models(n_select=1) #save model save_model(c, model_name='tfdemo') #deploy model on S3 deploy_model(c, model_name='tfdemo', platform='aws', authentication={'bucket': 'pycaret-test'})
# save models as .pkl files finalize_model(etrees) save_model(etrees, 'classifier_models(pkl)/xant_etrees') finalize_model(xgboost) save_model(xgboost, 'classifier_models(pkl)/xant_xgb') finalize_model(catboost) save_model(catboost, 'classifier_models(pkl)/xant_cboost') finalize_model(rf) save_model(rf, 'classifier_models(pkl)/xant_rf') finalize_model(lgbm) save_model(lgbm, 'classifier_models(pkl)/xant_lgbm') finalize_model(log) save_model(log, 'classifier_models(pkl)/xant_log') blender_specific = blend_models(estimator_list=[etrees, lgbm, rf], method='soft') #blender_specific = blend_models(estimator_list=[ #etrees, lgbm, catboost], method='soft') finalize_model(blender_specific) save_model(blender_specific, 'classifier_models(pkl)/xant_blended') print('PyCaret training ended \n\n') compare_models() # print ordered 10-fold cv scores
pycset = regression.setup(data=df, target=df_target) # Compare models st.dataframe(regression.compare_models()) # End st.success('End of execution!') if df_task in ['Classification (Binary)', 'Classification (Multiclass)']: # Setup PyCaret with st.spinner('PyCaret setup is running...'): pycset = classification.setup(data=df, target=df_target) # Compare models st.dataframe(classification.compare_models()) # End st.success('End of execution!') if df_task in ['NLP']: # Setup PyCaret with st.spinner('PyCaret setup is running...'): pycset = nlp.setup(data=df, target=df_target) # Compare models #st.dataframe(classification.compare_models()) # End st.success('End of execution!')
# define the location of the dataset #url = 'https://raw.githubusercontent.com/jbrownlee/Datasets/master/sonar.csv' # load the dataset #df = read_csv(url, header=None) # set column names as the column number n_cols = df.shape[1] df.columns = [str(i) for i in range(n_cols)] # setup the dataset grid = setup(data=df, target=df.columns[-1], html=False, silent=True, verbose=False) # evaluate models and compare models best = compare_models() # report the best model print("best") print(best) # tune model hyperparameters on the sonar classification dataset from pandas import read_csv from sklearn.ensemble import ExtraTreesClassifier from pycaret.classification import setup from pycaret.classification import tune_model # define the location of the dataset #url = 'https://raw.githubusercontent.com/jbrownlee/Datasets/master/sonar.csv' # load the dataset #df = read_csv(url, header=None) # set column names as the column number n_cols = df.shape[1]
def train_trad_ml_baseline(train_set_name, val_set_name, use_eiz=True, demographic_features=False): ''' Trains a ensemble based classifier on a distribution based feature representation of EI or EIZ scores to predict whether or not a patient has an NMD :param train_set_name: The name of the training set to use :param val_set_name: The name of the validation set to use :param use_eiz: Whether to use EIZ or raw EI scores :param demographic_features: Whether to include demographic features. :return: A dictionary with the path to the stored model and its best operating threshold. ''' additional_features = ['Age', 'Sex', 'BMI'] if demographic_features else [] # obtain feature representations train_set = obtain_feature_rep_ml_experiment( train_set_name, use_eiz=use_eiz, additional_features=additional_features) val_set = obtain_feature_rep_ml_experiment( val_set_name, use_eiz=use_eiz, additional_features=additional_features) # map to real-valued train_set['Class'] = train_set['Class'].replace({'no NMD': 0, 'NMD': 1}) val_set['Class'] = val_set['Class'].replace({'no NMD': 0, 'NMD': 1}) # use only ensemble models models_to_use = models(type='ensemble') models_to_use = models_to_use.index.to_list() # get the set of all features in the dataset features = set(train_set.columns) features.remove('Class') # set the experiment up exp = setup(train_set, target='Class', numeric_features=features, html=False, session_id=123, train_size=0.7) # sidestep the fact that the the lib makes another validation set # manually get the pipeline pycaret uses for transforming the data pipeline = exp[7] X_train = train_set.drop(columns='Class') # transform into the format pycaret expects X_train = pipeline.transform(X_train) # overwrite the selected train set to use the entire training set instead set_config('X_train', X_train) set_config('y_train', train_set['Class']) # same logic with the val set, use our own instead of the pre-sliced one X_test = val_set.drop(columns='Class') # transform and set as the validation set X_test = pipeline.transform(X_test) # overwrite config set_config('X_test', X_test) set_config('y_test', val_set['Class']) # obtain the best model from the list, sorted by val set AUC best_model = compare_models(whitelist=models_to_use, sort='AUC', n_select=1) # interpretability output, get SHAP plots to judge feature importance interpret_model(best_model) # now, do some additional tuning, compare different hyperparemters, maximize AUC best_model = tune_model(best_model, optimize='AUC') # interpret the best model interpret_model(best_model) # the path to save the model at model_path = get_model_name(train_set_name, use_eiz, demographic_features) # save the model save_model(best_model, model_path) # get results on val set as dataframe results = predict_model(best_model, verbose=False) # get the threshold at which the model performed best on the val set best_threshold = evaluate_roc(results['Class'], results['Score'], method='val_set_training') return {'best_threshold': best_threshold, 'model_path': model_path}
def classification_model( *, y_col, training_set, normalize, test_size, folds, metric, model_name, testing_set, imbalanced, seed, include_models, normalize_method, ): """ Build a classification model for prediction. Parameters ---------- y_col : str the name of the target column. training_set : pd.DataFrame DataFrame containing the training data. normalize : bool if True the dataset will be normalized before training. test_size : float Between [0.0-1.0]. The size of the split for test within the training set. folds : int number of folds for cross validation. metric : str the metric used for evaluating the best model. model_name : str the name to save the model. testing_set : pd.DataFrame the external dataset for evaluating the best model. imbalanced : bool if True the imbalance will be fixed before the training. seed : int random number to initilize the process. include_models : List a list of models to be included in the process. normalize_method : str The method used for normalizing the data. Returns ------- Final classification model """ if not metric: metric = 'AUC' setup = pycl.setup(target=y_col, fix_imbalance=imbalanced, normalize=normalize, normalize_method=normalize_method, data=training_set, train_size=1 - test_size, silent=True, fold=folds, session_id=seed) best_model = pycl.compare_models(sort=metric, include=include_models) pycl.pull().to_csv(model_name + '_compare_models.tsv', sep='\t', index=False) cl_model = pycl.create_model(best_model) cl_tuned_model = pycl.tune_model(cl_model, optimize=metric) pycl.pull().to_csv(model_name + '_tuned_model.tsv', sep='\t', index=False) final_model = pycl.finalize_model(cl_tuned_model) pycl.plot_model(final_model, plot='pr', save=True) pycl.plot_model(final_model, plot='confusion_matrix', save=True) pycl.plot_model(final_model, plot='feature', save=True) pycl.save_model(final_model, model_name) if len(testing_set.index) != 0: unseen_predictions = test_classifier( model_path=model_name + '.pkl', x_set=testing_set.drop(columns=[y_col]), y_col=testing_set[y_col], output=model_name) unseen_predictions.to_csv(model_name + '_external_testing_results.tsv', sep='\t', index=True) return final_model
def do_modeling(self, dataFrame, pipeline_dict): prob_type = st.selectbox('Select type of problem', ['Classification', 'Regression']) target_variable = st.selectbox('Select target variable', dataFrame.columns) classification_model_library = [ 'lr', 'knn', 'nb', 'dt', 'svm', 'rbfsvm', 'gpc', 'mlp', 'ridge', 'rf', 'qda', 'ada', 'gbc', 'lda', 'et', 'xgboost', 'lightgbm', 'catboost' ] tree_based_models = [ 'Random Forest Classifier', 'Decision Tree Classifier', 'Extra Trees Classifier', 'Gradient Boosting Classifier', 'Extreme Gradient Boosting', 'Light Gradient Boosting Machine', 'CatBoost Classifier' ] classification_model_names = [ 'Logistic Regression', 'K Neighbors Classifier', 'Naive Bayes', 'Decision Tree Classifier', 'SVM - Linear Kernel', 'SVM - Radial Kernel', 'Gaussian Process Classifier', 'MLP Classifier', 'Ridge Classifier', 'Random Forest Classifier', 'Quadratic Discriminant Analysis', 'Ada Boost Classifier', 'Gradient Boosting Classifier', 'Linear Discriminant Analysis', 'Extra Trees Classifier', 'Extreme Gradient Boosting', 'Light Gradient Boosting Machine', 'CatBoost Classifier' ] classification_models = dict( zip(classification_model_names, classification_model_library)) if st.checkbox('X and y Split'): X = self.get_features(dataFrame, target_variable) y = dataFrame[target_variable] st.write('Done!') if st.checkbox('X,y Info'): st.write(X) st.write(y) if st.checkbox('Scaling of data'): scale_X = self.do_standardScale(X) columns = X.columns pipeline_dict['Scaling'] = True for col in scale_X: X[col] = scale_X[col].values #X.drop(columns,axis=1,inplace=True) st.write(X) st.write('Done!') if st.checkbox('Dimensionality Reduction'): if st.checkbox('PCA'): information_loss = st.text_input( 'Enter Information loss in percentage(%)') if st.button('PCA'): pipeline_dict['PCA_info_loss'] = information_loss pca_X = self.dimred_PCA(X, information_loss) columns = X.columns for i, val in enumerate(pca_X.T): X[i] = val X.drop(columns, axis=1, inplace=True) st.write('Done!') if st.checkbox('LDA'): number_components = st.text_input( 'Enter the number of components') if st.button('LDA'): pipeline_dict['LDA_number_components'] = number_components lda = LDA(n_components=number_components) lda_X = lda.fit_transform(X, y) columns = X.columns for i, val in enumerate(lda_X.T): X[i] = val X.drop(columns, axis=1, inplace=True) st.write('Done!') if st.checkbox('Start Base-Line modeling Classification'): py_data = X py_data[target_variable] = y st.write('Name :' + str(target_variable)) st.write('Type :' + str(prob_type)) if st.checkbox('Start Modeling'): exp1 = cl.setup(data=py_data, target=target_variable, session_id=123, silent=True) st.write('Compare Models...') #models_info = cl.create_model('lr',verbose = False) models_info = cl.compare_models() st.write(models_info) if st.checkbox('Tuning Models'): tuning_model_name = st.selectbox('Select Model for Tuning', classification_model_names) if st.button('Start'): st.write(tuning_model_name) tuned_model, result = cl.tune_model( classification_models[tuning_model_name], verbose=False) st.write(result) if tuning_model_name in tree_based_models: cl.interpret_model(tuned_model) st.pyplot() cl.plot_model(tuned_model, plot='confusion_matrix') st.pyplot() else: cl.plot_model(tuned_model, plot='confusion_matrix') st.pyplot() if st.checkbox('Finalize Model'): final_model_name = st.selectbox('Select Model for Tuning', classification_model_names) if st.checkbox('Finalize'): tuned_model, result = cl.tune_model( classification_models[final_model_name], verbose=False) st.write(result) finalize_model = cl.finalize_model(tuned_model) st.write(final_model_name) st.write(finalize_model.get_params()) st.write('Done!') st.write(pipeline_dict) url = st.text_input( "Enter Test Data Url(Must be csv file)") if st.button('Click'): test_dataFrame = self.get_test_data_csv(url) st.write(test_dataFrame) for k, v in pipeline_dict.items(): if k == 'Convert_Data_Type': st.write('Convert_Data_Type') self.convert_type( test_dataFrame, pipeline_dict['Convert_Data_Type'] ['column_name'], pipeline_dict['Convert_Data_Type'] ['data_type']) elif k == 'remove_columns': st.write('remove_columns') test_dataFrame.drop( pipeline_dict['remove_columns'], axis=1, inplace=True) elif k == 'remove_columns_threshold': st.write('remove_columns_threshold..') for threshold in pipeline_dict[ 'remove_columns_threshold']: remove_columns = self.remove_null_columns( test_dataFrame, float(threshold)) test_dataFrame.drop(remove_columns, axis=1, inplace=True) elif k == 'Fill_Median_Mode_Columns': st.write('Fill_Median_Mode_Columns..') test_dataFrame = self.replace_null_columns( test_dataFrame, pipeline_dict['Fill_Median_Mode_Columns']) elif k == 'Create_Bins': st.write('Create_Bins..') column = pipeline_dict['Create_Bins'][ 'column_Name'] bins = pipeline_dict['Create_Bins'][ 'Numbers_bin'] for i, c in enumerate(column): test_dataFrame[c] = self.do_bining( test_dataFrame, c, int(bins[i])) elif k == 'OneHotEncoding': st.write('OneHotEncoding..') list_columns = pipeline_dict['OneHotEncoding'] for col in list_columns: tempdf = pd.get_dummies( data=test_dataFrame[col]) for in_col in tempdf.columns: colName = str(col) + '_' + str(in_col) test_dataFrame[colName] = tempdf[ in_col].values test_dataFrame.drop(list_columns, axis=1, inplace=True) elif k == 'LabelEncoding': st.write('LabelEncoding..') test_dataFrame = self.do_label_Encoding( test_dataFrame, pipeline_dict['LabelEncoding']) elif k == 'BinaryEncoding': st.write('BinaryEncoding..') binary_encoding_columns = pipeline_dict[ 'BinaryEncoding'] for col in binary_encoding_columns: encoder = ce.BinaryEncoder(cols=[col]) dfbin = encoder.fit_transform( dataFrame[col]) for col in dfbin.columns: test_dataFrame[col] = dfbin[col].values test_dataFrame.drop(binary_encoding_columns, axis=1, inplace=True) elif k == 'Scaling': st.write('Scaling..') scale_X = self.do_standardScale(test_dataFrame) columns = test_dataFrame.columns for col in scale_X: test_dataFrame[col] = scale_X[col].values st.write(test_dataFrame) unseen_predictions = cl.predict_model( finalize_model, data=test_dataFrame) st.write(unseen_predictions.head()) unseen_predictions.to_csv('result.csv')
version() import time t0 = time.time() #loading dataset from pycaret.datasets import get_data data = get_data(dataset, verbose=False) #init regression from pycaret.classification import setup exp1 = setup(data, target=target, silent=True, html=False, verbose=False) #RECEIPE #1 - SELECT TOP 5 MODELS from pycaret.classification import compare_models top5 = compare_models(n_select=5, whitelist=['dt', 'lr', 'rf', 'lightgbm', 'xgboost']) #RECEIPE #2 - TUNE TOP 5 MODELS from pycaret.classification import tune_model tuned_top5 = [tune_model(i) for i in top5] print(len(tuned_top5)) #RECIPE #3 from pycaret.classification import blend_models blender = blend_models(top5, verbose=False) print(blender) from pycaret.classification import pull pull() #FINALIZE BEST MODEL