def preprocessing(df, step_train): """ If True, it creates the Preprocessing object, otherwise it loads an existing one Parameters ---------- df : pd.Dataframe Train or test dataset step_train : boolean Train or test """ if step_train: norm_cols = {'min-max': ['Age']} oneHot_cols = ['Pclass', 'Sex'] p = Preprocessing(norm_cols, oneHot_cols) train, test_train = p.execute(df, step_train=True, val_size=0.2) logging.info("Saving") dump(p, path_output + 'preprocessing/preprocessing.pkl') train.to_csv(path_output + 'processed/train/train.csv', index=False) test_train.to_csv(path_output + 'processed/val/val.csv', index=False) else: p = load(path_input + 'preprocessing/preprocessing.pkl') test = p.execute(df, step_train=False) logging.info("Saving") test.to_csv(path_output + 'processed/inference/inference.csv', index=False)
def test_normalize(cleaned_data_train, cleaned_data_test): """ Test if column Age is normalized """ from ml.preprocessing.preprocessing import Preprocessing p = Preprocessing(norm_cols={'min-max': ['Age']}) df_train = p.normalize(cleaned_data_train, step_train=True) assert values_between(df_train, 'Age', 0, 1) df_test = p.normalize(cleaned_data_test, step_train=False) assert values_between(df_test, 'Age', 0, 1)
def test_categ_encoding(cleaned_data): """ Test if column PClass is """ from ml.preprocessing.preprocessing import Preprocessing p = Preprocessing() df = p.categ_encoding(cleaned_data) names = [ 'Survived', 'Age', 'Pclass_1', 'Pclass_2', 'Pclass_3', 'Sex_female', 'Sex_male' ] assert [name in df.columns for name in names]
def test_categ_encoding(cleaned_data_train, cleaned_data_test): """ Test if column PClass is encoding """ from ml.preprocessing.preprocessing import Preprocessing names = [ 'Survived', 'Age', 'Pclass_1', 'Pclass_2', 'Pclass_3', 'Sex_1', 'Sex_2' ] p = Preprocessing(oneHot_cols=['Pclass', 'Sex']) df_train = p.categ_encoding_oneHot(cleaned_data_train, step_train=True) assert all_columns(df_train, names) df_test = p.categ_encoding_oneHot(cleaned_data_test, step_train=False) assert all_columns(df_test, names)
def __init__(self): st.title("Model Execution") with open('config/config.json', 'r') as file: project_name = json.load(file)['project_name'] mlflow.set_experiment(project_name) self.df = Spreadsheet().get_data('../data/raw/train.csv') self.algos = { 'rf':RandomForestClassifier, 'gb':GradientBoostingClassifier, 'log':LogisticRegression } self.p = Preprocessing() self.mode = st.sidebar.radio("", ["Model Fitting", "Predict"])
def test_execute_train(read_data_train, read_data_test): """ Test if execute is correct """ from ml.preprocessing.preprocessing import Preprocessing names = [ 'Survived', 'Age', 'Pclass_1', 'Pclass_2', 'Pclass_3', 'Sex_1', 'Sex_2' ] norm_cols = {'min-max': ['Age']} oneHot_cols = ['Pclass', 'Sex'] p = Preprocessing(norm_cols, oneHot_cols) X_train, X_val = p.execute(read_data_train, step_train=True) assert all_columns(X_train, names) assert values_between(X_train, 'Age', 0, 1) assert all_columns(X_val, names) assert values_between(X_val, 'Age', 0, 1) X_test = p.execute(read_data_test, step_train=False) assert all_columns(X_test, names) assert values_between(X_test, 'Age', 0, 1)
from ml.model.trainer import TrainerSklearn from sklearn.model_selection import train_test_split from sklearn.ensemble import RandomForestClassifier, GradientBoostingClassifier from sklearn.linear_model import LogisticRegression import mlflow import mlflow.sklearn with open('config/config.json', 'r') as file: project_name = json.load(file)['project_name'] mlflow.set_experiment(project_name) df = Spreadsheet().get_data('../data/raw/train.csv') p = Preprocessing() df = p.clean_data(df) df = p.categ_encoding(df) X = df.drop(columns=["Survived"]) y = df["Survived"] algos = { 'rf':RandomForestClassifier, 'gb':GradientBoostingClassifier, 'log':LogisticRegression } for algo in algos.keys(): with mlflow.start_run() as run: model = TrainerSklearn().train(X, y,
def cleaned_data_test(read_data_test): from ml.preprocessing.preprocessing import Preprocessing p = Preprocessing() yield p.clean_data(read_data_test)
class ModelPage: def __init__(self): st.title("Model Execution") with open('config/config.json', 'r') as file: project_name = json.load(file)['project_name'] mlflow.set_experiment(project_name) self.df = Spreadsheet().get_data('../data/raw/train.csv') self.algos = { 'rf':RandomForestClassifier, 'gb':GradientBoostingClassifier, 'log':LogisticRegression } self.p = Preprocessing() self.mode = st.sidebar.radio("", ["Model Fitting", "Predict"]) def preprocess(self): df_clean = self.p.clean_data(self.df) df_encoded = self.p.categ_encoding(df_clean) return df_encoded def fit_model(self, df, algos): X = df.drop(columns=["Survived"]) y = df["Survived"] my_bar = st.progress(0) for algo, object, index in zip(algos.keys(), algos.values(), range(len(algos.keys()))): st.write('### Fitting {}'.format(ALGO_NAME[algo])) with mlflow.start_run() as run: model = TrainerSklearn().train(X, y, classification=True, algorithm=object, data_split=('cv', {'cv': 8}), preprocessing=self.p) mlflow.log_params({'algorithm': algo}) mlflow.log_metrics(model.get_metrics()) mlflow.sklearn.log_model(model.get_model(), 'model') # Salva o modelo na pasta output model.save_model(f'../output/titanic_model_{algo}.pkl') my_bar.progress((1 / len(algos.keys())) * (index + 1)) my_bar.empty() def model_page(self): st.write('## Model Fitting') st.write("""#### Train the developed machine learning algorithm""") with st.form(key="model_form"): algo_options = ['All'] + list(self.algos.keys()) chosen_algo = st.selectbox('Choose algorithm', algo_options, format_func=lambda x: x if x == 'All' else ALGO_NAME[x]) if chosen_algo == 'All': model_algo = self.algos else: model_algo = {key: value for key, value in self.algos.items() if key == chosen_algo} st.write('Execute model?') fit = st.form_submit_button('Fit') if fit: with st.spinner("Running algorithm ..."): df_model = self.preprocess() self.fit_model(df_model, model_algo) st.write('### Success! Models fitted!') def predict(self, X, model, probs): try: model = load(f'../output/titanic_model_{model}.pkl') except: st.error('Model not loaded') X = self.p.clean_data(X) X = self.p.categ_encoding(X) columns = model.get_columns() for col in columns: if col not in X.columns: X[col] = 0 if probs: return model.predict_proba(X)[:,1] else: return model.predict(X) def predict_page(self): st.write('## Simulator') st.write("""Predict the outcome or probability of survival by customizing the input data of individuals""") st.write('') model_list = os.listdir('../output') avail_algo = [re.search('(?<=model_).+(?=\.pkl)', m).group(0) for m in model_list] with st.form(key="predict_form"): # New Data col1, col2, col3 = st.columns(3) pclass = col1.radio("Passage Class", (1, 2, 3), format_func=lambda x: CLASS_NAME[x]) sex = col2.selectbox("Sex", ('male', 'female'), format_func=lambda x: x.title()) age = col3.number_input("Age", step = 5) new_data = pd.DataFrame({ 'Pclass': [pclass], 'Sex': [sex], 'Age': [age] }) # Prediction Options col4, col5 = st.columns([2, 1]) algorithm = col4.selectbox('Algorithm', avail_algo, format_func=lambda x: ALGO_NAME[x]) probs = col5.radio('Predict Probability?', (True, False), format_func=lambda x: 'Yes' if x else 'No') predict = st.form_submit_button(label = 'Predict') if predict: pred = self.predict(new_data, algorithm, probs) if probs: velocimeter_chart(pred[0]) else: outcome = 'SURVIVED' if pred == 1 else 'DIED' st.write(f""" The model predicted that this individual would have ### {outcome} in the Titanic tragic accident """) def write(self): if self.mode == 'Model Fitting': self.model_page() elif self.mode == 'Predict': self.predict_page()