Python Preprocessing Examples, ml.preprocessing.preprocessing.Preprocessing Python Examples

Example #1

0

Show file

def preprocessing(df, step_train):
    """
    If True, it creates the Preprocessing object,
    otherwise it loads an existing one

    Parameters
    ----------            
    df          : pd.Dataframe
                  Train or test dataset
    step_train  : boolean
                  Train or test

    """
    if step_train:
        norm_cols = {'min-max': ['Age']}
        oneHot_cols = ['Pclass', 'Sex']
        p = Preprocessing(norm_cols, oneHot_cols)
        train, test_train = p.execute(df, step_train=True, val_size=0.2)
        logging.info("Saving")
        dump(p, path_output + 'preprocessing/preprocessing.pkl')
        train.to_csv(path_output + 'processed/train/train.csv', index=False)
        test_train.to_csv(path_output + 'processed/val/val.csv', index=False)
    else:
        p = load(path_input + 'preprocessing/preprocessing.pkl')
        test = p.execute(df, step_train=False)
        logging.info("Saving")
        test.to_csv(path_output + 'processed/inference/inference.csv',
                    index=False)

Example #2

0

Show file

def test_normalize(cleaned_data_train, cleaned_data_test):
    """
    Test if column Age is normalized
    """
    from ml.preprocessing.preprocessing import Preprocessing
    p = Preprocessing(norm_cols={'min-max': ['Age']})
    df_train = p.normalize(cleaned_data_train, step_train=True)
    assert values_between(df_train, 'Age', 0, 1)
    df_test = p.normalize(cleaned_data_test, step_train=False)
    assert values_between(df_test, 'Age', 0, 1)

Example #3

0

Show file

File: test_project.py Project: gutoturolla/hermione

def test_categ_encoding(cleaned_data):
    """
    Test if column PClass is 
    """
    from ml.preprocessing.preprocessing import Preprocessing
    p = Preprocessing()
    df = p.categ_encoding(cleaned_data)
    names = [
        'Survived', 'Age', 'Pclass_1', 'Pclass_2', 'Pclass_3', 'Sex_female',
        'Sex_male'
    ]
    assert [name in df.columns for name in names]

Example #4

0

Show file

def test_categ_encoding(cleaned_data_train, cleaned_data_test):
    """
    Test if column PClass is encoding
    """
    from ml.preprocessing.preprocessing import Preprocessing
    names = [
        'Survived', 'Age', 'Pclass_1', 'Pclass_2', 'Pclass_3', 'Sex_1', 'Sex_2'
    ]
    p = Preprocessing(oneHot_cols=['Pclass', 'Sex'])
    df_train = p.categ_encoding_oneHot(cleaned_data_train, step_train=True)
    assert all_columns(df_train, names)
    df_test = p.categ_encoding_oneHot(cleaned_data_test, step_train=False)
    assert all_columns(df_test, names)

Example #5

0

Show file

 def __init__(self):
     st.title("Model Execution")
     with open('config/config.json', 'r') as file:
         project_name = json.load(file)['project_name']
     mlflow.set_experiment(project_name)
     self.df = Spreadsheet().get_data('../data/raw/train.csv')
     self.algos = {
         'rf':RandomForestClassifier,
         'gb':GradientBoostingClassifier,
         'log':LogisticRegression
     }
     self.p = Preprocessing()
     self.mode = st.sidebar.radio("", ["Model Fitting", "Predict"])

Example #6

0

Show file

def test_execute_train(read_data_train, read_data_test):
    """
    Test if execute is correct
    """
    from ml.preprocessing.preprocessing import Preprocessing
    names = [
        'Survived', 'Age', 'Pclass_1', 'Pclass_2', 'Pclass_3', 'Sex_1', 'Sex_2'
    ]
    norm_cols = {'min-max': ['Age']}
    oneHot_cols = ['Pclass', 'Sex']
    p = Preprocessing(norm_cols, oneHot_cols)
    X_train, X_val = p.execute(read_data_train, step_train=True)
    assert all_columns(X_train, names)
    assert values_between(X_train, 'Age', 0, 1)
    assert all_columns(X_val, names)
    assert values_between(X_val, 'Age', 0, 1)
    X_test = p.execute(read_data_test, step_train=False)
    assert all_columns(X_test, names)
    assert values_between(X_test, 'Age', 0, 1)

Example #7

0

Show file

from ml.model.trainer import TrainerSklearn

from sklearn.model_selection import train_test_split
from sklearn.ensemble import RandomForestClassifier, GradientBoostingClassifier
from sklearn.linear_model import LogisticRegression

import mlflow
import mlflow.sklearn

with open('config/config.json', 'r') as file:
    project_name = json.load(file)['project_name']

mlflow.set_experiment(project_name)

df = Spreadsheet().get_data('../data/raw/train.csv')
p = Preprocessing()
df = p.clean_data(df)
df = p.categ_encoding(df)

X = df.drop(columns=["Survived"])
y = df["Survived"]

algos = {
    'rf':RandomForestClassifier,
    'gb':GradientBoostingClassifier,
    'log':LogisticRegression
}

for algo in algos.keys():
    with mlflow.start_run() as run:
        model = TrainerSklearn().train(X, y,

Example #8

0

Show file

def cleaned_data_test(read_data_test):
    from ml.preprocessing.preprocessing import Preprocessing
    p = Preprocessing()
    yield p.clean_data(read_data_test)

Example #9

0

Show file

class ModelPage:

    def __init__(self):
        st.title("Model Execution")
        with open('config/config.json', 'r') as file:
            project_name = json.load(file)['project_name']
        mlflow.set_experiment(project_name)
        self.df = Spreadsheet().get_data('../data/raw/train.csv')
        self.algos = {
            'rf':RandomForestClassifier,
            'gb':GradientBoostingClassifier,
            'log':LogisticRegression
        }
        self.p = Preprocessing()
        self.mode = st.sidebar.radio("", ["Model Fitting", "Predict"])

    def preprocess(self):

        df_clean = self.p.clean_data(self.df)
        df_encoded = self.p.categ_encoding(df_clean)
        return df_encoded

    def fit_model(self, df, algos):
        X = df.drop(columns=["Survived"])
        y = df["Survived"]
        my_bar = st.progress(0)
        for algo, object, index in zip(algos.keys(), algos.values(), range(len(algos.keys()))):
            st.write('### Fitting {}'.format(ALGO_NAME[algo]))
            with mlflow.start_run() as run:
                model = TrainerSklearn().train(X, y, 
                                    classification=True, 
                                    algorithm=object,
                                    data_split=('cv', {'cv': 8}),
                                    preprocessing=self.p)
                mlflow.log_params({'algorithm': algo})
                mlflow.log_metrics(model.get_metrics())
                mlflow.sklearn.log_model(model.get_model(), 'model')
                # Salva o modelo na pasta output
                model.save_model(f'../output/titanic_model_{algo}.pkl')
                my_bar.progress((1 / len(algos.keys())) * (index + 1))
        my_bar.empty()

    def model_page(self):
        st.write('## Model Fitting')
        st.write("""#### Train the developed machine learning algorithm""")
        with st.form(key="model_form"):
            algo_options = ['All'] + list(self.algos.keys())
            chosen_algo = st.selectbox('Choose algorithm', algo_options, format_func=lambda x: x if x == 'All' else ALGO_NAME[x])
            if chosen_algo == 'All':
                model_algo = self.algos
            else:
                model_algo = {key: value for key, value in self.algos.items() if key == chosen_algo}
            st.write('Execute model?')
            fit = st.form_submit_button('Fit')
        if fit:
            with st.spinner("Running algorithm ..."):    
                df_model = self.preprocess()
                self.fit_model(df_model, model_algo)
            st.write('### Success! Models fitted!')

    def predict(self, X, model, probs):
        try:
            model = load(f'../output/titanic_model_{model}.pkl')
        except:
            st.error('Model not loaded')
        X = self.p.clean_data(X)
        X = self.p.categ_encoding(X)
        columns = model.get_columns()
        for col in columns:
            if col not in X.columns:
                X[col] = 0
        if probs:
            return model.predict_proba(X)[:,1]
        else:
            return model.predict(X)

    def predict_page(self):
        st.write('## Simulator')
        st.write("""Predict the outcome or probability of survival by customizing the input data of individuals""")
        st.write('')
        model_list = os.listdir('../output')
        avail_algo = [re.search('(?<=model_).+(?=\.pkl)', m).group(0) for m in model_list]
        with st.form(key="predict_form"):
            # New Data
            col1, col2, col3 = st.columns(3)
            pclass = col1.radio("Passage Class", (1, 2, 3), format_func=lambda x: CLASS_NAME[x])
            sex = col2.selectbox("Sex", ('male', 'female'), format_func=lambda x: x.title())
            age = col3.number_input("Age", step = 5)
            new_data = pd.DataFrame({
                'Pclass': [pclass],
                'Sex': [sex],
                'Age': [age]
            })
            # Prediction Options
            col4, col5 = st.columns([2, 1])
            algorithm = col4.selectbox('Algorithm', avail_algo, format_func=lambda x: ALGO_NAME[x])
            probs = col5.radio('Predict Probability?',  (True, False), format_func=lambda x: 'Yes' if x else 'No')
            predict = st.form_submit_button(label = 'Predict')
        if predict:
            pred = self.predict(new_data, algorithm, probs)
            if probs:
                velocimeter_chart(pred[0])
            else:
                outcome = 'SURVIVED' if pred == 1 else 'DIED'
                st.write(f"""
                         The model predicted that this individual would have 
                         ### {outcome}

                         in the Titanic tragic accident
                         """)

    def write(self):
        if self.mode == 'Model Fitting':
            self.model_page()
        elif self.mode == 'Predict':
            self.predict_page()