Ejemplo n.º 1
0
import preprocessing_functions as pf
import yaml
# ================================================
# TRAINING STEP - IMPORTANT TO PERPETUATE THE MODEL

# Load data
config_file = pf.read_config_file('config.yaml')
path = config_file[0]['Paths'].get('directory')
data_filename = config_file[0]['Paths'].get('data_filename')
extension = config_file[0]['Paths'].get('data_extension')
cols = config_file[2]['Feature_Groups'].get('data_columns')
df = pf.load_data(path, data_filename, extension, cols)

# divide data set
target = config_file[2]['Feature_Groups'].get('target')
X_train, X_test, y_train, y_test = pf.divide_train_test(df, target)

# get first letter from cabin variable
X_train['cabin'] = pf.extract_cabin_letter(X_train, 'cabin')

# impute categorical variables
cat_vars = config_file[2]['Feature_Groups'].get('categorical_vars')
num_vars = config_file[2]['Feature_Groups'].get('numerical_to_impute')
for var in cat_vars:
    X_train[var] = pf.impute_na(X_train, var, 'Missing')

# impute numerical variables
medians = config_file[1]['Parameters'].get('imputation_dict')
for var in num_vars:
    X_train = pf.add_missing_indicator(X_train, var)
    X_train[var] = pf.impute_na(X_train, var, medians.get(var))
Ejemplo n.º 2
0
    # make predictions
    predictions = pf.predict(data, config.OUTPUT_MODEL_PATH)

    return predictions


# ======================================

# small test that scripts are working ok

if __name__ == '__main__':

    from sklearn.metrics import accuracy_score
    import warnings
    warnings.simplefilter(action='ignore')

    # Load data
    data = pf.load_data(config.PATH_TO_DATASET)

    X_train, X_test, y_train, y_test = pf.divide_train_test(
        data, config.TARGET)

    pred = predict(X_test)

    # evaluate
    # if your code reprodues the notebook, your output should be:
    # test accuracy: 0.6832
    print('test accuracy: {}'.format(accuracy_score(y_test, pred)))
    print()
import preprocessing_functions as pf
import config

import warnings
warnings.simplefilter(action='ignore')
# ================================================
# TRAINING STEP - IMPORTANT TO PERPETUATE THE MODEL

# Load data
data = pf.load_data(config.PATH_TO_DATASET)
# divide data set

X_train, X_test, y_train, y_test = pf.divide_train_test(data, 'survived')

# get first letter from cabin variable

X_train['cabin'] = pf.extract_cabin_letter(X_train, 'cabin')

# impute categorical variables

for var in config.CATEGORICAL_VARS:
    X_train[var] = pf.impute_na(X_train, var)

# impute numerical variable

for var in config.NUMERICAL_TO_IMPUTE:
    X_train = pf.add_missing_indicator(X_train, var)
    X_train[var] = pf.impute_na(X_train,
                                var,
                                value=config.IMPUTATION_DICT[var])
Ejemplo n.º 4
0
import preprocessing_functions as pf
import config

# ================================================
# TRAINING STEP - IMPORTANT TO PERPETUATE THE MODEL

# Load data
df = pf.load_data(config.PATH_TO_DATASET)

# divide data set
xtrain, xtest, ytrain, ytest = pf.divide_train_test(df, config.TARGET)

# # get first letter from cabin variable
xtrain['cabin'] = pf.extract_cabin_letter(xtrain, 'cabin')

# # impute categorical variables
xtrain[config.CATEGORICAL_VARS] = pf.impute_na(xtrain[config.CATEGORICAL_VARS],
                                               'Missing')

# # impute numerical variable
xtrain[config.NUMERICAL_TO_IMPUTE] = pf.impute_na(
    xtrain[config.NUMERICAL_TO_IMPUTE], 'Numerical')

# # Group rare labels
for var in config.CATEGORICAL_VARS:
    xtrain[var] = pf.remove_rare_labels(xtrain, var,
                                        config.FREQUENT_LABELS[var])

# # encode categorical variables
xtrain = pf.encode_categorical(xtrain, config.CATEGORICAL_VARS)
Ejemplo n.º 5
0
import numpy as np
import preprocessing_functions as pf
import config
import warnings
warnings.simplefilter(action='ignore')

# ================================================
# TRAINING STEP - IMPORTANT TO PERPETUATE THE MODEL

# Load data
data = pf.load_data(config.PATH_TO_DATASET)

# divide data set
X_train, X_test, y_train, y_test = pf.divide_train_test(df=data,
                                                        target=config.TARGET)

# encode categorical variables
for var in config.CATEGORICAL_ENCODE:
    pf.encode_categorical(X_train, var)

# train model and save
pf.train_model(X_train, y_train, config.OUTPUT_MODEL_PATH)

print('Finished training')
import preprocessing_functions as pf
import config

# ================================================
# TRAINING STEP - IMPORTANT TO PERPETUATE THE MODEL

# Load data
data = pf.load_data('titanic.csv')

# divide data set
X_train, _, y_train, _ = pf.divide_train_test(data, config.TARGET)

# get first letter from cabin variable
X_train['cabin'] = pf.extract_cabin_letter(X_train, 'cabin')

# impute categorical variables
for var in config.CATEGORICAL_VARS:
    X_train[var] = pf.impute_na(X_train, var, replacement='Missing')

# impute numerical variable
for var in config.NUMERICAL_TO_IMPUTE:
    # add missing indicator
    X_train[var + '_NA'] = pf.add_missing_indicator(X_train, var)

    # impute NA
    X_train[var] = pf.impute_na(X_train,
                                var,
                                replacement=config.IMPUTATION_DICT[var])

# Group rare labels
for var in config.CATEGORICAL_VARS:
Ejemplo n.º 7
0
    predictions = pf.predict(X_test,config.OUTPUT_MODEL_PATH)

    
    return predictions

# ======================================
    
# small test that scripts are working ok
    
if __name__ == '__main__':
        
    from sklearn.metrics import accuracy_score    
    import warnings
    warnings.simplefilter(action='ignore')
    
    # Load data
    data = pf.load_data(config.PATH_TO_DATASET)
    df_target = data[config.TARGET]
    data = data.drop([config.TARGET],axis=1)

    X_train, X_test, y_train, y_test = pf.divide_train_test(data,
                                                            df_target,seed=config.GLOBAL_SEED)
    
    pred = predict(X_test)
    
    # evaluate
    # if your code reprodues the notebook, your output should be:
    # test accuracy: 0.6832
    print('test accuracy: {}'.format(accuracy_score(y_test, pred)))
    print()