import preprocessing_functions as pf import yaml # ================================================ # TRAINING STEP - IMPORTANT TO PERPETUATE THE MODEL # Load data config_file = pf.read_config_file('config.yaml') path = config_file[0]['Paths'].get('directory') data_filename = config_file[0]['Paths'].get('data_filename') extension = config_file[0]['Paths'].get('data_extension') cols = config_file[2]['Feature_Groups'].get('data_columns') df = pf.load_data(path, data_filename, extension, cols) # divide data set target = config_file[2]['Feature_Groups'].get('target') X_train, X_test, y_train, y_test = pf.divide_train_test(df, target) # get first letter from cabin variable X_train['cabin'] = pf.extract_cabin_letter(X_train, 'cabin') # impute categorical variables cat_vars = config_file[2]['Feature_Groups'].get('categorical_vars') num_vars = config_file[2]['Feature_Groups'].get('numerical_to_impute') for var in cat_vars: X_train[var] = pf.impute_na(X_train, var, 'Missing') # impute numerical variables medians = config_file[1]['Parameters'].get('imputation_dict') for var in num_vars: X_train = pf.add_missing_indicator(X_train, var) X_train[var] = pf.impute_na(X_train, var, medians.get(var))
# make predictions predictions = pf.predict(data, config.OUTPUT_MODEL_PATH) return predictions # ====================================== # small test that scripts are working ok if __name__ == '__main__': from sklearn.metrics import accuracy_score import warnings warnings.simplefilter(action='ignore') # Load data data = pf.load_data(config.PATH_TO_DATASET) X_train, X_test, y_train, y_test = pf.divide_train_test( data, config.TARGET) pred = predict(X_test) # evaluate # if your code reprodues the notebook, your output should be: # test accuracy: 0.6832 print('test accuracy: {}'.format(accuracy_score(y_test, pred))) print()
import preprocessing_functions as pf import config import warnings warnings.simplefilter(action='ignore') # ================================================ # TRAINING STEP - IMPORTANT TO PERPETUATE THE MODEL # Load data data = pf.load_data(config.PATH_TO_DATASET) # divide data set X_train, X_test, y_train, y_test = pf.divide_train_test(data, 'survived') # get first letter from cabin variable X_train['cabin'] = pf.extract_cabin_letter(X_train, 'cabin') # impute categorical variables for var in config.CATEGORICAL_VARS: X_train[var] = pf.impute_na(X_train, var) # impute numerical variable for var in config.NUMERICAL_TO_IMPUTE: X_train = pf.add_missing_indicator(X_train, var) X_train[var] = pf.impute_na(X_train, var, value=config.IMPUTATION_DICT[var])
import preprocessing_functions as pf import config # ================================================ # TRAINING STEP - IMPORTANT TO PERPETUATE THE MODEL # Load data df = pf.load_data(config.PATH_TO_DATASET) # divide data set xtrain, xtest, ytrain, ytest = pf.divide_train_test(df, config.TARGET) # # get first letter from cabin variable xtrain['cabin'] = pf.extract_cabin_letter(xtrain, 'cabin') # # impute categorical variables xtrain[config.CATEGORICAL_VARS] = pf.impute_na(xtrain[config.CATEGORICAL_VARS], 'Missing') # # impute numerical variable xtrain[config.NUMERICAL_TO_IMPUTE] = pf.impute_na( xtrain[config.NUMERICAL_TO_IMPUTE], 'Numerical') # # Group rare labels for var in config.CATEGORICAL_VARS: xtrain[var] = pf.remove_rare_labels(xtrain, var, config.FREQUENT_LABELS[var]) # # encode categorical variables xtrain = pf.encode_categorical(xtrain, config.CATEGORICAL_VARS)
import numpy as np import preprocessing_functions as pf import config import warnings warnings.simplefilter(action='ignore') # ================================================ # TRAINING STEP - IMPORTANT TO PERPETUATE THE MODEL # Load data data = pf.load_data(config.PATH_TO_DATASET) # divide data set X_train, X_test, y_train, y_test = pf.divide_train_test(df=data, target=config.TARGET) # encode categorical variables for var in config.CATEGORICAL_ENCODE: pf.encode_categorical(X_train, var) # train model and save pf.train_model(X_train, y_train, config.OUTPUT_MODEL_PATH) print('Finished training')
import preprocessing_functions as pf import config # ================================================ # TRAINING STEP - IMPORTANT TO PERPETUATE THE MODEL # Load data data = pf.load_data('titanic.csv') # divide data set X_train, _, y_train, _ = pf.divide_train_test(data, config.TARGET) # get first letter from cabin variable X_train['cabin'] = pf.extract_cabin_letter(X_train, 'cabin') # impute categorical variables for var in config.CATEGORICAL_VARS: X_train[var] = pf.impute_na(X_train, var, replacement='Missing') # impute numerical variable for var in config.NUMERICAL_TO_IMPUTE: # add missing indicator X_train[var + '_NA'] = pf.add_missing_indicator(X_train, var) # impute NA X_train[var] = pf.impute_na(X_train, var, replacement=config.IMPUTATION_DICT[var]) # Group rare labels for var in config.CATEGORICAL_VARS:
predictions = pf.predict(X_test,config.OUTPUT_MODEL_PATH) return predictions # ====================================== # small test that scripts are working ok if __name__ == '__main__': from sklearn.metrics import accuracy_score import warnings warnings.simplefilter(action='ignore') # Load data data = pf.load_data(config.PATH_TO_DATASET) df_target = data[config.TARGET] data = data.drop([config.TARGET],axis=1) X_train, X_test, y_train, y_test = pf.divide_train_test(data, df_target,seed=config.GLOBAL_SEED) pred = predict(X_test) # evaluate # if your code reprodues the notebook, your output should be: # test accuracy: 0.6832 print('test accuracy: {}'.format(accuracy_score(y_test, pred))) print()