def predict(data): # extract first letter from cabin data['cabin'] = pf.extract_cabin_letter(data, 'cabin') # impute NA categorical data[config.CATEGORICAL_VARS] = pf.impute_na(data[config.CATEGORICAL_VARS], 'Missing') # impute NA numerical data[config.NUMERICAL_TO_IMPUTE] = pf.impute_na( data[config.NUMERICAL_TO_IMPUTE], 'Numerical') # Group rare labels for var in config.CATEGORICAL_VARS: data[var] = pf.remove_rare_labels(data, var, config.FREQUENT_LABELS[var]) # encode variables data = pf.encode_categorical(data, config.CATEGORICAL_VARS) print(data.shape) # check all dummies were added data = pf.check_dummy_variables(data, config.DUMMY_VARIABLES) print(data.shape) # scale variables data = pf.scale_features(data, config.OUTPUT_SCALER_PATH) # make predictions predictions = pf.predict(data, config.OUTPUT_MODEL_PATH) return predictions
def predict(data): # extract first letter from cabin pf.extract_cabin_letter(data, 'cabin') # impute NA categorical for var in ['age', 'fare']: pf.add_missing_indicator(data, var) # impute NA numerical for var in config.CATEGORICAL_VARS: pf.impute_na(data, var) # Group rare labels for var in config.CATEGORICAL_VARS: pf.remove_rare_labels(data, var, config.FREQUENT_LABELS) # encode variables data = pf.encode_categorical(df, config.CATEGORICAL_VARS) # scale variables data = pf.scale_features(data, config.OUTPUT_SCALER_PATH) # make predictions predictions, _ = pf.predict(data, config.OUTPUT_MODEL_PATH) return predictions
def predict(data): # extract first letter from cabin data['cabin'] = pf.extract_cabin_letter(data, 'cabin') # impute NA categorical for var in config.CATEGORICAL_VARS: data[var] = pf.impute_na(data, var, replacement='Missing') # impute NA numerical for var in config.NUMERICAL_TO_IMPUTE: data[var + '_NA'] = pf.add_missing_indicator(data, var) data[var] = pf.impute_na(data, var, replacement=config.IMPUTATION_DICT[var]) # Group rare labels for var in config.CATEGORICAL_VARS: data[var] = pf.remove_rare_labels(data, var, config.FREQUENT_LABELS[var]) # encode variables for var in config.CATEGORICAL_VARS: data = pf.encode_categorical(data, var) # check all dummies were added data = pf.check_dummy_variables(data, config.DUMMY_VARIABLES) # scale variables data = pf.scale_features(data, config.OUTPUT_SCALER_PATH) # make predictions predictions = pf.predict(data, config.OUTPUT_MODEL_PATH) return predictions
def predict(data): # impute NA for var in config.CATEGORICAL_TO_IMPUTE: data[var] = pf.impute_na(data, var, replacement='Missing') data[config.NUMERICAL_TO_IMPUTE] = pf.impute_na( data, config.NUMERICAL_TO_IMPUTE, replacement=config.LOTFRONTAGE_MODE) # capture elapsed time data[config.YEAR_VARIABLE] = pf.elapsed_years(data, config.YEAR_VARIABLE, ref_var='YrSold') # log transform numerical variables for var in config.NUMERICAL_LOG: data[var] = pf.log_transform(data, var) # Group rare labels for var in config.CATEGORICAL_ENCODE: data[var] = pf.remove_rare_labels(data, var, config.FREQUENT_LABELS[var]) # encode variables for var in config.CATEGORICAL_ENCODE: data[var] = pf.encode_categorical(data, var, config.ENCODING_MAPPINGS[var]) # scale variables data = pf.scale_features(data[config.FEATURES], config.OUTPUT_SCALER_PATH) # make predictions predictions = pf.predict(data, config.OUTPUT_MODEL_PATH) return predictions
def predict(data): # extract first letter from cabin data[config.EXTRACT_VARIABLE] = pf.extract_cabin_letter( data, config.EXTRACT_VARIABLE) # impute NA categorical for var in config.CATEGORICAL_TO_ENCODE: data[var] = pf.impute_na(data, var, replacement='Missing') # impute NA numerical for var in config.NUMERICAL_TO_IMPUTE: if (var == 'age'): data[var] = pf.add_missing_indicator(data, var, config.AGE_MEDIAN) else: data[var] = pf.add_missing_indicator(data, var, config.FARE_MEDIAN) # Group rare labels for var in config.CATEGORICAL_TO_ENCODE: data[var] = pf.remove_rare_labels(data, var, config.RARE_VALUE) # encode variables for var in config.CATEGORICAL_TO_ENCODE: data = pf.encode_categorical(data, var) # check all dummies were added pf.check_dummy_variables(data, config.DUMMY_VARIABLE) # scale variables data = pf.scale_features(data[config.FEATURES], config.OUTPUT_SCALER_PATH) # make predictions predictions = pf.predict(data, config.OUTPUT_MODEL_PATH) return predictions
def predict(data): # extract first letter from cabin X_test = pf.extract_cabin_letter(data, config.IMPUTATION_DICT['cabin_variable']) # impute NA categorical X_test = pf.add_missing_indicator(X_test, config.CATEGORICAL_VARS) # impute NA numerical for var in config.NUMERICAL_TO_IMPUTE: X_test = pf.impute_na(X_test,var,replace_by=config.IMPUTATION_DICT[var], add_na_columns=True) # Group rare labels X_test = pf.remove_rare_labels(X_test, config.FREQUENT_LABELS) # encode variables for var in config.CATEGORICAL_VARS: X_test = pf.encode_categorical(X_test, var) X_test.drop(labels=config.CATEGORICAL_VARS, axis=1, inplace=True) # check all dummies were added X_test = pf.check_dummy_variables(X_test, config.DUMMY_VARIABLES) # scale variables X_test = pf.scale_features(X_test, config.OUTPUT_SCALER_PATH) # make predictions predictions = pf.predict(X_test,config.OUTPUT_MODEL_PATH) return predictions
def predict(data): # extract first letter from cabin data = pf.extract_cabin_letter(data, 'cabin') # impute NA categorical for var in config.CATEGORICAL_VARS: data = pf.impute_na(data, var, config.IMPUTATION_DICT) # impute NA numerical for var in ['age', 'fare']: data = pf.impute_na(data, var, config.IMPUTATION_DICT) # add indicator variables for var in ['age', 'fare']: data = pf.add_missing_indicator(data, var) # Group rare labels for var in config.CATEGORICAL_VARS: data = pf.remove_rare_labels(data, config.FREQUENT_LABELS, var) # encode variables for var in config.CATEGORICAL_VARS: data = pf.encode_categorical(data, var) # check all dummies were added data = pf.check_dummy_variables(data, config.DUMMY_VARIABLES) # scale variables data = pf.scale_features(data, config.ORDERED_COLUMNS, config.OUTPUT_SCALER_PATH) # make predictions predictions = pf.predict(data, config.ORDERED_COLUMNS, config.OUTPUT_MODEL_PATH) return predictions
def predict(data): # imputar datos faltantes for var in config.CATEGORICAL_TO_IMPUTE: data[var] = pf.impute_na(data, var, replacement='Missing') data[config.NUMERICAL_TO_IMPUTE] = pf.impute_na(data, config.NUMERICAL_TO_IMPUTE, replacement=config.LOTFRONTAGE_MODE) # intervalos de tiempo data[config.YEAR_VARIABLE] = pf.elapsed_years(data, config.YEAR_VARIABLE, ref_var='YrSold') # transformación logarítmica for var in config.NUMERICAL_LOG: data[var] = pf.log_transform(data, var) # agrupación de etiquetas poco frecuentes for var in config.CATEGORICAL_ENCODE: data[var] = pf.remove_rare_labels(data, var, config.FREQUENT_LABELS[var]) # codificación de var. categóricas for var in config.CATEGORICAL_ENCODE: data[var] = pf.encode_categorical(data, var, config.ENCODING_MAPPINGS[var]) # escalar variables data = pf.scale_features(data[config.FEATURES], config.OUTPUT_SCALER_PATH) # obtener predicciones predictions = pf.predict(data, config.OUTPUT_MODEL_PATH) return predictions
def predict(data): data = pf.load_data(config.PATH_TO_DATASET) X_train, X_test, y_train, y_test = pf.divide_train_test(data, config.TARGET) data = X_test.copy() # impute categorical variables data = pf.add_missing_indicator(data, config.CATEGORICAL_VARS) # extract first letter from cabin data = pf.extract_cabin_letter(data, 'cabin') # impute NA categorical data = pf.impute_na(data, config.CATEGORICAL_VARS) # impute NA numerical data = pf.add_missing_indicator(data, config.NUMERICAL_TO_IMPUTE) data = pf.impute_num(data, config.NUMERICAL_TO_IMPUTE) # Group rare labels data = pf.remove_rare_labels(data, config.CATEGORICAL_VARS) # encode variables data, data_features = pf.encode_categorical(data, config.CATEGORICAL_VARS) print(data.head(1)) # check all dummies were added data = pf.check_dummy_variables(data, config.DUMMY_VARIABLES) # scale variables data = pf.scale_features(data, config.OUTPUT_SCALER_PATH) # make predictions class_, pred = pf.predict(data, config.OUTPUT_MODEL_PATH) return class_
# ================================================ # TRAINING STEP - IMPORTANT TO PERPETUATE THE MODEL # Load data data = pf.load_data(config.PATH_TO_DATASET) # divide data set X_train, X_test, y_train, y_test = pf.divide_train_test(data, config.TARGET) # get first letter from cabin variable X_train['cabin'] = pf.extract_cabin_letter(X_train, 'cabin') # impute categorical variables for var in config.CATEGORICAL_VARS: X_train[var] = pf.impute_na(X_train, var, value='Missing') # impute numerical variable for var in config.NUMERICAL_TO_IMPUTE: # add missing indicator pf.add_missing_indicator(X_train, var) # replace NaN by median median_val = X_train[var].median() X_train[var] = pf.impute_na(X_train, var, value=median_val) # Group rare labels for var in config.CATEGORICAL_VARS: X_train[var] = pf.remove_rare_labels(X_train, var, config.FREQUENT_LABELS[var])
import warnings warnings.simplefilter(action='ignore') # ================================================ # ENTRENAMIENTO # cargar los daots data = pf.load_data(config.PATH_TO_DATASET) # dividir el set X_train, X_test, y_train, y_test = pf.divide_train_test(data, config.TARGET) # imputar variables categóricas for var in config.CATEGORICAL_TO_IMPUTE: X_train[var] = pf.impute_na(X_train, var, replacement='Missing') # imputar variables numéricas X_train[config.NUMERICAL_TO_IMPUTE] = pf.impute_na( X_train, config.NUMERICAL_TO_IMPUTE, replacement=config.LOTFRONTAGE_MODE) # intervalos de tiempo X_train[config.YEAR_VARIABLE] = pf.elapsed_years(X_train, config.YEAR_VARIABLE, ref_var='YrSold') # transformación logarítmica for var in config.NUMERICAL_LOG: X_train[var] = pf.log_transform(X_train, var) # agrupación de categorías poco frecuentes
# TRAINING STEP - IMPORTANT TO PERPETUATE THE MODEL # Load data data = pf.load_data(config.PATH_TO_DATASET) # divide data set X_train, X_test, y_train, y_test = pf.divide_train_test(data, 'survived') # get first letter from cabin variable X_train['cabin'] = pf.extract_cabin_letter(X_train, 'cabin') # impute categorical variables for var in config.CATEGORICAL_VARS: X_train[var] = pf.impute_na(X_train, var) # impute numerical variable for var in config.NUMERICAL_TO_IMPUTE: X_train = pf.add_missing_indicator(X_train, var) X_train[var] = pf.impute_na(X_train, var, value=config.IMPUTATION_DICT[var]) # Group rare labels for col in config.CATEGORICAL_VARS: X_train[col] = pf.remove_rare_labels( X_train, col, freq_labels=config.FREQUENT_LABELS[col])
extension = config_file[0]['Paths'].get('data_extension') cols = config_file[2]['Feature_Groups'].get('data_columns') df = pf.load_data(path, data_filename, extension, cols) # divide data set target = config_file[2]['Feature_Groups'].get('target') X_train, X_test, y_train, y_test = pf.divide_train_test(df, target) # get first letter from cabin variable X_train['cabin'] = pf.extract_cabin_letter(X_train, 'cabin') # impute categorical variables cat_vars = config_file[2]['Feature_Groups'].get('categorical_vars') num_vars = config_file[2]['Feature_Groups'].get('numerical_to_impute') for var in cat_vars: X_train[var] = pf.impute_na(X_train, var, 'Missing') # impute numerical variables medians = config_file[1]['Parameters'].get('imputation_dict') for var in num_vars: X_train = pf.add_missing_indicator(X_train, var) X_train[var] = pf.impute_na(X_train, var, medians.get(var)) ## Group rare labels frequent_list = config_file[1]['Parameters'].get('frequent_labels') for var in cat_vars: X_train[var] = pf.remove_rare_labels(X_train, var, frequent_list) # encode categorical variables dummies = config_file[1]['Parameters'].get('dummy_variables') for var in cat_vars:
# divide data set X_train, X_test, y_train, y_test = pf.divide_train_test(data, config.TARGET) # get first letter from cabin variable pf.extract_cabin_letter(X_train, 'cabin') # impute NA categorical for var in ['age', 'fare']: pf.add_missing_indicator(X_train, var) # impute NA numerical for var in config.CATEGORICAL_VARS: pf.impute_na(X_train, var) # Group rare labels for var in config.CATEGORICAL_VARS: pf.remove_rare_labels(X_train, var, config.FREQUENT_LABELS) # encode variables X_train = pf.encode_categorical(X_train, config.CATEGORICAL_VARS) # scale variables pf.train_scaler(X_train, config.OUTPUT_SCALER_PATH) pf.scale_features(X_train, config.OUTPUT_SCALER_PATH) pf.train_model(X_train, y_train, config.OUTPUT_MODEL_PATH) # train scaler and save
# ================================================ # TRAINING STEP - IMPORTANT TO PERPETUATE THE MODEL # Load data data = pf.load_data(config.PATH_TO_DATASET) # divide data set X_train, X_test, y_train, y_test = pf.divide_train_test(data, config.TARGET) # get first letter from cabin variable X_train['cabin'] = pf.extract_cabin_letter(X_train, 'cabin') # impute categorical variables for var in config.CATEGORICAL_VARS: X_train[var] = pf.impute_na(X_train, var, 'Missing') # impute numerical variable for var in config.NUMERICAL_TO_IMPUTE: # add missing indicator first X_train[var + '_NA'] = pf.add_missing_indicator(X_train, var) # impute NA X_train[var] = pf.impute_na(X_train, var, replace=config.IMPUTATION_DICT[var]) # Group rare labels for var in config.CATEGORICAL_VARS: X_train[var] = pf.remove_rare_labels(X_train, var, config.FREQUENT_LABELS[var])
# ================================================ # TRAINING STEP - IMPORTANT TO PERPETUATE THE MODEL # Load data df = pf.load_data(config.PATH_TO_DATASET) # divide data set X_train, X_test, y_train, y_test = pf.divide_train_test(df, config.TARGET) # get first letter from cabin variable pf.extract_cabin_letter(X_train, 'cabin') # impute numerical variable for var in config.NUMERICAL_TO_IMPUTE: pf.add_missing_indicator(X_train, var) pf.impute_na(X_train, var, config.IMPUTATION_DICT[var]) # impute categorical variables for var in config.CATEGORICAL_VARS: pf.impute_na(X_train, var) # Group rare labels for var, labels in config.FREQUENT_LABELS.items(): pf.remove_rare_labels(X_train, var, labels) # encode categorical variables for var in config.CATEGORICAL_VARS: X_train = pf.encode_categorical(X_train, var) # check all dummies were added pf.check_dummy_variables(X_train, config.DUMMY_VARIABLES)
# TRAINING STEP - IMPORTANT TO PERPETUATE THE MODEL # Load data data = pf.load_data(config.PATH_TO_DATASET) # divide data set X_train, X_test, y_train, y_test = pf.divide_train_test(data, config.TARGET) # get first letter from cabin variable X_train['cabin'] = pf.extract_cabin_letter(X_train, 'cabin') X_test['cabin'] = pf.extract_cabin_letter(X_test, 'cabin') # impute categorical variables for var in config.CATEGORICAL_VARS: X_train[var] = pf.impute_na(X_train, var, value='Missing') X_test[var] = pf.impute_na(X_test, var, value='Missing') # impute numerical variable for var in config.NUMERICAL_TO_IMPUTE: # add missing indicator first X_train[var + '_NA'] = pf.add_missing_indicator(X_train, var) X_test[var + '_NA'] = pf.add_missing_indicator(X_test, var) # impute NA X_train[var] = pf.impute_na(X_train, var, value=config.IMPUTATION_DICT[var]) X_test[var] = pf.impute_na(X_test, var, value=config.IMPUTATION_DICT[var]) # Group rare labels
# ================================================ # TRAINING STEP - IMPORTANT TO PERPETUATE THE MODEL # Load data df = pf.load_data(config.PATH_TO_DATASET) # divide data set X_train, X_test, y_train, y_test = pf.divide_train_test(df, config.TARGET) # get first letter from cabin variable X_train = pf.extract_cabin_letter(X_train, 'cabin') # impute categorical variables for var in config.CATEGORICAL_VARS: X_train = pf.impute_na(X_train, var, config.IMPUTATION_DICT) # impute numerical variable # since the notebook just uses age and fare, we will ignore the "NUMERICAL TO IMPUTE" for var in ['age', 'fare']: X_train = pf.impute_na(X_train, var, config.IMPUTATION_DICT) # add missing indicator #Note that I added this to conform train.py with notebook. for var in ['age', 'fare']: X_train = pf.add_missing_indicator(X_train, var) # Group rare labels for var in config.CATEGORICAL_VARS:
# ================================================ # TRAINING STEP - IMPORTANT TO PERPETUATE THE MODEL # Load data df = pf.load_data(config.PATH_TO_DATASET) # divide data set X_train, X_test, y_train, y_test = pf.divide_train_test(df, config.TARGET) # get first letter from cabin variable X_train = pf.extract_cabin_letter(X_train, 'cabin') # impute categorical variables X_train = pf.add_missing_indicator(X_train, config.CATEGORICAL_VARS) X_train = pf.impute_na(X_train, config.CATEGORICAL_VARS) # impute numerical variable X_train = pf.add_missing_indicator(X_train, config.NUMERICAL_TO_IMPUTE) X_train = pf.impute_num(X_train, config.NUMERICAL_TO_IMPUTE) # Group rare labels X_train = pf.remove_rare_labels(X_train, config.CATEGORICAL_VARS) # encode categorical variables X_train, X_train_features = pf.encode_categorical(X_train, config.CATEGORICAL_VARS) # check dummy variables X_check
# ================================================ # TRAINING STEP - IMPORTANT TO PERPETUATE THE MODEL # Load data data = pf.load_data(config.PATH_TO_DATASET) # Divide data set X_train, X_test, y_train, y_test = pf.divide_train_test(data, config.TARGET) # Get first letter from cabin variable X_train['cabin'] = pf.extract_cabin_letter(X_train, 'cabin') # Impute categorical variables for var in config.CATEGORICAL_VARS: X_train[var] = pf.impute_na(X_train, var) # Impute numerical variables for var in config.NUMERICAL_TO_IMPUTE: X_train[var] = pf.add_missing_indicator(X_train, var) # Group rare labels for var in config.CATEGORICAL_VARS: X_train[var] = pf.remove_rare_labels(X_train, var, config.FREQUENT_LABELS[var]) # Encode categorical variables for var in config.CATEGORICAL_VARS: X_train = pf.encode_categorical(X_train, var) # check all dummies were added
# ================================================ # TRAINING STEP - IMPORTANT TO PERPETUATE THE MODEL # Load data df = pf.load_data(config.PATH_TO_DATASET) # divide data set X_train, X_test, y_train, y_test = pf.divide_train_test(df, config.TARGET) # get first letter from cabin variable X_train = pf.extract_cabin_letter(X_train, 'cabin') X_test = pf.extract_cabin_letter(X_test, 'cabin') # impute categorical variables X_train = pf.impute_na(X_train, config.CATEGORICAL_VARS) X_test = pf.impute_na(X_test, config.CATEGORICAL_VARS) # impute numerical variable for var in config.IMPUTATION_DICT.keys(): X_train = pf.add_missing_indicator(X_train, var) X_test = pf.add_missing_indicator(X_test, var) X_train = pf.impute_na(X_train, var, config.IMPUTATION_DICT[var]) X_test = pf.impute_na(X_test, var, config.IMPUTATION_DICT[var]) # Group rare labels for var in config.FREQUENT_LABELS.keys(): X_train = pf.remove_rare_labels(X_train, var, config.FREQUENT_LABELS[var]) X_test = pf.remove_rare_labels(X_test, var, config.FREQUENT_LABELS[var])
# ================================================ # TRAINING STEP - IMPORTANT TO PERPETUATE THE MODEL # Load data data = pf.load_data(config.PATH_TO_DATASET) # divide data set X_train, X_test, y_train, y_test = pf.divide_train_test(data, config.TARGET) # get first letter from cabin variable X_train["cabin"] = pf.extract_cabin_letter(X_train, "cabin") print(X_train["cabin"].unique()) # impute categorical variables for var in config.CATEGORICAL_VARS: X_train[var] = pf.impute_na(X_train, var, replacement='Missing') # impute numerical variable with median for var in config.NUMERICAL_TO_IMPUTE: X_train[var + "_na"] = pf.add_missing_indicator(X_train, var) median_train_var = config.IMPUTATION_DICT[var] X_train[var] = pf.impute_na(X_train, var, replacement=median_train_var) # Group rare labels for var in config.CATEGORICAL_VARS: # Frequent labels found in Train set freq_labels = config.FREQUENT_LABELS[var] # Remove rare labels from both train and test set X_train[var] = pf.remove_rare_labels(X_train, var, freq_labels) # encode categorical variables
# ================================================ # TRAINING STEP - IMPORTANT TO PERPETUATE THE MODEL # Load data df = pf.load_data(config.PATH_TO_DATASET) # divide data set X_train, X_test, y_train, y_test = pf.divide_train_test(df, config.TARGET) # get first letter from cabin variable X_train['cabin'] = pf.extract_cabin_letter(X_train, 'cabin') # impute categorical variables for var in config.CATEGORICAL_VARS: X_train[var] = pf.impute_na(X_train, var) # impute numerical variable for var in config.NUMERICAL_TO_IMPUTE: # add missing indicator X_train[var + '_NA'] = pf.add_missing_indicator(X_train, var) # replace NaN by median X_train[var] = pf.impute_na(X_train, var, replacement=config.IMPUTATION_DICT[var]) # Group rare labels for var in config.CATEGORICAL_VARS: X_train[var] = pf.remove_rare_labels(X_train, var,
# Load data data = pf.load_data(config.PATH_TO_DATASET) # divide data set X_train, X_test, y_train, y_test = pf.divide_train_test(data, config.TARGET) # get first letter from cabin variable X_train[config.EXTRACT_VARIABLE] = pf.extract_cabin_letter(X_train, config.EXTRACT_VARIABLE) # impute categorical variables for var in config.CATEGORICAL_TO_ENCODE: X_train[var] = pf.impute_na(X_train, var, replacement='Missing') # impute numerical variable for var in config.NUMERICAL_TO_IMPUTE: if (var == 'age'): X_train[var] = pf.add_missing_indicator(X_train, var, config.AGE_MEDIAN) else: X_train[var] = pf.add_missing_indicator(X_train, var, config.FARE_MEDIAN) # Group rare labels for var in config.CATEGORICAL_TO_ENCODE: X_train[var] = pf.remove_rare_labels(X_train, var, config.RARE_VALUE)
# TRAINING STEP - IMPORTANT TO PERPETUATE THE MODEL # Load data data = pf.load_data(config.PATH_TO_DATASET) # divide data set X_train, X_test, y_train, y_test = pf.divide_train_test(data, config.TARGET) # get first letter from cabin variable X_train['cabin'] = pf.extract_cabin_letter(X_train, 'cabin') X_test['cabin'] = pf.extract_cabin_letter(X_test, 'cabin') # impute categorical variables for var in config.CATEGORICAL_VARS: X_train[var + '_na'] = pf.add_missing_indicator(X_train, var) X_train[var] = pf.impute_na(X_train, var) X_test[var + '_na'] = pf.add_missing_indicator(X_test, var) X_test[var] = pf.impute_na(X_test, var) # impute numerical variable for var in config.NUMERICAL_TO_IMPUTE: X_train[var + '_na'] = pf.add_missing_indicator(X_train, var) X_train[var] = pf.impute_na(X_train, var, config.IMPUTATION_DICT[var]) X_test[var + '_na'] = pf.add_missing_indicator(X_test, var) X_test[var] = pf.impute_na(X_test, var, config.IMPUTATION_DICT[var]) # Group rare labels for var in config.CATEGORICAL_VARS: X_train[var] = pf.remove_rare_labels(X_train, var, config.FREQUENT_LABELS[var])
import config # ================================================ # TRAINING STEP - IMPORTANT TO PERPETUATE THE MODEL # Load data df = pf.load_data(config.PATH_TO_DATASET) # divide data set xtrain, xtest, ytrain, ytest = pf.divide_train_test(df, config.TARGET) # # get first letter from cabin variable xtrain['cabin'] = pf.extract_cabin_letter(xtrain, 'cabin') # # impute categorical variables xtrain[config.CATEGORICAL_VARS] = pf.impute_na(xtrain[config.CATEGORICAL_VARS], 'Missing') # # impute numerical variable xtrain[config.NUMERICAL_TO_IMPUTE] = pf.impute_na( xtrain[config.NUMERICAL_TO_IMPUTE], 'Numerical') # # Group rare labels for var in config.CATEGORICAL_VARS: xtrain[var] = pf.remove_rare_labels(xtrain, var, config.FREQUENT_LABELS[var]) # # encode categorical variables xtrain = pf.encode_categorical(xtrain, config.CATEGORICAL_VARS) # # check all dummies were added xtrain = pf.check_dummy_variables(xtrain, config.DUMMY_VARIABLES)
df_data, df_target, seed=config.GLOBAL_SEED) # get first letter from cabin variable X_train = pf.extract_cabin_letter(X_train, config.IMPUTATION_DICT['cabin_variable']) X_test = pf.extract_cabin_letter(X_test, config.IMPUTATION_DICT['cabin_variable']) # impute categorical variables X_train = pf.add_missing_indicator(X_train, config.CATEGORICAL_VARS) X_test = pf.add_missing_indicator(X_test, config.CATEGORICAL_VARS) # impute numerical variable for var in config.NUMERICAL_TO_IMPUTE: X_train = pf.impute_na(X_train, var, replace_by=config.IMPUTATION_DICT[var], add_na_columns=True) X_test = pf.impute_na(X_test, var, replace_by=config.IMPUTATION_DICT[var], add_na_columns=True) # Group rare labels X_train = pf.remove_rare_labels(X_train, config.FREQUENT_LABELS) X_test = pf.remove_rare_labels(X_test, config.FREQUENT_LABELS) # encode categorical variables for var in config.CATEGORICAL_VARS: X_train = pf.encode_categorical(X_train, var) X_test = pf.encode_categorical(X_test, var) X_train.drop(labels=config.CATEGORICAL_VARS, axis=1, inplace=True)
# ================================================ # TRAINING STEP - IMPORTANT TO PERPETUATE THE MODEL # Load data data = pf.load_data(config.PATH_TO_DATASET) # divide data set X_train, X_test, y_train, y_test = pf.divide_train_test(data, config.TARGET) # get first letter from cabin variable X_train.loc[:, "cabin"] = pf.extract_cabin_letter(X_train, "cabin") # impute categorical variables for var in config.CATEGORICAL_VARS: X_train.loc[:, var] = pf.impute_na(X_train, var, replacement="Missing") # impute numerical variable for var in config.NUMERICAL_TO_IMPUTE: median_val = X_train[var].median() X_train.loc[:, var] = pf.impute_na(X_train, var, replacement=median_val) # Group rare labels for var in config.CATEGORICAL_VARS: X_train.loc[:, var] = pf.remove_rare_labels(X_train, var, config.FREQUENT_LABELS[var]) # encode categorical variables for var in config.CATEGORICAL_VARS: X_train = pf.encode_categorical(X_train, var)