def predict(data): # extract first letter from cabin data['cabin'] = pf.extract_cabin_letter(data, 'cabin') # impute NA categorical for var in config.CATEGORICAL_VARS: data[var] = pf.impute_na(data, var, replacement='Missing') # impute NA numerical for var in config.NUMERICAL_TO_IMPUTE: #add missing indicator (0,1) data[var + 'NA'] = pf.add_missing_indicator(data, var) # impute NA data[var] = pf.impute_na(data, var, replacement=config.IMPUTATION_DICT[var]) # Group rare labels for var in config.CATEGORICAL_VARS: data[var] = pf.remove_rare_labels(data, var, config.FREQUENT_LABELS[var]) # encode variables for var in config.CATEGORICAL_VARS: data = pf.encode_categorical(data, var) # check all dummies were added data = pf.check_dummy_variables(data, config.DUMMY_VARIABLES) # scale variables data = pf.scale_features(data, config.OUTPUT_SCALER_PATH) # make predictions predictions = pf.predict(data, config.OUTPUT_MODEL_PATH) return predictions
def predict(data): # impute NA for var in config.CATEGORICAL_TO_IMPUTE: data[var] = pf.impute_na(data, var, replacement='Missing') data[config.NUMERICAL_TO_IMPUTE] = pf.impute_na(data, config.NUMERICAL_TO_IMPUTE, replacement=config.LOTFRONTAGE_MODE) # capture elapsed time data[config.YEAR_VARIABLE] = pf.elapsed_years(data, config.YEAR_VARIABLE, ref_var='YrSold') # log transform numerical variables for var in config.NUMERICAL_LOG: data[var] = pf.log_transform(data, var) # Group rare labels for var in config.CATEGORICAL_ENCODE: data[var] = pf.remove_rare_labels(data, var, config.FREQUENT_LABELS[var]) # encode variables for var in config.CATEGORICAL_ENCODE: data[var] = pf.encode_categorical(data, var, config.ENCODING_MAPPINGS[var]) # scale variables data = pf.scale_features(data[config.FEATURES], config.OUTPUT_SCALER_PATH) # make predictions predictions = pf.predict(data, config.OUTPUT_MODEL_PATH) return predictions
for var in config.NUMERICAL_TO_IMPUTE: X_train[var + '_na'] = pf.add_missing_indicator(X_train, var) X_train[var] = pf.impute_na(X_train, var, config.IMPUTATION_DICT[var]) X_test[var + '_na'] = pf.add_missing_indicator(X_test, var) X_test[var] = pf.impute_na(X_test, var, config.IMPUTATION_DICT[var]) # Group rare labels for var in config.CATEGORICAL_VARS: X_train[var] = pf.remove_rare_labels(X_train, var, config.FREQUENT_LABELS[var]) X_test[var] = pf.remove_rare_labels(X_test, var, config.FREQUENT_LABELS[var]) # encode categorical variables for var in config.CATEGORICAL_VARS: X_train = pf.encode_categorical(X_train, var) X_test = pf.encode_categorical(X_test, var) # check all dummies were added X_train = pf.check_dummy_variables(X_train, config.DUMMY_VARIABLES) X_test = pf.check_dummy_variables(X_test, config.DUMMY_VARIABLES) # train scaler and save scaler = pf.train_scaler(X_train, config.OUTPUT_SCALER_PATH) # scale train set X_train = pf.scale_features(X_train, config.OUTPUT_SCALER_PATH) # train model and save pf.train_model(X_train, y_train, config.OUTPUT_MODEL_PATH)
# add missing indicator X_train[var + '_NA'] = pf.add_missing_indicator(X_train, var) # replace NaN by median X_train[var] = pf.impute_na(X_train, var, replacement=config.IMPUTATION_DICT[var]) # Group rare labels for var in config.CATEGORICAL_VARS: X_train[var] = pf.remove_rare_labels(X_train, var, config.FREQUENT_LABELS[var]) # encode categorical variables for var in config.CATEGORICAL_VARS: X_train = pf.encode_categorical(X_train, var) # check all dummies were added X_train = pf.check_dummy_variables(X_train, config.DUMMY_VARIABLES) # train scaler and save scaler = pf.train_scaler(X_train, config.OUTPUT_SCALER_PATH) # scale train set X_train = scaler.transform(X_train) # train model and save pf.train_model(X_train, y_train, config.OUTPUT_MODEL_PATH) print('Finished training')
X_train, config.NUMERICAL_TO_IMPUTE, replacement=config.LOTFRONTAGE_MODE) # intervalos de tiempo X_train[config.YEAR_VARIABLE] = pf.elapsed_years(X_train, config.YEAR_VARIABLE, ref_var='YrSold') # transformación logarítmica for var in config.NUMERICAL_LOG: X_train[var] = pf.log_transform(X_train, var) # agrupación de categorías poco frecuentes for var in config.CATEGORICAL_ENCODE: X_train[var] = pf.remove_rare_labels(X_train, var, config.FREQUENT_LABELS[var]) # codificación de variables categóricas for var in config.CATEGORICAL_ENCODE: X_train[var] = pf.encode_categorical(X_train, var, config.ENCODING_MAPPINGS[var]) # entrenear y guardar el escalador scaler = pf.train_scaler(X_train[config.FEATURES], config.OUTPUT_SCALER_PATH) # escalar variables X_train = scaler.transform(X_train[config.FEATURES]) # entrenar y guardar el modelo pf.train_model(X_train, np.log(y_train), config.OUTPUT_MODEL_PATH) print('Enterenamiento terminado')
# Apply cube-root transformation for var in config.CBRT_TRANSFORM: X_train[var] = pf.cbrt_transform(X_train, var) y_train = pf.cbrt_transform(y_train) # Train standard scaler on numerical variables only scaled = X_train[config.NUM_VARS].copy() scaler = pf.train_scaler(scaled, config.SCALER_PATH) # Scale the numerical data scaled.iloc[:,:] = pf.scale_features(scaled, config.SCALER_PATH) # One-hot encode all the categorical variables categoricals = [] for var in config.CAT_VARS: categoricals.append(pf.encode_categorical(X_train, var)) # Final design matrix for training X_train = pf.concat_dfs(scaled, categoricals) # Assert we have the desired features assert X_train.columns.tolist() == config.FEATURES # Train the default linear regression model pf.train_linreg_model(X_train, y_train, config.LINEAR_REG_MODEL_PATH) # Train the linear regression model via speed pf.train_linreg_model(X_train, y_train_aux, config.LINEAR_REG_SPEED_MODEL_PATH) # Train the neural network pf.train_nn_model(X_train, y_train, config.NET_ARCHITECTURE_AND_PARAMETERS, config.NEURAL_NET_MODEL_PATH)
X_train = pf.add_missing_indicator(X_train, var) X_train[var] = pf.impute_na(X_train, var, value=config.IMPUTATION_DICT[var]) # Group rare labels for col in config.CATEGORICAL_VARS: X_train[col] = pf.remove_rare_labels( X_train, col, freq_labels=config.FREQUENT_LABELS[col]) # encode categorical variables oh = pf.train_encoder(X_train, config.CATEGORICAL_VARS, config.OUTPUT_ENCODER_PATH) X_train = pf.encode_categorical(X_train, config.CATEGORICAL_VARS, config.OUTPUT_ENCODER_PATH) print(X_train.shape) print(X_train.head()) # check all dummies were added X_train = pf.check_dummy_variables(X_train, config.DUMMY_VARIABLES) # train scaler and save scaler = pf.train_scaler(X_train, config.OUTPUT_SCALER_PATH) # scale train set X_train = scaler.transform(X_train) # train model and save
xtrain['cabin'] = pf.extract_cabin_letter(xtrain, 'cabin') # # impute categorical variables xtrain[config.CATEGORICAL_VARS] = pf.impute_na(xtrain[config.CATEGORICAL_VARS], 'Missing') # # impute numerical variable xtrain[config.NUMERICAL_TO_IMPUTE] = pf.impute_na( xtrain[config.NUMERICAL_TO_IMPUTE], 'Numerical') # # Group rare labels for var in config.CATEGORICAL_VARS: xtrain[var] = pf.remove_rare_labels(xtrain, var, config.FREQUENT_LABELS[var]) # # encode categorical variables xtrain = pf.encode_categorical(xtrain, config.CATEGORICAL_VARS) # # check all dummies were added xtrain = pf.check_dummy_variables(xtrain, config.DUMMY_VARIABLES) # # train scaler and save scaler = pf.train_scaler(xtrain, config.OUTPUT_SCALER_PATH) # # scale train set xtrain = pf.scale_features(xtrain, config.OUTPUT_SCALER_PATH) # train model and save pf.train_model(xtrain, ytrain, config.OUTPUT_MODEL_PATH) print('Finished training')
# get first letter from cabin variable X_train = pf.extract_cabin_letter(X_train, 'cabin') # impute categorical variables X_train = pf.add_missing_indicator(X_train, config.CATEGORICAL_VARS) X_train = pf.impute_na(X_train, config.CATEGORICAL_VARS) # impute numerical variable X_train = pf.add_missing_indicator(X_train, config.NUMERICAL_TO_IMPUTE) X_train = pf.impute_num(X_train, config.NUMERICAL_TO_IMPUTE) # Group rare labels X_train = pf.remove_rare_labels(X_train, config.CATEGORICAL_VARS) # encode categorical variables X_train, X_train_features = pf.encode_categorical(X_train, config.CATEGORICAL_VARS) # check dummy variables X_check # train scaler and save pf.train_scaler(X_train, config.OUTPUT_SCALER_PATH) # scale train set X_train = pf.scale_features(X_train, config.OUTPUT_SCALER_PATH) # train model and save pf.train_model(X_train, y_train, config.OUTPUT_MODEL_PATH) print('Finished training')
X_test[var] = pf.impute_na(X_test, var, replacement='Missing') # impute numerical variable for var in config.NUMERICAL_TO_IMPUTE: X_train[var] = pf.add_missing_indicator(df=X_train, var=var) X_test[var] = pf.add_missing_indicator(df=X_test, var=var) # Group rare labels for var in config.CATEGORICAL_VARS: X_train[var] = pf.remove_rare_labels(X_train, var, config.FREQUENT_LABELS[var]) X_test[var] = pf.remove_rare_labels(X_test, var, config.FREQUENT_LABELS[var]) # encode categorical variables X_train = pf.encode_categorical(df=X_train, var=config.CATEGORICAL_VARS) X_test = pf.encode_categorical(df=X_test, var=config.CATEGORICAL_VARS) # check all dummies were added X_train = pf.check_dummy_variables(X_train, config.DUMMY_VARIABLES) X_test = pf.check_dummy_variables(df=X_test, dummy_list=config.DUMMY_VARIABLES) # train scaler and save scaler = pf.train_scaler(X_train[config.FEATURES], config.OUTPUT_SCALER_PATH) # scale train set X_train = scaler.transform(X_train[config.FEATURES]) X_test = scaler.transform(X_test[config.FEATURES]) # train model and save pf.train_model(X_train, y_train, config.OUTPUT_MODEL_PATH)