X_train[var + '_na'] = pf.add_missing_indicator(X_train, var) X_train[var] = pf.impute_na(X_train, var, config.IMPUTATION_DICT[var]) X_test[var + '_na'] = pf.add_missing_indicator(X_test, var) X_test[var] = pf.impute_na(X_test, var, config.IMPUTATION_DICT[var]) # Group rare labels for var in config.CATEGORICAL_VARS: X_train[var] = pf.remove_rare_labels(X_train, var, config.FREQUENT_LABELS[var]) X_test[var] = pf.remove_rare_labels(X_test, var, config.FREQUENT_LABELS[var]) # encode categorical variables for var in config.CATEGORICAL_VARS: X_train = pf.encode_categorical(X_train, var) X_test = pf.encode_categorical(X_test, var) # check all dummies were added X_train = pf.check_dummy_variables(X_train, config.DUMMY_VARIABLES) X_test = pf.check_dummy_variables(X_test, config.DUMMY_VARIABLES) # train scaler and save scaler = pf.train_scaler(X_train, config.OUTPUT_SCALER_PATH) # scale train set X_train = pf.scale_features(X_train, config.OUTPUT_SCALER_PATH) # train model and save pf.train_model(X_train, y_train, config.OUTPUT_MODEL_PATH) print('Finished training')
X_train, config.NUMERICAL_TO_IMPUTE, replacement=config.LOTFRONTAGE_MODE) # intervalos de tiempo X_train[config.YEAR_VARIABLE] = pf.elapsed_years(X_train, config.YEAR_VARIABLE, ref_var='YrSold') # transformación logarítmica for var in config.NUMERICAL_LOG: X_train[var] = pf.log_transform(X_train, var) # agrupación de categorías poco frecuentes for var in config.CATEGORICAL_ENCODE: X_train[var] = pf.remove_rare_labels(X_train, var, config.FREQUENT_LABELS[var]) # codificación de variables categóricas for var in config.CATEGORICAL_ENCODE: X_train[var] = pf.encode_categorical(X_train, var, config.ENCODING_MAPPINGS[var]) # entrenear y guardar el escalador scaler = pf.train_scaler(X_train[config.FEATURES], config.OUTPUT_SCALER_PATH) # escalar variables X_train = scaler.transform(X_train[config.FEATURES]) # entrenar y guardar el modelo pf.train_model(X_train, np.log(y_train), config.OUTPUT_MODEL_PATH) print('Enterenamiento terminado')
y_train_aux = X_train[config.AUX_TARGET] # Add time-based features and the speed columns X_train = pf.add_features(X_train) # Add the region column X_train = pf.add_region(X_train, config.REGION_BOUNDS) # Apply cube-root transformation for var in config.CBRT_TRANSFORM: X_train[var] = pf.cbrt_transform(X_train, var) y_train = pf.cbrt_transform(y_train) # Train standard scaler on numerical variables only scaled = X_train[config.NUM_VARS].copy() scaler = pf.train_scaler(scaled, config.SCALER_PATH) # Scale the numerical data scaled.iloc[:,:] = pf.scale_features(scaled, config.SCALER_PATH) # One-hot encode all the categorical variables categoricals = [] for var in config.CAT_VARS: categoricals.append(pf.encode_categorical(X_train, var)) # Final design matrix for training X_train = pf.concat_dfs(scaled, categoricals) # Assert we have the desired features assert X_train.columns.tolist() == config.FEATURES
# impute numerical variables medians = config_file[1]['Parameters'].get('imputation_dict') for var in num_vars: X_train = pf.add_missing_indicator(X_train, var) X_train[var] = pf.impute_na(X_train, var, medians.get(var)) ## Group rare labels frequent_list = config_file[1]['Parameters'].get('frequent_labels') for var in cat_vars: X_train[var] = pf.remove_rare_labels(X_train, var, frequent_list) # encode categorical variables dummies = config_file[1]['Parameters'].get('dummy_variables') for var in cat_vars: X_train = pf.encode_categorical(X_train, var) # check all dummies were added X_train = pf.check_dummy_variables(X_train, dummies) # train scaler and save output_path = config_file[0]['Paths'].get('output_scaler_path') output_model_path = config_file[0]['Paths'].get('output_model_path') scaler = pf.train_scaler(X_train, output_path) # scale train set X_train = scaler.transform(X_train) y_train = y_train.astype(int) # train model and save pf.train_model(X_train, y_train, output_model_path) print('Finished training')
# add missing indicator #Note that I added this to conform train.py with notebook. for var in ['age', 'fare']: X_train = pf.add_missing_indicator(X_train, var) # Group rare labels for var in config.CATEGORICAL_VARS: X_train = pf.remove_rare_labels(X_train, config.FREQUENT_LABELS, var) # encode categorical variables for var in config.CATEGORICAL_VARS: X_train = pf.encode_categorical(X_train, var) # check all dummies were added X_train = pf.check_dummy_variables(X_train, config.DUMMY_VARIABLES) # train scaler and save pf.train_scaler(X_train, config.ORDERED_COLUMNS, config.OUTPUT_SCALER_PATH) # scale train set X_train = pf.scale_features(X_train, config.ORDERED_COLUMNS, config.OUTPUT_SCALER_PATH) # train model and save pf.train_model(X_train, config.ORDERED_COLUMNS, y_train, config.OUTPUT_MODEL_PATH) print('Finished training')