X_train, config.NUMERICAL_TO_IMPUTE, replacement=config.LOTFRONTAGE_MODE) # intervalos de tiempo X_train[config.YEAR_VARIABLE] = pf.elapsed_years(X_train, config.YEAR_VARIABLE, ref_var='YrSold') # transformación logarítmica for var in config.NUMERICAL_LOG: X_train[var] = pf.log_transform(X_train, var) # agrupación de categorías poco frecuentes for var in config.CATEGORICAL_ENCODE: X_train[var] = pf.remove_rare_labels(X_train, var, config.FREQUENT_LABELS[var]) # codificación de variables categóricas for var in config.CATEGORICAL_ENCODE: X_train[var] = pf.encode_categorical(X_train, var, config.ENCODING_MAPPINGS[var]) # entrenear y guardar el escalador scaler = pf.train_scaler(X_train[config.FEATURES], config.OUTPUT_SCALER_PATH) # escalar variables X_train = scaler.transform(X_train[config.FEATURES]) # entrenar y guardar el modelo pf.train_model(X_train, np.log(y_train), config.OUTPUT_MODEL_PATH) print('Enterenamiento terminado')
X_train[var + '_na'] = pf.add_missing_indicator(X_train, var) X_train[var] = pf.impute_na(X_train, var, config.IMPUTATION_DICT[var]) X_test[var + '_na'] = pf.add_missing_indicator(X_test, var) X_test[var] = pf.impute_na(X_test, var, config.IMPUTATION_DICT[var]) # Group rare labels for var in config.CATEGORICAL_VARS: X_train[var] = pf.remove_rare_labels(X_train, var, config.FREQUENT_LABELS[var]) X_test[var] = pf.remove_rare_labels(X_test, var, config.FREQUENT_LABELS[var]) # encode categorical variables for var in config.CATEGORICAL_VARS: X_train = pf.encode_categorical(X_train, var) X_test = pf.encode_categorical(X_test, var) # check all dummies were added X_train = pf.check_dummy_variables(X_train, config.DUMMY_VARIABLES) X_test = pf.check_dummy_variables(X_test, config.DUMMY_VARIABLES) # train scaler and save scaler = pf.train_scaler(X_train, config.OUTPUT_SCALER_PATH) # scale train set X_train = pf.scale_features(X_train, config.OUTPUT_SCALER_PATH) # train model and save pf.train_model(X_train, y_train, config.OUTPUT_MODEL_PATH) print('Finished training')
# impute numerical variables medians = config_file[1]['Parameters'].get('imputation_dict') for var in num_vars: X_train = pf.add_missing_indicator(X_train, var) X_train[var] = pf.impute_na(X_train, var, medians.get(var)) ## Group rare labels frequent_list = config_file[1]['Parameters'].get('frequent_labels') for var in cat_vars: X_train[var] = pf.remove_rare_labels(X_train, var, frequent_list) # encode categorical variables dummies = config_file[1]['Parameters'].get('dummy_variables') for var in cat_vars: X_train = pf.encode_categorical(X_train, var) # check all dummies were added X_train = pf.check_dummy_variables(X_train, dummies) # train scaler and save output_path = config_file[0]['Paths'].get('output_scaler_path') output_model_path = config_file[0]['Paths'].get('output_model_path') scaler = pf.train_scaler(X_train, output_path) # scale train set X_train = scaler.transform(X_train) y_train = y_train.astype(int) # train model and save pf.train_model(X_train, y_train, output_model_path) print('Finished training')
xtrain['cabin'] = pf.extract_cabin_letter(xtrain, 'cabin') # # impute categorical variables xtrain[config.CATEGORICAL_VARS] = pf.impute_na(xtrain[config.CATEGORICAL_VARS], 'Missing') # # impute numerical variable xtrain[config.NUMERICAL_TO_IMPUTE] = pf.impute_na( xtrain[config.NUMERICAL_TO_IMPUTE], 'Numerical') # # Group rare labels for var in config.CATEGORICAL_VARS: xtrain[var] = pf.remove_rare_labels(xtrain, var, config.FREQUENT_LABELS[var]) # # encode categorical variables xtrain = pf.encode_categorical(xtrain, config.CATEGORICAL_VARS) # # check all dummies were added xtrain = pf.check_dummy_variables(xtrain, config.DUMMY_VARIABLES) # # train scaler and save scaler = pf.train_scaler(xtrain, config.OUTPUT_SCALER_PATH) # # scale train set xtrain = pf.scale_features(xtrain, config.OUTPUT_SCALER_PATH) # train model and save pf.train_model(xtrain, ytrain, config.OUTPUT_MODEL_PATH) print('Finished training')
# log transform numerical variables for var in config.NUMERICAL_LOG: X_train[var] = pf.log_transform(X_train, var) # Group rare labels for var in config.CATEGORICAL_ENCODE: X_train[var] = pf.remove_rare_labels(X_train, var, config.FREQUENT_LABELS[var]) # encode categorical variables for var in config.CATEGORICAL_ENCODE: X_train[var] = pf.encode_categorical(X_train, var, config.ENCODING_MAPPINGS[var]) # train scaler and save scaler = pf.train_scaler(X_train[config.FEATURES], config.OUTPUT_SCALER_PATH) # scale train set X_train = scaler.transform(X_train[config.FEATURES]) # train model and save pf.train_model(X_train, np.log(y_train), #Since features are log transformed, # prediction variable should also be log transformed. config.OUTPUT_MODEL_PATH) print('Finished training')
# add missing indicator #Note that I added this to conform train.py with notebook. for var in ['age', 'fare']: X_train = pf.add_missing_indicator(X_train, var) # Group rare labels for var in config.CATEGORICAL_VARS: X_train = pf.remove_rare_labels(X_train, config.FREQUENT_LABELS, var) # encode categorical variables for var in config.CATEGORICAL_VARS: X_train = pf.encode_categorical(X_train, var) # check all dummies were added X_train = pf.check_dummy_variables(X_train, config.DUMMY_VARIABLES) # train scaler and save pf.train_scaler(X_train, config.ORDERED_COLUMNS, config.OUTPUT_SCALER_PATH) # scale train set X_train = pf.scale_features(X_train, config.ORDERED_COLUMNS, config.OUTPUT_SCALER_PATH) # train model and save pf.train_model(X_train, config.ORDERED_COLUMNS, y_train, config.OUTPUT_MODEL_PATH) print('Finished training')
# Group rare labels X_train = pf.remove_rare_labels(X_train, config.FREQUENT_LABELS) X_test = pf.remove_rare_labels(X_test, config.FREQUENT_LABELS) # encode categorical variables for var in config.CATEGORICAL_VARS: X_train = pf.encode_categorical(X_train, var) X_test = pf.encode_categorical(X_test, var) X_train.drop(labels=config.CATEGORICAL_VARS, axis=1, inplace=True) X_test.drop(labels=config.CATEGORICAL_VARS, axis=1, inplace=True) # check all dummies were added X_train = pf.check_dummy_variables(X_train, config.DUMMY_VARIABLES) X_test = pf.check_dummy_variables(X_test, config.DUMMY_VARIABLES) # train scaler and save pf.train_scaler(X_train, config.OUTPUT_SCALER_PATH) # scale train set X_train = pf.scale_features(X_train, config.OUTPUT_SCALER_PATH) X_test = pf.scale_features(X_test, config.OUTPUT_SCALER_PATH) # train model and save pf.train_model(X_train, y_train, config.OUTPUT_MODEL_PATH, seed=config.GLOBAL_SEED, C=config.NORM_CONSTANT) print('Finished training')