def run_training(): """Train the model.""" data = pd.read_csv('titanic.csv') X_train, X_test, y_train, y_test = train_test_split(data.drop('survived', axis=1), data['survived'], test_size=0.2, random_state=0) print(X_train.head()) test_pipeline_1 = pp.CategoricalImputer(variables=config.CATEGORICAL_VARS) test_pipeline_2 = pp.NumericalImputer( variables=config.NUMERICAL_VARS_WITH_NA) test_pipeline_3 = pp.ExtractFirstLetter(variables=config.CABIN) test_pipeline_4 = pp.CategoricalEncoder(variables=config.CATEGORICAL_VARS) X_train = test_pipeline_1.fit_transform(X_train) X_train = test_pipeline_2.fit_transform(X_train) X_train = test_pipeline_3.fit_transform(X_train) print() print(X_train.head()) print() X_train = test_pipeline_4.fit_transform(X_train) print(X_train.head())
def pipeline(config): match_pipe = Pipeline([ ('drop_fatures', pp.DropUnecessaryFeatures( variables_to_drop=config.variables.drop_features)), ('categorical_to_numerical', pp.CategoricalToNumerical( variables=config.variables.numerical_vars_from_numerical)), ('numerical_imputer', pp.NumericalImputer()), ('categorical_imputer', pp.CategoricalImputer(variables=config.variables.categorical_vars)), #('temporal_variable', # pp.TemporalVariableEstimator( # variables=config.TEMPORAL_VARS, # reference_variable=config.DROP_FEATURES)), ('label extraction', pp.LabelExtraction( variables=config.variables.categorical_label_extraction)), ('rare label encoder', pp.RareLabelCategoricalEncoder( tol=0.01, variables=config.variables.categorical_vars)), ('categorical_encoder', pp.CategoricalEncoder(variables=config.variables.categorical_vars)), #('feature hashing', # FeatureHasher(n_features=10, input_type='string')), #('log_transformer', # pp.LogTransformer()), ('scaler', MinMaxScaler()), ('classifier', LogisticRegression()) ]) return match_pipe
# variables to log transform NUMERICALS_LOG_VARS = ['LotFrontage', '1stFlrSF', 'GrLivArea'] # numerical variables with NA in train set NUMERICAL_VARS_WITH_NA = ['LotFrontage'] # categorical variables to encode CATEGORICAL_VARS = [ 'MSZoning', 'Neighborhood', 'RoofStyle', 'MasVnrType', 'BsmtQual', 'BsmtExposure', 'HeatingQC', 'CentralAir', 'KitchenQual', 'FireplaceQu', 'GarageType', 'GarageFinish', 'PavedDrive' ] price_pipe = Pipeline([ ('categorical_imputer', pp.CategoricalImputer(variables=CATEGORICAL_VARS_WITH_NA)), ('numerical_inputer', pp.NumericalImputer(variables=NUMERICAL_VARS_WITH_NA)), ('temporal_variable', pp.TemporalVariableEstimator(variables=TEMPORAL_VARS, reference_variable=TEMPORAL_VARS)), ('rare_label_encoder', pp.RareLabelCategoricalEncoder(tol=0.01, variables=CATEGORICAL_VARS)), ('categorical_encoder', pp.CategoricalEncoder(variables=CATEGORICAL_VARS)), ('log_transformer', pp.LogTransformer(variables=NUMERICALS_LOG_VARS)), ('drop_features', pp.DropUnecessaryFeatures(variables_to_drop=DROP_FEATURES)), ('scaler', MinMaxScaler()), ('Linear_model', Lasso(alpha=0.005, random_state=0)) ])
titanic_pipe = Pipeline( # complete with the list of steps from the preprocessors file # and the list of variables from the config [ ('missing_indicator', pp.MissingIndicator(variables=config.NUMERICAL_VARS)), ('categorical_imputer', pp.CategoricalImputer(variables=config.CATEGORICAL_VARS)), ('numerical_inputer', pp.NumericalImputer(variables=config.NUMERICAL_VARS)), ('extract_firstletter', pp.ExtractFirstLetter(variables=config.CABIN)), ('rare_label_encoder', pp.RareLabelCategoricalEncoder( tol=0.05, variables=config.CATEGORICAL_VARS)), ('categorical_encoder', pp.CategoricalEncoder(variables=config.CATEGORICAL_VARS)), ('scaler', StandardScaler()), ('Linear_model', LogisticRegression(C=0.0005, random_state=0)) ] )
# ( # "DropNaFeatures", # preprocessors.DropDuplicates(variables=config.DUPLICATE_VALS), # ), ( "Fill_NA_encoder", preprocessors.FillNAEncoder(variables=config.CATEGORICAL_VALS), ), ( "RareEncoder", preprocessors.RareEncoder(variables=config.CATEGORICAL_VALS[1:], threshold=0.01), ), ( "CategoricalEncoder", preprocessors.CategoricalEncoder( variables=config.CATEGORICAL_VALS[1:]), ), ( "EducationEncoder", preprocessors.EducationEncoder(variables='education'), ), ( "Skewed2Cat", preprocessors.Skewed2Cat(variables=config.SKEWED_NUMERIC_VARS), ), ( "MinMaxScalar", preprocessors.Min_Max_Scalar(variables=config.DISCRETE_NUMERIC_VARS + config.CONTINUOUS_NUMERIC_VARS), ), (
import preprocessors as pp import utils as ut config = ut.read_config_file('config.yaml') titanic_pipe = Pipeline( # complete with the list of steps from the preprocessors file # and the list of variables from the config [('categorical_imputer', pp.CategoricalImputer( variables=config[2]['Feature_Groups'].get('categorical_vars'))), ('missing_indicator', pp.MissingIndicator( variables=config[2]['Feature_Groups'].get('numerical_to_impute'))), ('numerical_imputer', pp.NumericalImputer( variables=config[2]['Feature_Groups'].get('numerical_to_impute'))), ('cabin_variable', pp.ExtractFirstLetter( variables=config[2]['Feature_Groups'].get('categorical_vars')[1])), ('rare_label_encoder', pp.RareLabelCategoricalEncoder( tol=0.05, variables=config[2]['Feature_Groups'].get('categorical_vars'))), ('categorical_encoder', pp.CategoricalEncoder( variables=config[2]['Feature_Groups'].get('categorical_vars'))), ('scaler', StandardScaler()), ('linear_model', LogisticRegression(C=0.0005, random_state=0))])
from sklearn.linear_model import LogisticRegression from sklearn.pipeline import Pipeline from sklearn.preprocessing import StandardScaler import preprocessors as pp import config titanic_pipe = Pipeline( [('categorical_imputer', pp.CategoricalImputer(config.CATEGORICAL_VARS)), ('missing_indicator', pp.MissingIndicator(config.NUMERICAL_VARS)), ('numerical_imputer', pp.NumericalImputer(config.NUMERICAL_VARS)), ('extract_first_letter', pp.ExtractFirstLetter(config.CABIN)), ('rare_label_categorical_encoder', pp.RareLabelCategoricalEncoder(0.05, config.CATEGORICAL_VARS)), ('categorical_encoder', pp.CategoricalEncoder(config.CATEGORICAL_VARS)), ('scaler', StandardScaler()), ('linear_model', LogisticRegression(C=0.0005, random_state=0))] # complete with the list of steps from the preprocessors file # and the list of variables from the config )
from xgboost.sklearn import XGBRegressor # %% discrete_vars_pipeline = Pipeline([ ('discrete_vars_select', pp.ItemsSelector(features=config.DISCRETE_VARS)) ]) cat_vars_pipeline = Pipeline([ ('cat_vars_select', pp.ItemsSelector(features=config.CATEGORICAL_VARS)), ('categorical_imputer', pp.CategoricalImputer(features=config.CATEGORICAL_VARS)), ('other_label_encoder', pp.OtherLabelCategoricalEncoder(features=config.CATEGORICAL_VARS)), ('categorical_encoder', pp.CategoricalEncoder(features=config.CATEGORICAL_VARS)) ]) numerical_vars_pipeline = Pipeline([ ('numerical_vars_select', pp.ItemsSelector(features=config.NUMERICAL_VARS)), ('numerical_imputer', pp.NumericalImputer(features=config.NUMERICAL_VARS)), ('power_transformer', pp.SelectedFeaturesPowerTransformer(features=config.NUMERICAL_VARS)) ]) temporal_vars_pipeline = Pipeline([ ('temporal_vars_select', pp.ItemsSelector(features=config.TEMPORAL_VARS)), ('temporal_processor', pp.TemporalVariableTransformer( features=config.TEMPORAL_VARS,