def pipeline(config): match_pipe = Pipeline([ ('drop_fatures', pp.DropUnecessaryFeatures( variables_to_drop=config.variables.drop_features)), ('categorical_to_numerical', pp.CategoricalToNumerical( variables=config.variables.numerical_vars_from_numerical)), ('numerical_imputer', pp.NumericalImputer()), ('categorical_imputer', pp.CategoricalImputer(variables=config.variables.categorical_vars)), #('temporal_variable', # pp.TemporalVariableEstimator( # variables=config.TEMPORAL_VARS, # reference_variable=config.DROP_FEATURES)), ('label extraction', pp.LabelExtraction( variables=config.variables.categorical_label_extraction)), ('rare label encoder', pp.RareLabelCategoricalEncoder( tol=0.01, variables=config.variables.categorical_vars)), ('categorical_encoder', pp.CategoricalEncoder(variables=config.variables.categorical_vars)), #('feature hashing', # FeatureHasher(n_features=10, input_type='string')), #('log_transformer', # pp.LogTransformer()), ('scaler', MinMaxScaler()), ('classifier', LogisticRegression()) ]) return match_pipe
# variables to log transform NUMERICALS_LOG_VARS = ['LotFrontage', '1stFlrSF', 'GrLivArea'] # numerical variables with NA in train set NUMERICAL_VARS_WITH_NA = ['LotFrontage'] # categorical variables to encode CATEGORICAL_VARS = [ 'MSZoning', 'Neighborhood', 'RoofStyle', 'MasVnrType', 'BsmtQual', 'BsmtExposure', 'HeatingQC', 'CentralAir', 'KitchenQual', 'FireplaceQu', 'GarageType', 'GarageFinish', 'PavedDrive' ] price_pipe = Pipeline([ ('categorical_imputer', pp.CategoricalImputer(variables=CATEGORICAL_VARS_WITH_NA)), ('numerical_inputer', pp.NumericalImputer(variables=NUMERICAL_VARS_WITH_NA)), ('temporal_variable', pp.TemporalVariableEstimator(variables=TEMPORAL_VARS, reference_variable=TEMPORAL_VARS)), ('rare_label_encoder', pp.RareLabelCategoricalEncoder(tol=0.01, variables=CATEGORICAL_VARS)), ('categorical_encoder', pp.CategoricalEncoder(variables=CATEGORICAL_VARS)), ('log_transformer', pp.LogTransformer(variables=NUMERICALS_LOG_VARS)), ('drop_features', pp.DropUnecessaryFeatures(variables_to_drop=DROP_FEATURES)), ('scaler', MinMaxScaler()), ('Linear_model', Lasso(alpha=0.005, random_state=0)) ])
titanic_pipe = Pipeline( # complete with the list of steps from the preprocessors file # and the list of variables from the config [ ('missing_indicator', pp.MissingIndicator(variables=config.NUMERICAL_VARS)), ('categorical_imputer', pp.CategoricalImputer(variables=config.CATEGORICAL_VARS)), ('numerical_inputer', pp.NumericalImputer(variables=config.NUMERICAL_VARS)), ('extract_firstletter', pp.ExtractFirstLetter(variables=config.CABIN)), ('rare_label_encoder', pp.RareLabelCategoricalEncoder( tol=0.05, variables=config.CATEGORICAL_VARS)), ('categorical_encoder', pp.CategoricalEncoder(variables=config.CATEGORICAL_VARS)), ('scaler', StandardScaler()), ('Linear_model', LogisticRegression(C=0.0005, random_state=0)) ] )
import preprocessors as pp import utils as ut config = ut.read_config_file('config.yaml') titanic_pipe = Pipeline( # complete with the list of steps from the preprocessors file # and the list of variables from the config [('categorical_imputer', pp.CategoricalImputer( variables=config[2]['Feature_Groups'].get('categorical_vars'))), ('missing_indicator', pp.MissingIndicator( variables=config[2]['Feature_Groups'].get('numerical_to_impute'))), ('numerical_imputer', pp.NumericalImputer( variables=config[2]['Feature_Groups'].get('numerical_to_impute'))), ('cabin_variable', pp.ExtractFirstLetter( variables=config[2]['Feature_Groups'].get('categorical_vars')[1])), ('rare_label_encoder', pp.RareLabelCategoricalEncoder( tol=0.05, variables=config[2]['Feature_Groups'].get('categorical_vars'))), ('categorical_encoder', pp.CategoricalEncoder( variables=config[2]['Feature_Groups'].get('categorical_vars'))), ('scaler', StandardScaler()), ('linear_model', LogisticRegression(C=0.0005, random_state=0))])
from sklearn.linear_model import LogisticRegression from sklearn.pipeline import Pipeline from sklearn.preprocessing import StandardScaler import preprocessors as pp import config titanic_pipe = Pipeline( [('categorical_imputer', pp.CategoricalImputer(config.CATEGORICAL_VARS)), ('missing_indicator', pp.MissingIndicator(config.NUMERICAL_VARS)), ('numerical_imputer', pp.NumericalImputer(config.NUMERICAL_VARS)), ('extract_first_letter', pp.ExtractFirstLetter(config.CABIN)), ('rare_label_categorical_encoder', pp.RareLabelCategoricalEncoder(0.05, config.CATEGORICAL_VARS)), ('categorical_encoder', pp.CategoricalEncoder(config.CATEGORICAL_VARS)), ('scaler', StandardScaler()), ('linear_model', LogisticRegression(C=0.0005, random_state=0))] # complete with the list of steps from the preprocessors file # and the list of variables from the config )
from sklearn.linear_model import LogisticRegression from sklearn.pipeline import Pipeline from sklearn.preprocessing import StandardScaler import preprocessors as pp from config import * titanic_pipe = Pipeline( [ ('categorical_imputer', pp.CategoricalImputer(variables=CATEGORICAL_VARS)), ('missing_indicator', pp.MissingIndicator(variables=NUMERICAL_VARS)), ('numerical_imputer', pp.NumericalImputer(variables=NUMERICAL_VARS)), ('extract_first_letter', pp.ExtractFirstLetter(variables=CABIN_VAR)), ('rare_label_encoding', pp.RareLabelCategoricalEncoder(tol=0.05, variables=CATEGORICAL_VARS)), ('categorical_encoding', pp.CategoricalEncoder(variables=CATEGORICAL_VARS)), ('scaler', StandardScaler()), ('model', LogisticRegression(C=0.0005, random_state=0)) ] )
from sklearn.linear_model import LogisticRegression from sklearn.pipeline import Pipeline from sklearn.preprocessing import StandardScaler import preprocessors as pp import config titanic_pipe = Pipeline( [ ('Extract_First_Letter', pp.ExtractFirstLetter(variables=['cabin'])), ('Numerical_Imputer', pp.NumericalImputer(variables=config.NUMERICAL_VARS)), ('Categorical_Imputer', pp.CategoricalImputer(variables=config.CATEGORICAL_VARS)), ('Rare_Label_Categorical_Encoder', pp.RareLabelCategoricalEncoder(variables=config.CATEGORICAL_VARS)), ('Categorical_Encoder', pp.CategoricalEncoder(variables=config.CATEGORICAL_VARS)), ('scaler', StandardScaler()), ('Linear_model', LogisticRegression(C=0.005, random_state=0)) ], verbose=True )