def pipeline(config): match_pipe = Pipeline([ ('drop_fatures', pp.DropUnecessaryFeatures( variables_to_drop=config.variables.drop_features)), ('categorical_to_numerical', pp.CategoricalToNumerical( variables=config.variables.numerical_vars_from_numerical)), ('numerical_imputer', pp.NumericalImputer()), ('categorical_imputer', pp.CategoricalImputer(variables=config.variables.categorical_vars)), #('temporal_variable', # pp.TemporalVariableEstimator( # variables=config.TEMPORAL_VARS, # reference_variable=config.DROP_FEATURES)), ('label extraction', pp.LabelExtraction( variables=config.variables.categorical_label_extraction)), ('rare label encoder', pp.RareLabelCategoricalEncoder( tol=0.01, variables=config.variables.categorical_vars)), ('categorical_encoder', pp.CategoricalEncoder(variables=config.variables.categorical_vars)), #('feature hashing', # FeatureHasher(n_features=10, input_type='string')), #('log_transformer', # pp.LogTransformer()), ('scaler', MinMaxScaler()), ('classifier', LogisticRegression()) ]) return match_pipe
# variables to log transform NUMERICALS_LOG_VARS = ['LotFrontage', '1stFlrSF', 'GrLivArea'] # numerical variables with NA in train set NUMERICAL_VARS_WITH_NA = ['LotFrontage'] # categorical variables to encode CATEGORICAL_VARS = [ 'MSZoning', 'Neighborhood', 'RoofStyle', 'MasVnrType', 'BsmtQual', 'BsmtExposure', 'HeatingQC', 'CentralAir', 'KitchenQual', 'FireplaceQu', 'GarageType', 'GarageFinish', 'PavedDrive' ] price_pipe = Pipeline([ ('categorical_imputer', pp.CategoricalImputer(variables=CATEGORICAL_VARS_WITH_NA)), ('numerical_inputer', pp.NumericalImputer(variables=NUMERICAL_VARS_WITH_NA)), ('temporal_variable', pp.TemporalVariableEstimator(variables=TEMPORAL_VARS, reference_variable=TEMPORAL_VARS)), ('rare_label_encoder', pp.RareLabelCategoricalEncoder(tol=0.01, variables=CATEGORICAL_VARS)), ('categorical_encoder', pp.CategoricalEncoder(variables=CATEGORICAL_VARS)), ('log_transformer', pp.LogTransformer(variables=NUMERICALS_LOG_VARS)), ('drop_features', pp.DropUnecessaryFeatures(variables_to_drop=DROP_FEATURES)), ('scaler', MinMaxScaler()), ('Linear_model', Lasso(alpha=0.005, random_state=0)) ])
), ( "numerical_inputer", pp.NumericalImputer(variables=config.NUMERICAL_VARS_WITH_NA), ), ( "temporal_variable", pp.TemporalVariableEstimator(variables=config.TEMPORAL_VARS, reference_variable=config.DROP_FEATURES), ), ( "rare_label_encoder", pp.RareLabelCategoricalEncoder(tol=0.01, variables=config.CATEGORICAL_VARS), ), ( "categorical_encoder", pp.CategoricalEncoder(variables=config.CATEGORICAL_VARS), ), # ( # "log_transformer", # features.LogTransformer(variables=config.NUMERICALS_LOG_VARS), # ), ( "drop_features", pp.DropUnecessaryFeatures(variables_to_drop=config.DROP_FEATURES), ), ("scaler", MinMaxScaler()), ("Linear_model", Lasso(alpha=0.005, random_state=0)), ])
import config review_pipe = Pipeline([ ("f1", FeatureUnion([ ("p1", Pipeline([ ('text_cleaner_1', pp.TextCleaner(variables=config.TEXT_FEATURES)), ('text_lematizer_1', pp.TextLematizer(variables=config.TEXT_FEATURES)), ('create_length_feature_1', pp.CreateLengthFeature(variables=config.TEXT_FEATURES)), ('standrd_scaling_numeric_1', pp.StandardScalarNumeric(variables=config.NUMERIC_FEATURES)), ('drop_features_1', pp.DropUnecessaryFeatures()), ])), ("p2", Pipeline([ ('text_cleaner_2', pp.TextCleaner(variables=config.TEXT_FEATURES)), ('text_lematizer_2', pp.TextLematizer(variables=config.TEXT_FEATURES)), ('tfidf_converter_text_2', pp.TfidfConverterText()), ])), ("p3", Pipeline([ ('text_cleaner_3', pp.TextCleaner(variables=config.TEXT_FEATURES)), ('text_lematizer_3', pp.TextLematizer(variables=config.TEXT_FEATURES)),