Esempio n. 1
0
def pipeline(config):

    match_pipe = Pipeline([
        ('drop_fatures',
         pp.DropUnecessaryFeatures(
             variables_to_drop=config.variables.drop_features)),
        ('categorical_to_numerical',
         pp.CategoricalToNumerical(
             variables=config.variables.numerical_vars_from_numerical)),
        ('numerical_imputer', pp.NumericalImputer()),
        ('categorical_imputer',
         pp.CategoricalImputer(variables=config.variables.categorical_vars)),

        #('temporal_variable',
        #    pp.TemporalVariableEstimator(
        #       variables=config.TEMPORAL_VARS,
        #        reference_variable=config.DROP_FEATURES)),
        ('label extraction',
         pp.LabelExtraction(
             variables=config.variables.categorical_label_extraction)),
        ('rare label encoder',
         pp.RareLabelCategoricalEncoder(
             tol=0.01, variables=config.variables.categorical_vars)),
        ('categorical_encoder',
         pp.CategoricalEncoder(variables=config.variables.categorical_vars)),

        #('feature hashing',
        #   FeatureHasher(n_features=10, input_type='string')),

        #('log_transformer',
        #    pp.LogTransformer()),
        ('scaler', MinMaxScaler()),
        ('classifier', LogisticRegression())
    ])

    return match_pipe
Esempio n. 2
0
# variables to log transform
NUMERICALS_LOG_VARS = ['LotFrontage', '1stFlrSF', 'GrLivArea']

# numerical variables with NA in train set
NUMERICAL_VARS_WITH_NA = ['LotFrontage']

# categorical variables to encode
CATEGORICAL_VARS = [
    'MSZoning', 'Neighborhood', 'RoofStyle', 'MasVnrType', 'BsmtQual',
    'BsmtExposure', 'HeatingQC', 'CentralAir', 'KitchenQual', 'FireplaceQu',
    'GarageType', 'GarageFinish', 'PavedDrive'
]

price_pipe = Pipeline([
    ('categorical_imputer',
     pp.CategoricalImputer(variables=CATEGORICAL_VARS_WITH_NA)),
    ('numerical_inputer',
     pp.NumericalImputer(variables=NUMERICAL_VARS_WITH_NA)),
    ('temporal_variable',
     pp.TemporalVariableEstimator(variables=TEMPORAL_VARS,
                                  reference_variable=TEMPORAL_VARS)),
    ('rare_label_encoder',
     pp.RareLabelCategoricalEncoder(tol=0.01, variables=CATEGORICAL_VARS)),
    ('categorical_encoder', pp.CategoricalEncoder(variables=CATEGORICAL_VARS)),
    ('log_transformer', pp.LogTransformer(variables=NUMERICALS_LOG_VARS)),
    ('drop_features',
     pp.DropUnecessaryFeatures(variables_to_drop=DROP_FEATURES)),
    ('scaler', MinMaxScaler()),
    ('Linear_model', Lasso(alpha=0.005, random_state=0))
])
Esempio n. 3
0
    ),
    (
        "numerical_inputer",
        pp.NumericalImputer(variables=config.NUMERICAL_VARS_WITH_NA),
    ),
    (
        "temporal_variable",
        pp.TemporalVariableEstimator(variables=config.TEMPORAL_VARS,
                                     reference_variable=config.DROP_FEATURES),
    ),
    (
        "rare_label_encoder",
        pp.RareLabelCategoricalEncoder(tol=0.01,
                                       variables=config.CATEGORICAL_VARS),
    ),
    (
        "categorical_encoder",
        pp.CategoricalEncoder(variables=config.CATEGORICAL_VARS),
    ),
    # (
    #     "log_transformer",
    #     features.LogTransformer(variables=config.NUMERICALS_LOG_VARS),
    # ),
    (
        "drop_features",
        pp.DropUnecessaryFeatures(variables_to_drop=config.DROP_FEATURES),
    ),
    ("scaler", MinMaxScaler()),
    ("Linear_model", Lasso(alpha=0.005, random_state=0)),
])
import config

review_pipe = Pipeline([
    ("f1",
     FeatureUnion([
         ("p1",
          Pipeline([
              ('text_cleaner_1',
               pp.TextCleaner(variables=config.TEXT_FEATURES)),
              ('text_lematizer_1',
               pp.TextLematizer(variables=config.TEXT_FEATURES)),
              ('create_length_feature_1',
               pp.CreateLengthFeature(variables=config.TEXT_FEATURES)),
              ('standrd_scaling_numeric_1',
               pp.StandardScalarNumeric(variables=config.NUMERIC_FEATURES)),
              ('drop_features_1', pp.DropUnecessaryFeatures()),
          ])),
         ("p2",
          Pipeline([
              ('text_cleaner_2',
               pp.TextCleaner(variables=config.TEXT_FEATURES)),
              ('text_lematizer_2',
               pp.TextLematizer(variables=config.TEXT_FEATURES)),
              ('tfidf_converter_text_2', pp.TfidfConverterText()),
          ])),
         ("p3",
          Pipeline([
              ('text_cleaner_3',
               pp.TextCleaner(variables=config.TEXT_FEATURES)),
              ('text_lematizer_3',
               pp.TextLematizer(variables=config.TEXT_FEATURES)),