Beispiel #1
0
def cleaning_and_feature_pipeline():
    return Pipeline([
        # Feature engineer new columns
        AddColumnPipe(lambda data: parse_titles(data.Name), 'Title'),
        AddColumnPipe(lambda data: data.Parch + data.SibSp, 'FamilyCount'),

        # Drop columns that do not affect the model
        DropColumnPipe(
            ['PassengerId', 'Name', 'Cabin', 'Parch', 'SibSp', 'Ticket']),

        # Turn categorical variables into numeric
        CategoryToNumericPipe(['Title', 'Sex', 'Embarked']),

        # Deal with missing data
        MapColumnPipe(lambda col: Imputer(
            missing_values='NaN', strategy='median', axis=0).fit_transform(
                col.values.reshape(-1, 1)),
                      columns=['Age', 'Fare', 'Embarked']),

        # Group Age into 3 categories
        VariableToBinPipe(bins=3, columns=['Age']),

        # Normalize
        MapColumnPipe(lambda col: MinMaxScaler().fit_transform(
            col.values.reshape(-1, 1)))
    ])
Beispiel #2
0
    def test_flush_dataIsNone_error(self):
        pipeline = Pipeline([identity])

        with self.assertRaises(AssertionError):
            pipeline.flush(None)
Beispiel #3
0
def train_pipeline():
    return Pipeline([
        # FEATURE ENGINEERING
        AddColumnPipe(
            lambda data: data['GarageArea'] + data['GrLivArea'] + data[
                '1stFlrSF'] + data['2ndFlrSF'] + data['TotalBsmtSF'],
            'ExtraArea'),
        RemoveOutliersPipe(['LotArea', 'ExtraArea']),
        MapColumnPipe(np.log, columns=['LotArea', 'ExtraArea', 'LotFrontage']),
        PeekPipe(lambda data: inspect_features(data, ['LotFrontage'])),

        # FEATURE PICKING
        DropColumnPipe([
            'Id',
            'MSSubClass',
            'ExterCond',
            'Exterior2nd',
            'Utilities',
            'GarageCond',
            'GarageQual',
            'GarageType',
            'RoofMatl',
            'RoofStyle',
            'Heating',
            'HeatingQC',
            'Street',
            'MiscFeature',
            'MiscVal',
            'BsmtFinType2',
            'BsmtHalfBath',
            'BsmtFullBath',
            'FullBath',
            'HalfBath',
            'BsmtExposure',
            'Fence',
            'SaleType',
            'Alley',
            'LandSlope',
            'LandContour',
            'Electrical',
            '3SsnPorch',
            'EnclosedPorch',
            'PavedDrive',
            'Condition2',
            'LowQualFinSF',
            'Foundation',
            'TotRmsAbvGrd',
            'PoolQC',
            'BedroomAbvGr',
            'LotConfig',
        ]),

        # MISSING VALUES

        # Some NaNs actually indicate absence of the feature.
        MapColumnPipe(lambda col: col.fillna(-1),
                      columns=[
                          'LotFrontage',
                          'MasVnrType',
                          'MasVnrArea',
                          'BsmtFinType1',
                          'FireplaceQu',
                          'BsmtCond',
                          'BsmtQual',
                          'GarageFinish',
                      ]),
        # Garage build year can be filled with house build year.
        fill_garage_blt,

        # Transform features to numeric values
        CategoryToNumericPipe(['LotArea', 'LotFrontage'], excludes=True),

        # Impose the remaining NaNs with the mean
        PeekPipe(
            lambda data: print(data.isnull().sum()[data.isnull().sum() > 0])),
        MapColumnPipe(
            lambda col: Imputer(missing_values='NaN', strategy='mean', axis=0
                                ).fit_transform(col.values.reshape(-1, 1))),

        # NORMALIZE
        MapColumnPipe(lambda col: MinMaxScaler().fit_transform(
            col.values.reshape(-1, 1))),
        PeekPipe(lambda data: print(data.shape))
    ])
Beispiel #4
0
    def test_flush_initial_value(self):
        pipeline = Pipeline([identity, add_testpipe])

        self.assertEqual(pipeline.flush(self.data), list(range(11))[1:])
Beispiel #5
0
    def test_flush_pipes_and_functions(self):
        pipeline = Pipeline([IdentityPipe(), identity])

        self.assertEqual(pipeline.flush(self.data), self.data)
Beispiel #6
0
    def test_flush_functions(self):
        pipeline = Pipeline([identity])

        self.assertEqual(pipeline.flush(self.data), self.data)
Beispiel #7
0
    def test_flush_pipes(self):
        pipeline = Pipeline([IdentityPipe()])

        self.assertEqual(pipeline.flush(self.data), self.data)
def fe_pipeline():
    return Pipeline([
        MapColumnPipe(normalize),
        #lambda data: umap.UMAP(n_components=2, metric='correlation', verbose=1).fit_transform(data),
    ])