Example #1
0
def test_df_selector_returns_correct_dataframe(categorical, container):
    select = Select(container)
    result = select.fit_transform(categorical)

    assert isinstance(result, pd.DataFrame)
    assert len(categorical) == len(result)
    assert {'category_a'} == set(result.columns)
    def test_df_selector_raise_missing_column(self, categorical: pd.DataFrame):
        select = Select(["category_a", "category_b", "category_c"])

        with pytest.raises(
                TransformerError,
                match="The DataFrame does not include the columns:"):
            select.fit_transform(categorical)
Example #3
0
def test_df_selector_with_multiple_columns(categorical):
    select = Select(['category_a', 'category_b'])
    result = select.fit_transform(categorical)

    assert isinstance(result, pd.DataFrame)
    assert len(categorical) == len(result)
    assert {'category_a', 'category_b'} == set(result.columns)
    def test_df_selector_with_multiple_columns(self,
                                               categorical: pd.DataFrame):
        select = Select(["category_a", "category_b"])
        result = select.fit_transform(categorical)

        assert isinstance(result, pd.DataFrame)
        assert len(categorical) == len(result)
        assert {"category_a", "category_b"} == set(result.columns)
Example #5
0
def test_df_selector_raise_missing_column(categorical):
    select = Select(['category_a', 'category_b', 'category_c'])

    with pytest.raises(
            TransformerError,
            message="Expecting TransformerError but no error occurred",
            match="The DataFrame does not include the columns:"):
        select.fit_transform(categorical)
Example #6
0
def test_featureunion_returns_concatenated_df(categorical, numerical):
    df = pd.concat([categorical, numerical], axis=1)
    first_pipe = make_pipeline(Select(['category_a', 'category_b']),
                               ToCategorical())
    union = DFFeatureUnion([('category', first_pipe),
                            ('number', Select(['number_a', 'number_b']))])

    transform_df = union.fit_transform(df)

    assert isinstance(transform_df, pd.DataFrame)
    assert 8 == len(transform_df.columns)
    assert len(df) == len(transform_df)
Example #7
0
def feature_union_classifier() -> Pipeline:
    pipe1 = Pipeline([
        ("select", Select(["sepal length (cm)", "sepal width (cm)"])),
        ("scale", DFStandardScaler()),
    ])
    pipe2 = Pipeline([
        ("select", Select(["petal length (cm)", "petal width (cm)"])),
        ("scale", DFStandardScaler()),
    ])
    union = DFFeatureUnion(transformer_list=[("pipe1", pipe1), ("pipe2",
                                                                pipe2)])
    return Pipeline([("features", union),
                     ("estimator", LogisticRegression(solver="liblinear"))])
    def test_featureunion_returns_concatenated_df(self,
                                                  categorical: pd.DataFrame,
                                                  numerical: pd.DataFrame):
        df = pd.concat([categorical, numerical], axis=1)
        first_pipe = make_pipeline(Select(["category_a", "category_b"]),
                                   ToCategorical())
        union = DFFeatureUnion([("category", first_pipe),
                                ("number", Select(["number_a", "number_b"]))])

        transform_df = union.fit_transform(df)

        assert isinstance(transform_df, pd.DataFrame)
        assert 8 == len(transform_df.columns)
        assert len(df) == len(transform_df)
    def test_standard_scaler_works_in_pipeline_with_feature_union(
            self, numerical: pd.DataFrame):
        numerical_scaled = numerical.copy()
        numerical_scaled["number_a"] = (numerical["number_a"] -
                                        2.5) / 1.118033988749895
        numerical_scaled["number_b"] = (numerical["number_b"] -
                                        6.5) / 1.118033988749895

        union = DFFeatureUnion([("number_a", Select(["number_a"])),
                                ("number_b", Select(["number_b"]))])

        pipeline = make_pipeline(union, DFStandardScaler())
        result = pipeline.fit_transform(numerical)

        pd.testing.assert_frame_equal(result, numerical_scaled)
Example #10
0
def test_DFStandardScaler_works_in_pipeline_with_DFFeatureUnion(
        categorical, numerical):
    numerical_scaled = numerical.copy()
    numerical_scaled['number_a'] = (numerical['number_a'] -
                                    2.5) / 1.118033988749895
    numerical_scaled['number_b'] = (numerical['number_b'] -
                                    6.5) / 1.118033988749895

    union = DFFeatureUnion([('number_a', Select(['number_a'])),
                            ('number_b', Select(['number_b']))])

    pipeline = make_pipeline(
        union,
        DFStandardScaler(),
    )
    result = pipeline.fit_transform(numerical)

    pd.testing.assert_frame_equal(result, numerical_scaled)
Example #11
0
"""
host_since
==========
When started hosting. Hypothesis that being a host for longer affects the price - they might be able to charge a different price.
For our solution, we can set it to 0 or ask.

Is a date - note that date is not a dtype, but we can set read_csv to parse it automatically as a date
dtype: datetime
"""
from ml_tooling.transformers import Select, DateEncoder
from sklearn.pipeline import Pipeline

host_since = Pipeline([("select", Select("host_since")),
                       ("date_encoder", DateEncoder())])
Example #12
0
"""
square_feet
===========
Size of rental. Should affect price

Area in whole feet. Potentially very large
dtype: Int64
"""
from ml_tooling.transformers import Select, FillNA
from sklearn.pipeline import Pipeline

square_feet = Pipeline([("select", Select("square_feet")),
                        ("fill_na", FillNA(0))])
"""
guests_included
===============
How many guests are included in the price. Directly impacts price

A count of guests. Max value is 16
dtype: Int8

"""
from ml_tooling.transformers import Select
from sklearn.pipeline import Pipeline

guests_included = Pipeline([
    ("select", Select("guests_included")),
])
 def test_works_without_args(self):
     assert Select()
 def test_df_selector_works_gridsearch(self, train_iris_dataset):
     grid = create_gridsearch(Select("sepal length (cm)"))
     model = Model(grid)
     result = model.score_estimator(train_iris_dataset)
     assert isinstance(result, Result)
"""
cleaning_fee
============
What's the cleaning fee. Affects price directly

Is a float, but is prepended with `$`. Read as string and preprocess
dtype: string
"""

from ml_tooling.transformers import Select, FillNA
from sklearn.pipeline import Pipeline

cleaning_fee = Pipeline([
    ("select", Select("cleaning_fee")),
    ("fill_na", FillNA(0, indicate_nan=True))
])
Example #17
0
"""
beds
====
How many beds. More should increase price

A count of beds. Max value is 25
dtype: Int8
"""

from ml_tooling.transformers import Select, FillNA
from sklearn.pipeline import Pipeline

beds = Pipeline([("select", Select("beds")),
                 ("fill_na", FillNA(127, indicate_nan=True))])
Example #18
0
"""
zipcode
=======
Zipcode (postnr) of the location

It's a common pitfall to convert to a number, but there is a possibility of a leading zero. This is very rare and only applies to old zipcodes and some military installations. [reference](https://da.wikipedia.org/wiki/Postnumre_i_Danmark)

To be certain, this should either be a categorical or a string, but an int should be OK for this usecase.

In this case, we are getting errors, since some of the zipcodes are missing. In addition, some of the zipcodes are not numbers, such as '2400 Kbh NV' or '2100 ΓΈ'. These need to be cleaned up as well. We need to import it as str to begin with
dtype: string
"""

from ml_tooling.transformers import Select
from sklearn.pipeline import Pipeline

zipcode = Pipeline([
    ("select", Select("zipcode"))
])

Example #19
0
"""
room_type
=========
What kind of room - Private room / shared room / entire apt etc. Should definitely affect price

Categorical
dtype: category
"""

from ml_tooling.transformers import Select, ToCategorical
from sklearn.pipeline import Pipeline

room_type = Pipeline([("select", Select("room_type")),
                      ("categorical", ToCategorical())])
"""
bed_type
========
What type of bed is available. Better bed should increase price

Type of bed
dtype: category
"""

from ml_tooling.transformers import Select, ToCategorical
from sklearn.pipeline import Pipeline

bed_type = Pipeline([("select", Select("bed_type")),
                     ("categorical", ToCategorical())])
"""
security_deposit
================
What's the security deposit. Might be related to the price

Is a float, but is prepended with `$`. Read as string and preprocess
dtype: string
"""
from ml_tooling.transformers import Select, FillNA
from sklearn.pipeline import Pipeline

security_deposit = Pipeline([("select", Select("security_deposit")),
                             ("fill_na", FillNA(0, indicate_nan=True))])
Example #22
0
"""
property_type
=============
What kind of property it is. House/Apartment/Room etc. Should definitely affect price

Categorical with a number of rare categories
dtype: category
"""

from ml_tooling.transformers import Select, RareFeatureEncoder, ToCategorical
from sklearn.pipeline import Pipeline

property_type = Pipeline([("select", Select("property_type")),
                          ("rare_features", RareFeatureEncoder(threshold=50)),
                          ("categorical", ToCategorical())])
Example #23
0
"""
host_response_time
==================
How long does it take for the host to accept/decline an offer. Hypothesis that this could be an indicator of "seriousness" which could affect the price

Categorical with 4 levels
dtype: category
"""
from ml_tooling.transformers import Select, FillNA, ToCategorical
from sklearn.pipeline import Pipeline

host_response_time = Pipeline([
    ("select", Select("host_response_time")),
    ("fill_na", FillNA("unknown", indicate_nan=True)),
    ("categorical", ToCategorical())
])
"""
host_identity_verified
======================
Is the host verified. Hypothesis that this increases confidence and thus price

Is a bool, but represented as 't'/'f'. Read as string and preprocess
dtype: string
"""

from ml_tooling.transformers import Select
from sklearn.pipeline import Pipeline

host_identity_verified = Pipeline([("select", Select("host_identity_verified"))
                                   ])
"""
house_rules
===========
What rules the guest must follow. Would try to extract some simple rules such as smoking allowed
or similar

House rules will be used to extract features from, such as `is_no_smoking` or something
indicating a lot of rules
dtype: string
"""
from ml_tooling.transformers import Select, FillNA
from sklearn.pipeline import Pipeline

house_rules_len = Pipeline([("select", Select("house_rules_len")),
                            ("fill_na", FillNA(0))])
 def test_df_selector_works_cross_validated(self, train_iris_dataset):
     model = create_model(Select("sepal length (cm)"))
     result = model.score_estimator(train_iris_dataset, cv=2)
     assert isinstance(result, Result)
"""
extra_people
============
How much more for extra people. Directly impacts price

Is a float, but is prepended with `$`. Read as string and preprocess
dtype: string
"""

from ml_tooling.transformers import Select
from sklearn.pipeline import Pipeline

extra_people = Pipeline([
    ("select", Select("extra_people")),
])
from ml_tooling.transformers import Select, FillNA
from sklearn.pipeline import Pipeline

host_acceptance_rate = Pipeline([
    ("select", Select("host_acceptance_rate")),
    ("fill_na", FillNA(127, indicate_nan=True)),
])
Example #29
0
"""
host_has_profile_pic
====================
Does the host have a profile picture? Hypothesis that this could increase confidence and thus price

Is a bool, but represented as 't'/'f'. Read as string and preprocess
dtype: string
"""
from ml_tooling.transformers import Select
from sklearn.pipeline import Pipeline

host_has_profile_pic = Pipeline([("select", Select("host_has_profile_pic"))])
Example #30
0
    'accessible-height_toilet',
    'wide_clearance_to_shower',
    '_toilet',
    'accessible-height_bed',
    'wide_entryway',
    'sun_loungers',
    'outdoor_parking',
    'tennis_court',
    'outdoor_kitchen',
    'air_purifier',
    'kitchenette',
    'roll-in_shower',
    'gas_oven',
    'steam_oven',
    'pillow-top_mattress',
    'fire_pit',
    'printer',
    'standing_valet',
    'ceiling_fan',
    'memory_foam_mattress',
    'amazon_echo',
    'projector_and_screen',
    'en_suite_bathroom',
    'central_air_conditioning',
    'mini_fridge',
    'beach_view',
    'double_oven'
]

amenities = Select(amenities_list)