Python Select Examples, ml_tooling.transformers.Select Python Examples

Example #1

0

Show file

def test_df_selector_returns_correct_dataframe(categorical, container):
    select = Select(container)
    result = select.fit_transform(categorical)

    assert isinstance(result, pd.DataFrame)
    assert len(categorical) == len(result)
    assert {'category_a'} == set(result.columns)

Example #2

0

Show file

File: test_transformers.py Project: andersbogsnes/ml_tooling

    def test_df_selector_raise_missing_column(self, categorical: pd.DataFrame):
        select = Select(["category_a", "category_b", "category_c"])

        with pytest.raises(
                TransformerError,
                match="The DataFrame does not include the columns:"):
            select.fit_transform(categorical)

Example #3

0

Show file

def test_df_selector_with_multiple_columns(categorical):
    select = Select(['category_a', 'category_b'])
    result = select.fit_transform(categorical)

    assert isinstance(result, pd.DataFrame)
    assert len(categorical) == len(result)
    assert {'category_a', 'category_b'} == set(result.columns)

Example #4

0

Show file

File: test_transformers.py Project: andersbogsnes/ml_tooling

    def test_df_selector_with_multiple_columns(self,
                                               categorical: pd.DataFrame):
        select = Select(["category_a", "category_b"])
        result = select.fit_transform(categorical)

        assert isinstance(result, pd.DataFrame)
        assert len(categorical) == len(result)
        assert {"category_a", "category_b"} == set(result.columns)

Example #5

0

Show file

def test_df_selector_raise_missing_column(categorical):
    select = Select(['category_a', 'category_b', 'category_c'])

    with pytest.raises(
            TransformerError,
            message="Expecting TransformerError but no error occurred",
            match="The DataFrame does not include the columns:"):
        select.fit_transform(categorical)

Example #6

0

Show file

def test_featureunion_returns_concatenated_df(categorical, numerical):
    df = pd.concat([categorical, numerical], axis=1)
    first_pipe = make_pipeline(Select(['category_a', 'category_b']),
                               ToCategorical())
    union = DFFeatureUnion([('category', first_pipe),
                            ('number', Select(['number_a', 'number_b']))])

    transform_df = union.fit_transform(df)

    assert isinstance(transform_df, pd.DataFrame)
    assert 8 == len(transform_df.columns)
    assert len(df) == len(transform_df)

Example #7

0

Show file

def feature_union_classifier() -> Pipeline:
    pipe1 = Pipeline([
        ("select", Select(["sepal length (cm)", "sepal width (cm)"])),
        ("scale", DFStandardScaler()),
    ])
    pipe2 = Pipeline([
        ("select", Select(["petal length (cm)", "petal width (cm)"])),
        ("scale", DFStandardScaler()),
    ])
    union = DFFeatureUnion(transformer_list=[("pipe1", pipe1), ("pipe2",
                                                                pipe2)])
    return Pipeline([("features", union),
                     ("estimator", LogisticRegression(solver="liblinear"))])

Example #8

0

Show file

File: test_transformers.py Project: andersbogsnes/ml_tooling

    def test_featureunion_returns_concatenated_df(self,
                                                  categorical: pd.DataFrame,
                                                  numerical: pd.DataFrame):
        df = pd.concat([categorical, numerical], axis=1)
        first_pipe = make_pipeline(Select(["category_a", "category_b"]),
                                   ToCategorical())
        union = DFFeatureUnion([("category", first_pipe),
                                ("number", Select(["number_a", "number_b"]))])

        transform_df = union.fit_transform(df)

        assert isinstance(transform_df, pd.DataFrame)
        assert 8 == len(transform_df.columns)
        assert len(df) == len(transform_df)

Example #9

0

Show file

File: test_transformers.py Project: andersbogsnes/ml_tooling

    def test_standard_scaler_works_in_pipeline_with_feature_union(
            self, numerical: pd.DataFrame):
        numerical_scaled = numerical.copy()
        numerical_scaled["number_a"] = (numerical["number_a"] -
                                        2.5) / 1.118033988749895
        numerical_scaled["number_b"] = (numerical["number_b"] -
                                        6.5) / 1.118033988749895

        union = DFFeatureUnion([("number_a", Select(["number_a"])),
                                ("number_b", Select(["number_b"]))])

        pipeline = make_pipeline(union, DFStandardScaler())
        result = pipeline.fit_transform(numerical)

        pd.testing.assert_frame_equal(result, numerical_scaled)

Example #10

0

Show file

def test_DFStandardScaler_works_in_pipeline_with_DFFeatureUnion(
        categorical, numerical):
    numerical_scaled = numerical.copy()
    numerical_scaled['number_a'] = (numerical['number_a'] -
                                    2.5) / 1.118033988749895
    numerical_scaled['number_b'] = (numerical['number_b'] -
                                    6.5) / 1.118033988749895

    union = DFFeatureUnion([('number_a', Select(['number_a'])),
                            ('number_b', Select(['number_b']))])

    pipeline = make_pipeline(
        union,
        DFStandardScaler(),
    )
    result = pipeline.fit_transform(numerical)

    pd.testing.assert_frame_equal(result, numerical_scaled)

Example #11

0

Show file

"""
host_since
==========
When started hosting. Hypothesis that being a host for longer affects the price - they might be able to charge a different price.
For our solution, we can set it to 0 or ask.

Is a date - note that date is not a dtype, but we can set read_csv to parse it automatically as a date
dtype: datetime
"""
from ml_tooling.transformers import Select, DateEncoder
from sklearn.pipeline import Pipeline

host_since = Pipeline([("select", Select("host_since")),
                       ("date_encoder", DateEncoder())])

Example #12

0

Show file

"""
square_feet
===========
Size of rental. Should affect price

Area in whole feet. Potentially very large
dtype: Int64
"""
from ml_tooling.transformers import Select, FillNA
from sklearn.pipeline import Pipeline

square_feet = Pipeline([("select", Select("square_feet")),
                        ("fill_na", FillNA(0))])

Example #13

0

Show file

File: guests_included.py Project: andersbogsnes/airbnb_priceforecaster

"""
guests_included
===============
How many guests are included in the price. Directly impacts price

A count of guests. Max value is 16
dtype: Int8

"""
from ml_tooling.transformers import Select
from sklearn.pipeline import Pipeline

guests_included = Pipeline([
    ("select", Select("guests_included")),
])

Example #14

0

Show file

File: test_transformers.py Project: andersbogsnes/ml_tooling

 def test_works_without_args(self):
     assert Select()

Example #15

0

Show file

File: test_transformers.py Project: andersbogsnes/ml_tooling

 def test_df_selector_works_gridsearch(self, train_iris_dataset):
     grid = create_gridsearch(Select("sepal length (cm)"))
     model = Model(grid)
     result = model.score_estimator(train_iris_dataset)
     assert isinstance(result, Result)

Example #16

0

Show file

File: cleaning_fee.py Project: andersbogsnes/airbnb_priceforecaster

"""
cleaning_fee
============
What's the cleaning fee. Affects price directly

Is a float, but is prepended with `$`. Read as string and preprocess
dtype: string
"""

from ml_tooling.transformers import Select, FillNA
from sklearn.pipeline import Pipeline

cleaning_fee = Pipeline([
    ("select", Select("cleaning_fee")),
    ("fill_na", FillNA(0, indicate_nan=True))
])

Example #17

0

Show file

"""
beds
====
How many beds. More should increase price

A count of beds. Max value is 25
dtype: Int8
"""

from ml_tooling.transformers import Select, FillNA
from sklearn.pipeline import Pipeline

beds = Pipeline([("select", Select("beds")),
                 ("fill_na", FillNA(127, indicate_nan=True))])

Example #18

0

Show file

"""
zipcode
=======
Zipcode (postnr) of the location

It's a common pitfall to convert to a number, but there is a possibility of a leading zero. This is very rare and only applies to old zipcodes and some military installations. [reference](https://da.wikipedia.org/wiki/Postnumre_i_Danmark)

To be certain, this should either be a categorical or a string, but an int should be OK for this usecase.

In this case, we are getting errors, since some of the zipcodes are missing. In addition, some of the zipcodes are not numbers, such as '2400 Kbh NV' or '2100 ø'. These need to be cleaned up as well. We need to import it as str to begin with
dtype: string
"""

from ml_tooling.transformers import Select
from sklearn.pipeline import Pipeline

zipcode = Pipeline([
    ("select", Select("zipcode"))
])

Example #19

0

Show file

"""
room_type
=========
What kind of room - Private room / shared room / entire apt etc. Should definitely affect price

Categorical
dtype: category
"""

from ml_tooling.transformers import Select, ToCategorical
from sklearn.pipeline import Pipeline

room_type = Pipeline([("select", Select("room_type")),
                      ("categorical", ToCategorical())])

Example #20

0

Show file

File: bed_type.py Project: andersbogsnes/airbnb_priceforecaster

"""
bed_type
========
What type of bed is available. Better bed should increase price

Type of bed
dtype: category
"""

from ml_tooling.transformers import Select, ToCategorical
from sklearn.pipeline import Pipeline

bed_type = Pipeline([("select", Select("bed_type")),
                     ("categorical", ToCategorical())])

Example #21

0

Show file

File: security_deposit.py Project: andersbogsnes/airbnb_priceforecaster

"""
security_deposit
================
What's the security deposit. Might be related to the price

Is a float, but is prepended with `$`. Read as string and preprocess
dtype: string
"""
from ml_tooling.transformers import Select, FillNA
from sklearn.pipeline import Pipeline

security_deposit = Pipeline([("select", Select("security_deposit")),
                             ("fill_na", FillNA(0, indicate_nan=True))])

Example #22

0

Show file

"""
property_type
=============
What kind of property it is. House/Apartment/Room etc. Should definitely affect price

Categorical with a number of rare categories
dtype: category
"""

from ml_tooling.transformers import Select, RareFeatureEncoder, ToCategorical
from sklearn.pipeline import Pipeline

property_type = Pipeline([("select", Select("property_type")),
                          ("rare_features", RareFeatureEncoder(threshold=50)),
                          ("categorical", ToCategorical())])

Example #23

0

Show file

"""
host_response_time
==================
How long does it take for the host to accept/decline an offer. Hypothesis that this could be an indicator of "seriousness" which could affect the price

Categorical with 4 levels
dtype: category
"""
from ml_tooling.transformers import Select, FillNA, ToCategorical
from sklearn.pipeline import Pipeline

host_response_time = Pipeline([
    ("select", Select("host_response_time")),
    ("fill_na", FillNA("unknown", indicate_nan=True)),
    ("categorical", ToCategorical())
])

Example #24

0

Show file

File: host_identity_verified.py Project: andersbogsnes/airbnb_priceforecaster

"""
host_identity_verified
======================
Is the host verified. Hypothesis that this increases confidence and thus price

Is a bool, but represented as 't'/'f'. Read as string and preprocess
dtype: string
"""

from ml_tooling.transformers import Select
from sklearn.pipeline import Pipeline

host_identity_verified = Pipeline([("select", Select("host_identity_verified"))
                                   ])

Example #25

0

Show file

File: house_rules_len.py Project: andersbogsnes/airbnb_priceforecaster

"""
house_rules
===========
What rules the guest must follow. Would try to extract some simple rules such as smoking allowed
or similar

House rules will be used to extract features from, such as `is_no_smoking` or something
indicating a lot of rules
dtype: string
"""
from ml_tooling.transformers import Select, FillNA
from sklearn.pipeline import Pipeline

house_rules_len = Pipeline([("select", Select("house_rules_len")),
                            ("fill_na", FillNA(0))])

Example #26

0

Show file

File: test_transformers.py Project: andersbogsnes/ml_tooling

 def test_df_selector_works_cross_validated(self, train_iris_dataset):
     model = create_model(Select("sepal length (cm)"))
     result = model.score_estimator(train_iris_dataset, cv=2)
     assert isinstance(result, Result)

Example #27

0

Show file

File: extra_people.py Project: andersbogsnes/airbnb_priceforecaster

"""
extra_people
============
How much more for extra people. Directly impacts price

Is a float, but is prepended with `$`. Read as string and preprocess
dtype: string
"""

from ml_tooling.transformers import Select
from sklearn.pipeline import Pipeline

extra_people = Pipeline([
    ("select", Select("extra_people")),
])

Example #28

0

Show file

File: host_acceptance_rate.py Project: andersbogsnes/airbnb_priceforecaster

from ml_tooling.transformers import Select, FillNA
from sklearn.pipeline import Pipeline

host_acceptance_rate = Pipeline([
    ("select", Select("host_acceptance_rate")),
    ("fill_na", FillNA(127, indicate_nan=True)),
])

Example #29

0

Show file

"""
host_has_profile_pic
====================
Does the host have a profile picture? Hypothesis that this could increase confidence and thus price

Is a bool, but represented as 't'/'f'. Read as string and preprocess
dtype: string
"""
from ml_tooling.transformers import Select
from sklearn.pipeline import Pipeline

host_has_profile_pic = Pipeline([("select", Select("host_has_profile_pic"))])

Example #30

0

Show file

    'accessible-height_toilet',
    'wide_clearance_to_shower',
    '_toilet',
    'accessible-height_bed',
    'wide_entryway',
    'sun_loungers',
    'outdoor_parking',
    'tennis_court',
    'outdoor_kitchen',
    'air_purifier',
    'kitchenette',
    'roll-in_shower',
    'gas_oven',
    'steam_oven',
    'pillow-top_mattress',
    'fire_pit',
    'printer',
    'standing_valet',
    'ceiling_fan',
    'memory_foam_mattress',
    'amazon_echo',
    'projector_and_screen',
    'en_suite_bathroom',
    'central_air_conditioning',
    'mini_fridge',
    'beach_view',
    'double_oven'
]

amenities = Select(amenities_list)