Python Select Exemples, ml_tooling.transformers.Select Python Exemples

Exemple #1

0

Afficher le fichier

def test_df_selector_returns_correct_dataframe(categorical, container):
    select = Select(container)
    result = select.fit_transform(categorical)

    assert isinstance(result, pd.DataFrame)
    assert len(categorical) == len(result)
    assert {'category_a'} == set(result.columns)

Exemple #2

0

Afficher le fichier

Fichier : test_transformers.py Projet : andersbogsnes/ml_tooling

    def test_df_selector_raise_missing_column(self, categorical: pd.DataFrame):
        select = Select(["category_a", "category_b", "category_c"])

        with pytest.raises(
                TransformerError,
                match="The DataFrame does not include the columns:"):
            select.fit_transform(categorical)

Exemple #3

0

Afficher le fichier

def test_df_selector_with_multiple_columns(categorical):
    select = Select(['category_a', 'category_b'])
    result = select.fit_transform(categorical)

    assert isinstance(result, pd.DataFrame)
    assert len(categorical) == len(result)
    assert {'category_a', 'category_b'} == set(result.columns)

Exemple #4

0

Afficher le fichier

Fichier : test_transformers.py Projet : andersbogsnes/ml_tooling

    def test_df_selector_with_multiple_columns(self,
                                               categorical: pd.DataFrame):
        select = Select(["category_a", "category_b"])
        result = select.fit_transform(categorical)

        assert isinstance(result, pd.DataFrame)
        assert len(categorical) == len(result)
        assert {"category_a", "category_b"} == set(result.columns)

Exemple #5

0

Afficher le fichier

def test_df_selector_raise_missing_column(categorical):
    select = Select(['category_a', 'category_b', 'category_c'])

    with pytest.raises(
            TransformerError,
            message="Expecting TransformerError but no error occurred",
            match="The DataFrame does not include the columns:"):
        select.fit_transform(categorical)

Exemple #6

0

Afficher le fichier

def test_featureunion_returns_concatenated_df(categorical, numerical):
    df = pd.concat([categorical, numerical], axis=1)
    first_pipe = make_pipeline(Select(['category_a', 'category_b']),
                               ToCategorical())
    union = DFFeatureUnion([('category', first_pipe),
                            ('number', Select(['number_a', 'number_b']))])

    transform_df = union.fit_transform(df)

    assert isinstance(transform_df, pd.DataFrame)
    assert 8 == len(transform_df.columns)
    assert len(df) == len(transform_df)

Exemple #7

0

Afficher le fichier

def feature_union_classifier() -> Pipeline:
    pipe1 = Pipeline([
        ("select", Select(["sepal length (cm)", "sepal width (cm)"])),
        ("scale", DFStandardScaler()),
    ])
    pipe2 = Pipeline([
        ("select", Select(["petal length (cm)", "petal width (cm)"])),
        ("scale", DFStandardScaler()),
    ])
    union = DFFeatureUnion(transformer_list=[("pipe1", pipe1), ("pipe2",
                                                                pipe2)])
    return Pipeline([("features", union),
                     ("estimator", LogisticRegression(solver="liblinear"))])

Exemple #8

0

Afficher le fichier

Fichier : test_transformers.py Projet : andersbogsnes/ml_tooling

    def test_featureunion_returns_concatenated_df(self,
                                                  categorical: pd.DataFrame,
                                                  numerical: pd.DataFrame):
        df = pd.concat([categorical, numerical], axis=1)
        first_pipe = make_pipeline(Select(["category_a", "category_b"]),
                                   ToCategorical())
        union = DFFeatureUnion([("category", first_pipe),
                                ("number", Select(["number_a", "number_b"]))])

        transform_df = union.fit_transform(df)

        assert isinstance(transform_df, pd.DataFrame)
        assert 8 == len(transform_df.columns)
        assert len(df) == len(transform_df)

Exemple #9

0

Afficher le fichier

Fichier : test_transformers.py Projet : andersbogsnes/ml_tooling

    def test_standard_scaler_works_in_pipeline_with_feature_union(
            self, numerical: pd.DataFrame):
        numerical_scaled = numerical.copy()
        numerical_scaled["number_a"] = (numerical["number_a"] -
                                        2.5) / 1.118033988749895
        numerical_scaled["number_b"] = (numerical["number_b"] -
                                        6.5) / 1.118033988749895

        union = DFFeatureUnion([("number_a", Select(["number_a"])),
                                ("number_b", Select(["number_b"]))])

        pipeline = make_pipeline(union, DFStandardScaler())
        result = pipeline.fit_transform(numerical)

        pd.testing.assert_frame_equal(result, numerical_scaled)

Exemple #10

0

Afficher le fichier

def test_DFStandardScaler_works_in_pipeline_with_DFFeatureUnion(
        categorical, numerical):
    numerical_scaled = numerical.copy()
    numerical_scaled['number_a'] = (numerical['number_a'] -
                                    2.5) / 1.118033988749895
    numerical_scaled['number_b'] = (numerical['number_b'] -
                                    6.5) / 1.118033988749895

    union = DFFeatureUnion([('number_a', Select(['number_a'])),
                            ('number_b', Select(['number_b']))])

    pipeline = make_pipeline(
        union,
        DFStandardScaler(),
    )
    result = pipeline.fit_transform(numerical)

    pd.testing.assert_frame_equal(result, numerical_scaled)

Exemple #11

0

Afficher le fichier

"""
host_since
==========
When started hosting. Hypothesis that being a host for longer affects the price - they might be able to charge a different price.
For our solution, we can set it to 0 or ask.

Is a date - note that date is not a dtype, but we can set read_csv to parse it automatically as a date
dtype: datetime
"""
from ml_tooling.transformers import Select, DateEncoder
from sklearn.pipeline import Pipeline

host_since = Pipeline([("select", Select("host_since")),
                       ("date_encoder", DateEncoder())])

Exemple #12

0

Afficher le fichier

"""
square_feet
===========
Size of rental. Should affect price

Area in whole feet. Potentially very large
dtype: Int64
"""
from ml_tooling.transformers import Select, FillNA
from sklearn.pipeline import Pipeline

square_feet = Pipeline([("select", Select("square_feet")),
                        ("fill_na", FillNA(0))])

Exemple #13

0

Afficher le fichier

Fichier : guests_included.py Projet : andersbogsnes/airbnb_priceforecaster

"""
guests_included
===============
How many guests are included in the price. Directly impacts price

A count of guests. Max value is 16
dtype: Int8

"""
from ml_tooling.transformers import Select
from sklearn.pipeline import Pipeline

guests_included = Pipeline([
    ("select", Select("guests_included")),
])

Exemple #14

0

Afficher le fichier

Fichier : test_transformers.py Projet : andersbogsnes/ml_tooling

 def test_works_without_args(self):
     assert Select()

Exemple #15

0

Afficher le fichier

Fichier : test_transformers.py Projet : andersbogsnes/ml_tooling

 def test_df_selector_works_gridsearch(self, train_iris_dataset):
     grid = create_gridsearch(Select("sepal length (cm)"))
     model = Model(grid)
     result = model.score_estimator(train_iris_dataset)
     assert isinstance(result, Result)

Exemple #16

0

Afficher le fichier

Fichier : cleaning_fee.py Projet : andersbogsnes/airbnb_priceforecaster

"""
cleaning_fee
============
What's the cleaning fee. Affects price directly

Is a float, but is prepended with `$`. Read as string and preprocess
dtype: string
"""

from ml_tooling.transformers import Select, FillNA
from sklearn.pipeline import Pipeline

cleaning_fee = Pipeline([
    ("select", Select("cleaning_fee")),
    ("fill_na", FillNA(0, indicate_nan=True))
])

Exemple #17

0

Afficher le fichier

"""
beds
====
How many beds. More should increase price

A count of beds. Max value is 25
dtype: Int8
"""

from ml_tooling.transformers import Select, FillNA
from sklearn.pipeline import Pipeline

beds = Pipeline([("select", Select("beds")),
                 ("fill_na", FillNA(127, indicate_nan=True))])

Exemple #18

0

Afficher le fichier

"""
zipcode
=======
Zipcode (postnr) of the location

It's a common pitfall to convert to a number, but there is a possibility of a leading zero. This is very rare and only applies to old zipcodes and some military installations. [reference](https://da.wikipedia.org/wiki/Postnumre_i_Danmark)

To be certain, this should either be a categorical or a string, but an int should be OK for this usecase.

In this case, we are getting errors, since some of the zipcodes are missing. In addition, some of the zipcodes are not numbers, such as '2400 Kbh NV' or '2100 ø'. These need to be cleaned up as well. We need to import it as str to begin with
dtype: string
"""

from ml_tooling.transformers import Select
from sklearn.pipeline import Pipeline

zipcode = Pipeline([
    ("select", Select("zipcode"))
])

Exemple #19

0

Afficher le fichier

"""
room_type
=========
What kind of room - Private room / shared room / entire apt etc. Should definitely affect price

Categorical
dtype: category
"""

from ml_tooling.transformers import Select, ToCategorical
from sklearn.pipeline import Pipeline

room_type = Pipeline([("select", Select("room_type")),
                      ("categorical", ToCategorical())])

Exemple #20

0

Afficher le fichier

Fichier : bed_type.py Projet : andersbogsnes/airbnb_priceforecaster

"""
bed_type
========
What type of bed is available. Better bed should increase price

Type of bed
dtype: category
"""

from ml_tooling.transformers import Select, ToCategorical
from sklearn.pipeline import Pipeline

bed_type = Pipeline([("select", Select("bed_type")),
                     ("categorical", ToCategorical())])

Exemple #21

0

Afficher le fichier

Fichier : security_deposit.py Projet : andersbogsnes/airbnb_priceforecaster

"""
security_deposit
================
What's the security deposit. Might be related to the price

Is a float, but is prepended with `$`. Read as string and preprocess
dtype: string
"""
from ml_tooling.transformers import Select, FillNA
from sklearn.pipeline import Pipeline

security_deposit = Pipeline([("select", Select("security_deposit")),
                             ("fill_na", FillNA(0, indicate_nan=True))])

Exemple #22

0

Afficher le fichier

"""
property_type
=============
What kind of property it is. House/Apartment/Room etc. Should definitely affect price

Categorical with a number of rare categories
dtype: category
"""

from ml_tooling.transformers import Select, RareFeatureEncoder, ToCategorical
from sklearn.pipeline import Pipeline

property_type = Pipeline([("select", Select("property_type")),
                          ("rare_features", RareFeatureEncoder(threshold=50)),
                          ("categorical", ToCategorical())])

Exemple #23

0

Afficher le fichier

"""
host_response_time
==================
How long does it take for the host to accept/decline an offer. Hypothesis that this could be an indicator of "seriousness" which could affect the price

Categorical with 4 levels
dtype: category
"""
from ml_tooling.transformers import Select, FillNA, ToCategorical
from sklearn.pipeline import Pipeline

host_response_time = Pipeline([
    ("select", Select("host_response_time")),
    ("fill_na", FillNA("unknown", indicate_nan=True)),
    ("categorical", ToCategorical())
])

Exemple #24

0

Afficher le fichier

Fichier : host_identity_verified.py Projet : andersbogsnes/airbnb_priceforecaster

"""
host_identity_verified
======================
Is the host verified. Hypothesis that this increases confidence and thus price

Is a bool, but represented as 't'/'f'. Read as string and preprocess
dtype: string
"""

from ml_tooling.transformers import Select
from sklearn.pipeline import Pipeline

host_identity_verified = Pipeline([("select", Select("host_identity_verified"))
                                   ])

Exemple #25

0

Afficher le fichier

Fichier : house_rules_len.py Projet : andersbogsnes/airbnb_priceforecaster

"""
house_rules
===========
What rules the guest must follow. Would try to extract some simple rules such as smoking allowed
or similar

House rules will be used to extract features from, such as `is_no_smoking` or something
indicating a lot of rules
dtype: string
"""
from ml_tooling.transformers import Select, FillNA
from sklearn.pipeline import Pipeline

house_rules_len = Pipeline([("select", Select("house_rules_len")),
                            ("fill_na", FillNA(0))])

Exemple #26

0

Afficher le fichier

Fichier : test_transformers.py Projet : andersbogsnes/ml_tooling

 def test_df_selector_works_cross_validated(self, train_iris_dataset):
     model = create_model(Select("sepal length (cm)"))
     result = model.score_estimator(train_iris_dataset, cv=2)
     assert isinstance(result, Result)

Exemple #27

0

Afficher le fichier

Fichier : extra_people.py Projet : andersbogsnes/airbnb_priceforecaster

"""
extra_people
============
How much more for extra people. Directly impacts price

Is a float, but is prepended with `$`. Read as string and preprocess
dtype: string
"""

from ml_tooling.transformers import Select
from sklearn.pipeline import Pipeline

extra_people = Pipeline([
    ("select", Select("extra_people")),
])

Exemple #28

0

Afficher le fichier

Fichier : host_acceptance_rate.py Projet : andersbogsnes/airbnb_priceforecaster

from ml_tooling.transformers import Select, FillNA
from sklearn.pipeline import Pipeline

host_acceptance_rate = Pipeline([
    ("select", Select("host_acceptance_rate")),
    ("fill_na", FillNA(127, indicate_nan=True)),
])

Exemple #29

0

Afficher le fichier

"""
host_has_profile_pic
====================
Does the host have a profile picture? Hypothesis that this could increase confidence and thus price

Is a bool, but represented as 't'/'f'. Read as string and preprocess
dtype: string
"""
from ml_tooling.transformers import Select
from sklearn.pipeline import Pipeline

host_has_profile_pic = Pipeline([("select", Select("host_has_profile_pic"))])

Exemple #30

0

Afficher le fichier

    'accessible-height_toilet',
    'wide_clearance_to_shower',
    '_toilet',
    'accessible-height_bed',
    'wide_entryway',
    'sun_loungers',
    'outdoor_parking',
    'tennis_court',
    'outdoor_kitchen',
    'air_purifier',
    'kitchenette',
    'roll-in_shower',
    'gas_oven',
    'steam_oven',
    'pillow-top_mattress',
    'fire_pit',
    'printer',
    'standing_valet',
    'ceiling_fan',
    'memory_foam_mattress',
    'amazon_echo',
    'projector_and_screen',
    'en_suite_bathroom',
    'central_air_conditioning',
    'mini_fridge',
    'beach_view',
    'double_oven'
]

amenities = Select(amenities_list)