def test_df_selector_returns_correct_dataframe(categorical, container): select = Select(container) result = select.fit_transform(categorical) assert isinstance(result, pd.DataFrame) assert len(categorical) == len(result) assert {'category_a'} == set(result.columns)
def test_df_selector_raise_missing_column(self, categorical: pd.DataFrame): select = Select(["category_a", "category_b", "category_c"]) with pytest.raises( TransformerError, match="The DataFrame does not include the columns:"): select.fit_transform(categorical)
def test_df_selector_with_multiple_columns(categorical): select = Select(['category_a', 'category_b']) result = select.fit_transform(categorical) assert isinstance(result, pd.DataFrame) assert len(categorical) == len(result) assert {'category_a', 'category_b'} == set(result.columns)
def test_df_selector_with_multiple_columns(self, categorical: pd.DataFrame): select = Select(["category_a", "category_b"]) result = select.fit_transform(categorical) assert isinstance(result, pd.DataFrame) assert len(categorical) == len(result) assert {"category_a", "category_b"} == set(result.columns)
def test_df_selector_raise_missing_column(categorical): select = Select(['category_a', 'category_b', 'category_c']) with pytest.raises( TransformerError, message="Expecting TransformerError but no error occurred", match="The DataFrame does not include the columns:"): select.fit_transform(categorical)
def test_featureunion_returns_concatenated_df(categorical, numerical): df = pd.concat([categorical, numerical], axis=1) first_pipe = make_pipeline(Select(['category_a', 'category_b']), ToCategorical()) union = DFFeatureUnion([('category', first_pipe), ('number', Select(['number_a', 'number_b']))]) transform_df = union.fit_transform(df) assert isinstance(transform_df, pd.DataFrame) assert 8 == len(transform_df.columns) assert len(df) == len(transform_df)
def feature_union_classifier() -> Pipeline: pipe1 = Pipeline([ ("select", Select(["sepal length (cm)", "sepal width (cm)"])), ("scale", DFStandardScaler()), ]) pipe2 = Pipeline([ ("select", Select(["petal length (cm)", "petal width (cm)"])), ("scale", DFStandardScaler()), ]) union = DFFeatureUnion(transformer_list=[("pipe1", pipe1), ("pipe2", pipe2)]) return Pipeline([("features", union), ("estimator", LogisticRegression(solver="liblinear"))])
def test_featureunion_returns_concatenated_df(self, categorical: pd.DataFrame, numerical: pd.DataFrame): df = pd.concat([categorical, numerical], axis=1) first_pipe = make_pipeline(Select(["category_a", "category_b"]), ToCategorical()) union = DFFeatureUnion([("category", first_pipe), ("number", Select(["number_a", "number_b"]))]) transform_df = union.fit_transform(df) assert isinstance(transform_df, pd.DataFrame) assert 8 == len(transform_df.columns) assert len(df) == len(transform_df)
def test_standard_scaler_works_in_pipeline_with_feature_union( self, numerical: pd.DataFrame): numerical_scaled = numerical.copy() numerical_scaled["number_a"] = (numerical["number_a"] - 2.5) / 1.118033988749895 numerical_scaled["number_b"] = (numerical["number_b"] - 6.5) / 1.118033988749895 union = DFFeatureUnion([("number_a", Select(["number_a"])), ("number_b", Select(["number_b"]))]) pipeline = make_pipeline(union, DFStandardScaler()) result = pipeline.fit_transform(numerical) pd.testing.assert_frame_equal(result, numerical_scaled)
def test_DFStandardScaler_works_in_pipeline_with_DFFeatureUnion( categorical, numerical): numerical_scaled = numerical.copy() numerical_scaled['number_a'] = (numerical['number_a'] - 2.5) / 1.118033988749895 numerical_scaled['number_b'] = (numerical['number_b'] - 6.5) / 1.118033988749895 union = DFFeatureUnion([('number_a', Select(['number_a'])), ('number_b', Select(['number_b']))]) pipeline = make_pipeline( union, DFStandardScaler(), ) result = pipeline.fit_transform(numerical) pd.testing.assert_frame_equal(result, numerical_scaled)
""" host_since ========== When started hosting. Hypothesis that being a host for longer affects the price - they might be able to charge a different price. For our solution, we can set it to 0 or ask. Is a date - note that date is not a dtype, but we can set read_csv to parse it automatically as a date dtype: datetime """ from ml_tooling.transformers import Select, DateEncoder from sklearn.pipeline import Pipeline host_since = Pipeline([("select", Select("host_since")), ("date_encoder", DateEncoder())])
""" square_feet =========== Size of rental. Should affect price Area in whole feet. Potentially very large dtype: Int64 """ from ml_tooling.transformers import Select, FillNA from sklearn.pipeline import Pipeline square_feet = Pipeline([("select", Select("square_feet")), ("fill_na", FillNA(0))])
""" guests_included =============== How many guests are included in the price. Directly impacts price A count of guests. Max value is 16 dtype: Int8 """ from ml_tooling.transformers import Select from sklearn.pipeline import Pipeline guests_included = Pipeline([ ("select", Select("guests_included")), ])
def test_works_without_args(self): assert Select()
def test_df_selector_works_gridsearch(self, train_iris_dataset): grid = create_gridsearch(Select("sepal length (cm)")) model = Model(grid) result = model.score_estimator(train_iris_dataset) assert isinstance(result, Result)
""" cleaning_fee ============ What's the cleaning fee. Affects price directly Is a float, but is prepended with `$`. Read as string and preprocess dtype: string """ from ml_tooling.transformers import Select, FillNA from sklearn.pipeline import Pipeline cleaning_fee = Pipeline([ ("select", Select("cleaning_fee")), ("fill_na", FillNA(0, indicate_nan=True)) ])
""" beds ==== How many beds. More should increase price A count of beds. Max value is 25 dtype: Int8 """ from ml_tooling.transformers import Select, FillNA from sklearn.pipeline import Pipeline beds = Pipeline([("select", Select("beds")), ("fill_na", FillNA(127, indicate_nan=True))])
""" zipcode ======= Zipcode (postnr) of the location It's a common pitfall to convert to a number, but there is a possibility of a leading zero. This is very rare and only applies to old zipcodes and some military installations. [reference](https://da.wikipedia.org/wiki/Postnumre_i_Danmark) To be certain, this should either be a categorical or a string, but an int should be OK for this usecase. In this case, we are getting errors, since some of the zipcodes are missing. In addition, some of the zipcodes are not numbers, such as '2400 Kbh NV' or '2100 ø'. These need to be cleaned up as well. We need to import it as str to begin with dtype: string """ from ml_tooling.transformers import Select from sklearn.pipeline import Pipeline zipcode = Pipeline([ ("select", Select("zipcode")) ])
""" room_type ========= What kind of room - Private room / shared room / entire apt etc. Should definitely affect price Categorical dtype: category """ from ml_tooling.transformers import Select, ToCategorical from sklearn.pipeline import Pipeline room_type = Pipeline([("select", Select("room_type")), ("categorical", ToCategorical())])
""" bed_type ======== What type of bed is available. Better bed should increase price Type of bed dtype: category """ from ml_tooling.transformers import Select, ToCategorical from sklearn.pipeline import Pipeline bed_type = Pipeline([("select", Select("bed_type")), ("categorical", ToCategorical())])
""" security_deposit ================ What's the security deposit. Might be related to the price Is a float, but is prepended with `$`. Read as string and preprocess dtype: string """ from ml_tooling.transformers import Select, FillNA from sklearn.pipeline import Pipeline security_deposit = Pipeline([("select", Select("security_deposit")), ("fill_na", FillNA(0, indicate_nan=True))])
""" property_type ============= What kind of property it is. House/Apartment/Room etc. Should definitely affect price Categorical with a number of rare categories dtype: category """ from ml_tooling.transformers import Select, RareFeatureEncoder, ToCategorical from sklearn.pipeline import Pipeline property_type = Pipeline([("select", Select("property_type")), ("rare_features", RareFeatureEncoder(threshold=50)), ("categorical", ToCategorical())])
""" host_response_time ================== How long does it take for the host to accept/decline an offer. Hypothesis that this could be an indicator of "seriousness" which could affect the price Categorical with 4 levels dtype: category """ from ml_tooling.transformers import Select, FillNA, ToCategorical from sklearn.pipeline import Pipeline host_response_time = Pipeline([ ("select", Select("host_response_time")), ("fill_na", FillNA("unknown", indicate_nan=True)), ("categorical", ToCategorical()) ])
""" host_identity_verified ====================== Is the host verified. Hypothesis that this increases confidence and thus price Is a bool, but represented as 't'/'f'. Read as string and preprocess dtype: string """ from ml_tooling.transformers import Select from sklearn.pipeline import Pipeline host_identity_verified = Pipeline([("select", Select("host_identity_verified")) ])
""" house_rules =========== What rules the guest must follow. Would try to extract some simple rules such as smoking allowed or similar House rules will be used to extract features from, such as `is_no_smoking` or something indicating a lot of rules dtype: string """ from ml_tooling.transformers import Select, FillNA from sklearn.pipeline import Pipeline house_rules_len = Pipeline([("select", Select("house_rules_len")), ("fill_na", FillNA(0))])
def test_df_selector_works_cross_validated(self, train_iris_dataset): model = create_model(Select("sepal length (cm)")) result = model.score_estimator(train_iris_dataset, cv=2) assert isinstance(result, Result)
""" extra_people ============ How much more for extra people. Directly impacts price Is a float, but is prepended with `$`. Read as string and preprocess dtype: string """ from ml_tooling.transformers import Select from sklearn.pipeline import Pipeline extra_people = Pipeline([ ("select", Select("extra_people")), ])
from ml_tooling.transformers import Select, FillNA from sklearn.pipeline import Pipeline host_acceptance_rate = Pipeline([ ("select", Select("host_acceptance_rate")), ("fill_na", FillNA(127, indicate_nan=True)), ])
""" host_has_profile_pic ==================== Does the host have a profile picture? Hypothesis that this could increase confidence and thus price Is a bool, but represented as 't'/'f'. Read as string and preprocess dtype: string """ from ml_tooling.transformers import Select from sklearn.pipeline import Pipeline host_has_profile_pic = Pipeline([("select", Select("host_has_profile_pic"))])
'accessible-height_toilet', 'wide_clearance_to_shower', '_toilet', 'accessible-height_bed', 'wide_entryway', 'sun_loungers', 'outdoor_parking', 'tennis_court', 'outdoor_kitchen', 'air_purifier', 'kitchenette', 'roll-in_shower', 'gas_oven', 'steam_oven', 'pillow-top_mattress', 'fire_pit', 'printer', 'standing_valet', 'ceiling_fan', 'memory_foam_mattress', 'amazon_echo', 'projector_and_screen', 'en_suite_bathroom', 'central_air_conditioning', 'mini_fridge', 'beach_view', 'double_oven' ] amenities = Select(amenities_list)