def test_init(self): gender_map = {"m": "Male", "f": "Female"} state_map = {"NY": "New York", "CA": "California"} _ = Map(columns=[ replace(it.gender, gender_map), replace(it.state, state_map) ])
def test_transform_replace_list(self): d = { "gender": ["m", "f", "m", "m", "f"], "state": ["NY", "NY", "CA", "NY", "CA"], "status": [0, 1, 1, 0, 1], } df = pd.DataFrame(data=d) gender_map = {"m": "Male", "f": "Female"} state_map = {"NY": "New York", "CA": "California"} trainable = Map(columns=[ replace(it.gender, gender_map), replace(it.state, state_map) ]) trained = trainable.fit(df) transformed_df = trained.transform(df) self.assertEqual(transformed_df.shape, (5, 3)) self.assertEqual(transformed_df["gender"][0], "Male") self.assertEqual(transformed_df["state"][0], "New York")
def test_with_hyperopt(self): from sklearn.datasets import load_iris X, y = load_iris(return_X_y=True) gender_map = {"m": "Male", "f": "Female"} state_map = {"NY": "New York", "CA": "California"} map_replace = Map( columns=[ replace(it.gender, gender_map), replace(it.state, state_map) ], remainder="drop", ) pipeline = (Relational(operator=(Scan(table=it.main) & Scan( table=it.delay)) >> map_replace) >> LogisticRegression()) opt = Hyperopt(estimator=pipeline, cv=3, max_evals=5) trained = opt.fit(X, y) _ = trained
def test_transform_spark_replace_list(self): if spark_installed: d = { "gender": ["m", "f", "m", "m", "f"], "state": ["NY", "NY", "CA", "NY", "CA"], "status": [0, 1, 1, 0, 1], } df = pd.DataFrame(data=d) sdf = self.sqlCtx.createDataFrame(df) gender_map = {"m": "Male", "f": "Female"} state_map = {"NY": "New York", "CA": "California"} trainable = Map(columns=[ replace(it.gender, gender_map), replace(it.state, state_map) ]) trained = trainable.fit(sdf) transformed_df = trained.transform(sdf) self.assertEqual( (transformed_df.count(), len(transformed_df.columns)), (5, 3)) self.assertEqual(transformed_df.head()[0], "Male") self.assertEqual(transformed_df.head()[1], "New York")
def _build_transformer(self): result = Map( columns={ f"{col_name}_{cat_value}": replace( it[col_name], {cat_value: 1}, handle_unknown="use_encoded_value", unknown_value=0, ) for col_idx, col_name in enumerate(self.feature_names_in_) for cat_value in self.categories_[col_idx] }) return result
def _build_transformer(self): # prepare the transformer transformer = Map( columns={ col_name: replace( it[col_name], { self._hyperparams["missing_values"]: self.statistics_[col_idx] }, ) for col_idx, col_name in enumerate(self.feature_names_in_) }) return transformer
def _build_transformer(self): result = Map( columns={ col_name: replace( it[col_name], { cat_value: cat_idx for cat_idx, cat_value in enumerate( self.categories_[col_idx]) }, handle_unknown="use_encoded_value", unknown_value=self._hyperparams["unknown_value"], ) for col_idx, col_name in enumerate(self.feature_names_in_) }) return result