def auto_prep(X): from lale.lib.lale import ConcatFeatures, Project, categorical from lale.lib.sklearn import OneHotEncoder, SimpleImputer n_cols = X.shape[1] n_cats = len(categorical()(X)) prep_num = SimpleImputer(strategy="mean") prep_cat = SimpleImputer(strategy="most_frequent") >> OneHotEncoder( handle_unknown="ignore") if n_cats == 0: result = prep_num elif n_cats == n_cols: result = prep_cat else: result = ( (Project(columns={"type": "number"}, drop_columns=categorical()) >> prep_num) & (Project(columns=categorical()) >> prep_cat)) >> ConcatFeatures return result
def test_fit(self): (train_X_pd, _), (_, _) = self.tgt2creditg["pandas"] cat_columns = categorical()(train_X_pd) prefix = Map(columns={c: it[c] for c in cat_columns}) rasl_trainable = prefix >> RaslOneHotEncoder() sk_trainable = prefix >> SkOneHotEncoder() sk_trained = sk_trainable.fit(train_X_pd) for tgt, dataset in self.tgt2creditg.items(): (train_X, train_y), (test_X, test_y) = dataset rasl_trained = rasl_trainable.fit(train_X) self._check_last_trained(sk_trained, rasl_trained, tgt)
def auto_prep(X): from lale.lib.lale import ConcatFeatures from lale.lib.lale import Project from lale.lib.lale import categorical from lale.lib.sklearn import OneHotEncoder from lale.lib.sklearn import SimpleImputer n_cols = X.shape[1] n_cats = len(categorical()(X)) prep_num = SimpleImputer(strategy='mean') prep_cat = (SimpleImputer(strategy='most_frequent') >> OneHotEncoder(handle_unknown='ignore')) if n_cats == 0: result = prep_num elif n_cats == n_cols: result = prep_cat else: result = ( (Project(columns={'type': 'number'}, drop_columns=categorical()) >> prep_num) & (Project(columns=categorical()) >> prep_cat)) >> ConcatFeatures return result
def test_predict(self): (train_X_pd, train_y_pd), (test_X_pd, test_y_pd) = self.tgt2creditg["pandas"] cat_columns = categorical()(train_X_pd) prefix = Map(columns={c: it[c] for c in cat_columns}) to_pd = FunctionTransformer( func=lambda X: X if isinstance(X, pd.DataFrame) else X.toPandas()) lr = LogisticRegression() sk_trainable = prefix >> SkOneHotEncoder(sparse=False) >> lr sk_trained = sk_trainable.fit(train_X_pd, train_y_pd) sk_predicted = sk_trained.predict(test_X_pd) rasl_trainable = prefix >> RaslOneHotEncoder( sparse=False) >> to_pd >> lr for tgt, dataset in self.tgt2creditg.items(): (train_X, train_y), (test_X, test_y) = dataset rasl_trained = rasl_trainable.fit(train_X, train_y) rasl_predicted = rasl_trained.predict(test_X) self.assertEqual(sk_predicted.shape, rasl_predicted.shape, tgt) self.assertEqual(sk_predicted.tolist(), rasl_predicted.tolist(), tgt)
def test_partial_fit(self): (train_X_pd, _), (_, _) = self.tgt2creditg["pandas"] cat_columns = categorical()(train_X_pd) prefix = Map(columns={c: it[c] for c in cat_columns}) for tgt in self.tgt2creditg.keys(): rasl_pipe = prefix >> RaslOneHotEncoder() for lower, upper in [[0, 10], [10, 100], [100, train_X_pd.shape[0]]]: data_so_far = train_X_pd[0:upper] sk_pipe = prefix >> SkOrdinalEncoder() sk_pipe = sk_pipe.fit(data_so_far) data_delta = train_X_pd[lower:upper] if tgt == "spark": data_delta = lale.datasets.pandas2spark(data_delta) rasl_pipe = rasl_pipe.partial_fit(data_delta) self._check_last_trained( sk_pipe, rasl_pipe, (tgt, lower, upper), )
def test_transform(self): (train_X_pd, train_y_pd), (test_X_pd, test_y_pd) = self.tgt2creditg["pandas"] cat_columns = categorical()(train_X_pd) prefix = Map(columns={c: it[c] for c in cat_columns}) rasl_trainable = prefix >> RaslOneHotEncoder(sparse=False) sk_trainable = prefix >> SkOneHotEncoder(sparse=False) sk_trained = sk_trainable.fit(train_X_pd) sk_transformed = sk_trained.transform(test_X_pd) for tgt, dataset in self.tgt2creditg.items(): (train_X, train_y), (test_X, test_y) = dataset rasl_trained = rasl_trainable.fit(train_X) self._check_last_trained(sk_trained, rasl_trained, tgt) rasl_transformed = rasl_trained.transform(test_X) if tgt == "spark": rasl_transformed = rasl_transformed.toPandas() self.assertEqual(sk_transformed.shape, rasl_transformed.shape, tgt) for row_idx in range(sk_transformed.shape[0]): for col_idx in range(sk_transformed.shape[1]): self.assertEqual( sk_transformed[row_idx, col_idx], rasl_transformed.iloc[row_idx, col_idx], (row_idx, col_idx, tgt), )
# loads and downsamples and pickles forest covertype data for user study import pandas as pd from lale.datasets import covtype_df from sklearn.model_selection import train_test_split from lale.lib.lale import categorical import pickle TRAIN_SIZE = 5000 (train_X_all, train_y_all), (test_X, test_y) = covtype_df(test_size=5000) train_X, other_X, train_y, other_y = train_test_split(train_X_all, train_y_all, train_size=TRAIN_SIZE, stratify=train_y_all) constant_columns = categorical(max_values=1)(train_X) train_X = train_X.drop(constant_columns, axis=1) test_X = test_X.drop(constant_columns, axis=1) #pd.options.display.max_columns = None #pd.concat([train_y, train_X], axis=1) with open('train_x.pickle', 'wb') as f: pickle.dump(train_X, f) with open('test_x.pickle', 'wb') as f: pickle.dump(test_X, f) with open('train_y.pickle', 'wb') as f: pickle.dump(train_y, f) with open('test_y.pickle', 'wb') as f: