Esempio n. 1
0
def auto_prep(X):
    from lale.lib.lale import ConcatFeatures, Project, categorical
    from lale.lib.sklearn import OneHotEncoder, SimpleImputer

    n_cols = X.shape[1]
    n_cats = len(categorical()(X))
    prep_num = SimpleImputer(strategy="mean")
    prep_cat = SimpleImputer(strategy="most_frequent") >> OneHotEncoder(
        handle_unknown="ignore")
    if n_cats == 0:
        result = prep_num
    elif n_cats == n_cols:
        result = prep_cat
    else:
        result = (
            (Project(columns={"type": "number"}, drop_columns=categorical()) >>
             prep_num)
            & (Project(columns=categorical()) >> prep_cat)) >> ConcatFeatures
    return result
Esempio n. 2
0
 def test_fit(self):
     (train_X_pd, _), (_, _) = self.tgt2creditg["pandas"]
     cat_columns = categorical()(train_X_pd)
     prefix = Map(columns={c: it[c] for c in cat_columns})
     rasl_trainable = prefix >> RaslOneHotEncoder()
     sk_trainable = prefix >> SkOneHotEncoder()
     sk_trained = sk_trainable.fit(train_X_pd)
     for tgt, dataset in self.tgt2creditg.items():
         (train_X, train_y), (test_X, test_y) = dataset
         rasl_trained = rasl_trainable.fit(train_X)
         self._check_last_trained(sk_trained, rasl_trained, tgt)
Esempio n. 3
0
def auto_prep(X):
    from lale.lib.lale import ConcatFeatures
    from lale.lib.lale import Project
    from lale.lib.lale import categorical
    from lale.lib.sklearn import OneHotEncoder
    from lale.lib.sklearn import SimpleImputer
    n_cols = X.shape[1]
    n_cats = len(categorical()(X))
    prep_num = SimpleImputer(strategy='mean')
    prep_cat = (SimpleImputer(strategy='most_frequent') >>
                OneHotEncoder(handle_unknown='ignore'))
    if n_cats == 0:
        result = prep_num
    elif n_cats == n_cols:
        result = prep_cat
    else:
        result = (
            (Project(columns={'type': 'number'}, drop_columns=categorical()) >>
             prep_num)
            & (Project(columns=categorical()) >> prep_cat)) >> ConcatFeatures
    return result
Esempio n. 4
0
 def test_predict(self):
     (train_X_pd, train_y_pd), (test_X_pd,
                                test_y_pd) = self.tgt2creditg["pandas"]
     cat_columns = categorical()(train_X_pd)
     prefix = Map(columns={c: it[c] for c in cat_columns})
     to_pd = FunctionTransformer(
         func=lambda X: X if isinstance(X, pd.DataFrame) else X.toPandas())
     lr = LogisticRegression()
     sk_trainable = prefix >> SkOneHotEncoder(sparse=False) >> lr
     sk_trained = sk_trainable.fit(train_X_pd, train_y_pd)
     sk_predicted = sk_trained.predict(test_X_pd)
     rasl_trainable = prefix >> RaslOneHotEncoder(
         sparse=False) >> to_pd >> lr
     for tgt, dataset in self.tgt2creditg.items():
         (train_X, train_y), (test_X, test_y) = dataset
         rasl_trained = rasl_trainable.fit(train_X, train_y)
         rasl_predicted = rasl_trained.predict(test_X)
         self.assertEqual(sk_predicted.shape, rasl_predicted.shape, tgt)
         self.assertEqual(sk_predicted.tolist(), rasl_predicted.tolist(),
                          tgt)
Esempio n. 5
0
 def test_partial_fit(self):
     (train_X_pd, _), (_, _) = self.tgt2creditg["pandas"]
     cat_columns = categorical()(train_X_pd)
     prefix = Map(columns={c: it[c] for c in cat_columns})
     for tgt in self.tgt2creditg.keys():
         rasl_pipe = prefix >> RaslOneHotEncoder()
         for lower, upper in [[0, 10], [10, 100],
                              [100, train_X_pd.shape[0]]]:
             data_so_far = train_X_pd[0:upper]
             sk_pipe = prefix >> SkOrdinalEncoder()
             sk_pipe = sk_pipe.fit(data_so_far)
             data_delta = train_X_pd[lower:upper]
             if tgt == "spark":
                 data_delta = lale.datasets.pandas2spark(data_delta)
             rasl_pipe = rasl_pipe.partial_fit(data_delta)
             self._check_last_trained(
                 sk_pipe,
                 rasl_pipe,
                 (tgt, lower, upper),
             )
Esempio n. 6
0
 def test_transform(self):
     (train_X_pd, train_y_pd), (test_X_pd,
                                test_y_pd) = self.tgt2creditg["pandas"]
     cat_columns = categorical()(train_X_pd)
     prefix = Map(columns={c: it[c] for c in cat_columns})
     rasl_trainable = prefix >> RaslOneHotEncoder(sparse=False)
     sk_trainable = prefix >> SkOneHotEncoder(sparse=False)
     sk_trained = sk_trainable.fit(train_X_pd)
     sk_transformed = sk_trained.transform(test_X_pd)
     for tgt, dataset in self.tgt2creditg.items():
         (train_X, train_y), (test_X, test_y) = dataset
         rasl_trained = rasl_trainable.fit(train_X)
         self._check_last_trained(sk_trained, rasl_trained, tgt)
         rasl_transformed = rasl_trained.transform(test_X)
         if tgt == "spark":
             rasl_transformed = rasl_transformed.toPandas()
         self.assertEqual(sk_transformed.shape, rasl_transformed.shape, tgt)
         for row_idx in range(sk_transformed.shape[0]):
             for col_idx in range(sk_transformed.shape[1]):
                 self.assertEqual(
                     sk_transformed[row_idx, col_idx],
                     rasl_transformed.iloc[row_idx, col_idx],
                     (row_idx, col_idx, tgt),
                 )
Esempio n. 7
0
# loads and downsamples and pickles forest covertype data for user study

import pandas as pd
from lale.datasets import covtype_df
from sklearn.model_selection import train_test_split
from lale.lib.lale import categorical
import pickle

TRAIN_SIZE = 5000

(train_X_all, train_y_all), (test_X, test_y) = covtype_df(test_size=5000)
train_X, other_X, train_y, other_y = train_test_split(train_X_all,
                                                      train_y_all,
                                                      train_size=TRAIN_SIZE,
                                                      stratify=train_y_all)
constant_columns = categorical(max_values=1)(train_X)
train_X = train_X.drop(constant_columns, axis=1)
test_X = test_X.drop(constant_columns, axis=1)
#pd.options.display.max_columns = None
#pd.concat([train_y, train_X], axis=1)

with open('train_x.pickle', 'wb') as f:
    pickle.dump(train_X, f)

with open('test_x.pickle', 'wb') as f:
    pickle.dump(test_X, f)

with open('train_y.pickle', 'wb') as f:
    pickle.dump(train_y, f)

with open('test_y.pickle', 'wb') as f: