Exemple #1
0
 def test_compose4(self):
     from lale.operators import make_choice
     digits = sklearn.datasets.load_digits()
     ohe = OneHotEncoder(handle_unknown=OneHotEncoder.handle_unknown.ignore)
     ohe.get_params()
     no_op = NoOp()
     pca = PCA()
     nys = Nystroem()
     lr = LogisticRegression()
     knn = KNeighborsClassifier()
     step1 = ohe | no_op
     step2 = pca | nys
     step3 = lr | knn
     model_plan = step1 >> step2 >> step3
    def test_inverse_transform(self):
        from lale.lib.sklearn import OneHotEncoder, OrdinalEncoder

        fproc_ohe = OneHotEncoder(handle_unknown="ignore")
        # test_init_fit_transform
        trained_ohe = fproc_ohe.fit(self.X_train, self.y_train)
        transformed_X = trained_ohe.transform(self.X_test)
        orig_X_ohe = trained_ohe._impl._wrapped_model.inverse_transform(transformed_X)

        fproc_oe = OrdinalEncoder(handle_unknown="ignore")
        # test_init_fit_transform
        trained_oe = fproc_oe.fit(self.X_train, self.y_train)
        transformed_X = trained_oe.transform(self.X_test)
        orig_X_oe = trained_oe._impl.inverse_transform(transformed_X)
        self.assertEqual(orig_X_ohe.all(), orig_X_oe.all())
Exemple #3
0
 def test_scorers_np_cat(self):
     fairness_info = self.creditg_np_cat["fairness_info"]
     train_X = self.creditg_np_cat["train_X"]
     train_y = self.creditg_np_cat["train_y"]
     cat_columns, num_columns = [], []
     for i in range(train_X.shape[1]):
         try:
             _ = train_X[:, i].astype(np.float64)
             num_columns.append(i)
         except ValueError:
             cat_columns.append(i)
     trainable = (
         (
             (Project(columns=cat_columns) >> OneHotEncoder(handle_unknown="ignore"))
             & (
                 Project(columns=num_columns)
                 >> FunctionTransformer(func=lambda x: x.astype(np.float64))
             )
         )
         >> ConcatFeatures
         >> LogisticRegression(max_iter=1000)
     )
     trained = trainable.fit(train_X, train_y)
     test_X = self.creditg_np_cat["test_X"]
     test_y = self.creditg_np_cat["test_y"]
     self._attempt_scorers(fairness_info, trained, test_X, test_y)
Exemple #4
0
 def test_scorers_warn(self):
     fairness_info = {
         "favorable_labels": ["good"],
         "protected_attributes": [{"feature": "age", "privileged_groups": [1]}],
     }
     trainable = (
         (
             (
                 Project(columns={"type": "string"})
                 >> OneHotEncoder(handle_unknown="ignore")
             )
             & Project(columns={"type": "number"})
         )
         >> ConcatFeatures
         >> LogisticRegression(max_iter=1000)
     )
     train_X = self.creditg_pd_cat["train_X"]
     train_y = self.creditg_pd_cat["train_y"]
     trained = trainable.fit(train_X, train_y)
     test_X = self.creditg_pd_cat["test_X"]
     test_y = self.creditg_pd_cat["test_y"]
     disparate_impact_scorer = lale.lib.aif360.disparate_impact(**fairness_info)
     with self.assertWarnsRegex(UserWarning, "disparate_impact is ill-defined"):
         impact = disparate_impact_scorer(trained, test_X, test_y)
     self.assertTrue(np.isnan(impact))
Exemple #5
0
 def test_compose5(self):
     ohe = OneHotEncoder(handle_unknown=OneHotEncoder.handle_unknown.ignore)
     digits = sklearn.datasets.load_digits()
     lr = LogisticRegression()
     lr_trained = lr.fit(digits.data, digits.target)
     lr_trained.predict(digits.data)
     pipeline1 = ohe >> lr
     pipeline1_trained = pipeline1.fit(digits.data, digits.target)
     pipeline1_trained.predict(digits.data)
Exemple #6
0
 def _prep_pd_cat(cls):
     result = (
         (
             Project(columns={"type": "string"})
             >> OneHotEncoder(handle_unknown="ignore")
         )
         & Project(columns={"type": "number"})
     ) >> ConcatFeatures
     return result
Exemple #7
0
 def test_bool_label(self):
     import pandas as pd
     data_records = [
         {'IS_TENT':False, 'GENDER':'M', 'AGE':20, 'MARITAL_STATUS':'Single',  'PROFESSION':'Sales'},
         {'IS_TENT':False, 'GENDER':'M', 'AGE':20, 'MARITAL_STATUS':'Single',  'PROFESSION':'Sales'},
         {'IS_TENT':False, 'GENDER':'F', 'AGE':37, 'MARITAL_STATUS':'Single',  'PROFESSION':'Other'},
         {'IS_TENT':False, 'GENDER':'M', 'AGE':42, 'MARITAL_STATUS':'Married', 'PROFESSION':'Other'},
         {'IS_TENT':True,  'GENDER':'F', 'AGE':24, 'MARITAL_STATUS':'Married', 'PROFESSION':'Retail'},
         {'IS_TENT':False, 'GENDER':'F', 'AGE':24, 'MARITAL_STATUS':'Married', 'PROFESSION':'Retail'},
         {'IS_TENT':False, 'GENDER':'M', 'AGE':29, 'MARITAL_STATUS':'Single',  'PROFESSION':'Retail'},
         {'IS_TENT':False, 'GENDER':'M', 'AGE':29, 'MARITAL_STATUS':'Single',  'PROFESSION':'Retail'},
         {'IS_TENT':True,  'GENDER':'M', 'AGE':43, 'MARITAL_STATUS':'Married', 'PROFESSION':'Trades'},
         {'IS_TENT':False, 'GENDER':'M', 'AGE':43, 'MARITAL_STATUS':'Married', 'PROFESSION':'Trades'}]
     df = pd.DataFrame.from_records(data_records)
     X = df.drop(['IS_TENT'], axis=1).values
     y = df['IS_TENT'].values
     from lale.lib.sklearn import OneHotEncoder as Enc
     from lale.lib.sklearn import GradientBoostingClassifier as Clf
     trainable = Enc() >> Clf()
     trained = trainable.fit(X, y)
Exemple #8
0
 def test_preprocessing_union(self):
     from lale.datasets import openml
     (train_X, train_y), (test_X, test_y) = openml.fetch(
         'credit-g', 'classification', preprocess=False)
     from lale.lib.lale import Project
     from lale.lib.sklearn import Normalizer, OneHotEncoder
     from lale.lib.lale import ConcatFeatures as Concat
     from lale.lib.sklearn import RandomForestClassifier as Forest
     prep_num = Project(columns={'type': 'number'}) >> Normalizer
     prep_cat = Project(columns={'not': {'type': 'number'}}) >> OneHotEncoder(sparse=False)
     planned = (prep_num & prep_cat) >> Concat >> Forest
     from lale.lib.lale import Hyperopt
     hyperopt_classifier = Hyperopt(estimator=planned, max_evals=1)
     best_found = hyperopt_classifier.fit(train_X, train_y)
Exemple #9
0
    def dont_test_smac_choice(self):

        import numpy as np
        from sklearn import svm, datasets
        from sklearn.model_selection import cross_val_score

        # Import ConfigSpace and different types of parameters
        from smac.configspace import ConfigurationSpace

        # Import SMAC-utilities
        from smac.tae.execute_func import ExecuteTAFuncDict
        from smac.scenario.scenario import Scenario
        from smac.facade.smac_facade import SMAC

        tfm = PCA() | Nystroem() | NoOp()
        planned_pipeline1 = (
            OneHotEncoder(handle_unknown='ignore', sparse=False)
            | NoOp()) >> tfm >> (LogisticRegression() | KNeighborsClassifier())

        cs: ConfigurationSpace = get_smac_space(planned_pipeline1,
                                                lale_num_grids=5)

        # Scenario object
        scenario = Scenario({
            "run_obj":
            "quality",  # we optimize quality (alternatively runtime)
            "runcount-limit": 1,  # maximum function evaluations
            "cs": cs,  # configuration space
            "deterministic": "true"
        })

        # Optimize, using a SMAC-object
        tae = test_iris_fmin_tae(planned_pipeline1, num_folds=2)
        print(
            "Optimizing! Depending on your machine, this might take a few minutes."
        )
        smac = SMAC(scenario=scenario,
                    rng=np.random.RandomState(42),
                    tae_runner=tae)

        incumbent = smac.optimize()

        inc_value = tae(incumbent)

        print("Optimized Value: %.2f" % (inc_value))
Exemple #10
0
def auto_prep(X):
    from lale.lib.lale import ConcatFeatures, Project, categorical
    from lale.lib.sklearn import OneHotEncoder, SimpleImputer

    n_cols = X.shape[1]
    n_cats = len(categorical()(X))
    prep_num = SimpleImputer(strategy="mean")
    prep_cat = SimpleImputer(strategy="most_frequent") >> OneHotEncoder(
        handle_unknown="ignore")
    if n_cats == 0:
        result = prep_num
    elif n_cats == n_cols:
        result = prep_cat
    else:
        result = (
            (Project(columns={"type": "number"}, drop_columns=categorical()) >>
             prep_num)
            & (Project(columns=categorical()) >> prep_cat)) >> ConcatFeatures
    return result
Exemple #11
0
 def test_scorers_pd_cat(self):
     fairness_info = self.creditg_pd_cat["fairness_info"]
     trainable = (
         (
             (
                 Project(columns={"type": "string"})
                 >> OneHotEncoder(handle_unknown="ignore")
             )
             & Project(columns={"type": "number"})
         )
         >> ConcatFeatures
         >> LogisticRegression(max_iter=1000)
     )
     train_X = self.creditg_pd_cat["train_X"]
     train_y = self.creditg_pd_cat["train_y"]
     trained = trainable.fit(train_X, train_y)
     test_X = self.creditg_pd_cat["test_X"]
     test_y = self.creditg_pd_cat["test_y"]
     self._attempt_scorers(fairness_info, trained, test_X, test_y)
Exemple #12
0
def auto_prep(X):
    from lale.lib.lale import ConcatFeatures
    from lale.lib.lale import Project
    from lale.lib.lale import categorical
    from lale.lib.sklearn import OneHotEncoder
    from lale.lib.sklearn import SimpleImputer
    n_cols = X.shape[1]
    n_cats = len(categorical()(X))
    prep_num = SimpleImputer(strategy='mean')
    prep_cat = (SimpleImputer(strategy='most_frequent') >>
                OneHotEncoder(handle_unknown='ignore'))
    if n_cats == 0:
        result = prep_num
    elif n_cats == n_cols:
        result = prep_cat
    else:
        result = (
            (Project(columns={'type': 'number'}, drop_columns=categorical()) >>
             prep_num)
            & (Project(columns=categorical()) >> prep_cat)) >> ConcatFeatures
    return result
Exemple #13
0
    def test_bool_label(self):
        import pandas as pd

        data_records = [
            {
                "IS_TENT": False,
                "GENDER": "M",
                "AGE": 20,
                "MARITAL_STATUS": "Single",
                "PROFESSION": "Sales",
            },
            {
                "IS_TENT": False,
                "GENDER": "M",
                "AGE": 20,
                "MARITAL_STATUS": "Single",
                "PROFESSION": "Sales",
            },
            {
                "IS_TENT": False,
                "GENDER": "F",
                "AGE": 37,
                "MARITAL_STATUS": "Single",
                "PROFESSION": "Other",
            },
            {
                "IS_TENT": False,
                "GENDER": "M",
                "AGE": 42,
                "MARITAL_STATUS": "Married",
                "PROFESSION": "Other",
            },
            {
                "IS_TENT": True,
                "GENDER": "F",
                "AGE": 24,
                "MARITAL_STATUS": "Married",
                "PROFESSION": "Retail",
            },
            {
                "IS_TENT": False,
                "GENDER": "F",
                "AGE": 24,
                "MARITAL_STATUS": "Married",
                "PROFESSION": "Retail",
            },
            {
                "IS_TENT": False,
                "GENDER": "M",
                "AGE": 29,
                "MARITAL_STATUS": "Single",
                "PROFESSION": "Retail",
            },
            {
                "IS_TENT": False,
                "GENDER": "M",
                "AGE": 29,
                "MARITAL_STATUS": "Single",
                "PROFESSION": "Retail",
            },
            {
                "IS_TENT": True,
                "GENDER": "M",
                "AGE": 43,
                "MARITAL_STATUS": "Married",
                "PROFESSION": "Trades",
            },
            {
                "IS_TENT": False,
                "GENDER": "M",
                "AGE": 43,
                "MARITAL_STATUS": "Married",
                "PROFESSION": "Trades",
            },
        ]
        df = pd.DataFrame.from_records(data_records)
        X = df.drop(["IS_TENT"], axis=1).values
        y = df["IS_TENT"].values
        from lale.lib.sklearn import GradientBoostingClassifier as Clf
        from lale.lib.sklearn import OneHotEncoder as Enc

        trainable = Enc() >> Clf()
        _ = trainable.fit(X, y)
Exemple #14
0
    def test_shallow_impl(self):
        import lale.lib.sklearn.one_hot_encoder as lohe

        ohe = OneHotEncoder()
        self.assertIsInstance(ohe.shallow_impl, lohe._OneHotEncoderImpl)
Exemple #15
0
    def test_impl(self):
        import sklearn.preprocessing._encoders as skohe

        ohe = OneHotEncoder()
        self.assertIsInstance(ohe.impl, skohe.OneHotEncoder)
Exemple #16
0
def fetch(dataset_name, task_type, verbose=False, preprocess=True):
    if verbose:
        print('Loading dataset:', dataset_name)
    #Check that the dataset name exists in experiments_dict
    try:
        dataset_name_found = experiments_dict[dataset_name]
        if experiments_dict[dataset_name]['task_type'] != task_type.lower():
            raise ValueError("The task type {} does not match with the given datasets task type {}"\
                .format(task_type, experiments_dict[dataset_name]['task_type']))

    except KeyError:
        raise KeyError("Dataset name {} not found in the supported datasets".format(dataset_name))
    data_file_name = os.path.join(download_data_dir, dataset_name+".arff")
    if verbose:
        print(data_file_name)
    if not os.path.exists(data_file_name):
        #TODO: Download the data
        if not os.path.exists(download_data_dir):
            os.makedirs(download_data_dir)
            if verbose:
                print('created directory {}'.format(download_data_dir))
        urllib.request.urlretrieve(experiments_dict[dataset_name]['download_arff_url'], data_file_name)

    assert os.path.exists(data_file_name)
    with open(data_file_name) as f:
        dataDictionary = arff.load(f)
        f.close()

    from lale.datasets.data_schemas import liac_arff_to_schema
    schema_orig = liac_arff_to_schema(dataDictionary)
    target_col = experiments_dict[dataset_name]['target']
    if preprocess:
        arffData = pd.DataFrame(dataDictionary['data'])
        #arffData = arffData.fillna(0)
        attributes = dataDictionary['attributes']

        if verbose:
            print(attributes)
        categorical_cols = []
        numeric_cols = []
        X_columns = []
        for i, item in enumerate(attributes):
            if item[0].lower() == target_col:
                target_indx = i
                #remove it from attributes so that the next loop indices are adjusted accordingly.
                del attributes[i]
                y = arffData.iloc[:,target_indx]
                arffData = arffData.drop(i, axis = 1)

        for i, item in enumerate(attributes):
            X_columns.append(i)
            if (((isinstance(item[1], str) and item[1].lower() not in numeric_data_types_list) \
                or isinstance(item[1], list)) and (item[0].lower() != 'class')):
                categorical_cols.append(i)
            elif (isinstance(item[1], str) and item[1].lower() in numeric_data_types_list) and (item[0].lower() != 'class'):
                numeric_cols.append(i)
        if verbose:
            print(f'categorical columns: {categorical_cols}')
            print(f'numeric columns:     {numeric_cols}')
        X = arffData.iloc[:,X_columns]

        #Check whether there is any error
        num_classes_from_last_row = len(list(set(y)))

        if verbose:
            print('num_classes_from_last_row', num_classes_from_last_row)

        transformers1 = [
            ( 'imputer_str',
              SimpleImputer(missing_values=None, strategy='most_frequent'),
              categorical_cols),
            ( 'imputer_num',
              SimpleImputer(strategy='mean'), numeric_cols)]
        txm1 = ColumnTransformer(transformers1, sparse_threshold=0.0)

        transformers2 = [
            ( 'ohe', OneHotEncoder(sparse=False),
              list(range(len(categorical_cols)))),
            ( 'no_op', 'passthrough',
              list(range(len(categorical_cols),
                         len(categorical_cols) + len(numeric_cols))))]
        txm2 = ColumnTransformer(transformers2, sparse_threshold=0.0)
        if verbose:
            print("Shape of X before preprocessing", X.shape)
        from lale.operators import make_pipeline
        preprocessing = make_pipeline(txm1, txm2)

        X = preprocessing.fit(X).transform(X)
        if verbose:
            print("Shape of X after preprocessing", X.shape)

    else:
        col_names = [attr[0] for attr in dataDictionary['attributes']]
        df_all = pd.DataFrame(dataDictionary['data'], columns=col_names)
        y = df_all[target_col]
        y = y.squeeze()
        cols_X = [col for col in col_names if col != target_col]
        X = df_all[cols_X]

    labelencoder = LabelEncoder()
    y = labelencoder.fit_transform(y)

    X_train, X_test, y_train, y_test = \
        train_test_split(X, y, test_size = 0.33, random_state = 0)
    if verbose:
        print(f'training set shapes: X {X_train.shape}, y {y_train.shape}')
        print(f'test set shapes:     X {X_test.shape}, y {y_test.shape}')
    X_train, X_test, y_train, y_test = add_schemas( \
        schema_orig, target_col, X_train, X_test, y_train, y_test)
    return (X_train, y_train), (X_test, y_test)