def test_scorers_warn(self): fairness_info = { "favorable_labels": ["good"], "protected_attributes": [{"feature": "age", "privileged_groups": [1]}], } trainable = ( ( ( Project(columns={"type": "string"}) >> OneHotEncoder(handle_unknown="ignore") ) & Project(columns={"type": "number"}) ) >> ConcatFeatures >> LogisticRegression(max_iter=1000) ) train_X = self.creditg_pd_cat["train_X"] train_y = self.creditg_pd_cat["train_y"] trained = trainable.fit(train_X, train_y) test_X = self.creditg_pd_cat["test_X"] test_y = self.creditg_pd_cat["test_y"] disparate_impact_scorer = lale.lib.aif360.disparate_impact(**fairness_info) with self.assertWarnsRegex(UserWarning, "disparate_impact is ill-defined"): impact = disparate_impact_scorer(trained, test_X, test_y) self.assertTrue(np.isnan(impact))
def test_using_pipeline(self): import lale.datasets.openml import pandas as pd (X_train, y_train), (X_test, y_test) = lale.datasets.openml.fetch('credit-g', 'classification', preprocess=False) project_nums = Project(columns={'type': 'number'}) project_cats = Project(columns={'type': 'string'}) planned_pipeline = ( (project_nums >> (Normalizer | NoOp) & project_cats >> OneHotEncoder) >> ConcatFeatures >> (LGBMClassifier | GradientBoostingClassifier)) # Let's first use Hyperopt to find the best pipeline opt = Hyperopt(estimator=planned_pipeline, max_evals=3) # run optimizer res = opt.fit(X_train, y_train) best_pipeline = res.get_pipeline() # Now let's use NSGA2 to perform multi-objective # optimization on the last step (i.e., classifier) # in the best pipeline returned by Hyperopt fpr_scorer = make_scorer(compute_fpr, greater_is_better=False) nsga2_args = { 'scoring': ['roc_auc', fpr_scorer], 'best_score': [1, 0], 'cv': 3, 'max_evals': 20, 'population_size': 10 } opt_last = OptimizeLast(estimator=best_pipeline, last_optimizer=NSGA2, optimizer_args=nsga2_args) res_last = opt_last.fit(X_train, y_train) df_summary = res_last.summary() print(df_summary) self.assertTrue(df_summary.shape[0] > 0) # check if summary contains valid loss values valid_objs = True for i in range(df_summary.shape[0]): record = df_summary.iloc[i] valid_objs = valid_objs and \ all([0 <= record['loss1'], record['loss1'] <= 1, 0 <= record['loss2'], record['loss2'] <= 1]) self.assertTrue(valid_objs, msg="Invalid loss values in summary") _ = res_last.predict(X_test) best_pipeline2 = res_last.get_pipeline() self.assertEqual(type(best_pipeline), type(best_pipeline2)) auc_scorer = get_scorer('roc_auc') print(f'test_using_pipeline: \n' 'AUC, FPR scorer values on test split - %.3f %.3f' % (auc_scorer(best_pipeline2, X_test, y_test), fpr_scorer(best_pipeline2, X_test, y_test)))
def test_decision_function_binary(self): from lale.lib.lale import Project train_X, train_y = self._creditG["X"], self._creditG["y"] trainable = Project(columns={"type": "number"}) >> LogisticRegression() trained = trainable.fit(train_X, train_y) _ = trained.decision_function(train_X)
def test_keep_numbers(self): from lale.datasets.data_schemas import to_schema from lale.lib.lale import Project train_X, train_y = self._creditG["X"], self._creditG["y"] trainable = Project(columns={"type": "number"}) trained = trainable.fit(train_X) transformed = trained.transform(train_X) transformed_schema = to_schema(transformed) transformed_expected = { "type": "array", "minItems": 670, "maxItems": 670, "items": { "type": "array", "minItems": 7, "maxItems": 7, "items": [ {"description": "duration", "type": "number"}, {"description": "credit_amount", "type": "number"}, {"description": "installment_commitment", "type": "number"}, {"description": "residence_since", "type": "number"}, {"description": "age", "type": "number"}, {"description": "existing_credits", "type": "number"}, {"description": "num_dependents", "type": "number"}, ], }, } self.maxDiff = None self.assertEqual(transformed_schema, transformed_expected)
def test_multimodal(self): from lale.lib.lale import ConcatFeatures as Cat from lale.lib.lale import Project from lale.lib.sklearn import LinearSVC from lale.lib.sklearn import Normalizer as Norm from lale.lib.sklearn import OneHotEncoder as OneHot project_0 = Project(columns={"type": "number"}) project_1 = Project(columns={"type": "string"}) linear_svc = LinearSVC(C=29617.4, dual=False, tol=0.005266) pipeline = ( ((project_0 >> Norm()) & (project_1 >> OneHot())) >> Cat >> linear_svc ) expected = """from lale.lib.lale import Project from sklearn.preprocessing import Normalizer as Norm from sklearn.preprocessing import OneHotEncoder as OneHot from lale.lib.lale import ConcatFeatures as Cat from sklearn.svm import LinearSVC import lale lale.wrap_imported_operators() project_0 = Project(columns={"type": "number"}) project_1 = Project(columns={"type": "string"}) linear_svc = LinearSVC(C=29617.4, dual=False, tol=0.005266) pipeline = ( ((project_0 >> Norm()) & (project_1 >> OneHot())) >> Cat >> linear_svc )""" self._roundtrip(expected, lale.pretty_print.to_string(pipeline))
def test_scorers_np_cat(self): fairness_info = self.creditg_np_cat["fairness_info"] train_X = self.creditg_np_cat["train_X"] train_y = self.creditg_np_cat["train_y"] cat_columns, num_columns = [], [] for i in range(train_X.shape[1]): try: _ = train_X[:, i].astype(np.float64) num_columns.append(i) except ValueError: cat_columns.append(i) trainable = ( ( (Project(columns=cat_columns) >> OneHotEncoder(handle_unknown="ignore")) & ( Project(columns=num_columns) >> FunctionTransformer(func=lambda x: x.astype(np.float64)) ) ) >> ConcatFeatures >> LogisticRegression(max_iter=1000) ) trained = trainable.fit(train_X, train_y) test_X = self.creditg_np_cat["test_X"] test_y = self.creditg_np_cat["test_y"] self._attempt_scorers(fairness_info, trained, test_X, test_y)
def _prep_pd_cat(cls): result = ( ( Project(columns={"type": "string"}) >> OneHotEncoder(handle_unknown="ignore") ) & Project(columns={"type": "number"}) ) >> ConcatFeatures return result
def test_preprocessing_union(self): from lale.datasets import openml (train_X, train_y), (test_X, test_y) = openml.fetch( 'credit-g', 'classification', preprocess=False) from lale.lib.lale import Project from lale.lib.sklearn import Normalizer, OneHotEncoder from lale.lib.lale import ConcatFeatures as Concat from lale.lib.sklearn import RandomForestClassifier as Forest prep_num = Project(columns={'type': 'number'}) >> Normalizer prep_cat = Project(columns={'not': {'type': 'number'}}) >> OneHotEncoder(sparse=False) planned = (prep_num & prep_cat) >> Concat >> Forest from lale.lib.lale import Hyperopt hyperopt_classifier = Hyperopt(estimator=planned, max_evals=1) best_found = hyperopt_classifier.fit(train_X, train_y)
def test_keep_non_numbers(self): from lale.datasets.data_schemas import to_schema from lale.lib.lale import Project train_X, train_y = self._creditG['X'], self._creditG['y'] trainable = Project(columns={'not': {'type': 'number'}}) trained = trainable.fit(train_X) transformed = trained.transform(train_X) transformed_schema = to_schema(transformed) transformed_expected = { 'type': 'array', 'minItems': 670, 'maxItems': 670, 'items': { 'type': 'array', 'minItems': 13, 'maxItems': 13, 'items': [ {'description': 'checking_status', 'enum': [ '<0', '0<=X<200', '>=200', 'no checking']}, {'description': 'credit_history', 'enum': [ 'no credits/all paid', 'all paid', 'existing paid', 'delayed previously', 'critical/other existing credit']}, {'description': 'purpose', 'enum': [ 'new car', 'used car', 'furniture/equipment', 'radio/tv', 'domestic appliance', 'repairs', 'education', 'vacation', 'retraining', 'business', 'other']}, {'description': 'savings_status', 'enum': [ '<100', '100<=X<500', '500<=X<1000', '>=1000', 'no known savings']}, {'description': 'employment', 'enum': [ 'unemployed', '<1', '1<=X<4', '4<=X<7', '>=7']}, {'description': 'personal_status', 'enum': [ 'male div/sep', 'female div/dep/mar', 'male single', 'male mar/wid', 'female single']}, {'description': 'other_parties', 'enum': [ 'none', 'co applicant', 'guarantor']}, {'description': 'property_magnitude', 'enum': [ 'real estate', 'life insurance', 'car', 'no known property']}, {'description': 'other_payment_plans', 'enum': [ 'bank', 'stores', 'none']}, {'description': 'housing', 'enum': [ 'rent', 'own', 'for free']}, {'description': 'job', 'enum': [ 'unemp/unskilled non res', 'unskilled resident', 'skilled', 'high qualif/self emp/mgmt']}, {'description': 'own_telephone', 'enum': ['none', 'yes']}, {'description': 'foreign_worker', 'enum': ['yes', 'no']}]}} self.maxDiff = None self.assertEqual(transformed_schema, transformed_expected)
def _fit_gbt_num(self, X, y): from lale.lib.lale import Project from lale.lib.sklearn import SimpleImputer gbt = auto_gbt(self.prediction_type) trainable = (Project(columns={'type': 'number'}) >> SimpleImputer(strategy='mean') >> gbt()) self._try_and_add('gbt_num', trainable, X, y)
def _fit_gbt_num(self, X, y): from lale.lib.lale import Project from lale.lib.sklearn import SimpleImputer gbt = auto_gbt(self.prediction_type) trainable = (Project(columns={"type": "number"}) >> SimpleImputer(strategy="mean") >> gbt()) self._try_and_add("gbt_num", trainable, X, y)
def test_keep_numbers(self): from lale.datasets.data_schemas import to_schema from lale.lib.lale import Project train_X, train_y = self._creditG['X'], self._creditG['y'] trainable = Project(columns={'type': 'number'}) trained = trainable.fit(train_X) transformed = trained.transform(train_X) transformed_schema = to_schema(transformed) transformed_expected = { 'type': 'array', 'minItems': 670, 'maxItems': 670, 'items': { 'type': 'array', 'minItems': 7, 'maxItems': 7, 'items': [{ 'description': 'duration', 'type': 'number' }, { 'description': 'credit_amount', 'type': 'number' }, { 'description': 'installment_commitment', 'type': 'number' }, { 'description': 'residence_since', 'type': 'number' }, { 'description': 'age', 'type': 'number' }, { 'description': 'existing_credits', 'type': 'number' }, { 'description': 'num_dependents', 'type': 'number' }] } } self.maxDiff = None self.assertEqual(transformed_schema, transformed_expected)
def test_text_and_structured(self): from lale.datasets.uci.uci_datasets import fetch_drugscom from sklearn.model_selection import train_test_split train_X_all, train_y_all, test_X, test_y = fetch_drugscom() #subset to speed up debugging train_X, train_X_ignore, train_y, train_y_ignore = train_test_split( train_X_all, train_y_all, train_size=0.01, random_state=42) from lale.lib.lale import Project from lale.lib.lale import ConcatFeatures as Cat from lale.lib.sklearn import TfidfVectorizer as Tfidf from lale.lib.sklearn import LinearRegression as LinReg from lale.lib.sklearn import RandomForestRegressor as Forest prep_text = Project(columns=['review']) >> Tfidf(max_features=100) prep_nums = Project(columns={'type': 'number'}) planned = (prep_text & prep_nums) >> Cat >> (LinReg | Forest) from lale.lib.lale import Hyperopt hyperopt_classifier = Hyperopt(estimator=planned, max_evals=1, scoring='r2') best_found = hyperopt_classifier.fit(train_X, train_y)
def auto_prep(X): from lale.lib.lale import ConcatFeatures, Project, categorical from lale.lib.sklearn import OneHotEncoder, SimpleImputer n_cols = X.shape[1] n_cats = len(categorical()(X)) prep_num = SimpleImputer(strategy="mean") prep_cat = SimpleImputer(strategy="most_frequent") >> OneHotEncoder( handle_unknown="ignore") if n_cats == 0: result = prep_num elif n_cats == n_cols: result = prep_cat else: result = ( (Project(columns={"type": "number"}, drop_columns=categorical()) >> prep_num) & (Project(columns=categorical()) >> prep_cat)) >> ConcatFeatures return result
def test_scorers_pd_cat(self): fairness_info = self.creditg_pd_cat["fairness_info"] trainable = ( ( ( Project(columns={"type": "string"}) >> OneHotEncoder(handle_unknown="ignore") ) & Project(columns={"type": "number"}) ) >> ConcatFeatures >> LogisticRegression(max_iter=1000) ) train_X = self.creditg_pd_cat["train_X"] train_y = self.creditg_pd_cat["train_y"] trained = trainable.fit(train_X, train_y) test_X = self.creditg_pd_cat["test_X"] test_y = self.creditg_pd_cat["test_y"] self._attempt_scorers(fairness_info, trained, test_X, test_y)
def auto_prep(X): from lale.lib.lale import ConcatFeatures from lale.lib.lale import Project from lale.lib.lale import categorical from lale.lib.sklearn import OneHotEncoder from lale.lib.sklearn import SimpleImputer n_cols = X.shape[1] n_cats = len(categorical()(X)) prep_num = SimpleImputer(strategy='mean') prep_cat = (SimpleImputer(strategy='most_frequent') >> OneHotEncoder(handle_unknown='ignore')) if n_cats == 0: result = prep_num elif n_cats == n_cols: result = prep_cat else: result = ( (Project(columns={'type': 'number'}, drop_columns=categorical()) >> prep_num) & (Project(columns=categorical()) >> prep_cat)) >> ConcatFeatures return result
def test_keep_non_numbers(self): from lale.datasets.data_schemas import to_schema from lale.lib.lale import Project train_X = self._creditG["X"] trainable = Project(columns={"not": {"type": "number"}}) trained = trainable.fit(train_X) transformed = trained.transform(train_X) transformed_schema = to_schema(transformed) transformed_expected = { "type": "array", "minItems": 670, "maxItems": 670, "items": { "type": "array", "minItems": 13, "maxItems": 13, "items": [ { "description": "checking_status", "enum": ["<0", "0<=X<200", ">=200", "no checking"], }, { "description": "credit_history", "enum": [ "no credits/all paid", "all paid", "existing paid", "delayed previously", "critical/other existing credit", ], }, { "description": "purpose", "enum": [ "new car", "used car", "furniture/equipment", "radio/tv", "domestic appliance", "repairs", "education", "vacation", "retraining", "business", "other", ], }, { "description": "savings_status", "enum": [ "<100", "100<=X<500", "500<=X<1000", ">=1000", "no known savings", ], }, { "description": "employment", "enum": ["unemployed", "<1", "1<=X<4", "4<=X<7", ">=7"], }, { "description": "personal_status", "enum": [ "male div/sep", "female div/dep/mar", "male single", "male mar/wid", "female single", ], }, { "description": "other_parties", "enum": ["none", "co applicant", "guarantor"], }, { "description": "property_magnitude", "enum": [ "real estate", "life insurance", "car", "no known property", ], }, { "description": "other_payment_plans", "enum": ["bank", "stores", "none"], }, { "description": "housing", "enum": ["rent", "own", "for free"] }, { "description": "job", "enum": [ "unemp/unskilled non res", "unskilled resident", "skilled", "high qualif/self emp/mgmt", ], }, { "description": "own_telephone", "enum": ["none", "yes"] }, { "description": "foreign_worker", "enum": ["yes", "no"] }, ], }, } self.maxDiff = None self.assertEqual(transformed_schema, transformed_expected)
def test_decision_function_binary(self): from lale.lib.lale import Project train_X, train_y = self._creditG['X'], self._creditG['y'] trainable = Project(columns={'type': 'number'}) >> LogisticRegression() trained = trainable.fit(train_X, train_y) decisions = trained.decision_function(train_X)