def test_duplicate_instances(self): tfm = PCA() clf = LogisticRegression(LogisticRegression.solver.lbfgs, LogisticRegression.multi_class.auto) with self.assertRaises(ValueError): _ = lale.operators.make_pipeline(tfm, tfm, clf)
def test_redacting_pd_cat(self): fairness_info = self.creditg_pd_cat["fairness_info"] estim = self.prep_pd_cat >> LogisticRegression(max_iter=1000) trainable_remi = Redacting(**fairness_info) >> estim self._attempt_remi_creditg_pd_cat(fairness_info, trainable_remi, 0.81, 0.91)
def test_reweighing_pd_cat(self): fairness_info = self.creditg_pd_cat["fairness_info"] estim = self.prep_pd_cat >> LogisticRegression(max_iter=1000) trainable_remi = Reweighing(estimator=estim, **fairness_info) self._attempt_remi_creditg_pd_cat(fairness_info, trainable_remi, 0.85, 1.00)
def test_reweighing_pd_num(self): fairness_info = self.creditg_pd_num["fairness_info"] estim = LogisticRegression(max_iter=1000) trainable_remi = Reweighing(estimator=estim, **fairness_info) self._attempt_remi_creditg_pd_num(fairness_info, trainable_remi, 0.82, 0.92)
def test_eq_odds_postprocessing_pd_cat(self): fairness_info = self.creditg_pd_cat["fairness_info"] estim = self.prep_pd_cat >> LogisticRegression(max_iter=1000) trainable_remi = EqOddsPostprocessing(**fairness_info, estimator=estim) self._attempt_remi_creditg_pd_cat(fairness_info, trainable_remi, 0.88, 0.98)
def dont_test_smac_choice(self): import numpy as np from sklearn import svm, datasets from sklearn.model_selection import cross_val_score # Import ConfigSpace and different types of parameters from smac.configspace import ConfigurationSpace # Import SMAC-utilities from smac.tae.execute_func import ExecuteTAFuncDict from smac.scenario.scenario import Scenario from smac.facade.smac_facade import SMAC as orig_SMAC tfm = PCA() | Nystroem() | NoOp() planned_pipeline1 = (OneHotEncoder(handle_unknown = 'ignore', sparse = False) | NoOp()) >> tfm >> (LogisticRegression() | KNeighborsClassifier()) cs:ConfigurationSpace = get_smac_space(planned_pipeline1, lale_num_grids=1) # Scenario object scenario = Scenario({"run_obj": "quality", # we optimize quality (alternatively runtime) "runcount-limit": 1, # maximum function evaluations "cs": cs, # configuration space "deterministic": "true" }) # Optimize, using a SMAC-object tae = iris_fmin_tae(planned_pipeline1, num_folds=2) print("Optimizing! Depending on your machine, this might take a few minutes.") smac = orig_SMAC(scenario=scenario, rng=np.random.RandomState(42), tae_runner=tae) incumbent = smac.optimize() inc_value = tae(incumbent) print("Optimized Value: %.2f" % (inc_value))
def test_disparate_impact_remover_pd_num(self): fairness_info = self.creditg_pd_num["fairness_info"] trainable_remi = DisparateImpactRemover( **fairness_info) >> LogisticRegression(max_iter=1000) self._attempt_remi_creditg_pd_num(fairness_info, trainable_remi, 0.78, 0.88)
def test_unknown_arg(self): with self.assertRaises(jsonschema.ValidationError) as cm: LogisticRegression(activation='relu') summary = cm.exception.message.split('\n')[0] self.assertEqual(summary, "Invalid configuration for LogisticRegression(activation='relu') due to argument 'activation' was unexpected.")
def test_constraint(self): with self.assertRaises(jsonschema.ValidationError) as cm: LogisticRegression(solver='sag', penalty='l1') summary = cm.exception.message.split('\n')[0] self.assertEqual(summary, "Invalid configuration for LogisticRegression(solver='sag', penalty='l1') due to constraint the newton-cg, sag, and lbfgs solvers support only l2 penalties.")
def test_decision_function_binary(self): from lale.lib.lale import Project train_X, train_y = self._creditG['X'], self._creditG['y'] trainable = Project(columns={'type': 'number'}) >> LogisticRegression() trained = trainable.fit(train_X, train_y) decisions = trained.decision_function(train_X)
def test_wrong_cat(self): with self.assertRaises(jsonschema.ValidationError) as cm: LogisticRegression(solver='adam') summary = cm.exception.message.split('\n')[0] self.assertEqual(summary, "Invalid configuration for LogisticRegression(solver='adam') due to invalid value solver=adam.")
class _HyperoptImpl: def __init__( self, estimator=None, max_evals=50, frac_evals_with_defaults=0, algo="tpe", cv=5, handle_cv_failure=False, scoring=None, best_score=0.0, max_opt_time=None, max_eval_time=None, pgo: Optional[PGO] = None, show_progressbar=True, args_to_scorer=None, verbose=False, ): self.max_evals = max_evals if estimator is None: self.estimator = LogisticRegression() else: self.estimator = estimator if frac_evals_with_defaults > 0: self.evals_with_defaults = int(frac_evals_with_defaults * max_evals) else: self.evals_with_defaults = 0 self.algo = algo self.scoring = scoring if self.scoring is None: is_clf = self.estimator.is_classifier() if is_clf: self.scoring = "accuracy" else: self.scoring = "r2" self.best_score = best_score self.handle_cv_failure = handle_cv_failure self.cv = cv self._trials = hyperopt.Trials() self._default_trials = hyperopt.Trials() self.max_opt_time = max_opt_time self.max_eval_time = max_eval_time self.pgo = pgo self.show_progressbar = show_progressbar if args_to_scorer is not None: self.args_to_scorer = args_to_scorer else: self.args_to_scorer = {} self.verbose = verbose def _summarize_statuses(self): status_list = self._trials.statuses() status_hist = {} for status in status_list: status_hist[status] = 1 + status_hist.get(status, 0) if hyperopt.STATUS_FAIL in status_hist: print( f"{status_hist[hyperopt.STATUS_FAIL]} out of {len(status_list)} trials failed, call summary() for details." ) if not self.verbose: print("Run with verbose=True to see per-trial exceptions.") def fit(self, X_train, y_train): opt_start_time = time.time() is_clf = self.estimator.is_classifier() self.cv = check_cv(self.cv, y=y_train, classifier=is_clf) data_schema = lale.helpers.fold_schema(X_train, y_train, self.cv, is_clf) self.search_space = hyperopt.hp.choice( "meta_model", [ hyperopt_search_space( self.estimator, pgo=self.pgo, data_schema=data_schema ) ], ) # Create a search space with default hyperparameters for all trainable parts of the pipeline. # This search space is used for `frac_evals_with_defaults` fraction of the total trials. try: self.search_space_with_defaults = hyperopt.hp.choice( "meta_model", [ hyperopt_search_space( self.estimator.freeze_trainable(), pgo=self.pgo, data_schema=data_schema, ) ], ) except Exception: logger.warning( "Exception caught during generation of default search space, setting frac_evals_with_defaults to zero." ) self.evals_with_defaults = 0 def hyperopt_train_test(params, X_train, y_train): warnings.filterwarnings("ignore") trainable = create_instance_from_hyperopt_search_space( self.estimator, params ) try: cv_score, logloss, execution_time = cross_val_score_track_trials( trainable, X_train, y_train, cv=self.cv, scoring=self.scoring, args_to_scorer=self.args_to_scorer, ) logger.debug( "Successful trial of hyperopt with hyperparameters:{}".format( params ) ) except BaseException as e: # If there is any error in cross validation, use the score based on a random train-test split as the evaluation criterion if self.handle_cv_failure: ( X_train_part, X_validation, y_train_part, y_validation, ) = train_test_split(X_train, y_train, test_size=0.20) start = time.time() trained = trainable.fit(X_train_part, y_train_part) scorer = check_scoring(trainable, scoring=self.scoring) cv_score = scorer( trained, X_validation, y_validation, **self.args_to_scorer ) execution_time = time.time() - start y_pred_proba = trained.predict_proba(X_validation) try: logloss = log_loss(y_true=y_validation, y_pred=y_pred_proba) except BaseException: logloss = 0 logger.debug("Warning, log loss cannot be computed") else: logger.debug(e) logger.debug( "Error {} with pipeline:{}".format(e, trainable.to_json()) ) raise e return cv_score, logloss, execution_time def merge_trials(trials1, trials2): max_tid = max([trial["tid"] for trial in trials1.trials]) for trial in trials2: tid = trial["tid"] + max_tid + 1 hyperopt_trial = hyperopt.Trials().new_trial_docs( tids=[None], specs=[None], results=[None], miscs=[None] ) hyperopt_trial[0] = trial hyperopt_trial[0]["tid"] = tid hyperopt_trial[0]["misc"]["tid"] = tid for key in hyperopt_trial[0]["misc"]["idxs"].keys(): hyperopt_trial[0]["misc"]["idxs"][key] = [tid] trials1.insert_trial_docs(hyperopt_trial) trials1.refresh() return trials1 def proc_train_test(params, X_train, y_train, return_dict): return_dict["params"] = copy.deepcopy(params) try: score, logloss, execution_time = hyperopt_train_test( params, X_train=X_train, y_train=y_train ) return_dict["loss"] = self.best_score - score return_dict["time"] = execution_time return_dict["log_loss"] = logloss return_dict["status"] = hyperopt.STATUS_OK except BaseException as e: exception_type = f"{type(e).__module__}.{type(e).__name__}" try: trainable = create_instance_from_hyperopt_search_space( self.estimator, params ) trial_info = ( f'pipeline: """{trainable.pretty_print(show_imports=False)}"""' ) except BaseException: trial_info = f"hyperparams: {params}" error_msg = f"Exception caught in Hyperopt: {exception_type}, {traceback.format_exc()}with {trial_info}" logger.warning(error_msg + ", setting status to FAIL") return_dict["status"] = hyperopt.STATUS_FAIL return_dict["error_msg"] = error_msg if self.verbose: print(return_dict["error_msg"]) def get_final_trained_estimator(params, X_train, y_train): warnings.filterwarnings("ignore") trainable = create_instance_from_hyperopt_search_space( self.estimator, params ) trained = trainable.fit(X_train, y_train) return trained def f(params): current_time = time.time() if (self.max_opt_time is not None) and ( (current_time - opt_start_time) > self.max_opt_time ): # if max optimization time set, and we have crossed it, exit optimization completely sys.exit(0) if self.max_eval_time: # Run hyperopt in a subprocess that can be interupted manager = multiprocessing.Manager() proc_dict = manager.dict() p = multiprocessing.Process( target=proc_train_test, args=(params, X_train, y_train, proc_dict) ) p.start() p.join(self.max_eval_time) if p.is_alive(): p.terminate() p.join() logger.warning( f"Maximum alloted evaluation time exceeded. with hyperparams: {params}, setting status to FAIL" ) proc_dict["status"] = hyperopt.STATUS_FAIL if "status" not in proc_dict: logger.warning("Corrupted results, setting status to FAIL") proc_dict["status"] = hyperopt.STATUS_FAIL else: proc_dict = {} proc_train_test(params, X_train, y_train, proc_dict) return proc_dict algo = getattr(hyperopt, self.algo) # Search in the search space with defaults if self.evals_with_defaults > 0: try: hyperopt.fmin( f, self.search_space_with_defaults, algo=algo.suggest, max_evals=self.evals_with_defaults, trials=self._default_trials, rstate=np.random.RandomState(SEED), show_progressbar=self.show_progressbar, ) except SystemExit: logger.warning( "Maximum alloted optimization time exceeded. Optimization exited prematurely" ) except AllTrialsFailed: self._best_estimator = None if hyperopt.STATUS_OK not in self._trials.statuses(): raise ValueError( "Error from hyperopt, none of the trials succeeded." ) try: hyperopt.fmin( f, self.search_space, algo=algo.suggest, max_evals=self.max_evals - self.evals_with_defaults, trials=self._trials, rstate=np.random.RandomState(SEED), show_progressbar=self.show_progressbar, ) except SystemExit: logger.warning( "Maximum alloted optimization time exceeded. Optimization exited prematurely" ) except AllTrialsFailed: self._best_estimator = None if hyperopt.STATUS_OK not in self._trials.statuses(): self._summarize_statuses() raise ValueError("Error from hyperopt, none of the trials succeeded.") self._trials = merge_trials(self._trials, self._default_trials) if self.show_progressbar: self._summarize_statuses() try: best_trial = self._trials.best_trial val_loss = self._trials.best_trial["result"]["loss"] if len(self._default_trials) > 0: default_val_loss = self._default_trials.best_trial["result"]["loss"] if default_val_loss < val_loss: best_trial = self._default_trials.best_trial best_params = best_trial["result"]["params"] logger.info( "best score: {:.1%}\nbest hyperparams found using {} hyperopt trials: {}".format( self.best_score - self._trials.average_best_error(), self.max_evals, best_params, ) ) trained = get_final_trained_estimator(best_params, X_train, y_train) self._best_estimator = trained except BaseException as e: logger.warning( "Unable to extract the best parameters from optimization, the error: {}".format( e ) ) self._best_estimator = None return self def predict(self, X_eval): import warnings warnings.filterwarnings("ignore") if self._best_estimator is None: raise ValueError( "Can not predict as the best estimator is None. Either an attempt to call `predict` " "before calling `fit` or all the trials during `fit` failed." ) trained = self._best_estimator try: predictions = trained.predict(X_eval) except ValueError as e: logger.warning( "ValueError in predicting using Hyperopt:{}, the error is:{}".format( trained, e ) ) predictions = None return predictions def summary(self): """Table summarizing the trial results (ID, loss, time, log_loss, status). Returns ------- result : DataFrame""" def make_record(trial_dict): return { "name": f'p{trial_dict["tid"]}', "tid": trial_dict["tid"], "loss": trial_dict["result"].get("loss", float("nan")), "time": trial_dict["result"].get("time", float("nan")), "log_loss": trial_dict["result"].get("log_loss", float("nan")), "status": trial_dict["result"]["status"], } records = [make_record(td) for td in self._trials.trials] result = pd.DataFrame.from_records(records, index="name") return result def get_pipeline(self, pipeline_name=None, astype="lale"): """Retrieve one of the trials. Parameters ---------- pipeline_name : union type, default None - string Key for table returned by summary(), return a trainable pipeline. - None When not specified, return the best trained pipeline found. astype : 'lale' or 'sklearn', default 'lale' Type of resulting pipeline. Returns ------- result : Trained operator if best, trainable operator otherwise. """ best_name = None if self._best_estimator is not None: best_name = f'p{self._trials.best_trial["tid"]}' if pipeline_name is None: pipeline_name = best_name if pipeline_name == best_name: result = getattr(self, "_best_estimator", None) else: tid = int(pipeline_name[1:]) params = self._trials.trials[tid]["result"]["params"] result = create_instance_from_hyperopt_search_space(self.estimator, params) if result is None or astype == "lale": return result assert astype == "sklearn", astype return result.export_to_sklearn_pipeline()
if i > max_evals: assert False try: X, y = data_loader() clf = Hyperopt(estimator=pipeline, max_evals=i, scoring=scoring) trained_pipeline = clf.fit(X, y) trained_pipeline.predict(X) return True except Exception: test(3 * i) test(1) kls = inspect.getmembers(autogen, lambda m: isinstance(m, Operator)) LR = LogisticRegression.customize_schema(relevantToOptimizer=[]) classifiers = [ "AdaBoostClassifier", "BernoulliNB", "CalibratedClassifierCV", "ComplementNB", "DecisionTreeClassifier", "ExtraTreesClassifier", "GaussianNB", "GaussianProcessClassifier", "GradientBoostingClassifier", "KNeighborsClassifier", "LGBMClassifier", "LabelPropagation", "LabelSpreading",
X_train, X_test, y_train, y_test = \ train_test_split(X, y, test_size = test_size, random_state = 0) if verbose: print(f'training set shapes: X {X_train.shape}, y {y_train.shape}') print(f'test set shapes: X {X_test.shape}, y {y_test.shape}') if preprocess: from lale.datasets.data_schemas import add_schema X_train = add_schema(X_train.astype(np.number), recalc=True) y_train = add_schema(y_train.astype(np.int), recalc=True) X_test = add_schema(X_test.astype(np.number), recalc=True) y_test = add_schema(y_test.astype(np.int), recalc=True) else: X_train, X_test, y_train, y_test = add_schemas( \ schema_orig, target_col, X_train, X_test, y_train, y_test) return (X_train, y_train), (X_test, y_test) if __name__ == "__main__": datasets = ['spectf', 'diabetes', 'breast-cancer', 'hill-valley'] from lale.lib.sklearn import LogisticRegression for dataset_name in datasets: try: (X_train, y_train), (X_test, y_test) = fetch(dataset_name, 'classification') trained = LogisticRegression().fit(X_train, y_train) trained.predict(X_test) except BaseException as e: import traceback traceback.print_exc()
def test_hyperparam_exclusive_min(self): with EnableSchemaValidation(): with self.assertRaises(jsonschema.ValidationError): _ = LogisticRegression(LogisticRegression.penalty.l1, C=0.0)
def test_no_partial_fit(self): pipeline = Batching(operator=NoOp() >> LogisticRegression()) with self.assertRaises(AttributeError): _ = pipeline.fit(self.X_train, self.y_train)
def test_hyperparam_penalty_solver_dependence(self): with EnableSchemaValidation(): with self.assertRaises(jsonschema.ValidationError): _ = LogisticRegression(LogisticRegression.penalty.l1, LogisticRegression.solver.newton_cg)
def create_pipeline(self): from sklearn.decomposition import PCA from sklearn.pipeline import make_pipeline pipeline = make_pipeline(PCA(), LogisticRegression()) return pipeline
def test_input_schema_fit(self): self.maxDiff = None self.assertEqual( LogisticRegression.input_schema_fit(), LogisticRegression.get_schema("input_fit"), ) self.assertEqual((NMF >> LogisticRegression).input_schema_fit(), NMF.get_schema("input_fit")) self.assertEqual( IdentityWrapper(op=LogisticRegression).input_schema_fit(), LogisticRegression.get_schema("input_fit"), ) actual = (TfidfVectorizer | NMF).input_schema_fit() expected = { "anyOf": [ { "type": "object", "required": ["X"], "additionalProperties": False, "properties": { "X": { "anyOf": [ { "type": "array", "items": { "type": "string" } }, { "type": "array", "items": { "type": "array", "minItems": 1, "maxItems": 1, "items": { "type": "string" }, }, }, ] }, "y": {}, }, }, { "type": "object", "required": ["X"], "additionalProperties": False, "properties": { "X": { "type": "array", "items": { "type": "array", "items": { "type": "number", "minimum": 0.0 }, }, }, "y": {}, }, }, ] } self.assertEqual(actual, expected)
def test_resampler(self): from lale.lib.lale import ConcatFeatures, NoOp from lale.lib.sklearn import ( PCA, LogisticRegression, Nystroem, RandomForestClassifier, ) X_train, y_train = self.X_train, self.y_train X_test, y_test = self.X_test, self.y_test import importlib module_name = ".".join(res_name.split(".")[0:-1]) class_name = res_name.split(".")[-1] module = importlib.import_module(module_name) class_ = getattr(module, class_name) with self.assertRaises(ValidationError): res = class_() # test_schemas_are_schemas lale.type_checking.validate_is_schema(class_.input_schema_fit()) lale.type_checking.validate_is_schema(class_.input_schema_predict()) lale.type_checking.validate_is_schema(class_.output_schema_predict()) lale.type_checking.validate_is_schema(class_.hyperparam_schema()) # test_init_fit_predict from lale.operators import make_pipeline pipeline1 = PCA() >> class_(operator=make_pipeline(LogisticRegression())) trained = pipeline1.fit(X_train, y_train) predictions = trained.predict(X_test) pipeline2 = class_(operator=make_pipeline(PCA(), LogisticRegression())) trained = pipeline2.fit(X_train, y_train) predictions = trained.predict(X_test) # test_with_hyperopt from lale.lib.lale import Hyperopt optimizer = Hyperopt( estimator=PCA >> class_(operator=make_pipeline(LogisticRegression())), max_evals=1, show_progressbar=False, ) trained_optimizer = optimizer.fit(X_train, y_train) predictions = trained_optimizer.predict(X_test) pipeline3 = class_( operator=PCA() >> (Nystroem & NoOp) >> ConcatFeatures >> LogisticRegression() ) optimizer = Hyperopt(estimator=pipeline3, max_evals=1, show_progressbar=False) trained_optimizer = optimizer.fit(X_train, y_train) predictions = trained_optimizer.predict(X_test) pipeline4 = ( ( PCA >> class_(operator=make_pipeline(Nystroem())) & class_(operator=make_pipeline(Nystroem())) ) >> ConcatFeatures >> LogisticRegression() ) optimizer = Hyperopt( estimator=pipeline4, max_evals=1, scoring="roc_auc", show_progressbar=False ) trained_optimizer = optimizer.fit(X_train, y_train) predictions = trained_optimizer.predict(X_test) # test_cross_validation from lale.helpers import cross_val_score cv_results = cross_val_score(pipeline1, X_train, y_train, cv=2) self.assertEqual(len(cv_results), 2) # test_to_json pipeline1.to_json()
def test_lfr_pd_num(self): fairness_info = self.creditg_pd_num["fairness_info"] trainable_remi = LFR(**fairness_info) >> LogisticRegression( max_iter=1000) self._attempt_remi_creditg_pd_num(fairness_info, trainable_remi, 0.95, 1.05)
def test_hyperparam_dual_penalty_solver_dependence(self): with self.assertRaises(jsonschema.ValidationError): lr = LogisticRegression(LogisticRegression.penalty.l2, LogisticRegression.solver.sag, dual=True)
def test_sans_mitigation_pd_num(self): fairness_info = self.creditg_pd_num["fairness_info"] trainable_remi = LogisticRegression(max_iter=1000) self._attempt_remi_creditg_pd_num(fairness_info, trainable_remi, 0.5, 1.0)
def test_with_lale_classifiers(self): from lale.lib.sklearn import BaggingClassifier from lale.sklearn_compat import make_sklearn_compat clf = BaggingClassifier(base_estimator=LogisticRegression()) trained = clf.fit(self.X_train, self.y_train) trained.predict(self.X_test)
def test_optim_preproc_pd_cat(self): # TODO: set the optimizer options as shown in the example https://github.com/Trusted-AI/AIF360/blob/master/examples/demo_optim_data_preproc.ipynb fairness_info = self.creditg_pd_cat["fairness_info"] _ = OptimPreproc(**fairness_info, optim_options={}) >> LogisticRegression(max_iter=1000)
def test_with_lale_pipeline(self): from lale.lib.sklearn import BaggingClassifier clf = BaggingClassifier(base_estimator=PCA() >> LogisticRegression()) trained = clf.fit(self.X_train, self.y_train) trained.predict(self.X_test)
def test_sans_mitigation_pd_cat(self): fairness_info = self.creditg_pd_cat["fairness_info"] trainable_remi = self.prep_pd_cat >> LogisticRegression(max_iter=1000) self._attempt_remi_creditg_pd_cat(fairness_info, trainable_remi, 0.66, 0.76)
def test_hyperparam_keyword_enum(self): _ = LogisticRegression(LogisticRegression.penalty.l1, C=0.1, solver=LogisticRegression.solver.saga)
def test_lr_parameters(self): pgo = PGO.load_pgo_file(example_pgo_fp) lr = LogisticRegression() parameters: SearchSpace = hyperopt_search_space(lr, pgo=pgo)
def test_two_estimators_predict_proba(self): pipeline = (StandardScaler() >> (PCA() & Nystroem() & LogisticRegression()) >> ConcatFeatures() >> NoOp() >> LogisticRegression()) trained = pipeline.fit(self.X_train, self.y_train) trained.predict_proba(self.X_test)