Esempio n. 1
0
 def __init__(
         self,
         search_method: str = "smac",
         run_limit: int = 100,
         initial_runs: int = 20,
         search_method_params: dict = frozendict(),
         n_jobs: int = 1,
         exit_processes: Optional[int] = None
 ):
     self.search_method_params = search_method_params
     assert search_method in ("smac", "grid", "random")
     if search_method in ("grid", "random"):
         initial_runs = 0
     self.initial_runs = initial_runs
     self.run_limit = run_limit
     self.evaluator = TrainEvaluator()
     self.search_method = search_method
     self.evaluator.set_shp2model(self.shp2model)
     self.random_state = 0
     self.addition_info = {}
     self.resource_manager = None
     self.ml_task = None
     self.data_manager = None
     self.n_jobs = parse_n_jobs(n_jobs)
     if exit_processes is None:
         exit_processes = max(self.n_jobs // 3, 1)
     self.exit_processes = exit_processes
     self.logger=get_logger(__name__)
Esempio n. 2
0
 def __init__(self, threshold, n_jobs=1, max_delete=1):
     self.max_delete = max_delete
     self.to_delete = []
     self.threshold = threshold
     self.n_jobs = n_jobs
     self._type = "DataFrame"
     self.logger=get_logger(__name__)
Esempio n. 3
0
    def init_data(
        self,
        data_manager: XYDataManager,
        metric: Scorer,
        all_scoring_functions: bool,
        splitter=None,
    ):
        self.splitter = splitter
        self.data_manager = data_manager
        self.X_train = self.data_manager.X_train
        self.y_train = self.data_manager.y_train
        self.X_test = self.data_manager.X_test
        self.y_test = self.data_manager.y_test

        self.metric = metric
        self.ml_task: MLTask = self.data_manager.ml_task

        self.all_scoring_functions = all_scoring_functions

        if self.ml_task.mainTask == "regression":
            self.predict_function = self._predict_regression
        else:
            self.predict_function = self._predict_proba

        logger_name = self.__class__.__name__
        self.logger = get_logger(logger_name)

        self.Y_optimization = None
        self.Y_actual_train = None
Esempio n. 4
0
 def __init__(self):
     self.resource_manager = None
     self.estimator = None
     self.hyperparams = deepcopy(self.cls_hyperparams)
     self.set_params(**self.hyperparams)
     self.in_feat_grp = None
     self.out_feat_grp = None
     self.logger=get_logger(__name__)
Esempio n. 5
0
 def __init__(
         self,
         DAG_descriptions=None,
         hdl_bank_path=None,
         hdl_bank=None,
 ):
     self.logger = get_logger(__name__)
     if DAG_descriptions is None:
         DAG_descriptions = {
             "nan->{highR=highR_nan,lowR=lowR_nan}": "operate.split.nan",
             "highR_nan->lowR_nan": [
                 "operate.drop",
                 {"_name": "operate.merge", "__rely_model": "boost_model"}
             ],
             "lowR_nan->{cat_name=cat_nan,num_name=num_nan}": "operate.split.cat_num",
             "num_nan->num": [
                 "impute.fill_num",
                 {"_name": "impute.fill_abnormal", "__rely_model": "boost_model"}
             ],
             "cat_nan->cat": [
                 "impute.fill_cat",
                 {"_name": "impute.fill_abnormal", "__rely_model": "boost_model"}
             ],
             "cat->{highR=highR_cat,lowR=lowR_cat}": "operate.split.cat",
             "highR_cat->num": [
                 "operate.drop",
                 "encode.label"
             ],
             "lowR_cat->num": [
                 "encode.one_hot",
                 "encode.label"
             ],
             "num->target": [
                 "decision_tree", "libsvm_svc",
                 "k_nearest_neighbors",
                 "catboost",
                 "lightgbm"
             ]
         }
     self.hdl_bank_path = hdl_bank_path
     self.DAG_describe = DAG_descriptions
     if hdl_bank is None:
         if hdl_bank_path:
             hdl_bank = get_hdl_bank(hdl_bank_path)
         else:
             hdl_bank = get_default_hdl_bank()
     if hdl_bank is None:
         hdl_bank = {}
         self.logger.warning("No hdl_bank, will use DAG_descriptions only.")
     self.hdl_bank = hdl_bank
     self.random_state = 42
     self.ml_task = None
     self.data_manager = None
Esempio n. 6
0
 def __init__(
         self,
         tuner: Union[Tuner, List[Tuner], None, dict] = None,
         hdl_constructor: Union[HDL_Constructor, List[HDL_Constructor], None, dict] = None,
         resource_manager: Union[ResourceManager, str] = None,
         ensemble_builder: Union[StackEnsembleBuilder, None, bool, int] = None,
         random_state=42
 ):
     # ---logger------------------------------------
     self.logger = get_logger(__name__)
     # ---random_state-----------------------------------
     self.random_state = random_state
     # ---ensemble_builder-----------------------------------
     if ensemble_builder is None:
         self.logger.info("Using default StackEnsembleBuilder.")
         ensemble_builder = StackEnsembleBuilder()
     elif ensemble_builder == False:
         self.logger.info("Not using EnsembleBuilder, will select the best estimator.")
     else:
         ensemble_builder = StackEnsembleBuilder(set_model=ensemble_builder)
     self.ensemble_builder = ensemble_builder
     # ---tuners-----------------------------------
     if not tuner:
         tuner = Tuner()
     if not isinstance(tuner, (list, tuple)):
         tuner = [tuner]
     self.tuners: List[Tuner] = tuner
     # ---hdl_constructors-----------------------------------
     if not hdl_constructor:
         hdl_constructor = HDL_Constructor()
     if not isinstance(hdl_constructor, (list, tuple)):
         hdl_constructor = [hdl_constructor]
     self.hdl_constructors = hdl_constructor
     # ---resource_manager-----------------------------------
     if resource_manager is None:
         resource_manager = ResourceManager()
     self.resource_manager = resource_manager
     # ---member_variable------------------------------------
     self.estimator = None
Esempio n. 7
0
    def test_pipeline(self):
        self.logger = get_logger(__name__)
        df = pd.read_csv("../examples/classification/train_classification.csv")
        y = df.pop("Survived").values
        df = df.loc[:, ["Sex", "Cabin", "Age"]]
        feat_grp = ["cat_nan", "cat_nan", "num_nan"]
        df_train, df_test, y_train, y_test = train_test_split(df,
                                                              y,
                                                              test_size=0.2,
                                                              random_state=10)
        df_train = GenericDataFrame(df_train, feat_grp=feat_grp)
        df_test = GenericDataFrame(df_test, feat_grp=feat_grp)
        cv = KFold(n_splits=5, random_state=10, shuffle=True)
        train_ix, valid_ix = next(cv.split(df_train))

        df_train, df_valid = df_train.split([train_ix, valid_ix])
        y_valid = y_train[valid_ix]
        y_train = y_train[train_ix]

        fill_cat = FillCat()
        fill_cat.in_feat_grp = "cat_nan"
        fill_cat.out_feat_grp = "cat"
        fill_cat.update_hyperparams({"strategy": "<NULL>"})

        fill_num = FillNum()
        fill_num.in_feat_grp = "num_nan"
        fill_num.out_feat_grp = "num"
        fill_num.update_hyperparams({"strategy": "median"})

        ohe = OneHotEncoder()
        ohe.in_feat_grp = "cat"
        ohe.out_feat_grp = "num"

        sgd = SGD()
        sgd.in_feat_grp = "num"
        sgd.update_hyperparams({"loss": "log", "random_state": 10})

        pipeline = GenericPipeline([
            ("fill_cat", fill_cat),
            ("fill_num", fill_num),
            ("ohe", ohe),
            ("sgd", sgd),
        ])

        pipeline.fit(df_train, y_train, df_valid, y_valid, df_test, y_test)
        pred_train = pipeline.predict(df_train)
        pred_test = pipeline.predict(df_test)
        pred_valid = pipeline.predict(df_valid)
        score_valid = pipeline.predict_proba(df_valid)
        self.logger.info(accuracy_score(y_train, pred_train))
        self.logger.info(accuracy_score(y_valid, pred_valid))
        self.logger.info(accuracy_score(y_test, pred_test))
        result = pipeline.procedure(constants.binary_classification_task,
                                    df_train, y_train, df_valid, y_valid,
                                    df_test, y_test)
        pred_test = result["pred_test"]
        pred_valid = result["pred_valid"]
        self.logger.info(
            accuracy_score(y_valid, (pred_valid > .5).astype("int")[:, 1]))
        self.logger.info(
            accuracy_score(y_test, (pred_test > .5).astype("int")[:, 1]))

        pipeline = GenericPipeline([
            ("fill_cat", fill_cat),
            ("fill_num", fill_num),
            ("ohe", ohe),
        ])

        pipeline.fit(df_train, y_train, df_valid, y_valid, df_test, y_test)
        ret1 = pipeline.transform(df_train, df_valid, df_test)
        ret2 = pipeline.fit_transform(df_train, y_train, df_valid, y_valid,
                                      df_test, y_test)
        for key in ["X_train", "X_valid", "X_test"]:
            assert np.all(ret1[key] == ret2[key])

        pipeline = GenericPipeline([
            ("sgd", sgd),
        ])

        result = pipeline.procedure(constants.binary_classification_task,
                                    ret1["X_train"], y_train, ret1["X_valid"],
                                    y_valid, ret1["X_test"], y_test)
        pred_test = result["pred_test"]
        pred_valid = result["pred_valid"]
        self.logger.info(
            accuracy_score(y_valid, (pred_valid > .5).astype("int")[:, 1]))
        self.logger.info(
            accuracy_score(y_test, (pred_test > .5).astype("int")[:, 1]))
Esempio n. 8
0
 def __init__(self,
              store_path="~/hyperflow",
              file_system="local",
              file_system_params=frozendict(),
              db_type="sqlite",
              db_params=frozendict(),
              redis_params=frozendict(),
              max_persistent_estimators=50,
              persistent_mode="fs",
              store_intermediate=True,
              compress_suffix="bz2"):
     # --logger-------------------
     self.logger = get_logger(__name__)
     # --preprocessing------------
     file_system_params = dict(file_system_params)
     db_params = dict(db_params)
     redis_params = dict(redis_params)
     # ---file_system------------
     directory = os.path.split(generic_fs.__file__)[0]
     file_system2cls = find_components(generic_fs.__package__, directory,
                                       FileSystem)
     self.file_system_type = file_system
     if file_system not in file_system2cls:
         raise Exception(f"Invalid file_system {file_system}")
     self.file_system = file_system2cls[file_system](**file_system_params)
     if self.file_system_type == "local":
         store_path = os.path.expandvars(os.path.expanduser(store_path))
     self.store_path = store_path
     # ---data_base------------
     assert db_type in ("sqlite", "postgresql", "mysql")
     self.db_type = db_type
     self.db_params = dict(db_params)
     if db_type == "sqlite":
         assert self.file_system_type == "local"
     # ---redis----------------
     self.redis_params = redis_params
     # ---max_persistent_model---
     self.max_persistent_estimators = max_persistent_estimators
     # ---persistent_mode-------
     self.persistent_mode = persistent_mode
     assert self.persistent_mode in ("fs", "db")
     # ---store_intermediate-------
     self.store_intermediate = store_intermediate
     # ---compress_suffix------------
     self.compress_suffix = compress_suffix
     # ---post_process------------
     self.store_path = store_path
     self.file_system.mkdir(self.store_path)
     self.is_init_experiments_db = False
     self.is_init_tasks_db = False
     self.is_init_hdls_db = False
     self.is_init_trials_db = False
     self.is_init_redis = False
     self.is_master = False
     # --some specific path based on file_system---
     self.datasets_dir = self.file_system.join(self.store_path, "datasets")
     self.databases_dir = self.file_system.join(self.store_path,
                                                "databases")
     self.parent_trials_dir = self.file_system.join(self.store_path,
                                                    "trials")
     self.parent_experiments_dir = self.file_system.join(
         self.store_path, "experiments")
     for dir_path in [
             self.datasets_dir, self.databases_dir,
             self.parent_experiments_dir, self.parent_trials_dir
     ]:
         self.file_system.mkdir(dir_path)
     # --db-----------------------------------------
     self.Datebase = get_db_class_by_db_type(self.db_type)
     # --JSONField-----------------------------------------
     if self.db_type == "sqlite":
         from playhouse.sqlite_ext import JSONField
         self.JSONField = JSONField
     elif self.db_type == "postgresql":
         from playhouse.postgres_ext import JSONField
         self.JSONField = JSONField
     elif self.db_type == "mysql":
         from playhouse.mysql_ext import JSONField
         self.JSONField = JSONField
Esempio n. 9
0
 def __init__(self):
     self.ml_task = None
     self.logger = get_logger(__name__)