def __init__(self, estimator=None, max_depth=7, n_estimators="auto", perc=100, alpha=0.05, two_step=True, max_iter=10, random_state=42, verbose=1, budget=10, weak=True, n_jobs=-1, imp_mask=None): self.imp_mask = imp_mask self.n_jobs = n_jobs self.max_depth = max_depth self.weak = weak self.budget = budget self.estimator = estimator self.n_estimators = n_estimators self.perc = perc self.alpha = alpha self.two_step = two_step self.max_iter = max_iter self.random_state = random_state self.verbose = verbose self.__version__ = '0.3' self._is_lightgbm = 'lightgbm' in str(type(self.estimator)) self.logger = get_logger(self) self.logging_level = 20 if self.verbose > 0 else 10
def __init__(self): self.resource_manager = None self.estimator = None self.in_feature_groups = None self.out_feature_groups = None self.hyperparams = {} self.logger = get_logger(self)
def __init__(self, threshold, n_jobs=1, max_delete=1): self.max_delete = max_delete self.to_delete = [] self.threshold = threshold self.n_jobs = n_jobs self._type = "DataFrame" self.logger = get_logger(self)
def init_data( self, random_state, data_manager: DataManager, metric: Scorer, should_calc_all_metric: bool, splitter, should_store_intermediate_result: bool, resource_manager: ResourceManager, should_finally_fit: bool ): self.random_state = random_state if hasattr(splitter, "random_state"): setattr(splitter, "random_state", self.random_state) self.splitter = splitter self.data_manager = data_manager self.X_train = self.data_manager.X_train self.y_train = self.data_manager.y_train self.X_test = self.data_manager.X_test self.y_test = self.data_manager.y_test self.should_store_intermediate_result = should_store_intermediate_result self.metric = metric self.ml_task: MLTask = self.data_manager.ml_task self.should_calc_all_metric = should_calc_all_metric if self.ml_task.mainTask == "regression": self.predict_function = self._predict_regression else: self.predict_function = self._predict_proba self.logger = get_logger(self) self.resource_manager = resource_manager self.should_finally_fit = should_finally_fit
def __init__(self, **kwargs): self.resource_manager = None self.component = None self.in_feature_groups = None self.out_feature_groups = None self.hyperparams = kwargs self.set_inside_dict(kwargs) self.logger = get_logger(self)
def __init__(self, categorical_feature=None, numerical_feature=None, copy=True, missing_rate=0.4): self.missing_rate = missing_rate self.numerical_feature = numerical_feature self.copy = copy self.categorical_feature = categorical_feature self.logger = get_logger(self)
def __init__( self, run_id, nameserver=None, nameserver_port=None, host=None, worker_id=None, timeout=None, debug=False ): """ Parameters ---------- run_id: anything with a __str__ method unique id to identify individual HpBandSter run nameserver: str hostname or IP of the nameserver nameserver_port: int port of the nameserver logger: logging.logger instance logger used for debugging output host: str hostname for this worker process worker_id: anything with a __str__method if multiple workers are started in the same process, you MUST provide a unique id for each one of them using the `id` argument. timeout: int or float or None specifies the timeout a worker will wait for a new after finishing a computation before shutting down. Towards the end of a long run with multiple workers, this helps to shutdown idling workers. We recommend a timeout that is roughly half the time it would take for the second largest budget to finish. The default (None) means that the worker will wait indefinitely and never shutdown on its own. """ self.debug = debug self.run_id = run_id self.host = host self.nameserver = nameserver self.nameserver_port = nameserver_port self.worker_id = "opt.run_%s.worker.%s.%i" % (self.run_id, socket.gethostname(), os.getpid()) self.manifest_id = uuid4().hex[-8:] self.timeout = timeout self.timer = None worker_id = str(worker_id) if not worker_id is None: self.worker_id += f".{worker_id}" self.manifest_id = str(worker_id) self.thread = None self.logger = get_logger(f"Worker[{self.manifest_id}]") # 分布式环境下的命名问题 self.busy = False self.thread_cond = threading.Condition(threading.Lock())
def __init__(self, new_result_callback, run_id='0', ping_interval=10, nameserver='localhost', nameserver_port=None, host=None, queue_callback=None): """ Parameters ---------- new_result_callback: function function that will be called with a `Job instance <opt.core.dispatcher.Job>`_ as argument. From the `Job` the result can be read and e.g. logged. run_id: str unique run_id associated with the HPB run ping_interval: int how often to ping for workers (in seconds) nameserver: str address of the Pyro4 nameserver nameserver_port: int port of Pyro4 nameserver host: str ip (or name that resolves to that) of the network interface to use queue_callback: function gets called with the number of workers in the pool on every update-cycle """ self.new_result_callback = new_result_callback self.queue_callback = queue_callback self.run_id = run_id self.nameserver = nameserver self.nameserver_port = nameserver_port self.host = host self.ping_interval = int(ping_interval) self.shutdown_all_threads = False self.logger = get_logger(self) self.worker_pool = {} self.waiting_jobs = queue.Queue() self.running_jobs = {} self.idle_workers = set() self.thread_lock = threading.Lock() self.runner_cond = threading.Condition(self.thread_lock) self.discover_cond = threading.Condition(self.thread_lock) self.pyro_id = "opt.run_%s.dispatcher" % self.run_id
def __init__(self, method="tsvd", n_components="auto", problem_type=None, random_state=0, budget=10, n_jobs=-1): self.budget = budget self.n_components = n_components self.n_jobs = n_jobs self.random_state = random_state self.problem_type = problem_type self.method = method self.logger = get_logger(self)
def __init__( self, top_n_percent=15, bandwidth_factor=3, min_bandwidth=1e3, bw_estimation="normal_reference", min_points_in_kde=2, ): self.min_points_in_kde = min_points_in_kde self.bw_estimation = bw_estimation self.min_bandwidth = min_bandwidth self.bandwidth_factor = bandwidth_factor self.top_n_percent = top_n_percent self.config_transformer: Optional[ConfigurationTransformer] = None self.logger = get_logger(self)
def __init__( self, n_estimators=2048, objective=None, boosting_type="gbdt", # objective="binary", learning_rate=0.01, max_depth=31, num_leaves=31, feature_fraction=0.8, bagging_fraction=0.8, bagging_freq=1, random_state=0, # cat_smooth=35, lambda_l1=0.1, lambda_l2=0.2, subsample_for_bin=40000, # min_data_in_leaf=4, min_child_weight=0.01, early_stopping_rounds=256, verbose=-1, n_jobs=1, warm_start=True): self.warm_start = warm_start assert self.is_classification is not None, NotImplementedError self.n_jobs = n_jobs self.objective = objective self.verbose = verbose self.early_stopping_rounds = early_stopping_rounds self.min_child_weight = min_child_weight self.subsample_for_bin = subsample_for_bin self.lambda_l2 = lambda_l2 self.lambda_l1 = lambda_l1 self.random_state = random_state self.bagging_freq = bagging_freq self.feature_fraction = feature_fraction self.bagging_fraction = bagging_fraction self.num_leaves = num_leaves self.max_depth = max_depth self.learning_rate = learning_rate self.objective = objective self.boosting_type = boosting_type self.n_estimators = n_estimators self.model = None self.current_iterations = 0 self.early_stopped = False self.logger = get_logger(self)
def __init__(self): """ Parameters ---------- directory: string where the results are logged logger: opt.utils.result_logger_v?? the logger to store the data, defaults to v1 overwrite: bool whether or not existing data will be overwritten logger: logging.logger for some debug output """ self.logger = get_logger(self)
def __init__(self, budget2epm, budget, acq_func="EI", acq_func_params=frozendict()): self.acq_func_params = dict(acq_func_params) # todo: 引入包的形式 if acq_func == "EI": acq_func_cls = EI elif acq_func == "LogEI": acq_func_cls = LogEI else: raise NotImplementedError self.acq_func = acq_func_cls(**self.acq_func_params) self.budget2weight = None self.budget = budget self.budget2epm = budget2epm self.logger = get_logger(self)
def __init__(self, steps, should_store_intermediate_result=False, resource_manager=None): self.config_id = None self.config = None self.logger = get_logger(self) if resource_manager is None: from autoflow import ResourceManager self.logger.warning( "In ML_Workflow __init__, resource_manager is None, create a default local resource_manager.") resource_manager = ResourceManager() self.resource_manager = resource_manager self.should_store_intermediate_result = should_store_intermediate_result self.steps = steps self.memory = None self.verbose = False self._validate_steps() self.intermediate_result = {} self.fitted = False self.budget = 0
def __init__( self, meta_learner=None, use_features_in_secondary=False, ): self.use_features_in_secondary = use_features_in_secondary assert self.mainTask in ("classification", "regression") if not meta_learner: if self.mainTask == "classification": meta_learner = LogisticRegression(penalty='elasticnet', solver="saga", l1_ratio=0.5, C=1.0, fit_intercept=False) elif self.mainTask == "regression": meta_learner = ElasticNet(fit_intercept=False, random_state=10) self.meta_learner = meta_learner self.logger = get_logger(self)
def __init__(self, n_uniques: np.ndarray, A=10, B=5, dropout1=0.1, dropout2=0.1, dropout3=0.1, n_class=2): super(EntityEmbeddingNN, self).__init__() self.dropout3 = dropout3 self.logger = get_logger(self) self.epoch = 0 self.n_class = n_class self.dropout2 = dropout2 self.dropout1 = dropout1 self.n_uniques = n_uniques self.A = A self.B = B self.embed_dims = self.get_embed_dims(n_uniques) sum_ = np.log(self.embed_dims).sum() self.n_layer1 = min(1000, int(A * (n_uniques.size**0.5) * sum_ + 1)) self.n_layer2 = int(self.n_layer1 / B) + 2 self.embedding_blocks = nn.ModuleList([ nn.Embedding(int(n_unique), int(embed_dim)) for n_unique, embed_dim in zip(self.n_uniques, self.embed_dims) ]) embed_dims_size = self.embed_dims.sum() layer1 = self.get_block(embed_dims_size, self.n_layer1, False, dropout1, "leaky_relu") layer2 = self.get_block(self.n_layer1, self.n_layer2, False, dropout2, "leaky_relu") layer3 = self.get_block(self.n_layer2, self.n_class, False, dropout3, "leaky_relu") self.deep_net = nn.Sequential(layer1, layer2, layer3) self.wide_net = self.get_block(embed_dims_size, self.n_class, False, dropout3, "leaky_relu") output_modules = [] if self.n_class > 1: output_modules.append(nn.Softmax(dim=1)) self.output_layer = nn.Sequential(*output_modules) self.initializing_modules( chain(self.deep_net.modules(), self.wide_net.modules(), self.output_layer.modules(), self.embedding_blocks.modules()))
def init_variables(self): self.scaler = StandardScaler(copy=True) self.rng = np.random.RandomState(self.random_state) self.logger = get_logger(self) self.model = None self.learning_curve = [ [], # train_sizes_abs [0] [], # train_scores [1] [], # test_scores [2] ] self.performance_history = np.full(self.early_stopping_rounds, -np.inf) self.iteration_history = np.full(self.early_stopping_rounds, 0, dtype="int32") N = len(self.performance_history) self.best_estimators = np.zeros([N], dtype="object") if self.is_classification: self.score_func = accuracy_score else: self.score_func = r2_score self.early_stopped = False self.best_iteration = 0
def __init__(self, lr=1e-2, max_epoch=25, n_class=None, nn_params=frozendict(), random_state=1000, batch_size=1024, optimizer="adam", n_jobs=-1, class_weight=None): self.class_weight = class_weight self.n_jobs = check_n_jobs(n_jobs) self.optimizer = optimizer self.batch_size = batch_size self.random_state = random_state self.nn_params = nn_params self.n_class = n_class self.max_epoch = max_epoch self.lr = lr self.rng = check_random_state(random_state) self.logger = get_logger(self)
def __init__(self, percentage=20, feats_must_less_than_rows=True, lgbm_w=0.5, et_iters=100, lgbm_iters=100, et_budget=1.5, lgbm_budget=1.5, step=10, n_jobs=-1, random_state=42): self.feats_must_less_than_rows = feats_must_less_than_rows self.random_state = random_state self.lgbm_w = lgbm_w self.lgbm_budget = lgbm_budget self.et_budget = et_budget self.n_jobs = n_jobs self.step = step self.lgbm_iters = lgbm_iters self.et_iters = et_iters self.percentage = float(np.clip(percentage, 0, 100)) self.logger = get_logger(self)
def __init__(self, dataset_source="", dataset_path=None, dataset_instance=None, dataset_id=None, resource_manager=None, dataset_metadata=frozendict(), upload_type="fs"): self.upload_type = upload_type self.dataset_id = None self.dataset_source = dataset_source self.dataset_metadata = dict(dataset_metadata) self.dataset_metadata.update(dataset_source=dataset_source) self.uploaded_hash = None from autoflow.resource_manager.base import ResourceManager self.logger = get_logger(self) if resource_manager is None: self.logger.warning( "In DataContainer __init__, resource_manager is None, create a default local resource_manager." ) resource_manager = ResourceManager() self.resource_manager: ResourceManager = resource_manager data_indicators = [dataset_path, dataset_instance, dataset_id] data_indicators = np.array(list( map(lambda x: x is not None, data_indicators)), dtype='int32') assert data_indicators.sum() == 1 if dataset_path is not None: data = self.read_local(dataset_path) self.data = self.process_dataset_instance(data) elif dataset_instance is not None: assert isinstance(dataset_instance, self.VALID_INSTANCE) self.data = self.process_dataset_instance(dataset_instance) elif dataset_id is not None: self.download(dataset_id) else: raise NotImplementedError
def __init__( self, budget_per_trial=1, budget=10, # n_jobs=-1, verbose=0, random_state=42, cv=3, lr_iter_step=10, lr_max_iter=100, lr_es_round=4, problem_type=None): self.lr_es_round = lr_es_round self.lr_max_iter = lr_max_iter self.lr_iter_step = lr_iter_step self.problem_type = problem_type self.cv = cv self.random_state = random_state self.verbose = verbose # self.n_jobs = check_n_jobs(n_jobs) self.budget = budget self.budget_per_trial = budget_per_trial self.logging_level = 20 if verbose > 0 else 20 self.logger = get_logger(self)
def __init__( self, base_model="lgbm", # ["lgbm", "et", "ridge"] are recommended n_jobs=-1, random_state=42, max_dichotomy=None, cv=3, cv_budget=2, test_size=0.33, model_params=frozendict()): if max_dichotomy is None: if base_model in ("et", "lgbm"): max_dichotomy = 10 else: max_dichotomy = 5 self.model_params = model_params self.cv_budget = cv_budget self.test_size = test_size self.cv = cv self.max_dichotomy = max_dichotomy self.random_state = random_state self.n_jobs = n_jobs self.base_model = base_model self.logger = get_logger(self)
def __init__( self, X_train: Union[pd.DataFrame, GenericDataFrame, np.ndarray, None] = None, y_train: Union[pd.Series, np.ndarray, str, None] = None, X_test: Union[pd.DataFrame, GenericDataFrame, np.ndarray, None] = None, y_test: Union[pd.Series, np.ndarray, str, None] = None, dataset_metadata: Dict[str, Any] = frozenset(), column_descriptions: Dict[str, Union[List[str], str]] = None, highR_nan_threshold: float = 0.5, ): ''' Parameters ---------- X_train: :class:`numpy.ndarray` or :class:`pandas.DataFrame` y_train: :class:`numpy.ndarray` X_test: :class:`numpy.ndarray` or :class:`pandas.DataFrame` y_test: :class:`numpy.ndarray` dataset_metadata: dict column_descriptions: dict ``column_descriptions`` is a dict, key is ``feature_group``, value is column (column name) or columns (list of column names). This is a list of some frequently-used built-in ``feature_group`` * ``id`` - id of this table. * ``ignore`` - some columns which contains irrelevant information. * ``target`` - column in the dataset is what your model will learn to predict. * ``nan`` - Not a Number, a column contain missing values. * ``num`` - numerical features, such as [1, 2, 3]. * ``cat`` - categorical features, such as ["a", "b", "c"]. * ``num_nan`` - numerical features contains missing values. such as [1, 2, NaN]. * ``cat_nan`` - categorical features contains missing values. such as ["a", "b", NaN]. * ``highR_nan`` - highly ratio NaN. You can find explain in :class:`autoflow.hdl.hdl_constructor.HDL_Constructor` * ``lowR_nan`` - lowly ratio NaN. You can find explain in :class:`autoflow.hdl.hdl_constructor.HDL_Constructor` * ``highR_cat`` - highly cardinality ratio categorical. You can find explain in :class:`autoflow.hdl.hdl_constructor.HDL_Constructor` * ``lowR_cat`` - lowly cardinality ratio categorical. You can find explain in :class:`autoflow.hdl.hdl_constructor.HDL_Constructor` highR_nan_threshold: float high ratio NaN threshold, you can find examples and practice in :class:`autoflow.hdl.hdl_constructor.HDL_Constructor` ''' self.logger = get_logger(self) dataset_metadata = dict(dataset_metadata) self.highR_nan_threshold = highR_nan_threshold self.dataset_metadata = dataset_metadata X_train = deepcopy(X_train) y_train = deepcopy(y_train) X_test = deepcopy(X_test) y_test = deepcopy(y_test) X_train, y_train, X_test, y_test, feature_groups, column2feature_groups = self.parse_column_descriptions( column_descriptions, X_train, y_train, X_test, y_test) self.feature_groups = feature_groups self.column2feature_groups = column2feature_groups self.ml_task: MLTask = get_ml_task_from_y(y_train) self.X_train = GenericDataFrame(X_train, feature_groups=feature_groups) self.y_train = y_train self.X_test = GenericDataFrame( X_test, feature_groups=feature_groups) if X_test is not None else None self.y_test = y_test if y_test is not None else None # todo: 用户自定义验证集可以通过RandomShuffle 或者mlxtend指定 # fixme: 不支持multilabel if len(y_train.shape) > 2: raise ValueError('y must not have more than two dimensions, ' 'but has %d.' % len(y_train.shape)) if X_train.shape[0] != y_train.shape[0]: raise ValueError('X and y must have the same number of ' 'datapoints, but have %d and %d.' % (X_train.shape[0], y_train.shape[0]))
def __init__(self, tuner: Union[Tuner, List[Tuner], None, dict] = None, hdl_constructor: Union[HDL_Constructor, List[HDL_Constructor], None, dict] = None, resource_manager: Union[ResourceManager, str] = None, random_state=42, log_file: str = None, log_config: Optional[dict] = None, highR_nan_threshold=0.5, highR_cat_threshold=0.5, should_store_intermediate_result=False, should_finally_fit=False, should_calc_all_metrics=True, **kwargs): ''' Parameters ---------- tuner: :class:`autoflow.tuner.tuner.Tuner` or None ``Tuner`` if class who agent an abstract search process. hdl_constructor: :class:`autoflow.hdl.hdl_constructor.HDL_Constructor` or None ``HDL`` is abbreviation of Hyper-parameter Descriptions Language. It describes an abstract hyperparametric space that independent with concrete implementation. ``HDL_Constructor`` is a class who is responsible for translating dict-type ``DAG-workflow`` into ``H.D.L`` . resource_manager: :class:`autoflow.manager.resource_manager.ResourceManager` or None ``ResourceManager`` is a class manager computer resources such like ``file_system`` and ``data_base``. random_state: int random state log_file: path which file to store log, if is None, ``autoflow.log`` will be used. log_config: dict logging configuration highR_nan_threshold: float high ratio NaN threshold, you can find example and practice in :class:`autoflow.hdl.hdl_constructor.HDL_Constructor` highR_cat_threshold: float high ratio categorical feature's cardinality threshold, you can find example and practice in :class:`autoflow.hdl.hdl_constructor.HDL_Constructor` kwargs if parameters like ``tuner`` or ``hdl_constructor`` and ``resource_manager`` are passing None, you can passing kwargs to make passed parameter work. See the following example. Examples --------- In this example, you can see a trick to seed kwargs parameters with out initializing :class:`autoflow.hdl.hdl_constructor.HDL_Constructor` or other class. In following example, user pass ``DAG_workflow`` and ``hdl_bank`` by key-work arguments method. And we can see hdl_constructor is instanced by kwargs implicitly. >>> from autoflow import AutoFlowClassifier >>> classifier = AutoFlowClassifier(DAG_workflow={"num->target":["lightgbm"]}, ... hdl_bank={"classification":{"lightgbm":{"boosting_type": {"_type": "choice", "_value":["gbdt","dart","goss"]}}}}) AutoFlowClassifier(hdl_constructor=HDL_Constructor( DAG_workflow={'num->target': ['lightgbm']} hdl_bank_path=None hdl_bank={'classification': {'lightgbm': {'boosting_type': {'_type': 'choice', '_value': ['gbdt', 'dart', 'goss']}}}} included_classifiers=('adaboost', 'catboost', 'decision_tree', 'extra_trees', 'gaussian_nb', 'k_nearest_neighbors', 'liblinear_svc', 'lib... ''' self.should_finally_fit = should_finally_fit self.should_store_intermediate_result = should_store_intermediate_result self.should_calc_all_metrics = should_calc_all_metrics self.log_config = log_config self.highR_nan_threshold = highR_nan_threshold self.highR_cat_threshold = highR_cat_threshold # ---logger------------------------------------ self.log_file = log_file setup_logger(self.log_file, self.log_config) self.logger = get_logger(self) # ---random_state----------------------------------- self.random_state = random_state # ---tuner----------------------------------- tuner = instancing(tuner, Tuner, kwargs) # ---tuners----------------------------------- self.tuners = sequencing(tuner, Tuner) self.tuner = self.tuners[0] # ---hdl_constructor-------------------------- hdl_constructor = instancing(hdl_constructor, HDL_Constructor, kwargs) # ---hdl_constructors------------------------- self.hdl_constructors = sequencing(hdl_constructor, HDL_Constructor) self.hdl_constructor = self.hdl_constructors[0] # ---resource_manager----------------------------------- self.resource_manager = instancing(resource_manager, ResourceManager, kwargs) # ---member_variable------------------------------------ self.estimator = None self.ensemble_estimator = None
from copy import deepcopy from fractions import Fraction from typing import Dict, Optional, Union, List import numpy as np from ConfigSpace import ConfigurationSpace, Constant, CategoricalHyperparameter, Configuration from ConfigSpace.util import deactivate_inactive_hyperparameters from scipy.spatial.distance import euclidean from sklearn.base import BaseEstimator, TransformerMixin from sklearn.manifold import TSNE from sklearn.preprocessing import LabelEncoder, StandardScaler from autoflow.constants import ERR_LOSS from autoflow.utils.logging_ import get_logger inc_logger = get_logger("incumbent trajectory") logger = get_logger(__name__) def is_top_level_activated(config_space, config, hp_name, hp_value=None): parent_conditions = config_space.get_parent_conditions_of(hp_name) if len(parent_conditions): parent_condition = parent_conditions[0] parent_value = parent_condition.value parent_name = parent_condition.parent.name return is_top_level_activated(config_space, config, parent_name, parent_value) # 没有条件依赖,就是parent if hp_value is None: return True return config[hp_name] == hp_value
def __init__(self, store_path="~/autoflow", file_system="local", file_system_params=frozendict(), db_type="sqlite", db_params=frozendict(), redis_params=frozendict(), max_persistent_estimators=50, compress_suffix="bz2"): ''' Parameters ---------- store_path: str A path store files, such as metadata and model file and database file, which belong to AutoFlow. file_system: str Indicator-string about which file system or storage system will be used. Available options list below: * ``local`` * ``hdfs`` * ``s3`` ``local`` is default value. file_system_params: dict Specific file_system configuration. db_type: str Indicator-string about which file system or storage system will be used. Available options list below: * ``sqlite`` * ``postgresql`` * ``mysql`` ``sqlite`` is default value. db_params: dict Specific database configuration. redis_params: dict Redis configuration. max_persistent_estimators: int Maximal number of models can persistent in single task. If more than this number, the The worst performing model file will be delete, the corresponding database record will also be deleted. compress_suffix: str compress file's suffix, default is bz2 ''' # --logger------------------- self.logger = get_logger(self) # --preprocessing------------ file_system_params = dict(file_system_params) db_params = dict(db_params) redis_params = dict(redis_params) # ---file_system------------ self.file_system_type = file_system self.file_system: FileSystem = get_file_system(file_system)( **file_system_params) if self.file_system_type == "local": store_path = os.path.expandvars(os.path.expanduser(store_path)) self.store_path = store_path # ---data_base------------ assert db_type in ("sqlite", "postgresql", "mysql") self.db_type = db_type self.db_params = dict(db_params) if db_type == "sqlite": assert self.file_system_type == "local" # ---redis---------------- self.redis_params = dict(redis_params) # ---max_persistent_model--- self.max_persistent_estimators = max_persistent_estimators # ---compress_suffix------------ self.compress_suffix = compress_suffix # ---post_process------------ self.store_path = store_path self.file_system.mkdir(self.store_path) self.is_init_experiments_db = False self.is_init_tasks_db = False self.is_init_hdls_db = False self.is_init_trials_db = False self.is_init_redis = False self.is_master = False # --some specific path based on file_system--- self.datasets_dir = self.file_system.join(self.store_path, "datasets") self.databases_dir = self.file_system.join(self.store_path, "databases") self.parent_trials_dir = self.file_system.join(self.store_path, "trials") self.parent_experiments_dir = self.file_system.join( self.store_path, "experiments") for dir_path in [ self.datasets_dir, self.databases_dir, self.parent_experiments_dir, self.parent_trials_dir ]: self.file_system.mkdir(dir_path) # --db----------------------------------------- self.Datebase = get_db_class_by_db_type(self.db_type) # --JSONField----------------------------------------- self.JSONField = get_JSONField(self.db_type) # --database_name--------------------------------- # None means didn't create database self._meta_records_db_name = None # meta records database self._tasks_db_name = None
def __init__( self, DAG_workflow: Union[str, Dict[str, Any]] = "generic_recommend", hdl_bank_path=None, hdl_bank=None, hdl_metadata=frozendict(), balance_strategies=("weight", "None"), included_classifiers=("extra_trees", "lightgbm", "logistic_regression", "random_forest", "gbt_lr", "tabular_nn"), included_regressors=("extra_trees", "lightgbm", "elasticnet", "random_forest", "gbt_lr", "tabular_nn"), included_imputers=("impute.simple", "impute.gbt"), included_highC_cat_encoders=("encode.entity", "encode.ordinal", "encode.cat_boost"), combine_rare=True, included_cat_encoders=("encode.one_hot", "encode.ordinal"), num2normed_workflow=frozendict({ "num->normed": ["scale.standard", "operate.keep_going"], # "scale.adaptive", }), text2normed_workflow=frozendict({ "text->tokenized": "text.tokenize.simple", "tokenized->normed": [ "text.topic.tsvd", "text.topic.lsi", "text.topic.nmf", ] }), date2normed_workflow=frozendict({}), normed2final_workflow=frozendict({ "normed->final": ["operate.keep_going", "select.boruta", "generate.autofeat"] })): self.combine_rare = combine_rare self.balance_strategies = balance_strategies self.date2normed_workflow = date2normed_workflow self.text2normed_workflow = text2normed_workflow self.normed2final_workflow = normed2final_workflow self.num2normed_workflow = num2normed_workflow self.hdl_metadata = dict(hdl_metadata) self.included_cat_encoders = included_cat_encoders self.included_highC_cat_encoders = included_highC_cat_encoders self.included_imputers = included_imputers self.included_regressors = included_regressors self.included_classifiers = included_classifiers self.logger = get_logger(self) self.hdl_bank_path = hdl_bank_path self.DAG_workflow = DAG_workflow if hdl_bank is None: if hdl_bank_path: hdl_bank = get_hdl_bank(hdl_bank_path) else: hdl_bank = get_default_hdl_bank() if hdl_bank is None: hdl_bank = {} self.logger.warning("No hdl_bank, will use DAG_descriptions only.") self.hdl_bank = hdl_bank self.random_state = 42 self.ml_task = None self.data_manager = None
def __init__(self): self.ml_task = None self.logger = get_logger(__name__)
def __init__(self, resource_manager=None, X_train: Union[pd.DataFrame, DataFrameContainer, np.ndarray, None, str] = None, y_train: Union[pd.Series, np.ndarray, None] = None, X_test: Union[pd.DataFrame, DataFrameContainer, np.ndarray, None, str] = None, y_test: Union[pd.Series, np.ndarray, None] = None, dataset_metadata: Dict[str, Any] = frozendict(), column_descriptions: Dict[str, Union[List[str], str]] = frozendict(), highR_nan_threshold: float = 0.5, highC_cat_threshold: int = 4, consider_ordinal_as_cat=False, upload_type="fs"): ''' Parameters ---------- X_train: :class:`numpy.ndarray` or :class:`pandas.DataFrame` y_train: :class:`numpy.ndarray` X_test: :class:`numpy.ndarray` or :class:`pandas.DataFrame` y_test: :class:`numpy.ndarray` dataset_metadata: dict column_descriptions: dict ``column_descriptions`` is a dict, key is ``feature_group``, value is column (column name) or columns (list of column names). This is a list of some frequently-used built-in ``feature_group`` * ``id`` - id of this table. * ``ignore`` - some columns which contains irrelevant information. * ``target`` - column in the dataset is what your model will learn to predict. * ``nan`` - Not a Number, a column contain missing values. * ``num`` - numerical features, such as [1, 2, 3]. * ``cat`` - categorical features, such as ["a", "b", "c"]. * ``num_nan`` - numerical features contains missing values. such as [1, 2, NaN]. * ``cat_nan`` - categorical features contains missing values. such as ["a", "b", NaN]. * ``highR_nan`` - highly ratio NaN. You can find explain in :class:`autoflow.hdl.hdl_constructor.HDL_Constructor` * ``lowR_nan`` - lowly ratio NaN. You can find explain in :class:`autoflow.hdl.hdl_constructor.HDL_Constructor` * ``highC_cat`` - highly cardinality ratio categorical. You can find explain in :class:`autoflow.hdl.hdl_constructor.HDL_Constructor` * ``lowR_cat`` - lowly cardinality ratio categorical. You can find explain in :class:`autoflow.hdl.hdl_constructor.HDL_Constructor` highR_nan_threshold: float high ratio NaN threshold, you can find examples and practice in :class:`autoflow.hdl.hdl_constructor.HDL_Constructor` ''' self.upload_type = upload_type from autoflow.resource_manager.base import ResourceManager self.logger = get_logger(self) if resource_manager is None: self.logger.warning( "In DataManager __init__, resource_manager is None, create a default local resource_manager." ) resource_manager = ResourceManager() self.resource_manager: ResourceManager = resource_manager self.resource_manager = resource_manager self.highC_cat_threshold = highC_cat_threshold self.consider_ordinal_as_cat = consider_ordinal_as_cat dataset_metadata = dict(dataset_metadata) self.highR_nan_threshold = highR_nan_threshold self.dataset_metadata = dataset_metadata self.column_descriptions = dict(column_descriptions) # --load data to container--------------------------------- self.X_test, self.input_test_hash = self.parse_data_container( "TestSet", X_test, y_test) # train set 靠后,以train set 的column_descriptions为准 self.X_train, self.input_train_hash = self.parse_data_container( "TrainSet", X_train, y_train) # --migrate column descriptions------------------------------ # if X is dataset_id , remote data_container's column_descriptions will assigned to final_column_descriptions if self.final_column_descriptions is not None: self.column_descriptions = deepcopy(self.final_column_descriptions) # --column descriptions------------------------------ self.parse_column_descriptions() # 注意,此时feature_groups与columns不是一一匹配的,删除了辅助特征组 # ---check target----------------------------------------------------- assert "target" in self.column_descriptions self.target_col_name = self.column_descriptions["target"] # todo: 测试集预测的情况 # --final column descriptions------------------------------ # 用户定义的 column descriptions 和 remote 下载的column description都不应该包含nan的内容 # update `column2essential_feature_groups` to `final_column_descriptions` if self.final_column_descriptions is None: final_column_descriptions = defaultdict(list) final_column_descriptions.update(self.column_descriptions) # 先将非唯一的特征组处理为列表 for feat_grp, cols in final_column_descriptions.items(): if feat_grp not in UNIQUE_FEATURE_GROUPS: if isinstance(cols, str): final_column_descriptions[feat_grp] = [cols] # 然后开始更新 for column, essential_feature_group in self.column2feature_groups.items( ): if column not in final_column_descriptions[ essential_feature_group]: final_column_descriptions[essential_feature_group].append( column) self.final_column_descriptions = final_column_descriptions self.final_column_descriptions = dict(self.final_column_descriptions) # ---set column descriptions, upload to dataset----------------------------------------------------- if self.X_train is not None: self.X_train.set_column_descriptions( self.final_column_descriptions) self.X_train.upload(self.upload_type) self.logger.info( f"TrainSet's DataSet ID = {self.X_train.dataset_id}") if self.X_test is not None: self.X_test.set_column_descriptions(self.final_column_descriptions) self.X_test.upload(self.upload_type) self.logger.info( f"TestSet's DataSet ID = {self.X_test.dataset_id}") # ---origin hash----------------------------------------------------- self.train_set_id = self.X_train.get_hash( ) if self.X_train is not None else "" self.test_set_id = self.X_test.get_hash( ) if self.X_test is not None else "" if self.input_train_hash: assert self.input_train_hash == self.train_set_id if self.input_test_hash: assert self.input_test_hash == self.test_set_id # ---pop auxiliary columns----------------------------------------------------- y_train, y_test = self.pop_auxiliary_feature_groups() # --验证X与X_test的列应该相同 if self.X_test is not None and self.X_train is not None: assert self.X_train.shape[1] == self.X_test.shape[1] assert np.all(self.X_train.columns == self.X_test.columns) # --设置feature_groups-- if self.X_train is not None: self.X_train.set_feature_groups(self.feature_groups) if self.X_test is not None: self.X_test.set_feature_groups(self.feature_groups) # --设置参数-- y_train = to_array(y_train) y_test = to_array(y_test) # encode label assert y_train is not None, ValueError( f"{self.target_col_name} does not exist!") self.label_encoder = None if is_target_need_label_encode(y_train): self.label_encoder = LabelEncoder() y_train = self.label_encoder.fit_transform(y_train) y_test = self.encode_label(y_test) if y_train is not None: y_train = NdArrayContainer("TrainLabel", dataset_instance=y_train, resource_manager=self.resource_manager) y_train.upload() if y_test is not None: y_test = NdArrayContainer("TestLabel", dataset_instance=y_test, resource_manager=self.resource_manager) y_test.upload() self.ml_task: MLTask = get_ml_task_from_y(y_train.data) self.y_train = y_train self.y_test = y_test self.train_label_id = self.y_train.get_hash( ) if self.y_train is not None else "" self.test_label_id = self.y_test.get_hash( ) if self.y_test is not None else "" if self.X_train is not None: self.columns = self.X_train.columns else: self.columns = self.X_test.columns
def __init__( self, url=None, email=None, password=None, user_id=None, user_token=None, ): if url is None: # url = "http://192.168.1.182:9901" url = os.getenv("XENON_URL", "https://xacs.nitrogen.fun:9090") # todo: 增加encrypt字段 self.url = url self.user_token = user_token self.user_id = user_id self.password = password self.email = email self.db_params = { "http_client": True, "url": url, "headers": { 'Content-Type': 'application/json', 'accept': 'application/json', } } token_dir = f"{os.getenv('HOME')}/autoflow/auth" token_file = f"{token_dir}/config.json" self.login_logger = get_logger("Login") if email is None or password is None: self.login_logger.info( "'email' or 'password' is None, try to " "verify User Authentication by 'user_id' and 'user_token'.") if user_id is None or user_token is None: self.login_logger.info( "'user_id' and 'user_token' is None, " f"try to load token file '{token_file}'") if not Path(token_file).exists(): self.login_logger.error( f"user_token file '{token_file} do not exists! AutoFlow-SDK will exit..." ) sys.exit(-1) config_data = json.loads(Path(token_file).read_text()) if "user_token" not in config_data or "user_id" not in config_data: self.login_logger.error( f"'user_token' and 'user_id' did not exist in '{token_file}'! AutoFlow-SDK will exit..." ) sys.exit(-1) self.user_token = config_data["user_token"] self.user_id = config_data["user_id"] self.db_params["headers"].update({ "user_id": str(self.user_id), "user_token": self.user_token }) else: self.db_params["user"] = self.email self.db_params["password"] = self.password self.user_id, self.user_token = self.login() Path(token_dir).mkdir(parents=True, exist_ok=True) Path(token_file).write_text( json.dumps({ "user_id": self.user_id, "user_token": self.user_token })) super(HttpResourceManager, self).__init__(store_path="xenon", db_params=self.db_params, user_id=self.user_id, file_system="nitrogen", file_system_params={"db_params": self.db_params}, del_local_log_path=False)