def __init__(self,
                 fixed_config=None,
                 scorer=None,
                 data_node=None,
                 task_type=REGRESSION,
                 resampling_strategy='cv',
                 resampling_params=None,
                 timestamp=None,
                 output_dir=None,
                 seed=1):
        self.resampling_strategy = resampling_strategy
        self.resampling_params = resampling_params

        self.fixed_config = fixed_config
        self.scorer = scorer if scorer is not None else balanced_accuracy_scorer
        self.task_type = task_type
        self.data_node = data_node
        self.output_dir = output_dir
        self.seed = seed
        self.onehot_encoder = None
        self.logger = get_logger(self.__module__ + "." +
                                 self.__class__.__name__)
        self.continue_training = False

        self.train_node = data_node.copy_()
        self.val_node = data_node.copy_()

        self.timestamp = timestamp
Example #2
0
    def __init__(self,
                 evaluator: _BaseEvaluator,
                 config_space,
                 name,
                 timestamp,
                 eval_type,
                 output_dir=None,
                 seed=None):
        self.evaluator = evaluator
        self.config_space = config_space

        assert name in ['hpo', 'fe']
        self.name = name
        self.seed = np.random.random_integers(
            MAX_INT) if seed is None else seed
        self.start_time = time.time()
        self.timing_list = list()
        self.incumbent = None
        self.eval_type = eval_type
        self.logger = get_logger(self.__module__ + "." +
                                 self.__class__.__name__)
        self.init_hpo_iter_num = None
        self.early_stopped_flag = False
        self.timestamp = timestamp
        self.output_dir = output_dir
        self.topk_saver = CombinedTopKModelSaver(k=50,
                                                 model_dir=self.output_dir,
                                                 identifier=self.timestamp)
Example #3
0
 def _get_logger(self, name):
     import os
     logger_name = 'mindware-%s-%d:%s' % (self.task_id, self._seed, name)
     setup_logger(
         os.path.join(self.tmp_directory, '%s.log' % str(logger_name)),
         self.logging_config,
     )
     return get_logger(logger_name)
Example #4
0
 def __init__(self, estimator, master_ip, master_port, authkey,
              worker_port):
     self.logger = get_logger(self.__module__ + "." +
                              self.__class__.__name__)
     self.estimator = estimator
     self.evaluator = estimator.get_evaluator()
     self.master_ip = master_ip
     self.master_port = master_port
     self.worker_port = worker_port
     self.worker_messager = WorkerMessager(master_ip, master_port, authkey)
     self.receiver_messager = ReceiverMessager(ip='127.0.0.1',
                                               port=worker_port)
Example #5
0
    def __init__(self,
                 eval_func,
                 config_space,
                 per_run_time_limit=600,
                 seed=1,
                 R=81,
                 eta=3,
                 n_jobs=1,
                 output_dir='./'):
        self.eval_func = eval_func
        self.config_space = config_space
        self.n_workers = n_jobs
        self.per_run_time_limit = per_run_time_limit

        self.trial_cnt = 0
        self.configs = list()
        self.perfs = list()
        self.incumbent_perf = float("-INF")
        self.incumbent_config = self.config_space.get_default_configuration()
        self.incumbent_configs = list()
        self.incumbent_perfs = list()
        self.evaluation_stats = dict()
        self.evaluation_stats['timestamps'] = list()
        self.evaluation_stats['val_scores'] = list()
        self.global_start_time = time.time()
        self.logger = get_logger(self.__module__ + "." +
                                 self.__class__.__name__)

        # Parameters in Hyperband framework.
        self.restart_needed = True
        self.R = R
        self.eta = eta
        self.seed = seed
        self.logeta = lambda x: log(x) / log(self.eta)
        self.s_max = int(self.logeta(self.R))
        self.B = (self.s_max + 1) * self.R
        self.s_values = list(reversed(range(self.s_max + 1)))
        self.inner_iter_id = 0

        # Parameters in MFSE-HB.
        self.iterate_r = []
        self.target_x = dict()
        self.target_y = dict()
        self.exp_output = dict()
        for index, item in enumerate(
                np.logspace(0, self.s_max, self.s_max + 1, base=self.eta)):
            r = int(item)
            self.iterate_r.append(r)
            self.target_x[r] = list()
            self.target_y[r] = list()

        self.mf_advisor = MFBatchAdvisor(config_space, output_dir=output_dir)
        self.eval_dict = dict()
 def __init__(self,
              rep=3,
              metric='acc',
              n_algorithm=3,
              task_type=None,
              total_resource=1200,
              exclude_datasets=None,
              meta_dir=None):
     self.logger = get_logger(self.__module__ + "." +
                              self.__class__.__name__)
     super().__init__(n_algorithm, task_type, metric, rep, total_resource,
                      'ranknet', exclude_datasets, meta_dir)
     self.model = None
Example #7
0
 def __init__(self,
              n_algorithm=3,
              task_type=None,
              metric='acc',
              exclude_datasets=None):
     self.logger = get_logger(self.__module__ + "." +
                              self.__class__.__name__)
     super().__init__(n_algorithm,
                      task_type,
                      metric=metric,
                      meta_algorithm='lightgbm',
                      exclude_datasets=exclude_datasets)
     self.model = None
    def __init__(self,
                 task_type,
                 architectures,
                 time_limit,
                 sampling_strategy='uniform',
                 R=27,
                 eta=3,
                 N=9,
                 n_jobs=1):
        self.architectures = architectures
        self.time_limit = time_limit
        self.task_type = task_type
        self.n_jobs = n_jobs
        self.R = R
        self.eta = eta
        self.N = N
        self.logeta = lambda x: log(x) / log(self.eta)
        self.s_max = int(self.logeta(self.R))
        self.sampling_strategy = sampling_strategy
        self.logger = get_logger(self.__module__ + "." +
                                 self.__class__.__name__)

        from mindware.components.models.img_classification import _classifiers as _img_estimators, _addons as _img_addons
        from mindware.components.models.text_classification import _classifiers as _text_estimators, \
            _addons as _text_addons
        from mindware.components.models.object_detection import _classifiers as _od_estimators, _addons as _od_addons

        self.time_limit = time_limit
        self.elimination_strategy = 'bandit'
        # Runtime stats.
        self.evaluation_stats = dict()

        self.update_cs = dict()

        if task_type == IMG_CLS:
            self._estimators = _img_estimators
            self._addons = _img_addons
        elif task_type == TEXT_CLS:
            self._estimators = _text_estimators
            self._addons = _text_addons
        elif task_type == OBJECT_DET:
            self._estimators = _od_estimators
            self._addons = _od_addons
        else:
            raise ValueError("Unknown task type %s" % task_type)
        self.eval_hist_configs = dict()
        self.eval_hist_perfs = dict()

        self.tpe_config_gen = dict()
        self.mfse_config_gen = dict()
Example #9
0
    def __init__(self, eval_func, config_space,
                 seed=1, R=81, eta=3, n_jobs=1):
        self.eval_func = eval_func
        self.config_space = config_space
        self.n_workers = n_jobs

        self.trial_cnt = 0
        self.configs = list()
        self.perfs = list()
        self.incumbent_perf = float("-INF")
        self.incumbent_config = self.config_space.get_default_configuration()
        self.incumbent_configs = []
        self.incumbent_perfs = []
        self.logger = get_logger(self.__module__ + "." + self.__class__.__name__)

        # Parameters in Hyperband framework.
        self.restart_needed = True
        self.R = R
        self.eta = eta
        self.seed = seed
        self.logeta = lambda x: log(x) / log(self.eta)
        self.s_max = int(self.logeta(self.R))
        self.B = (self.s_max + 1) * self.R
        self.s_values = list(reversed(range(self.s_max + 1)))
        self.inner_iter_id = 0

        # Parameters in Hyperband.
        self.iterate_r = list()
        self.target_x = dict()
        self.target_y = dict()
        self.exp_output = dict()
        for index, item in enumerate(np.logspace(0, self.s_max, self.s_max + 1, base=self.eta)):
            r = int(item)
            self.iterate_r.append(r)
            self.target_x[r] = list()
            self.target_y[r] = list()

        self.eval_dict = dict()
Example #10
0
    def __init__(self,
                 eval_func,
                 config_space,
                 config_generator='tpe',
                 seed=1,
                 R=27,
                 eta=3,
                 n_jobs=1):
        self.eval_func = eval_func
        self.config_space = config_space
        self.config_generator = config_generator
        self.n_workers = n_jobs

        self.trial_cnt = 0
        self.configs = list()
        self.perfs = list()
        self.incumbent_perf = float("-INF")
        self.incumbent_config = self.config_space.get_default_configuration()
        self.incumbent_configs = list()
        self.incumbent_perfs = list()
        self.global_start_time = time.time()
        self.time_ticks = list()
        self.logger = get_logger(self.__module__ + "." +
                                 self.__class__.__name__)

        # Parameters in Hyperband framework.
        self.restart_needed = True
        self.R = R
        self.eta = eta
        self.seed = seed
        self.logeta = lambda x: log(x) / log(self.eta)
        self.s_max = int(self.logeta(self.R))
        self.B = (self.s_max + 1) * self.R
        self.s_values = list(reversed(range(self.s_max + 1)))
        self.inner_iter_id = 0

        # Parameters in BOHB.
        self.iterate_r = list()
        self.target_x = dict()
        self.target_y = dict()
        self.exp_output = dict()
        for index, item in enumerate(
                np.logspace(0, self.s_max, self.s_max + 1, base=self.eta)):
            r = int(item)
            self.iterate_r.append(r)
            self.target_x[r] = list()
            self.target_y[r] = list()

        types, bounds = get_types(self.config_space)
        self.num_config = len(bounds)
        self.surrogate = RandomForestWithInstances(types, bounds)

        # self.executor = ParallelEvaluator(self.eval_func, n_worker=n_jobs)
        # self.executor = ParallelProcessEvaluator(self.eval_func, n_worker=n_jobs)
        self.acquisition_func = EI(model=self.surrogate)
        self.acq_optimizer = RandomSampling(self.acquisition_func,
                                            self.config_space,
                                            n_samples=2000,
                                            rng=np.random.RandomState(seed))

        self.config_gen = TPE(config_space)

        self.eval_dict = dict()
    def __init__(self,
                 node_list,
                 node_index,
                 task_type,
                 timestamp,
                 fe_config_space: ConfigurationSpace,
                 cash_config_space: ConfigurationSpace,
                 data: DataNode,
                 fixed_config=None,
                 trial_num=0,
                 time_limit=None,
                 metric='acc',
                 optimizer='smac',
                 ensemble_method='ensemble_selection',
                 ensemble_size=50,
                 per_run_time_limit=300,
                 output_dir="logs",
                 dataset_name='default_dataset',
                 eval_type='holdout',
                 resampling_params=None,
                 n_jobs=1,
                 seed=1):
        # Tree setting
        self.node_list = node_list
        self.node_index = node_index

        # Set up backend.
        self.dataset_name = dataset_name
        self.trial_num = trial_num
        self.time_limit = time_limit
        self.per_run_time_limit = per_run_time_limit
        self.start_time = time.time()
        self.logger = get_logger('Soln-ml: %s' % dataset_name)

        # Basic settings.
        self.eval_type = eval_type
        self.resampling_params = resampling_params
        self.task_type = task_type
        self.timestamp = timestamp
        self.fe_config_space = fe_config_space
        self.cash_config_space = cash_config_space
        self.fixed_config = fixed_config
        self.original_data = data.copy_()
        self.metric = get_metric(metric)
        self.optimizer = optimizer
        self.ensemble_method = ensemble_method
        self.ensemble_size = ensemble_size
        self.n_jobs = n_jobs
        self.seed = seed
        self.output_dir = output_dir

        self.early_stop_flag = False
        self.timeout_flag = False
        self.incumbent_perf = -float("INF")
        self.incumbent = None
        self.eval_dict = dict()

        if self.task_type in CLS_TASKS:
            self.if_imbal = is_imbalanced_dataset(self.original_data)
        else:
            self.if_imbal = False

        self.es = None
    def __init__(self,
                 stats,
                 ensemble_method: str,
                 ensemble_size: int,
                 task_type: int,
                 metric: _BaseScorer,
                 data_node,
                 output_dir=None):
        self.stats = stats
        self.ensemble_method = ensemble_method
        self.ensemble_size = ensemble_size
        self.task_type = task_type
        self.metric = metric
        self.output_dir = output_dir
        self.node = data_node

        self.predictions = []
        self.train_labels = None
        self.timestamp = str(time.time())
        logger_name = 'EnsembleBuilder'
        self.logger = get_logger(logger_name)

        for algo_id in self.stats.keys():
            model_to_eval = self.stats[algo_id]
            for idx, (_, _, path) in enumerate(model_to_eval):
                with open(path, 'rb') as f:
                    op_list, model, _ = pkl.load(f)
                _node = self.node.copy_()
                _node = construct_node(_node, op_list)

                # TODO: Test size
                test_size = 0.33
                X, y = _node.data

                if self.task_type in CLS_TASKS:
                    ss = StratifiedShuffleSplit(n_splits=1,
                                                test_size=test_size,
                                                random_state=1)
                else:
                    ss = ShuffleSplit(n_splits=1,
                                      test_size=test_size,
                                      random_state=1)

                for train_index, val_index in ss.split(X, y):
                    X_valid = X[val_index]
                    y_valid = y[val_index]

                if self.train_labels is not None:
                    assert (self.train_labels == y_valid).all()
                else:
                    self.train_labels = y_valid

                if self.task_type in CLS_TASKS:
                    y_valid_pred = model.predict_proba(X_valid)
                else:
                    y_valid_pred = model.predict(X_valid)
                self.predictions.append(y_valid_pred)

        if len(self.predictions) < self.ensemble_size:
            self.ensemble_size = len(self.predictions)

        if ensemble_method == 'ensemble_selection':
            return

        if task_type in CLS_TASKS:
            self.base_model_mask = choose_base_models_classification(
                np.array(self.predictions), self.ensemble_size)
        else:
            self.base_model_mask = choose_base_models_regression(
                np.array(self.predictions), np.array(y_valid),
                self.ensemble_size)
        self.ensemble_size = sum(self.base_model_mask)
Example #13
0
 def __init__(self):
     self.logger = get_logger(__name__)
Example #14
0
 def _get_logger(self, name):
     logger_name = 'MindWare-%s(%d)' % (name, self.seed)
     setup_logger(os.path.join(self.output_dir, '%s.log' % str(logger_name)),
                  self.logging_config)
     return get_logger(logger_name)
Example #15
0
    def __init__(self,
                 configspace,
                 min_points_in_model=None,
                 top_n_percent=15,
                 num_samples=64,
                 random_fraction=1 / 3,
                 bandwidth_factor=3,
                 min_bandwidth=1e-3,
                 **kwargs):
        """
            Fits for each given budget a kernel density estimator on the best N percent of the
            evaluated configurations on this budget.


            Parameters:
            -----------
            configspace: ConfigSpace
                Configuration space object
            top_n_percent: int
                Determines the percentile of configurations that will be used as training data
                for the kernel density estimator, e.g if set to 10 the 10% best configurations will be considered
                for training.
            min_points_in_model: int
                minimum number of datapoints needed to fit a model
            num_samples: int
                number of samples drawn to optimize EI via sampling
            random_fraction: float
                fraction of random configurations returned
            bandwidth_factor: float
                widens the bandwidth for contiuous parameters for proposed points to optimize EI
            min_bandwidth: float
                to keep diversity, even when all (good) samples have the same value for one of the parameters,
                a minimum bandwidth (Default: 1e-3) is used instead of zero.

        """
        super().__init__(**kwargs)
        self.top_n_percent = top_n_percent
        self.configspace = configspace
        self.bw_factor = bandwidth_factor
        self.min_bandwidth = min_bandwidth

        self.min_points_in_model = min_points_in_model
        if min_points_in_model is None:
            self.min_points_in_model = len(
                self.configspace.get_hyperparameters()) + 1

        if self.min_points_in_model < len(
                self.configspace.get_hyperparameters()) + 1:
            self.min_points_in_model = len(
                self.configspace.get_hyperparameters()) + 1

        self.num_samples = num_samples
        self.random_fraction = random_fraction

        hps = self.configspace.get_hyperparameters()

        self.kde_vartypes = ""
        self.vartypes = []

        for h in hps:
            if hasattr(h, 'sequence'):
                raise RuntimeError(
                    'This version on BOHB does not support ordinal hyperparameters. Please encode %s as an integer parameter!'
                    % (h.name))

            if hasattr(h, 'choices'):
                self.kde_vartypes += 'u'
                self.vartypes += [len(h.choices)]
            else:
                self.kde_vartypes += 'c'
                self.vartypes += [0]

        self.vartypes = np.array(self.vartypes, dtype=int)

        # store precomputed probs for the categorical parameters
        self.cat_probs = []

        self.configs = dict()
        self.losses = dict()
        self.good_config_rankings = dict()
        self.kde_models = dict()
        self.logger = get_logger(self.__module__ + "." +
                                 self.__class__.__name__)
Example #16
0
def calculate_all_metafeatures(X, y, categorical, dataset_name, task_type,
                               calculate=None, dont_calculate=None, densify_threshold=1000):
    logger = get_logger(__name__)

    """Calculate all metafeatures."""
    helper_functions.clear()
    metafeatures.clear()
    mf_ = dict()

    visited = set()
    to_visit = deque()
    to_visit.extend(metafeatures)

    X_transformed = None
    y_transformed = None

    func_cls = ['NumberOfClasses', 'LogNumberOfFeatures',
                'ClassProbabilityMin', 'ClassProbabilityMax',
                'ClassProbabilityMean', "ClassProbabilitySTD",
                'ClassEntropy', 'LandmarkLDA',
                'LandmarkNaiveBayes', 'LandmarkDecisionTree',
                'LandmarkDecisionNodeLearner', 'LandmarkRandomNodeLearner',
                'LandmarkWorstNodeLearner', 'Landmark1NN']

    # TODO calculate the numpy metafeatures after all others to consume less
    # memory
    while len(to_visit) > 0:
        name = to_visit.pop()
        if calculate is not None and name not in calculate:
            continue
        if dont_calculate is not None and name in dont_calculate:
            continue
        if name in func_cls and task_type not in CLS_TASKS:
            continue

        if name in npy_metafeatures:
            if X_transformed is None:
                # TODO make sure this is done as efficient as possible (no copy for
                # sparse matrices because of wrong sparse format)
                sparse = scipy.sparse.issparse(X)

                imputer = SimpleImputer(strategy='most_frequent', copy=False)
                X_transformed = imputer.fit_transform(X.copy())
                if any(categorical):
                    categorical_idx = [idx for idx, i in enumerate(categorical) if i]
                    ohe = ColumnTransformer([('one-hot', OneHotEncoder(), categorical_idx)], remainder="passthrough")
                    X_transformed = ohe.fit_transform(X_transformed)

                center = not scipy.sparse.isspmatrix(X_transformed)
                standard_scaler = StandardScaler(copy=False, with_mean=center)
                X_transformed = standard_scaler.fit_transform(X_transformed)
                categorical_transformed = [False] * X_transformed.shape[1]

                # Densify the transformed matrix
                if not sparse and scipy.sparse.issparse(X_transformed):
                    bytes_per_float = X_transformed.dtype.itemsize
                    num_elements = X_transformed.shape[0] * X_transformed.shape[1]
                    megabytes_required = num_elements * bytes_per_float / 1000 / 1000
                    if megabytes_required < densify_threshold:
                        X_transformed = X_transformed.todense()

                # This is not only important for datasets which are somehow
                # sorted in a strange way, but also prevents lda from failing in
                # some cases.
                # Because this is advanced indexing, a copy of the data is returned!!!
                X_transformed = check_array(X_transformed,
                                            force_all_finite=True,
                                            accept_sparse='csr')
                rs = np.random.RandomState(42)
                indices = np.arange(X_transformed.shape[0])
                rs.shuffle(indices)
                # TODO Shuffle inplace
                X_transformed = X_transformed[indices]
                y_transformed = y[indices]

            X_ = X_transformed
            y_ = y_transformed
            categorical_ = categorical_transformed
        else:
            X_ = X
            y_ = y
            categorical_ = categorical

        dependency = metafeatures.get_dependency(name)
        if dependency is not None:
            is_metafeature = dependency in metafeatures
            is_helper_function = dependency in helper_functions

            if is_metafeature and is_helper_function:
                raise NotImplementedError()
            elif not is_metafeature and not is_helper_function:
                raise ValueError(dependency)
            elif is_metafeature and not metafeatures.is_calculated(dependency):
                to_visit.appendleft(name)
                continue
            elif is_helper_function and not helper_functions.is_calculated(
                    dependency):
                logger.debug("%s: Going to calculate: %s", dataset_name,
                            dependency)
                value = helper_functions[dependency](X_, y_, categorical_)
                helper_functions.set_value(dependency, value)
                mf_[dependency] = value

        logger.debug("%s: Going to calculate: %s", dataset_name,
                    name)

        value = metafeatures[name](X_, y_, categorical_)
        metafeatures.set_value(name, value)
        mf_[name] = value
        visited.add(name)

    mf_ = DatasetMetafeatures(dataset_name, mf_, task_type=task_type)
    return mf_
Example #17
0
    def __init__(self,
                 n_algorithm=3,
                 task_type=None,
                 metric='bal_acc',
                 rep=3,
                 total_resource=1200,
                 meta_algorithm='lightgbm',
                 exclude_datasets=None,
                 meta_dir=None):
        self.logger = get_logger(self.__module__ + "." +
                                 self.__class__.__name__)
        self.n_algorithm = n_algorithm
        self.n_algo_candidates = len(_cls_builtin_algorithms)
        self.task_type = task_type
        self.meta_algo = meta_algorithm
        self.rep = rep
        self.metric = metric
        if task_type in CLS_TASKS:
            self.algorithms = _cls_builtin_algorithms
            self.n_algo_candidates = len(_cls_builtin_algorithms)
            if metric not in ['acc', 'bal_acc']:
                self.logger.info(
                    'Meta information about metric-%s does not exist, use accuracy instead.'
                    % str(metric))
                metric = 'acc'
        elif task_type in RGS_TASKS:
            self.algorithms = _rgs_builtin_algorithms
            self.n_algo_candidates = len(_rgs_builtin_algorithms)
            if metric not in ['mse']:
                self.logger.info(
                    'Meta information about metric-%s does not exist, use accuracy instead.'
                    % str(metric))
                metric = 'mse'
        else:
            raise ValueError('Invalid metric: %s.' % metric)

        self.total_resource = total_resource
        self.exclude_datasets = exclude_datasets

        builtin_loc = os.path.dirname(__file__)
        builtin_loc = os.path.join(builtin_loc, '..')
        builtin_loc = os.path.join(builtin_loc, 'meta_resource')
        self.meta_dir = meta_dir if meta_dir is not None else builtin_loc

        if self.exclude_datasets is None:
            self.hash_id = 'none'
        else:
            self.exclude_datasets = list(set(exclude_datasets))
            exclude_str = ','.join(sorted(self.exclude_datasets))
            md5 = hashlib.md5()
            md5.update(exclude_str.encode('utf-8'))
            self.hash_id = md5.hexdigest()
        meta_datasets = set()
        _folder = os.path.join(self.meta_dir, 'meta_dataset_vec')

        if task_type in CLS_TASKS:
            task_prefix = 'cls'
        else:
            task_prefix = 'rgs'

        embedding_path = os.path.join(
            _folder, '%s_meta_dataset_embedding.pkl' % task_prefix)
        with open(embedding_path, 'rb') as f:
            d = pkl.load(f)
            meta_datasets = d['task_ids']

        self._builtin_datasets = sorted(list(meta_datasets))

        self.metadata_manager = MetaDataManager(self.meta_dir,
                                                self.algorithms,
                                                self._builtin_datasets,
                                                metric,
                                                total_resource,
                                                task_type=task_type,
                                                rep=rep)
        self.meta_learner = None