Exemple #1
0
    def predict(self, data):
        model_pred_list = []
        final_pred = []
        # Get predictions from each model
        model_cnt = 0
        for algo_id in self.stats:
            model_to_eval = self.stats[algo_id]
            for idx, (_, _, path) in enumerate(model_to_eval):
                with open(path, 'rb') as f:
                    op_list, model = pkl.load(f)
                _node = data.copy_()

                _node = construct_node(_node, op_list)

                if self.base_model_mask[model_cnt] == 1:
                    if self.task_type in CLS_TASKS:
                        model_pred_list.append(
                            model.predict_proba(_node.data[0]))
                    else:
                        model_pred_list.append(model.predict(_node.data[0]))
                model_cnt += 1

        # Calculate the average of predictions
        for i in range(len(data.data[0])):
            sample_pred_list = [
                model_pred[i] for model_pred in model_pred_list
            ]
            pred_average = reduce(lambda x, y: x + y,
                                  sample_pred_list) / len(sample_pred_list)
            final_pred.append(pred_average)

        return np.array(final_pred)
Exemple #2
0
    def fit(self, data):
        # Split training data for phase 1 and phase 2
        test_size = 0.2

        # Train basic models using a part of training data
        model_cnt = 0
        suc_cnt = 0
        feature_p2 = None
        for algo_id in self.stats.keys():
            model_to_eval = self.stats[algo_id]
            for idx, (config, _, path) in enumerate(model_to_eval):
                with open(path, 'rb')as f:
                    op_list, model = pkl.load(f)
                _node = data.copy_()

                _node = construct_node(_node, op_list, mode='train')

                X, y = _node.data
                if self.task_type in CLS_TASKS:
                    x_p1, x_p2, y_p1, y_p2 = train_test_split(X, y, test_size=test_size,
                                                              stratify=data.data[1], random_state=1)
                else:
                    x_p1, x_p2, y_p1, y_p2 = train_test_split(X, y, test_size=test_size,
                                                              random_state=1)

                if self.base_model_mask[model_cnt] == 1:
                    estimator = fetch_predict_estimator(self.task_type, algo_id, config[0], x_p1, y_p1,
                                                        weight_balance=_node.enable_balance,
                                                        data_balance=_node.data_balance)
                    with open(os.path.join(self.output_dir, '%s-blending-model%d' % (self.timestamp, model_cnt)),
                              'wb') as f:
                        pkl.dump(estimator, f)
                    if self.task_type in CLS_TASKS:
                        pred = estimator.predict_proba(x_p2)
                        n_dim = np.array(pred).shape[1]
                        if n_dim == 2:
                            # Binary classificaion
                            n_dim = 1
                        # Initialize training matrix for phase 2
                        if feature_p2 is None:
                            num_samples = len(x_p2)
                            feature_p2 = np.zeros((num_samples, self.ensemble_size * n_dim))
                        if n_dim == 1:
                            feature_p2[:, suc_cnt * n_dim:(suc_cnt + 1) * n_dim] = pred[:, 1:2]
                        else:
                            feature_p2[:, suc_cnt * n_dim:(suc_cnt + 1) * n_dim] = pred
                    else:
                        pred = estimator.predict(x_p2).reshape(-1, 1)
                        n_dim = 1
                        # Initialize training matrix for phase 2
                        if feature_p2 is None:
                            num_samples = len(x_p2)
                            feature_p2 = np.zeros((num_samples, self.ensemble_size * n_dim))
                        feature_p2[:, suc_cnt * n_dim:(suc_cnt + 1) * n_dim] = pred
                    suc_cnt += 1
                model_cnt += 1
        self.meta_learner.fit(feature_p2, y_p2)

        return self
Exemple #3
0
    def get_feature(self, data):
        # Predict the labels via stacking
        feature_p2 = None
        model_cnt = 0
        suc_cnt = 0
        for algo_id in self.stats.keys():
            model_to_eval = self.stats[algo_id]
            for idx, (config, _, path) in enumerate(model_to_eval):
                with open(path, 'rb') as f:
                    op_list, model = pkl.load(f)
                _node = data.copy_()

                _node = construct_node(_node, op_list)

                if self.base_model_mask[model_cnt] == 1:
                    for j in range(self.kfold):
                        with open(
                                os.path.join(
                                    self.output_dir, '%s-model%d_part%d' %
                                    (self.timestamp, model_cnt, j)),
                                'rb') as f:
                            estimator = pkl.load(f)
                        if self.task_type in CLS_TASKS:
                            pred = estimator.predict_proba(_node.data[0])
                            n_dim = np.array(pred).shape[1]
                            if n_dim == 2:
                                n_dim = 1
                            if feature_p2 is None:
                                num_samples = len(_node.data[0])
                                feature_p2 = np.zeros(
                                    (num_samples, self.ensemble_size * n_dim))
                            # Get average predictions
                            if n_dim == 1:
                                feature_p2[:, suc_cnt * n_dim:(suc_cnt + 1) * n_dim] = \
                                    feature_p2[:, suc_cnt * n_dim:(suc_cnt + 1) * n_dim] + pred[:,
                                                                                           1:2] / self.kfold
                            else:
                                feature_p2[:, suc_cnt * n_dim:(suc_cnt + 1) * n_dim] = \
                                    feature_p2[:, suc_cnt * n_dim:(suc_cnt + 1) * n_dim] + pred / self.kfold
                        else:
                            pred = estimator.predict(_node.data[0]).reshape(
                                -1, 1)
                            n_dim = 1
                            # Initialize training matrix for phase 2
                            if feature_p2 is None:
                                num_samples = len(_node.data[0])
                                feature_p2 = np.zeros(
                                    (num_samples, self.ensemble_size * n_dim))
                            # Get average predictions
                            feature_p2[:, suc_cnt * n_dim:(suc_cnt + 1) * n_dim] = \
                                feature_p2[:, suc_cnt * n_dim:(suc_cnt + 1) * n_dim] + pred / self.kfold
                    suc_cnt += 1
                model_cnt += 1
        return feature_p2
Exemple #4
0
    def predict(self, data):
        predictions = []
        cur_idx = 0
        for algo_id in self.stats.keys():
            model_to_eval = self.stats[algo_id]
            for idx, (_, _, path) in enumerate(model_to_eval):
                with open(path, 'rb') as f:
                    op_list, estimator = pkl.load(f)
                _node = data.copy_()

                _node = construct_node(_node, op_list)

                X_test = _node.data[0]
                if cur_idx in self.model_idx:
                    if self.task_type in CLS_TASKS:
                        predictions.append(estimator.predict_proba(X_test))
                    else:
                        predictions.append(estimator.predict(X_test))
                else:
                    if len(self.shape) == 1:
                        predictions.append(np.zeros(len(_node.data[0])))
                    else:
                        predictions.append(
                            np.zeros((len(_node.data[0]), self.shape[1])))
                cur_idx += 1
        predictions = np.asarray(predictions)

        # if predictions.shape[0] == len(self.weights_),
        # predictions include those of zero-weight models.
        if predictions.shape[0] == len(self.weights_):
            return np.average(predictions, axis=0, weights=self.weights_)

        # if prediction model.shape[0] == len(non_null_weights),
        # predictions do not include those of zero-weight models.
        elif predictions.shape[0] == np.count_nonzero(self.weights_):
            non_null_weights = [w for w in self.weights_ if w > 0]
            return np.average(predictions, axis=0, weights=non_null_weights)

        # If none of the above applies, then something must have gone wrong.
        else:
            raise ValueError("The dimensions of ensemble predictions"
                             " and ensemble weights do not match!")
Exemple #5
0
    def _predict(self, test_data: DataNode):
        if self.ensemble_method is not None:
            if self.es is None:
                raise AttributeError("AutoML is not fitted!")
            return self.es.predict(test_data)
        else:
            if self.inner_opt_algorithm == 'combined':
                best_op_list, estimator = load_combined_transformer_estimator(
                    self.output_dir, self.best_config, self.timestamp)
            else:
                best_op_list, estimator = load_transformer_estimator(
                    self.output_dir, self.optimal_algo_id,
                    self.best_hpo_config, self.best_fe_config, self.timestamp)

            test_data_node = test_data.copy_()
            test_data_node = construct_node(test_data_node, best_op_list)

            if self.task_type in CLS_TASKS:
                return estimator.predict_proba(test_data_node.data[0])
            else:
                return estimator.predict(test_data_node.data[0])
    def __call__(self, config, **kwargs):
        start_time = time.time()
        return_dict = dict()
        self.seed = 1
        downsample_ratio = kwargs.get('resource_ratio', 1.0)

        if 'holdout' in self.resampling_strategy:
            try:
                # Prepare data node.
                with warnings.catch_warnings():
                    warnings.filterwarnings("ignore")

                    if self.resampling_params is None or 'test_size' not in self.resampling_params:
                        test_size = 0.33
                    else:
                        test_size = self.resampling_params['test_size']

                    from sklearn.model_selection import ShuffleSplit
                    ss = ShuffleSplit(n_splits=1,
                                      test_size=test_size,
                                      random_state=self.seed)
                    for train_index, test_index in ss.split(
                            self.data_node.data[0], self.data_node.data[1]):
                        _x_train, _x_val = self.data_node.data[0][
                            train_index], self.data_node.data[0][test_index]
                        _y_train, _y_val = self.data_node.data[1][
                            train_index], self.data_node.data[1][test_index]
                    self.train_node.data = [_x_train, _y_train]
                    self.val_node.data = [_x_val, _y_val]

                    data_node, op_list = parse_config(self.train_node,
                                                      config,
                                                      record=True)
                    _val_node = self.val_node.copy_()
                    _val_node = construct_node(_val_node, op_list)

                _x_train, _y_train = data_node.data
                _x_val, _y_val = _val_node.data

                config_dict = config.get_dictionary().copy()
                # regression gadgets
                regressor_id, clf = get_estimator(config_dict,
                                                  self.estimator_id)

                score = validation(clf,
                                   self.scorer,
                                   _x_train,
                                   _y_train,
                                   _x_val,
                                   _y_val,
                                   random_state=self.seed)

                if 'rw_lock' not in kwargs or kwargs['rw_lock'] is None:
                    self.logger.info(
                        'rw_lock not defined! Possible read-write conflicts may happen!'
                    )
                lock = kwargs.get('rw_lock', Lock())
                lock.acquire()
                if np.isfinite(score):
                    save_flag, model_path, delete_flag, model_path_deleted = self.topk_model_saver.add(
                        config, score, regressor_id)
                    if save_flag is True:
                        with open(model_path, 'wb') as f:
                            pkl.dump([op_list, clf], f)
                        self.logger.info("Model saved to %s" % model_path)

                    self.topk_model_saver.save_topk_config()

                    try:
                        if delete_flag and os.path.exists(model_path_deleted):
                            os.remove(model_path_deleted)
                            self.logger.info("Model deleted from %s" %
                                             model_path_deleted)
                    except:
                        pass

                lock.release()

            except Exception as e:
                import traceback
                traceback.print_exc()
                self.logger.info('Evaluator: %s' % (str(e)))
                score = -np.inf

        elif 'cv' in self.resampling_strategy:
            # Prepare data node.
            try:
                with warnings.catch_warnings():
                    warnings.filterwarnings("ignore")

                    if 'cv' in self.resampling_strategy:
                        if self.resampling_params is None or 'folds' not in self.resampling_params:
                            folds = 5
                        else:
                            folds = self.resampling_params['folds']

                    from sklearn.model_selection import KFold
                    kfold = KFold(n_splits=folds,
                                  random_state=self.seed,
                                  shuffle=False)
                    scores = list()

                    for train_index, test_index in kfold.split(
                            self.data_node.data[0], self.data_node.data[1]):
                        _x_train, _x_val = self.data_node.data[0][
                            train_index], self.data_node.data[0][test_index]
                        _y_train, _y_val = self.data_node.data[1][
                            train_index], self.data_node.data[1][test_index]
                        self.train_node.data = [_x_train, _y_train]
                        self.val_node.data = [_x_val, _y_val]

                        data_node, op_list = parse_config(self.train_node,
                                                          config,
                                                          record=True)
                        _val_node = self.val_node.copy_()
                        _val_node = construct_node(_val_node, op_list)

                        _x_train, _y_train = data_node.data
                        _x_val, _y_val = _val_node.data

                        config_dict = config.get_dictionary().copy()
                        # regressor gadgets
                        regressor_id, clf = get_estimator(
                            config_dict, self.estimator_id)

                        _score = validation(clf,
                                            self.scorer,
                                            _x_train,
                                            _y_train,
                                            _x_val,
                                            _y_val,
                                            random_state=self.seed)
                        scores.append(_score)
                    score = np.mean(scores)

                # TODO: Don't save models for cv
                if 'rw_lock' not in kwargs or kwargs['rw_lock'] is None:
                    self.logger.info(
                        'rw_lock not defined! Possible read-write conflicts may happen!'
                    )
                lock = kwargs.get('rw_lock', Lock())
                lock.acquire()
                if np.isfinite(score):
                    _ = self.topk_model_saver.add(config, score, regressor_id)
                    self.topk_model_saver.save_topk_config()
                lock.release()

            except Exception as e:
                import traceback
                traceback.print_exc()
                self.logger.info('Evaluator: %s' % (str(e)))
                score = -np.inf

        elif 'partial' in self.resampling_strategy:
            try:
                # Prepare data node.
                with warnings.catch_warnings():
                    warnings.filterwarnings("ignore")

                    if self.resampling_params is None or 'test_size' not in self.resampling_params:
                        test_size = 0.33
                    else:
                        test_size = self.resampling_params['test_size']

                    from sklearn.model_selection import ShuffleSplit
                    ss = ShuffleSplit(n_splits=1,
                                      test_size=test_size,
                                      random_state=self.seed)
                    for train_index, test_index in ss.split(
                            self.data_node.data[0], self.data_node.data[1]):
                        _x_train, _x_val = self.data_node.data[0][
                            train_index], self.data_node.data[0][test_index]
                        _y_train, _y_val = self.data_node.data[1][
                            train_index], self.data_node.data[1][test_index]
                    self.train_node.data = [_x_train, _y_train]
                    self.val_node.data = [_x_val, _y_val]

                    data_node, op_list = parse_config(self.train_node,
                                                      config,
                                                      record=True)
                    _val_node = self.val_node.copy_()
                    _val_node = construct_node(_val_node, op_list)

                _x_train, _y_train = data_node.data

                if downsample_ratio != 1:
                    down_ss = ShuffleSplit(n_splits=1,
                                           test_size=downsample_ratio,
                                           random_state=self.seed)
                    for _, _val_index in down_ss.split(_x_train, _y_train):
                        _act_x_train, _act_y_train = _x_train[
                            _val_index], _y_train[_val_index]
                else:
                    _act_x_train, _act_y_train = _x_train, _y_train
                    _val_index = list(range(len(_x_train)))

                _x_val, _y_val = _val_node.data

                config_dict = config.get_dictionary().copy()
                # Regressor gadgets
                regressor_id, clf = get_estimator(config_dict,
                                                  self.estimator_id)

                score = validation(clf,
                                   self.scorer,
                                   _act_x_train,
                                   _act_y_train,
                                   _x_val,
                                   _y_val,
                                   random_state=self.seed)

                # TODO: Only save models with maximum resources
                if 'rw_lock' not in kwargs or kwargs['rw_lock'] is None:
                    self.logger.info(
                        'rw_lock not defined! Possible read-write conflicts may happen!'
                    )
                lock = kwargs.get('rw_lock', Lock())
                lock.acquire()
                if np.isfinite(score) and downsample_ratio == 1:
                    save_flag, model_path, delete_flag, model_path_deleted = self.topk_model_saver.add(
                        config, score, regressor_id)
                    if save_flag is True:
                        with open(model_path, 'wb') as f:
                            pkl.dump([op_list, clf], f)
                        self.logger.info("Model saved to %s" % model_path)

                    self.topk_model_saver.save_topk_config()

                    try:
                        if delete_flag and os.path.exists(model_path_deleted):
                            os.remove(model_path_deleted)
                            self.logger.info("Model deleted from %s" %
                                             model_path_deleted)
                    except:
                        pass

                lock.release()
            except Exception as e:
                import traceback
                traceback.print_exc()
                self.logger.info('Evaluator: %s' % (str(e)))
                score = -np.inf

        else:
            raise ValueError('Invalid resampling strategy: %s!' %
                             self.resampling_strategy)

        try:
            self.logger.info(
                'Evaluation<%s> | Score: %.4f | Time cost: %.2f seconds | Shape: %s'
                % (regressor_id, self.scorer._sign * score,
                   time.time() - start_time, _x_train.shape))
        except:
            pass

        # Turn it into a minimization problem.
        return_dict['objective_value'] = -score
        return -score
Exemple #7
0
    def __call__(self, config, **kwargs):
        start_time = time.time()
        return_dict = dict()

        if self.name is None:
            raise ValueError('This evaluator has no name/type!')
        assert self.name in ['hpo', 'fe']

        if self.name == 'hpo':
            hpo_config = config if config is not None else self.hpo_config
            fe_config = kwargs.get('ano_config', self.fe_config)
        else:
            fe_config = config if config is not None else self.fe_config
            hpo_config = kwargs.get('ano_config', self.hpo_config)

        # Prepare configuration.
        self.seed = 1
        downsample_ratio = kwargs.get('resource_ratio', 1.0)

        if 'holdout' in self.resampling_strategy:
            try:
                with warnings.catch_warnings():
                    warnings.filterwarnings("ignore")
                    if self.resampling_params is None or 'test_size' not in self.resampling_params:
                        test_size = 0.33
                    else:
                        test_size = self.resampling_params['test_size']

                    from sklearn.model_selection import StratifiedShuffleSplit
                    ss = StratifiedShuffleSplit(n_splits=1, test_size=test_size, random_state=self.seed)
                    for train_index, test_index in ss.split(self.data_node.data[0], self.data_node.data[1]):
                        _X_train, _X_val = self.data_node.data[0][train_index], self.data_node.data[0][test_index]
                        _y_train, _y_val = self.data_node.data[1][train_index], self.data_node.data[1][test_index]
                    self.train_node.data = [_X_train, _y_train]
                    self.val_node.data = [_X_val, _y_val]

                    data_node, op_list = parse_config(self.train_node, fe_config, record=True, if_imbal=self.if_imbal)
                    _val_node = self.val_node.copy_()
                    _val_node = construct_node(_val_node, op_list)

                _X_train, _y_train = data_node.data
                _X_val, _y_val = _val_node.data

                config_dict = hpo_config.get_dictionary().copy()
                # Prepare training and initial params for classifier.
                init_params, fit_params = {}, {}
                if data_node.enable_balance == 1:
                    init_params, fit_params = self.get_fit_params(_y_train, self.estimator_id)
                    for key, val in init_params.items():
                        config_dict[key] = val

                if data_node.data_balance == 1:
                    fit_params['data_balance'] = True

                classifier_id, clf = get_estimator(config_dict, self.estimator_id)

                if self.onehot_encoder is None:
                    self.onehot_encoder = OneHotEncoder(categories='auto')
                    y = np.reshape(_y_train, (len(_y_train), 1))
                    self.onehot_encoder.fit(y)

                score = validation(clf, self.scorer, _X_train, _y_train, _X_val, _y_val,
                                   random_state=self.seed,
                                   onehot=self.onehot_encoder if isinstance(self.scorer,
                                                                            _ThresholdScorer) else None,
                                   fit_params=fit_params)

                if 'rw_lock' not in kwargs or kwargs['rw_lock'] is None:
                    self.logger.info('rw_lock not defined! Possible read-write conflicts may happen!')
                lock = kwargs.get('rw_lock', Lock())
                lock.acquire()
                if np.isfinite(score):
                    save_flag, model_path, delete_flag, model_path_deleted = self.topk_model_saver.add(hpo_config,
                                                                                                       fe_config,
                                                                                                       score,
                                                                                                       classifier_id)
                    if save_flag is True:
                        with open(model_path, 'wb') as f:
                            pkl.dump([op_list, clf], f)
                        self.logger.info("Model saved to %s" % model_path)

                    self.topk_model_saver.save_topk_config()

                    try:
                        if delete_flag and os.path.exists(model_path_deleted):
                            os.remove(model_path_deleted)
                            self.logger.info("Model deleted from %s" % model_path_deleted)
                    except:
                        pass



                lock.release()
            except Exception as e:
                import traceback
                self.logger.info('%s-evaluator: %s' % (self.name, str(e)))
                score = -np.inf
                traceback.print_exc(file=sys.stdout)

        elif 'cv' in self.resampling_strategy:
            # Prepare data node.
            try:
                with warnings.catch_warnings():
                    warnings.filterwarnings("ignore")

                    if 'cv' in self.resampling_strategy:
                        if self.resampling_params is None or 'folds' not in self.resampling_params:
                            folds = 5
                        else:
                            folds = self.resampling_params['folds']

                    from sklearn.model_selection import StratifiedKFold
                    skfold = StratifiedKFold(n_splits=folds, random_state=self.seed, shuffle=False)
                    scores = list()

                    for train_index, test_index in skfold.split(self.data_node.data[0], self.data_node.data[1]):
                        _X_train, _X_val = self.data_node.data[0][train_index], self.data_node.data[0][test_index]
                        _y_train, _y_val = self.data_node.data[1][train_index], self.data_node.data[1][test_index]
                        self.train_node.data = [_X_train, _y_train]
                        self.val_node.data = [_X_val, _y_val]

                        data_node, op_list = parse_config(self.train_node, fe_config, record=True,
                                                          if_imbal=self.if_imbal)
                        _val_node = self.val_node.copy_()
                        _val_node = construct_node(_val_node, op_list)

                        _X_train, _y_train = data_node.data
                        _X_val, _y_val = _val_node.data

                        config_dict = hpo_config.get_dictionary().copy()
                        # Prepare training and initial params for classifier.
                        init_params, fit_params = {}, {}
                        if data_node.enable_balance == 1:
                            init_params, fit_params = self.get_fit_params(_y_train, self.estimator_id)
                            for key, val in init_params.items():
                                config_dict[key] = val

                        if data_node.data_balance == 1:
                            fit_params['data_balance'] = True

                        classifier_id, clf = get_estimator(config_dict, self.estimator_id)

                        if self.onehot_encoder is None:
                            self.onehot_encoder = OneHotEncoder(categories='auto')
                            y = np.reshape(_y_train, (len(_y_train), 1))
                            self.onehot_encoder.fit(y)

                        _score = validation(clf, self.scorer, _X_train, _y_train, _X_val, _y_val,
                                            random_state=self.seed,
                                            onehot=self.onehot_encoder if isinstance(self.scorer,
                                                                                     _ThresholdScorer) else None,
                                            fit_params=fit_params)
                        scores.append(_score)
                    score = np.mean(scores)

                # TODO: Don't save models for cv
                if 'rw_lock' not in kwargs or kwargs['rw_lock'] is None:
                    self.logger.info('rw_lock not defined! Possible read-write conflicts may happen!')
                lock = kwargs.get('rw_lock', Lock())
                lock.acquire()
                if np.isfinite(score):
                    _ = self.topk_model_saver.add(hpo_config, fe_config, score, classifier_id)
                    self.topk_model_saver.save_topk_config()
                lock.release()

            except Exception as e:
                import traceback
                traceback.print_exc()
                self.logger.info('Evaluator: %s' % (str(e)))
                score = -np.inf

        elif 'partial' in self.resampling_strategy:
            try:
                # Prepare data node.
                with warnings.catch_warnings():
                    warnings.filterwarnings("ignore")

                    if self.resampling_params is None or 'test_size' not in self.resampling_params:
                        test_size = 0.33
                    else:
                        test_size = self.resampling_params['test_size']

                    from sklearn.model_selection import StratifiedShuffleSplit
                    ss = StratifiedShuffleSplit(n_splits=1, test_size=test_size, random_state=self.seed)
                    for train_index, test_index in ss.split(self.data_node.data[0], self.data_node.data[1]):
                        _X_train, _X_val = self.data_node.data[0][train_index], self.data_node.data[0][test_index]
                        _y_train, _y_val = self.data_node.data[1][train_index], self.data_node.data[1][test_index]
                    self.train_node.data = [_X_train, _y_train]
                    self.val_node.data = [_X_val, _y_val]

                    data_node, op_list = parse_config(self.train_node, fe_config, record=True, if_imbal=self.if_imbal)
                    _val_node = self.val_node.copy_()
                    _val_node = construct_node(_val_node, op_list)

                _X_train, _y_train = data_node.data

                if downsample_ratio != 1:
                    down_ss = StratifiedShuffleSplit(n_splits=1, test_size=downsample_ratio,
                                                     random_state=self.seed)
                    for _, _val_index in down_ss.split(_X_train, _y_train):
                        _act_x_train, _act_y_train = _X_train[_val_index], _y_train[_val_index]
                else:
                    _act_x_train, _act_y_train = _X_train, _y_train
                    _val_index = list(range(len(_X_train)))

                _X_val, _y_val = _val_node.data

                config_dict = hpo_config.get_dictionary().copy()
                # Prepare training and initial params for classifier.
                init_params, fit_params = {}, {}
                if data_node.enable_balance == 1:
                    init_params, fit_params = self.get_fit_params(_y_train, self.estimator_id)
                    for key, val in init_params.items():
                        config_dict[key] = val
                if 'sample_weight' in fit_params:
                    fit_params['sample_weight'] = fit_params['sample_weight'][_val_index]
                if data_node.data_balance == 1:
                    fit_params['data_balance'] = True

                classifier_id, clf = get_estimator(config_dict, self.estimator_id)

                if self.onehot_encoder is None:
                    self.onehot_encoder = OneHotEncoder(categories='auto')
                    y = np.reshape(_y_train, (len(_y_train), 1))
                    self.onehot_encoder.fit(y)

                score = validation(clf, self.scorer, _act_x_train, _act_y_train, _X_val, _y_val,
                                   random_state=self.seed,
                                   onehot=self.onehot_encoder if isinstance(self.scorer,
                                                                            _ThresholdScorer) else None,
                                   fit_params=fit_params)

                # TODO: Only save models with maximum resources
                if 'rw_lock' not in kwargs or kwargs['rw_lock'] is None:
                    self.logger.info('rw_lock not defined! Possible read-write conflicts may happen!')
                lock = kwargs.get('rw_lock', Lock())
                lock.acquire()
                if np.isfinite(score) and downsample_ratio == 1:
                    save_flag, model_path, delete_flag, model_path_deleted = self.topk_model_saver.add(hpo_config,
                                                                                                       fe_config, score,
                                                                                                       classifier_id)
                    if save_flag is True:
                        with open(model_path, 'wb') as f:
                            pkl.dump([op_list, clf], f)
                        self.logger.info("Model saved to %s" % model_path)

                    self.topk_model_saver.save_topk_config()

                    try:
                        if delete_flag and os.path.exists(model_path_deleted):
                            os.remove(model_path_deleted)
                            self.logger.info("Model deleted from %s" % model_path_deleted)
                    except:
                        pass



                lock.release()
            except Exception as e:
                import traceback
                traceback.print_exc()
                self.logger.info('Evaluator: %s' % (str(e)))
                score = -np.inf

        else:
            raise ValueError('Invalid resampling strategy: %s!' % self.resampling_strategy)
        try:
            self.logger.info('Evaluation<%s> | Score: %.4f | Time cost: %.2f seconds | Shape: %s' %
                             (classifier_id,
                              self.scorer._sign * score,
                              time.time() - start_time, _X_train.shape))
        except:
            pass

        # Turn it into a minimization problem.
        return_dict['objective_value'] = -score
        return -score
Exemple #8
0
    def __init__(self,
                 stats,
                 ensemble_method: str,
                 ensemble_size: int,
                 task_type: int,
                 metric: _BaseScorer,
                 data_node,
                 output_dir=None):
        self.stats = stats
        self.ensemble_method = ensemble_method
        self.ensemble_size = ensemble_size
        self.task_type = task_type
        self.metric = metric
        self.output_dir = output_dir
        self.node = data_node

        self.predictions = []
        self.train_labels = None
        self.timestamp = str(time.time())
        logger_name = 'EnsembleBuilder'
        self.logger = get_logger(logger_name)

        for algo_id in self.stats.keys():
            model_to_eval = self.stats[algo_id]
            for idx, (_, _, path) in enumerate(model_to_eval):
                with open(path, 'rb') as f:
                    op_list, model = pkl.load(f)
                _node = self.node.copy_()
                _node = construct_node(_node, op_list)

                # TODO: Test size
                test_size = 0.33
                X, y = _node.data

                if self.task_type in CLS_TASKS:
                    ss = StratifiedShuffleSplit(n_splits=1,
                                                test_size=test_size,
                                                random_state=1)
                else:
                    ss = ShuffleSplit(n_splits=1,
                                      test_size=test_size,
                                      random_state=1)

                for train_index, val_index in ss.split(X, y):
                    X_valid = X[val_index]
                    y_valid = y[val_index]

                if self.train_labels is not None:
                    assert (self.train_labels == y_valid).all()
                else:
                    self.train_labels = y_valid

                if self.task_type in CLS_TASKS:
                    y_valid_pred = model.predict_proba(X_valid)
                else:
                    y_valid_pred = model.predict(X_valid)
                self.predictions.append(y_valid_pred)

        if len(self.predictions) < self.ensemble_size:
            self.ensemble_size = len(self.predictions)

        if ensemble_method == 'ensemble_selection':
            return

        if task_type in CLS_TASKS:
            self.base_model_mask = choose_base_models_classification(
                np.array(self.predictions), self.ensemble_size)
        else:
            self.base_model_mask = choose_base_models_regression(
                np.array(self.predictions), np.array(y_valid),
                self.ensemble_size)
        self.ensemble_size = sum(self.base_model_mask)
Exemple #9
0
    def fit(self, data):
        # Split training data for phase 1 and phase 2
        if self.task_type in CLS_TASKS:
            kf = StratifiedKFold(n_splits=self.kfold)
        else:
            kf = KFold(n_splits=self.kfold)

        # Train basic models using a part of training data
        model_cnt = 0
        suc_cnt = 0
        feature_p2 = None
        for algo_id in self.stats.keys():
            model_to_eval = self.stats[algo_id]
            for idx, (config, _, path) in enumerate(model_to_eval):
                with open(path, 'rb') as f:
                    op_list, model = pkl.load(f)
                _node = data.copy_()

                _node = construct_node(_node, op_list, mode='train')

                X, y = _node.data
                if self.base_model_mask[model_cnt] == 1:
                    for j, (train, test) in enumerate(kf.split(X, y)):
                        x_p1, x_p2, y_p1, _ = X[train], X[test], y[train], y[
                            test]
                        estimator = fetch_predict_estimator(
                            self.task_type,
                            algo_id,
                            config,
                            x_p1,
                            y_p1,
                            weight_balance=data.enable_balance,
                            data_balance=data.data_balance,
                            combined=True)
                        with open(
                                os.path.join(
                                    self.output_dir, '%s-model%d_part%d' %
                                    (self.timestamp, model_cnt, j)),
                                'wb') as f:
                            pkl.dump(estimator, f)
                        if self.task_type in CLS_TASKS:
                            pred = estimator.predict_proba(x_p2)
                            n_dim = np.array(pred).shape[1]
                            if n_dim == 2:
                                # Binary classificaion
                                n_dim = 1
                            # Initialize training matrix for phase 2
                            if feature_p2 is None:
                                num_samples = len(train) + len(test)
                                feature_p2 = np.zeros(
                                    (num_samples, self.ensemble_size * n_dim))
                            if n_dim == 1:
                                feature_p2[test,
                                           suc_cnt * n_dim:(suc_cnt + 1) *
                                           n_dim] = pred[:, 1:2]
                            else:
                                feature_p2[test, suc_cnt *
                                           n_dim:(suc_cnt + 1) * n_dim] = pred
                        else:
                            pred = estimator.predict(x_p2).reshape(-1, 1)
                            n_dim = 1
                            # Initialize training matrix for phase 2
                            if feature_p2 is None:
                                num_samples = len(train) + len(test)
                                feature_p2 = np.zeros(
                                    (num_samples, self.ensemble_size * n_dim))
                            feature_p2[test, suc_cnt * n_dim:(suc_cnt + 1) *
                                       n_dim] = pred
                    suc_cnt += 1
                model_cnt += 1
        # Train model for stacking using the other part of training data
        self.meta_learner.fit(feature_p2, y)
        return self