コード例 #1
0
ファイル: automl.py プロジェクト: fossabot/mljar-supervised
 def _check_imbalanced(self, y):
     v = y.value_counts()
     # at least 10 samples of each class
     ii = v < 10
     if np.sum(ii):
         raise AutoMLException(
             f"There need to be at least 10 samples of each class, for class {list(v[ii].index)} there is {v[ii].values} samples"
         )
     # at least 1% of all samples for each class
     v = y.value_counts(normalize=True) * 100.0
     ii = v < 1.0
     if np.sum(ii):
         raise AutoMLException(
             f"There need to be at least 1% of samples of each class, for class {list(v[ii].index)} there is {v[ii].values} % of samples"
         )
コード例 #2
0
ファイル: automl.py プロジェクト: ptesan777/mljar-supervised
    def _initial_prep(self,
                      X_train,
                      y_train,
                      X_validation=None,
                      y_validation=None):

        if not isinstance(X_train, pd.DataFrame):
            X_train = pd.DataFrame(X_train)

        if not isinstance(X_train.columns[0], str):
            X_train.columns = [str(c) for c in X_train.columns]

        X_train.reset_index(drop=True, inplace=True)

        if isinstance(y_train, pd.DataFrame):
            if "target" not in y_train.columns:
                raise AutoMLException(
                    "y_train should be Numpy array, Pandas Series or DataFrame with column 'target' "
                )
            else:
                y_train = y_train["target"]
        y_train = pd.Series(np.array(y_train), name="target")

        X_train, y_train = ExcludeRowsMissingTarget.transform(X_train,
                                                              y_train,
                                                              warn=True)

        return X_train, y_train, X_validation, y_validation
コード例 #3
0
 def create_dir(self, model_path):
     if not os.path.exists(model_path):
         try:
             os.mkdir(model_path)
         except Exception as e:
             raise AutoMLException(
                 f"Cannot create directory {model_path}. {str(e)}")
コード例 #4
0
ファイル: automl.py プロジェクト: amoonhappy/mljar-supervised
    def train_model(self, params):

        model_path = os.path.join(self._results_path, params["name"])

        early_stop = EarlyStopping(
            {"metric": {"name": self._optimize_metric}, "log_to_dir": model_path}
        )
        time_constraint = TimeConstraint({"train_seconds_time_limit": self._time_limit})
        mf = ModelFramework(params, callbacks=[early_stop, time_constraint])

        if self._enough_time_to_train(mf.get_type()):

            logger.info(
                f"Train model #{len(self._models)+1} / Model name: {params['name']}"
            )

            try:
                os.mkdir(model_path)
            except Exception as e:
                raise AutoMLException(f"Cannot create directory {model_path}")

            mf.train()  # {"train": {"X": X, "y": y}})

            mf.save(model_path)
            self._model_paths += [model_path]

            self.keep_model(mf)

        else:
            logger.info(
                f"Cannot check more models of {mf.get_type()} because of time constraint"
            )
コード例 #5
0
ファイル: automl.py プロジェクト: amoonhappy/mljar-supervised
    def load(self):
        logger.info("Loading AutoML models ...")
        try:
            params = json.load(open(os.path.join(self._results_path, "params.json")))

            self._model_paths = params["saved"]
            self._ml_task = params["ml_task"]
            self._optimize_metric = params["optimize_metric"]

            models_map = {}
            for model_path in self._model_paths:
                if model_path.endswith("ensemble"):
                    ens = Ensemble.load(model_path, models_map)
                    models_map[ens.get_name()] = ens
                else:
                    m = ModelFramework.load(model_path)
                    self._models += [m]
                    models_map[m.get_name()] = m

            best_model_name = None
            with open(os.path.join(self._results_path, "best_model.txt"), "r") as fin:
                best_model_name = fin.read()

            self._best_model = models_map[best_model_name]

            data_info_path = os.path.join(self._results_path, "data_info.json")
            self._data_info = json.load(open(data_info_path))
        except Exception as e:
            raise AutoMLException(f"Cannot load AutoML directory. {str(e)}")
コード例 #6
0
ファイル: automl.py プロジェクト: ptesan777/mljar-supervised
    def predict(self, X):
        """
        Computes predictions from AutoML best model.

        :param X: The Pandas DataFrame with input data. The input data should have the same columns as data used for training, otherwise the `AutoMLException` will be raised.
        """
        if self._best_model is None:
            return None

        if not isinstance(X.columns[0], str):
            X.columns = [str(c) for c in X.columns]

        input_columns = X.columns.tolist()
        for column in self._data_info["columns"]:
            if column not in input_columns:
                raise AutoMLException(
                    f"Missing column: {column} in input data. Cannot predict")
        X = X[self._data_info["columns"]]

        # is stacked model
        if self._best_model._is_stacked:
            self.stack_models()
            X_stacked = self.get_stacked_data(X, mode="predict")

            if self._best_model.get_type() == "Ensemble":
                # Ensemble is using both original and stacked data
                predictions = self._best_model.predict(X, X_stacked)
            else:
                predictions = self._best_model.predict(X_stacked)
        else:
            predictions = self._best_model.predict(X)

        if self._ml_task == BINARY_CLASSIFICATION:
            # need to predict the label based on predictions and threshold
            neg_label, pos_label = (
                predictions.columns[0][11:],
                predictions.columns[1][11:],
            )

            if neg_label == "0" and pos_label == "1":
                neg_label, pos_label = 0, 1
            target_is_numeric = self._data_info.get("target_is_numeric", False)
            if target_is_numeric:
                neg_label = int(neg_label)
                pos_label = int(pos_label)
            # assume that it is binary classification
            predictions[
                "label"] = predictions.iloc[:, 1] > self._best_model._threshold
            predictions["label"] = predictions["label"].map({
                True: pos_label,
                False: neg_label
            })
            return predictions
        elif self._ml_task == MULTICLASS_CLASSIFICATION:
            target_is_numeric = self._data_info.get("target_is_numeric", False)
            if target_is_numeric:
                predictions["label"] = predictions["label"].astype(int)
            return predictions
        else:
            return predictions
コード例 #7
0
    def get_split(self, k, repeat=0):
        try:
            train_index_file = os.path.join(self._results_path, "folds",
                                            f"fold_{k}_train_indices.npy")
            validation_index_file = os.path.join(
                self._results_path, "folds",
                f"fold_{k}_validation_indices.npy")

            train_index = np.load(train_index_file)
            validation_index = np.load(validation_index_file)

            X = load_data(self._X_path)
            y = load_data(self._y_path)
            y = y["target"]

            sample_weight = None
            if self._sample_weight_path is not None:
                sample_weight = load_data(self._sample_weight_path)
                sample_weight = sample_weight["sample_weight"]

            train_data = {"X": X.iloc[train_index], "y": y.iloc[train_index]}
            validation_data = {
                "X": X.iloc[validation_index],
                "y": y.iloc[validation_index],
            }
            if sample_weight is not None:
                train_data["sample_weight"] = sample_weight.iloc[train_index]
                validation_data["sample_weight"] = sample_weight.iloc[
                    validation_index]
        except Exception as e:
            import traceback

            print(traceback.format_exc())
            raise AutoMLException("Problem with custom validation. " + str(e))
        return (train_data, validation_data)
コード例 #8
0
 def _check_is_fitted(self):
     # First check if model can be loaded
     self._check_can_load()
     # Check if fitted
     if self._fit_level != "finished":
         raise AutoMLException(
             "This model has not been fitted yet. Please call `fit()` first."
         )
コード例 #9
0
    def _base_predict(self, X):
        self._check_is_fitted()

        X = self._build_dataframe(X)
        if not isinstance(X.columns[0], str):
            X.columns = [str(c) for c in X.columns]

        input_columns = X.columns.tolist()
        for column in self._data_info["columns"]:
            if column not in input_columns:
                raise AutoMLException(
                    f"Missing column: {column} in input data. Cannot predict")

        X = X[self._data_info["columns"]]
        self._validate_X_predict(X)

        # is stacked model
        if self._best_model._is_stacked:
            self._perform_model_stacking()
            X_stacked = self.get_stacked_data(X, mode="predict")

            if self._best_model.get_type() == "Ensemble":
                # Ensemble is using both original and stacked data
                predictions = self._best_model.predict(X, X_stacked)
            else:
                predictions = self._best_model.predict(X_stacked)
        else:
            predictions = self._best_model.predict(X)

        if self._ml_task == BINARY_CLASSIFICATION:
            # need to predict the label based on predictions and threshold
            neg_label, pos_label = (
                predictions.columns[0][11:],
                predictions.columns[1][11:],
            )

            if neg_label == "0" and pos_label == "1":
                neg_label, pos_label = 0, 1
            target_is_numeric = self._data_info.get("target_is_numeric", False)
            if target_is_numeric:
                neg_label = int()
                pos_label = int(pos_label)
            # assume that it is binary classification
            predictions[
                "label"] = predictions.iloc[:, 1] > self._best_model._threshold
            predictions["label"] = predictions["label"].map({
                True: pos_label,
                False: neg_label
            })
            return predictions
        elif self._ml_task == MULTICLASS_CLASSIFICATION:
            target_is_numeric = self._data_info.get("target_is_numeric", False)
            if target_is_numeric:
                predictions["label"] = predictions["label"].astype(np.int32)
            return predictions
        # Regression
        else:
            return predictions
コード例 #10
0
    def get_algorithm(cls, params):
        alg_type = params.get("model_type", "Xgboost")
        ml_task = params.get("ml_task", BINARY_CLASSIFICATION)

        try:
            Algorithm = AlgorithmsRegistry.get_algorithm_class(ml_task, alg_type)
            return Algorithm(params)
        except Exception as e:
            raise AutoMLException(f"Cannot get algorithm class. {str(e)}")
コード例 #11
0
    def _predict_all(self, X):
        # Check is task type is correct
        if self._ml_task == REGRESSION:
            raise AutoMLException(
                f"Method `predict_all()` can only be used when in classification tasks. Current task: '{self._ml_task}'."
            )

        # Make and return predictions
        return self._base_predict(X)
コード例 #12
0
ファイル: automl.py プロジェクト: ptesan777/mljar-supervised
    def train_model(self, params):

        model_path = os.path.join(self._results_path, params["name"])
        early_stop = EarlyStopping({
            "metric": {
                "name": self._optimize_metric
            },
            "log_to_dir": model_path
        })

        learner_time_constraint = LearnerTimeConstraint({
            "learner_time_limit":
            self._get_learner_time_limit(
                params["learner"]["model_type"]),  # self._time_limit,
            "min_steps":
            params["additional"].get("min_steps"),
        })

        total_time_constraint = TotalTimeConstraint({
            "total_time_limit":
            self._total_time_limit if self._model_time_limit is None else None,
            "total_time_start":
            self._start_time,
        })

        mf = ModelFramework(
            params,
            callbacks=[
                early_stop, learner_time_constraint, total_time_constraint
            ],
        )

        if self._enough_time_to_train(mf.get_type()):

            # self.verbose_print(params["name"] + " training start ...")
            logger.info(
                f"Train model #{len(self._models)+1} / Model name: {params['name']}"
            )

            try:
                os.mkdir(model_path)
            except Exception as e:
                raise AutoMLException(f"Cannot create directory {model_path}")

            mf.train(model_path)

            mf.save(model_path)
            self._model_paths += [model_path]

            self.keep_model(mf)

            # save the best one in the case the training will be interrupted
            self.select_and_save_best()
        else:
            logger.info(
                f"Cannot train {mf.get_type()} because of time constraint")
コード例 #13
0
    def _score(self, X, y=None):
        # y default must be None for scikit-learn compatibility

        # Check if y is None
        if y is None:
            raise AutoMLException("y must be specified.")

        predictions = self._predict(X)
        return (r2_score(y, predictions) if self._ml_task == REGRESSION else
                accuracy_score(y, predictions))
コード例 #14
0
    def _predict_proba(self, X):
        # Check is task type is correct
        if self._ml_task == REGRESSION:
            raise AutoMLException(
                f"Method `predict_proba()` can only be used when in classification tasks. Current task: '{self._ml_task}'."
            )

        # Make and return predictions
        # If classification task the result is in column 'label'
        # Need to drop `label` column.
        return self._base_predict(X).drop(["label"], axis=1).to_numpy()
コード例 #15
0
    def __init__(
        self,
        results_path,
        ml_task,
        eval_metric,
        time_budget=3600,
        init_params={},
        verbose=True,
        n_jobs=-1,
        random_state=42,
    ):
        if eval_metric.name not in [
                "auc",
                "logloss",
                "rmse",
                "mse",
                "mae",
                "mape",
                "r2",
                "spearman",
                "pearson",
                "f1",
                "average_precision",
                "accuracy",
                "user_defined_metric",
        ]:
            raise AutoMLException(
                f"Metric {eval_metric.name} is not supported")

        self.study_dir = os.path.join(results_path, "optuna")
        if not os.path.exists(self.study_dir):
            try:
                os.mkdir(self.study_dir)
            except Exception as e:
                print("Problem while creating directory for optuna studies.",
                      str(e))
        self.tuning_fname = os.path.join(self.study_dir, "optuna.json")
        self.tuning = init_params
        self.eval_metric = eval_metric

        self.direction = ("maximize" if Metric.optimize_negative(
            eval_metric.name) else "minimize")
        self.n_warmup_steps = (
            500  # set large enough to give small learning rates a chance
        )
        self.time_budget = time_budget
        self.verbose = verbose
        self.ml_task = ml_task
        self.n_jobs = n_jobs
        self.random_state = random_state
        self.cat_features_indices = []
        self.load()
        if not self.verbose:
            optuna.logging.set_verbosity(optuna.logging.CRITICAL)
コード例 #16
0
ファイル: automl.py プロジェクト: shiji203/mljar-supervised
    def _set_results_dir(self):
        if self._results_path is None:
            found = False
            for i in range(1, 101):
                self._results_path = f"AutoML_{i}"
                if not os.path.exists(self._results_path):
                    found = True
                    break
            if not found:
                raise AutoMLException("Cannot create directory for AutoML results")

        if os.path.exists(self._results_path):
            print(f"Directory {self._results_path} already exists")
            self.load()
        elif self._results_path is not None:
            print(f"Create directory {self._results_path}")
            try:
                os.mkdir(self._results_path)
            except Exception as e:
                raise AutoMLException(f"Cannot create directory {self._results_path}")
コード例 #17
0
    def on_learner_train_end(self, logs):
        if (
            self.total_time_limit is not None
            and len(self.learners) == 1
            and self.expected_learners_cnt > 1
            # just check for the first learner
            # need to have more than 1 learner
            # otherwise it is a finish of the training
        ):
            one_fold_time = time.time() - self.train_start_time
            estimate_all_folds = one_fold_time * self.expected_learners_cnt

            total_elapsed_time = np.round(time.time() - self.total_time_start, 2)

            # we need to add time for the rest of learners (assuming that all folds training time is the same)
            estimate_elapsed_time = total_elapsed_time + one_fold_time * (
                self.expected_learners_cnt - 1
            )

            if estimate_elapsed_time >= self.total_time_limit:
                raise AutoMLException(
                    "Stop training after the first fold. "
                    f"Time needed to train on the first fold {np.round(one_fold_time)} seconds. "
                    "The time estimate for training on all folds is larger than total_time_limit."
                )
        if (
            self.total_time_limit is not None
            and len(self.learners) < self.expected_learners_cnt
            # dont stop for last learner, we are finishing anyway
        ):
            total_elapsed_time = np.round(time.time() - self.total_time_start, 2)

            if total_elapsed_time > self.total_time_limit + 600:
                # add 10 minutes of margin
                # margin is added because of unexpected time changes
                # if training on each fold will be the same
                # then the training will be stopped after first fold (above condition)
                raise AutoMLException(
                    "Force to stop the training. "
                    "Total time for AutoML training already exceeded."
                )
コード例 #18
0
    def _get_results_path(self):
        """ Gets the current results_path"""
        # if we already have the results path set, please return it
        if self._results_path is not None:
            return self._results_path

        self._validate_results_path()

        path = self.results_path

        if path is None:
            for i in range(1, 10001):
                name = f"AutoML_{i}"
                if not os.path.exists(name):
                    self.create_dir(name)
                    self._results_path = name
                    return name
            # If it got here, could not create, raise expection
            raise AutoMLException("Cannot create directory for AutoML results")
        elif os.path.exists(self.results_path) and os.path.exists(
                os.path.join(
                    self.results_path,
                    "params.json")):  # AutoML already loaded, return path
            self._results_path = path
            return path
        # Dir does not exist, create it
        elif not os.path.exists(path):
            self.create_dir(path)
            self._results_path = path
            return path
        # Dir exists and is empty, use it
        elif os.path.exists(path) and not len(os.listdir(path)):
            self._results_path = path
            return path
        elif os.path.exists(path) and len(os.listdir(path)):
            raise AutoMLException(
                f"Cannot set directory for AutoML. Directory '{path}' is not empty."
            )

        raise AutoMLException("Cannot set directory for AutoML results")
コード例 #19
0
 def _set_metric(self):
     """ Set and validate the metric to be optimized. """
     if self._ml_task == BINARY_CLASSIFICATION:
         if self._user_set_optimize_metric is None:
             self._optimize_metric = "logloss"
         elif self._user_set_optimize_metric not in ["logloss", "auc"]:
             raise AutoMLException(
                 "Metric {} is not allowed in ML task: {}".format(
                     self._user_set_optimize_metric, self._ml_task
                 )
             )
         else:
             self._optimize_metric = self._user_set_optimize_metric
     elif self._ml_task == MULTICLASS_CLASSIFICATION:
         if self._user_set_optimize_metric is None:
             self._optimize_metric = "logloss"
         elif self._user_set_optimize_metric not in ["logloss"]:
             raise AutoMLException(
                 "Metric {} is not allowed in ML task: {}".format(
                     self._user_set_optimize_metric, self._ml_task
                 )
             )
         else:
             self._optimize_metric = self._user_set_optimize_metric
     elif self._ml_task == REGRESSION:
         if self._user_set_optimize_metric is None:
             self._optimize_metric = "rmse"
         elif self._user_set_optimize_metric not in ["rmse"]:
             raise AutoMLException(
                 "Metric {} is not allowed in ML task: {}".format(
                     self._user_set_optimize_metric, self._ml_task
                 )
             )
         else:
             self._optimize_metric = self._user_set_optimize_metric
     logger.info(
         "AutoML will optimize for metric: {0}".format(self._optimize_metric)
     )
     print(f"AutoML will optimize for metric: {self._optimize_metric}")
コード例 #20
0
ファイル: automl.py プロジェクト: amoonhappy/mljar-supervised
    def ensemble_step(self):
        if self._train_ensemble:
            self.ensemble = Ensemble(self._optimize_metric, self._ml_task)
            oofs, target = self.ensemble.get_oof_matrix(self._models)
            self.ensemble.fit(oofs, target)
            self.keep_model(self.ensemble)

            ensemble_path = os.path.join(self._results_path, "ensemble")
            try:
                os.mkdir(ensemble_path)
            except Exception as e:
                raise AutoMLException(f"Cannot create directory {ensemble_path}")
            self.ensemble.save(ensemble_path)
            self._model_paths += [ensemble_path]
コード例 #21
0
    def __init__(self, params):

        # kfold is default validation technique
        self.validation_type = params.get("validation_type", "kfold")

        if self.validation_type == "kfold":
            self.validator = KFoldValidator(params)
        elif self.validation_type == "split":
            self.validator = SplitValidator(params)
        else:
            raise AutoMLException(
                f"The validation type ({self.validation_type}) is not implemented."
            )
        """
コード例 #22
0
    def __init__(self, params):
        BaseValidator.__init__(self, params)

        self.train_ratio = self.params.get("train_ratio", 0.8)
        self.shuffle = self.params.get("shuffle", True)
        self.stratify = self.params.get("stratify", False)
        self.random_seed = self.params.get("random_seed", 1234)
        log.debug("SplitValidator, train_ratio: {0}".format(self.train_ratio))

        self._results_path = self.params.get("results_path")
        self._X_path = self.params.get("X_path")
        self._y_path = self.params.get("y_path")

        if self._X_path is None or self._y_path is None:
            raise AutoMLException("No data path set in SplitValidator params")
コード例 #23
0
    def transform(self, X):
        if self._kmeans is None:
            raise AutoMLException("KMeans not fitted")

        # scale
        X_scaled = self._scale.transform(X[self._input_columns])

        # kmeans
        distances = self._kmeans.transform(X_scaled)
        clusters = self._kmeans.predict(X_scaled)

        X[self._new_features[:-1]] = distances
        X[self._new_features[-1]] = clusters

        return X
コード例 #24
0
    def ensemble_step(self):
        if self._train_ensemble and len(self._models) > 1:
            self.ensemble = Ensemble(self._optimize_metric, self._ml_task)
            oofs, target = self.ensemble.get_oof_matrix(self._models)
            self.ensemble.fit(oofs, target)
            self.keep_model(self.ensemble)

            ensemble_path = os.path.join(self._results_path, "ensemble")
            try:
                os.mkdir(ensemble_path)
            except Exception as e:
                raise AutoMLException(
                    f"Cannot create directory {ensemble_path}")
            self.ensemble.save(ensemble_path)
            self._model_paths += [ensemble_path]
            # save the best one in the case the training will be interrupted
            self.select_and_save_best()
コード例 #25
0
    def __init__(
        self,
        results_path,
        ml_task,
        eval_metric,
        time_budget=3600,
        init_params={},
        verbose=True,
        n_jobs=-1,
        random_state=42,
    ):
        if eval_metric.name not in ["auc", "logloss", "rmse", "mae", "mape"]:
            raise AutoMLException(
                f"Metric {eval_metric.name} is not supported")

        self.study_dir = os.path.join(results_path, "optuna")
        if not os.path.exists(self.study_dir):
            try:
                os.mkdir(self.study_dir)
            except Exception as e:
                print("Problem while creating directory for optuna studies.",
                      str(e))
        self.tuning_fname = os.path.join(self.study_dir, "optuna.json")
        self.tuning = init_params
        self.eval_metric = eval_metric

        self.direction = ("maximize" if Metric.optimize_negative(
            eval_metric.name) else "minimize")
        self.n_warmup_steps = 500  # set large enough to give small learning rates a chance
        self.time_budget = time_budget
        self.verbose = verbose
        self.ml_task = ml_task
        self.n_jobs = n_jobs
        self.random_state = random_state

        self.cat_features_indices = []
        data_info_fname = os.path.join(results_path, "data_info.json")
        if os.path.exists(data_info_fname):
            data_info = json.loads(open(data_info_fname).read())
            for i, (k, v) in enumerate(data_info["columns_info"].items()):
                if "categorical" in v:
                    self.cat_features_indices += [i]

        self.load()
        if not self.verbose:
            optuna.logging.set_verbosity(optuna.logging.CRITICAL)
コード例 #26
0
ファイル: automl.py プロジェクト: shiji203/mljar-supervised
    def _set_algorithms(self):
        """ Set and validate available algorithms.

        If algorithms are not set, all algorithms from registry are used.
        Then perform vadlidation of algorithms.
        """
        if len(self._algorithms) == 0:
            self._algorithms = list(AlgorithmsRegistry.registry[self._ml_task].keys())

        for a in self._algorithms:
            if a not in list(AlgorithmsRegistry.registry[self._ml_task].keys()):
                raise AutoMLException(
                    "The algorithm {} is not allowed to use for ML task: {}. Allowed algorithms: {}".format(
                        a, self._ml_task, list(AlgorithmsRegistry.registry[self._ml_task].keys())
                    )
                )
        logger.info("AutoML will use algorithms: {}".format(self._algorithms))
        print(f"AutoML will use algorithms: {self._algorithms}")
コード例 #27
0
ファイル: automl.py プロジェクト: shiji203/mljar-supervised
    def _initial_prep(self, X_train, y_train, X_validation=None, y_validation=None):

        if not isinstance(X_train, pd.DataFrame):
            X_train = pd.DataFrame(X_train)

        if not isinstance(X_train.columns[0], str):
            X_train.columns = [str(c) for c in X_train.columns]

        X_train.reset_index(drop=True, inplace=True)

        if not isinstance(y_train, pd.DataFrame):
            y_train = pd.DataFrame({"target": np.array(y_train)})
        else:
            if "target" not in y_train.columns:
                raise AutoMLException("There should be target column in y_train")
        y_train.reset_index(drop=True, inplace=True)

        return X_train, y_train["target"], X_validation, y_validation
コード例 #28
0
    def load(self, path):
        logger.info("Loading AutoML models ...")
        try:
            params = json.load(open(os.path.join(path, "params.json")))

            self._model_paths = params["saved"]
            self._ml_task = params["ml_task"]
            self._eval_metric = params["eval_metric"]
            stacked_models = params.get("stacked")

            models_map = {}
            for model_path in self._model_paths:
                if model_path.endswith("Ensemble") or model_path.endswith(
                    "Ensemble_Stacked"
                ):
                    ens = Ensemble.load(model_path, models_map)
                    self._models += [ens]
                    models_map[ens.get_name()] = ens
                else:
                    m = ModelFramework.load(model_path)
                    self._models += [m]
                    models_map[m.get_name()] = m

            if stacked_models is not None:
                self._stacked_models = []
                for stacked_model_name in stacked_models:
                    self._stacked_models += [models_map[stacked_model_name]]

            best_model_name = None
            with open(os.path.join(path, "best_model.txt"), "r") as fin:
                best_model_name = fin.read()

            self._best_model = models_map[best_model_name]

            data_info_path = os.path.join(path, "data_info.json")
            self._data_info = json.load(open(data_info_path))
            self.n_features_in_ = self._data_info["n_features"]

            if "n_classes" in self._data_info:
                self.n_classes = self._data_info["n_classes"]

            self._fit_level = "finished"
        except Exception as e:
            raise AutoMLException(f"Cannot load AutoML directory. {str(e)}")
コード例 #29
0
    def __init__(self, params):
        BaseValidator.__init__(self, params)

        self.train_ratio = self.params.get("train_ratio", 0.8)
        self.shuffle = self.params.get("shuffle", True)
        self.stratify = self.params.get("stratify", False)
        self.random_seed = self.params.get("random_seed", 1234)
        self.repeats = self.params.get("repeats", 1)

        if not self.shuffle and self.repeats > 1:
            warnings.warn(
                "Disable repeats in validation because shuffle is disabled")
            self.repeats = 1

        self._results_path = self.params.get("results_path")
        self._X_path = self.params.get("X_path")
        self._y_path = self.params.get("y_path")

        if self._X_path is None or self._y_path is None:
            raise AutoMLException("No data path set in SplitValidator params")
コード例 #30
0
ファイル: automl.py プロジェクト: shiji203/mljar-supervised
    def predict(self, X):
        if self._best_model is None:
            return None

        if not isinstance(X.columns[0], str):
            X.columns = [str(c) for c in X.columns]

        input_columns = X.columns.tolist()
        for column in self._data_info["columns"]:
            if column not in input_columns:
                raise AutoMLException(
                    f"Missing column: {column} in input data. Cannot predict"
                )
        X = X[self._data_info["columns"]]

        predictions = self._best_model.predict(X)

        if self._ml_task == BINARY_CLASSIFICATION:
            # need to predict the label based on predictions and threshold
            neg_label, pos_label = (
                predictions.columns[0][11:],
                predictions.columns[1][11:],
            )
            if neg_label == "0" and pos_label == "1":
                neg_label, pos_label = 0, 1
            # assume that it is binary classification
            predictions["label"] = predictions.iloc[:, 1] > self._best_model._threshold
            predictions["label"] = predictions["label"].map(
                {True: pos_label, False: neg_label}
            )
            return predictions
        elif self._ml_task == MULTICLASS_CLASSIFICATION:

            return predictions
        else:
            return predictions