Ejemplo n.º 1
0
    def load(self):
        logger.info("Loading AutoML models ...")
        try:
            params = json.load(
                open(os.path.join(self._results_path, "params.json")))

            self._model_paths = params["saved"]
            self._ml_task = params["ml_task"]
            self._optimize_metric = params["optimize_metric"]
            stacked_models = params.get("stacked")

            models_map = {}
            for model_path in self._model_paths:
                if model_path.endswith("Ensemble") or model_path.endswith(
                        "Ensemble_Stacked"):
                    ens = Ensemble.load(model_path, models_map)
                    self._models += [ens]
                    models_map[ens.get_name()] = ens
                else:
                    m = ModelFramework.load(model_path)
                    self._models += [m]
                    models_map[m.get_name()] = m

            if stacked_models is not None:
                self._stacked_models = []
                for stacked_model_name in stacked_models:
                    self._stacked_models += [models_map[stacked_model_name]]

            best_model_name = None
            with open(os.path.join(self._results_path, "best_model.txt"),
                      "r") as fin:
                best_model_name = fin.read()

            self._best_model = models_map[best_model_name]

            data_info_path = os.path.join(self._results_path, "data_info.json")
            self._data_info = json.load(open(data_info_path))
        except Exception as e:
            raise AutoMLException(f"Cannot load AutoML directory. {str(e)}")
Ejemplo n.º 2
0
    def _set_algorithms(self):
        """ Set and validate available algorithms.

        If algorithms are not set, all algorithms from registry are used.
        Then perform vadlidation of algorithms.
        """
        if len(self._algorithms) == 0:
            self._algorithms = list(
                AlgorithmsRegistry.registry[self._ml_task].keys())

        for a in self._algorithms:
            if a not in list(
                    AlgorithmsRegistry.registry[self._ml_task].keys()):
                raise AutoMLException(
                    "The algorithm {} is not allowed to use for ML task: {}. Allowed algorithms: {}"
                    .format(
                        a,
                        self._ml_task,
                        list(
                            AlgorithmsRegistry.registry[self._ml_task].keys()),
                    ))
        logger.info("AutoML will use algorithms: {}".format(self._algorithms))
        print(f"AutoML will use algorithms: {self._algorithms}")
Ejemplo n.º 3
0
    def _initial_prep(self, X_train, y_train, X_validation=None, y_validation=None):

        if not isinstance(X_train, pd.DataFrame):
            X_train = pd.DataFrame(X_train)

        if not isinstance(X_train.columns[0], str):
            X_train.columns = [str(c) for c in X_train.columns]

        X_train.reset_index(drop=True, inplace=True)

        if isinstance(y_train, pd.DataFrame):
            if "target" not in y_train.columns:
                raise AutoMLException(
                    "y_train should be Numpy array, Pandas Series or DataFrame with column 'target' "
                )
            else:
                y_train = y_train["target"]
        y_train = pd.Series(np.array(y_train), name="target")

        X_train, y_train = ExcludeRowsMissingTarget.transform(
            X_train, y_train, warn=True
        )

        return X_train, y_train, X_validation, y_validation
Ejemplo n.º 4
0
    def predict(self, X):
        if self._best_model is None:
            return None

        if not isinstance(X.columns[0], str):
            X.columns = [str(c) for c in X.columns]

        input_columns = X.columns.tolist()
        for column in self._data_info["columns"]:
            if column not in input_columns:
                raise AutoMLException(
                    f"Missing column: {column} in input data. Cannot predict"
                )
        X = X[self._data_info["columns"]]

        predictions = self._best_model.predict(X)

        if self._ml_task == BINARY_CLASSIFICATION:
            # need to predict the label based on predictions and threshold
            neg_label, pos_label = (
                predictions.columns[0][11:],
                predictions.columns[1][11:],
            )
            if neg_label == "0" and pos_label == "1":
                neg_label, pos_label = 0, 1
            # assume that it is binary classification
            predictions["label"] = predictions.iloc[:, 1] > self._best_model._threshold
            predictions["label"] = predictions["label"].map(
                {True: pos_label, False: neg_label}
            )
            return predictions
        elif self._ml_task == MULTICLASS_CLASSIFICATION:

            return predictions
        else:
            return predictions
Ejemplo n.º 5
0
    def train_model(self, params):

        model_path = os.path.join(self._results_path, params["name"])

        early_stop = EarlyStopping({
            "metric": {
                "name": self._optimize_metric
            },
            "log_to_dir": model_path
        })
        time_constraint = TimeConstraint(
            {"train_seconds_time_limit": self._time_limit})
        mf = ModelFramework(params, callbacks=[early_stop, time_constraint])

        if self._enough_time_to_train(mf.get_type()):

            logger.info(
                f"Train model #{len(self._models)+1} / Model name: {params['name']}"
            )

            try:
                os.mkdir(model_path)
            except Exception as e:
                raise AutoMLException(f"Cannot create directory {model_path}")

            mf.train(model_path)

            mf.save(model_path)
            self._model_paths += [model_path]

            self.keep_model(mf)

        else:
            logger.info(
                f"Cannot check more models of {mf.get_type()} because of time constraint"
            )
Ejemplo n.º 6
0
    def fit(self, X_train, y_train, X_validation=None, y_validation=None):
        """
        Fit AutoML
        
        :param X_train: Pandas DataFrame with training data.
        :param y_train: Numpy Array with target training data.
        
        :param X_validation: Pandas DataFrame with validation data. (Not implemented yet)
        :param y_validation: Numpy Array with target of validation data. (Not implemented yet)
        
        """
        try:

            if self._best_model is not None:
                print(
                    "Best model is already set, no need to run fit. Skipping ..."
                )
                return

            self._start_time = time.time()

            if not isinstance(X_train, pd.DataFrame):
                raise AutoMLException(
                    "AutoML needs X_train matrix to be a Pandas DataFrame")

            self._set_ml_task(y_train)

            if X_train is not None:
                X_train = X_train.copy(deep=False)

            X_train, y_train, X_validation, y_validation = self._initial_prep(
                X_train, y_train, X_validation, y_validation)
            self._save_data(X_train, y_train, X_validation, y_validation)

            self._set_algorithms()
            self._set_metric()
            # self._estimate_training_times()

            if self._ml_task in [
                    BINARY_CLASSIFICATION, MULTICLASS_CLASSIFICATION
            ]:
                self._check_imbalanced(y_train)

            tuner = MljarTuner(
                self._tuner_params,
                self._algorithms,
                self._ml_task,
                self._validation,
                self._explain_level,
                self._data_info,
                self._seed,
            )
            self.tuner = tuner
            self._time_spend = {}
            self._time_start = {}

            # 1. Check simple algorithms
            self._fit_level = "simple_algorithms"
            start = time.time()
            self._time_start[self._fit_level] = start
            for params in tuner.simple_algorithms_params():
                self.train_model(params)
            self._time_spend["simple_algorithms"] = np.round(
                time.time() - start, 2)

            # 2. Default parameters
            self._fit_level = "default_algorithms"
            start = time.time()
            self._time_start[self._fit_level] = start
            for params in tuner.default_params(len(self._models)):
                self.train_model(params)
            self._time_spend["default_algorithms"] = np.round(
                time.time() - start, 2)

            # 3. The not-so-random step
            self._fit_level = "not_so_random"
            start = time.time()
            self._time_start[self._fit_level] = start
            generated_params = tuner.get_not_so_random_params(len(
                self._models))
            for params in generated_params:
                self.train_model(params)
            self._time_spend["not_so_random"] = np.round(
                time.time() - start, 2)

            # 4. The hill-climbing step
            self._fit_level = "hill_climbing"
            start = time.time()
            self._time_start[self._fit_level] = start
            for params in tuner.get_hill_climbing_params(self._models):
                self.train_model(params)
            self._time_spend["hill_climbing"] = np.round(
                time.time() - start, 2)

            # 5. Ensemble unstacked models
            self._fit_level = "ensemble_unstacked"
            start = time.time()
            self._time_start[self._fit_level] = start
            self.ensemble_step()
            self._time_spend["ensemble_unstacked"] = np.round(
                time.time() - start, 2)

            if self._stack:
                # 6. Stack best models
                self._fit_level = "stack"
                start = time.time()
                self._time_start[self._fit_level] = start
                self.stacked_ensemble_step()
                self._time_spend["stack"] = np.round(time.time() - start, 2)

                # 7. Ensemble all models (original and stacked)
                any_stacked = False
                for m in self._models:
                    if m._is_stacked:
                        any_stacked = True
                        break
                if any_stacked:
                    self._fit_level = "ensemble_all"
                    start = time.time()
                    self.ensemble_step(is_stacked=True)
                    self._time_spend["ensemble_all"] = np.round(
                        time.time() - start, 2)

            self._fit_time = time.time() - self._start_time

            logger.info(f"AutoML fit time: {self._fit_time}")

        except Exception as e:
            raise e
        finally:
            if self._X_train_path is not None:
                self._load_data_variables(X_train)
    def __init__(self, params):
        BaseValidator.__init__(self, params)

        self.k_folds = self.params.get("k_folds", 5)
        self.shuffle = self.params.get("shuffle", True)
        self.stratify = self.params.get("stratify", False)
        self.random_seed = self.params.get("random_seed", 1906)

        if self.stratify:
            if self.shuffle:
                self.skf = StratifiedKFold(
                    n_splits=self.k_folds,
                    shuffle=self.shuffle,
                    random_state=self.random_seed if self.shuffle else None,
                )
            else:
                self.skf = StratifiedKFold(n_splits=self.k_folds,
                                           shuffle=self.shuffle)
        else:
            self.skf = KFold(
                n_splits=self.k_folds,
                shuffle=self.shuffle,
                random_state=self.random_seed if self.shuffle else None,
            )

        self._results_path = self.params.get("results_path")
        self._X_path = self.params.get("X_path")
        self._y_path = self.params.get("y_path")

        if self._X_path is None or self._y_path is None:
            raise AutoMLException("No data path set in KFoldValidator params")

        folds_path = os.path.join(self._results_path, "folds")

        if not os.path.exists(folds_path):

            os.mkdir(folds_path)

            X = pd.read_parquet(self._X_path)
            y = pd.read_parquet(self._y_path)
            y = y["target"]

            if isinstance(y[0], bytes):
                # see https://github.com/scikit-learn/scikit-learn/issues/16980
                y = y.astype(str)

            for fold_cnt, (train_index,
                           validation_index) in enumerate(self.skf.split(X,
                                                                         y)):

                train_index_file = os.path.join(
                    self._results_path, "folds",
                    f"fold_{fold_cnt}_train_indices.npy")
                validation_index_file = os.path.join(
                    self._results_path,
                    "folds",
                    f"fold_{fold_cnt}_validation_indices.npy",
                )

                np.save(train_index_file, train_index)
                np.save(validation_index_file, validation_index)

            del X
            del y
            gc.collect()

        else:
            log.debug("Folds split already done, reuse it")
Ejemplo n.º 8
0
    def fit(self, X_train, y_train, X_validation=None, y_validation=None):
        """
        Fit AutoML
        
        :param X_train: Pandas DataFrame with training data.
        :param y_train: Numpy Array with target training data.
        
        :param X_validation: Pandas DataFrame with validation data. (Not implemented yet)
        :param y_validation: Numpy Array with target of validation data. (Not implemented yet)
        """

        try:

            self.load_progress()

            if self._fit_level == "finished":
                print("AutoML is trained. Skipping fit step ...")
                return

            # if self._best_model is not None:
            #    print("Best model is already set, no need to run fit. Skipping ...")
            #    return

            self._start_time = time.time()
            if self._time_ctrl is not None:
                self._start_time -= self._time_ctrl.already_spend()

            if not isinstance(X_train, pd.DataFrame):
                raise AutoMLException(
                    "AutoML needs X_train matrix to be a Pandas DataFrame"
                )

            # Automatic Exloratory Data Analysis
            if self._explain_level == 2:
                EDA.compute(X_train, y_train, os.path.join(self._results_path, "EDA"))

            self._set_ml_task(y_train)

            if X_train is not None:
                X_train = X_train.copy(deep=False)

            X_train, y_train, X_validation, y_validation = self._initial_prep(
                X_train, y_train, X_validation, y_validation
            )
            self._save_data(X_train, y_train, X_validation, y_validation)
            self._set_algorithms()
            self._set_metric()

            if self._ml_task in [BINARY_CLASSIFICATION, MULTICLASS_CLASSIFICATION]:
                self._check_imbalanced(y_train)

            tuner = MljarTuner(
                self._tuner_params,
                self._algorithms,
                self._ml_task,
                self._validation,
                self._explain_level,
                self._data_info,
                self._golden_features,
                self._feature_selection,
                self._train_ensemble,
                self._stack_models,
                self._seed,
            )
            self.tuner = tuner

            steps = tuner.steps()

            if self._time_ctrl is None:
                self._time_ctrl = TimeController(
                    self._start_time,
                    self._total_time_limit,
                    self._model_time_limit,
                    steps,
                    self._algorithms,
                )

            self._time_ctrl.log_time(
                "prepare_data", "prepare_data", "prepare_data", time.time() - self._start_time
            )

            for step in steps:
                self._fit_level = step
                start = time.time()
                # self._time_start[step] = start

                if step == "stack":
                    self.prepare_for_stacking()

                generated_params = []
                if step in self._all_params:
                    generated_params = self._all_params[step]
                else:
                    generated_params = tuner.generate_params(
                        step, self._models, self._results_path, self._stacked_models
                    )

                if generated_params is None:
                    continue
                if generated_params:
                    print("-" * 72)
                    print(f"{step} with {len(generated_params)} models to train ...")

                for params in generated_params:
                    if params.get("status", "") == "trained":
                        print(f"Skipping {params['name']}, already trained.")
                        continue
                    if params.get("status", "") == "skipped":
                        print(f"Skipped {params['name']}.")
                        continue

                    trained = False
                    if "ensemble" in step:
                        trained = self.ensemble_step(is_stacked=params["is_stacked"])
                    else:
                        trained = self.train_model(params)

                    params["status"] = "trained" if trained else "skipped"
                    params["final_loss"] = self._models[-1].get_final_loss()
                    params["train_time"] = self._models[-1].get_train_time()
                    self.save_progress(step, generated_params)

            self._fit_level = "finished"
            self.save_progress()

            print(f"AutoML fit time: {time.time() - self._start_time}")

        except Exception as e:
            raise e
        finally:
            if self._X_train_path is not None:
                self._load_data_variables(X_train)
Ejemplo n.º 9
0
 def create_dir(self, model_path):
     if not os.path.exists(model_path):
         try:
             os.mkdir(model_path)
         except Exception as e:
             raise AutoMLException(f"Cannot create directory {model_path}. {str(e)}")
Ejemplo n.º 10
0
    def fit_and_transform(self, X_train, y_train, sample_weight=None):
        logger.debug("Preprocessing.fit_and_transform")

        if y_train is not None:
            # target preprocessing
            # this must be used first, maybe we will drop some rows because of missing target values
            target_preprocessing = self._params.get("target_preprocessing")
            logger.debug(
                "target_preprocessing params: {}".format(target_preprocessing))

            X_train, y_train, sample_weight = ExcludeRowsMissingTarget.transform(
                X_train, y_train, sample_weight)

            if PreprocessingCategorical.CONVERT_INTEGER in target_preprocessing:
                logger.debug("Convert target to integer")
                self._categorical_y = LabelEncoder(try_to_fit_numeric=True)
                self._categorical_y.fit(y_train)
                y_train = pd.Series(self._categorical_y.transform(y_train))

            if PreprocessingCategorical.CONVERT_ONE_HOT in target_preprocessing:
                logger.debug("Convert target to one-hot coding")
                self._categorical_y = LabelBinarizer()
                self._categorical_y.fit(pd.DataFrame({"target": y_train}),
                                        "target")
                y_train = self._categorical_y.transform(
                    pd.DataFrame({"target": y_train}), "target")

            if Scale.SCALE_LOG_AND_NORMAL in target_preprocessing:
                logger.debug("Scale log and normal")

                self._scale_y = Scale(["target"],
                                      scale_method=Scale.SCALE_LOG_AND_NORMAL)
                y_train = pd.DataFrame({"target": y_train})
                self._scale_y.fit(y_train)
                y_train = self._scale_y.transform(y_train)
                y_train = y_train["target"]

            if Scale.SCALE_NORMAL in target_preprocessing:
                logger.debug("Scale normal")

                self._scale_y = Scale(["target"],
                                      scale_method=Scale.SCALE_NORMAL)
                y_train = pd.DataFrame({"target": y_train})
                self._scale_y.fit(y_train)
                y_train = self._scale_y.transform(y_train)
                y_train = y_train["target"]

        # columns preprocessing
        columns_preprocessing = self._params.get("columns_preprocessing")
        for column in columns_preprocessing:
            transforms = columns_preprocessing[column]
            # logger.debug("Preprocess column {} with: {}".format(column, transforms))

        # remove empty or constant columns
        cols_to_remove = list(
            filter(
                lambda k: "remove_column" in columns_preprocessing[k],
                columns_preprocessing,
            ))

        if X_train is not None:
            X_train.drop(cols_to_remove, axis=1, inplace=True)
        self._remove_columns = cols_to_remove

        numeric_cols = []  # get numeric cols before text transformations
        # needed for golden features
        if X_train is not None and ("golden_features" in self._params
                                    or "kmeans_features" in self._params):
            numeric_cols = X_train.select_dtypes(
                include="number").columns.tolist()

        # there can be missing values in the text data,
        # but we don't want to handle it by fill missing methods
        # zeros will be imputed by text_transform method
        cols_to_process = list(
            filter(
                lambda k: "text_transform" in columns_preprocessing[k],
                columns_preprocessing,
            ))

        new_text_columns = []
        for col in cols_to_process:
            t = TextTransformer()
            t.fit(X_train, col)
            X_train = t.transform(X_train)
            self._text_transforms += [t]
            new_text_columns += t._new_columns
        # end of text transform

        for missing_method in [PreprocessingMissingValues.FILL_NA_MEDIAN]:
            cols_to_process = list(
                filter(
                    lambda k: missing_method in columns_preprocessing[k],
                    columns_preprocessing,
                ))
            missing = PreprocessingMissingValues(cols_to_process,
                                                 missing_method)
            missing.fit(X_train)
            X_train = missing.transform(X_train)
            self._missing_values += [missing]

        # golden features
        golden_columns = []
        if "golden_features" in self._params:
            results_path = self._params["golden_features"]["results_path"]
            ml_task = self._params["golden_features"]["ml_task"]
            self._golden_features = GoldenFeaturesTransformer(
                results_path, ml_task)
            self._golden_features.fit(X_train[numeric_cols], y_train)
            X_train = self._golden_features.transform(X_train)
            golden_columns = self._golden_features._new_columns

        kmeans_columns = []
        if "kmeans_features" in self._params:
            results_path = self._params["kmeans_features"]["results_path"]
            self._kmeans = KMeansTransformer(results_path, self._model_name,
                                             self._k_fold)
            self._kmeans.fit(X_train[numeric_cols], y_train)
            X_train = self._kmeans.transform(X_train)
            kmeans_columns = self._kmeans._new_features

        for convert_method in [
                PreprocessingCategorical.CONVERT_INTEGER,
                PreprocessingCategorical.CONVERT_ONE_HOT,
                PreprocessingCategorical.CONVERT_LOO,
        ]:
            cols_to_process = list(
                filter(
                    lambda k: convert_method in columns_preprocessing[k],
                    columns_preprocessing,
                ))
            convert = PreprocessingCategorical(cols_to_process, convert_method)
            convert.fit(X_train, y_train)
            X_train = convert.transform(X_train)
            self._categorical += [convert]

        # datetime transform
        cols_to_process = list(
            filter(
                lambda k: "datetime_transform" in columns_preprocessing[k],
                columns_preprocessing,
            ))

        new_datetime_columns = []
        for col in cols_to_process:

            t = DateTimeTransformer()
            t.fit(X_train, col)
            X_train = t.transform(X_train)
            self._datetime_transforms += [t]
            new_datetime_columns += t._new_columns

        # SCALE
        for scale_method in [Scale.SCALE_NORMAL, Scale.SCALE_LOG_AND_NORMAL]:
            cols_to_process = list(
                filter(
                    lambda k: scale_method in columns_preprocessing[k],
                    columns_preprocessing,
                ))
            if (len(cols_to_process) and len(new_datetime_columns)
                    and scale_method == Scale.SCALE_NORMAL):
                cols_to_process += new_datetime_columns
            if (len(cols_to_process) and len(new_text_columns)
                    and scale_method == Scale.SCALE_NORMAL):
                cols_to_process += new_text_columns

            if (len(cols_to_process) and len(golden_columns)
                    and scale_method == Scale.SCALE_NORMAL):
                cols_to_process += golden_columns

            if (len(cols_to_process) and len(kmeans_columns)
                    and scale_method == Scale.SCALE_NORMAL):
                cols_to_process += kmeans_columns

            if len(cols_to_process):
                scale = Scale(cols_to_process)
                scale.fit(X_train)
                X_train = scale.transform(X_train)
                self._scale += [scale]

        if self._add_random_feature:
            # -1, 1, with 0 mean
            X_train["random_feature"] = np.random.rand(
                X_train.shape[0]) * 2.0 - 1.0

        if self._drop_features:
            available_cols = X_train.columns.tolist()
            drop_cols = [c for c in self._drop_features if c in available_cols]
            if len(drop_cols) == X_train.shape[1]:
                raise AutoMLException(
                    "All features are droppped! Your data looks like random data."
                )
            if drop_cols:
                X_train.drop(drop_cols, axis=1, inplace=True)
            self._drop_features = drop_cols

        if X_train is not None:
            # there can be catagorical columns (in CatBoost) which cant be clipped
            numeric_cols = X_train.select_dtypes(
                include="number").columns.tolist()
            X_train[numeric_cols] = X_train[numeric_cols].clip(
                lower=np.finfo(np.float32).min + 1000,
                upper=np.finfo(np.float32).max - 1000,
            )

        return X_train, y_train, sample_weight
Ejemplo n.º 11
0
    def extensive_eda(X, y, save_path):

        # Check for empty dataframes in params
        if not isinstance(X, pd.DataFrame):
            raise ValueError("X should be a dataframe")
        if X.shape[0] != len(y):
            raise ValueError("X and y should have the same number of samples")

        if X.shape[1] > MAXCOL:
            X = X.iloc[:, :MAXCOL]
            warnings.warn(
                f"AutoML EDA column limit exceeded! running for first {MAXCOL} columns"
            )

        if save_path:
            if not os.path.exists(save_path):
                os.mkdir(save_path)
        else:
            raise ValueError("Please provide a valid path to save the Extensive EDA")

        plt.style.use("ggplot")
        try:

            if PreprocessingUtils.get_type(y) in ("categorical", "discrete"):

                for col in X.columns:

                    if PreprocessingUtils.get_type(X[col]) == "continous":

                        plt.figure(figsize=(5, 5))
                        for i in np.unique(y):
                            sns.kdeplot(
                                x=X.iloc[np.where(y == i)[0]][col],
                                label=f"class {i}",
                                shade=True,
                            )
                        plt.legend()
                        plt.gca().set_title(
                            f"Distribution of {col} for each class",
                            fontsize=11,
                            weight="bold",
                            alpha=0.75,
                        )
                        plt.savefig(EDA.plot_path(save_path, col + "_target"))

                    elif PreprocessingUtils.get_type(X[col]) in (
                        "categorical",
                        "discrete",
                    ):

                        if X[col].nunique() > 7:
                            warnings.warn("Considering 7 the most frequent values")

                        values = X[col].value_counts().index[:7]
                        plt.figure(figsize=(5, 5))
                        sns.countplot(
                            x=X[X[col].isin(values)][col], hue=y[X[col].isin(values)]
                        )
                        plt.gca().set_title(
                            f"Count plot of each {col}",
                            fontsize=11,
                            weight="bold",
                            alpha=0.75,
                        )
                        plt.savefig(EDA.plot_path(save_path, col + "_target"))

            elif PreprocessingUtils.get_type(y) == "continous":
                for col in X.columns:

                    if PreprocessingUtils.get_type(X[col]) == "continous":

                        plt.figure(figsize=(5, 5))
                        plt.scatter(X[col].values, y)
                        plt.gca().set_xlabel(f"{col}")
                        plt.gca().set_ylabel("target")
                        plt.gca().set_title(
                            f"Scatter plot of {col} vs target",
                            fontsize=11,
                            weight="bold",
                            alpha=0.75,
                        )

                        plt.savefig(EDA.plot_path(save_path, col + "_target"))

                    elif PreprocessingUtils.get_type(X[col]) in (
                        "categorical",
                        "discrete",
                    ):
                        if X[col].nunique() > 7:
                            warnings.warn("Considering 7 the most frequent values")

                        plt.figure(figsize=(5, 5))
                        for i in X[col].value_counts().index[:7]:
                            sns.kdeplot(
                                x=y[X[X[col] == i].index],
                                shade=True,
                                label=f"{col}_{i}",
                            )
                        plt.gca().set_title(
                            f"Distribution of target for each {col}",
                            fontsize=11,
                            weight="bold",
                            alpha=0.75,
                        )
                        plt.legend()

                        plt.savefig(EDA.plot_path(save_path, col + "_target"))

                    elif PreprocessingUtils.get_type(X[col]) == "datetime":

                        plt.figure(figsize=(5, 5))
                        plt.plot(X[col], y)
                        plt.gca().set_xticklabels(X[col].dt.date, rotation="45")
                        plt.gca().set_title(
                            f"Distribution of target over time",
                            fontsize=11,
                            weight="bold",
                            alpha=0.75,
                        )
                        plt.savefig(EDA.plot_path(save_path, col + "_target"))

            cols = [
                col
                for col in X.columns
                if PreprocessingUtils.get_type(X[col]) == "continous"
            ][:COLS]

            if len(cols) > 0:

                plt.figure(figsize=(10, 10))
                sns.heatmap(X[cols].corr())
                plt.gca().set_title("Heatmap", fontsize=11, weight="bold", alpha=0.75)

            plt.savefig(os.path.join(save_path, "heatmap"))

            with open(os.path.join(save_path, "Extensive_EDA.md"), "w") as fout:

                for col in X.columns:

                    fout.write(f"## Bivariate analysis of {col} feature with target\n")
                    fout.write("\n![]({})\n".format(EDA.plot_fname(col + "_target")))
                    fout.write("\n")
                    fout.write(
                        "------------------------------------------------------\n"
                    )

                if len(cols) > 0:
                    fout.write("## Heatmap\n")
                    fout.write("![](heatmap.png)\n")
                    fout.write("\n")
                    fout.write(
                        "------------------------------------------------------\n"
                    )

        except Exception as e:
            AutoMLException(e)
Ejemplo n.º 12
0
    def __init__(
        self,
        results_path,
        ml_task,
        eval_metric,
        time_budget=3600,
        init_params={},
        verbose=True,
        n_jobs=-1,
        random_state=42,
    ):
        if eval_metric.name not in [
                "auc",
                "logloss",
                "rmse",
                "mse",
                "mae",
                "mape",
                "r2",
                "spearman",
                "pearson",
                "f1",
                "average_precision",
                "accuracy",
        ]:
            raise AutoMLException(
                f"Metric {eval_metric.name} is not supported")

        self.study_dir = os.path.join(results_path, "optuna")
        if not os.path.exists(self.study_dir):
            try:
                os.mkdir(self.study_dir)
            except Exception as e:
                print("Problem while creating directory for optuna studies.",
                      str(e))
        self.tuning_fname = os.path.join(self.study_dir, "optuna.json")
        self.tuning = init_params
        self.eval_metric = eval_metric

        self.direction = ("maximize" if Metric.optimize_negative(
            eval_metric.name) else "minimize")
        self.n_warmup_steps = (
            500  # set large enough to give small learning rates a chance
        )
        self.time_budget = time_budget
        self.verbose = verbose
        self.ml_task = ml_task
        self.n_jobs = n_jobs
        self.random_state = random_state

        self.cat_features_indices = []
        data_info_fname = os.path.join(results_path, "data_info.json")
        if os.path.exists(data_info_fname):
            data_info = json.loads(open(data_info_fname).read())
            for i, (k, v) in enumerate(data_info["columns_info"].items()):
                if "categorical" in v:
                    self.cat_features_indices += [i]

        self.load()
        if not self.verbose:
            optuna.logging.set_verbosity(optuna.logging.CRITICAL)
Ejemplo n.º 13
0
    def fit(self, X, y):
        if self._new_features:
            return
        if self._error is not None and self._error:
            raise AutoMLException(
                "Golden Features not created due to error (please check errors.md)."
            )
            return
        if X.shape[1] == 0:
            self._error = f"Golden Features not created. No continous features. Input data shape: {X.shape}, {y.shape}"
            self.save(self._result_file)
            raise AutoMLException(
                "Golden Features not created. No continous features.")

        start_time = time.time()
        combinations = itertools.combinations(X.columns, r=2)
        items = [i for i in combinations]
        if len(items) > 250000:
            si = np.random.choice(len(items), 250000, replace=False)
            items = [items[i] for i in si]

        X_train, X_test, y_train, y_test = self._subsample(X, y)

        for i in range(len(items)):
            items[i] += (X_train, y_train, X_test, y_test, self._scorer)

        scores = []
        # parallel version
        with Pool() as p:
            scores = p.map(get_score, items)
        # single process version
        # for item in items:
        #    scores += [get_score(item)]

        if not scores:
            self._error = f"Golden Features not created. Empty scores. Input data shape: {X.shape}, {y.shape}"
            self.save(self._result_file)
            raise AutoMLException("Golden Features not created. Empty scores.")

        result = []
        for i in range(len(items)):
            if scores[i][0] is not None:
                result += [(items[i][0], items[i][1], "diff", scores[i][0])]
            if scores[i][1] is not None:
                result += [(items[i][0], items[i][1], "ratio", scores[i][1])]
            if scores[i][2] is not None:
                result += [(items[i][1], items[i][0], "ratio", scores[i][2])]

        df = pd.DataFrame(
            result, columns=["feature1", "feature2", "operation", "score"])
        df.sort_values(by="score", inplace=True)

        new_cols_cnt = np.min([50, np.max([5, int(0.05 * X.shape[1])])])

        self._new_features = json.loads(
            df.head(new_cols_cnt).to_json(orient="records"))

        for new_feature in self._new_features:
            new_col = "_".join([
                new_feature["feature1"],
                new_feature["operation"],
                new_feature["feature2"],
            ])
            self._new_columns += [new_col]
            print(f"Add Golden Feature: {new_col}")

        self.save(self._result_file)

        print(
            f"Created {len(self._new_features)} Golden Features in {np.round(time.time() - start_time,2)} seconds."
        )
Ejemplo n.º 14
0
    def fit(self, X_train, y_train, X_validation=None, y_validation=None):
        """
        Fit AutoML
        
        :param X_train: Pandas DataFrame with training data.
        :param y_train: Numpy Array with target training data.
        
        :param X_validation: Pandas DataFrame with validation data. (Not implemented yet)
        :param y_validation: Numpy Array with target of validation data. (Not implemented yet)
        
        """
        try:
            if self._best_model is not None:
                print("Best model is already set, no need to run fit. Skipping ...")
                return

            start_time = time.time()
            if not isinstance(X_train, pd.DataFrame):
                raise AutoMLException(
                    "AutoML needs X_train matrix to be a Pandas DataFrame"
                )

            if X_train is not None:
                X_train = X_train.copy(deep=False)

            X_train, y_train, X_validation, y_validation = self._initial_prep(
                X_train, y_train, X_validation, y_validation
            )
            self._save_data(X_train, y_train, X_validation, y_validation)

            self._set_ml_task(y_train)
            self._set_algorithms()
            self._set_metric()
            self._estimate_training_times()

            if self._ml_task in [BINARY_CLASSIFICATION, MULTICLASS_CLASSIFICATION]:
                self._check_imbalanced(y_train)

            tuner = MljarTuner(
                self._tuner_params,
                self._algorithms,
                self._ml_task,
                self._validation,
                self._seed,
            )

            # not so random step
            generated_params = tuner.get_not_so_random_params(X_train, y_train)
            self._del_data_variables(X_train, y_train)

            for params in generated_params:
                self.train_model(params)
            # hill climbing
            for params in tuner.get_hill_climbing_params(self._models):
                self.train_model(params)

            self.ensemble_step()

            max_loss = 10e12
            for i, m in enumerate(self._models):
                if m.get_final_loss() < max_loss:
                    self._best_model = m
                    max_loss = m.get_final_loss()

            self.get_additional_metrics()
            self._fit_time = time.time() - start_time
            # self._progress_bar.close()

            with open(os.path.join(self._results_path, "best_model.txt"), "w") as fout:
                fout.write(f"{self._best_model.get_name()}")

            with open(os.path.join(self._results_path, "params.json"), "w") as fout:
                params = {
                    "ml_task": self._ml_task,
                    "optimize_metric": self._optimize_metric,
                    "saved": self._model_paths,
                }
                fout.write(json.dumps(params, indent=4))

            ldb = self.get_leaderboard()
            ldb.to_csv(os.path.join(self._results_path, "leaderboard.csv"), index=False)

            # save report
            ldb["Link"] = [f"[Results link]({m}/README.md)" for m in ldb["name"].values]
            ldb.insert(loc=0, column="Best model", value="")
            ldb.loc[
                ldb.name == self._best_model.get_name(), "Best model"
            ] = "*** the best ***"
            with open(os.path.join(self._results_path, "README.md"), "w") as fout:
                fout.write(f"# AutoML Leaderboard\n\n")
                fout.write(tabulate(ldb.values, ldb.columns, tablefmt="pipe"))
        except Exception as e:
            raise e
        finally:
            if self._X_train_path is not None:
                self._load_data_variables(X_train)
Ejemplo n.º 15
0
    def __init__(self, params):
        BaseValidator.__init__(self, params)

        self.k_folds = self.params.get("k_folds", 5)
        self.shuffle = self.params.get("shuffle", True)
        self.stratify = self.params.get("stratify", False)
        self.random_seed = self.params.get("random_seed", 1906)
        self.repeats = self.params.get("repeats", 1)

        if not self.shuffle and self.repeats > 1:
            warnings.warn(
                "Disable repeats in validation because shuffle is disabled")
            self.repeats = 1

        self.skf = []

        for r in range(self.repeats):
            random_seed = self.random_seed + r if self.shuffle else None
            if self.stratify:
                if self.shuffle:
                    self.skf += [
                        StratifiedKFold(
                            n_splits=self.k_folds,
                            shuffle=self.shuffle,
                            random_state=random_seed,
                        )
                    ]
                else:
                    self.skf += [
                        StratifiedKFold(
                            n_splits=self.k_folds,
                            #                             shuffle=self.shuffle,
                            random_state=random_seed,
                        )
                    ]
            else:
                self.skf += [
                    KFold(
                        n_splits=self.k_folds,
                        shuffle=self.shuffle,
                        random_state=random_seed,
                    )
                ]

        self._results_path = self.params.get("results_path")
        self._X_path = self.params.get("X_path")
        self._y_path = self.params.get("y_path")
        self._sample_weight_path = self.params.get("sample_weight_path")

        if self._X_path is None or self._y_path is None:
            raise AutoMLException("No data path set in KFoldValidator params")

        folds_path = os.path.join(self._results_path, "folds")

        if not os.path.exists(folds_path):

            os.mkdir(folds_path)

            X = pd.read_parquet(self._X_path)
            y = pd.read_parquet(self._y_path)
            y = y["target"]

            if isinstance(y[0], bytes):
                # see https://github.com/scikit-learn/scikit-learn/issues/16980
                y = y.astype(str)

            for repeat_cnt, skf in enumerate(self.skf):
                for fold_cnt, (train_index,
                               validation_index) in enumerate(skf.split(X, y)):
                    repeat_str = f"_repeat_{repeat_cnt}" if len(
                        self.skf) > 1 else ""
                    train_index_file = os.path.join(
                        self._results_path,
                        "folds",
                        f"fold_{fold_cnt}{repeat_str}_train_indices.npy",
                    )
                    validation_index_file = os.path.join(
                        self._results_path,
                        "folds",
                        f"fold_{fold_cnt}{repeat_str}_validation_indices.npy",
                    )

                    np.save(train_index_file, train_index)
                    np.save(validation_index_file, validation_index)

            del X
            del y
            gc.collect()

        else:
            log.debug("Folds split already done, reuse it")
Ejemplo n.º 16
0
    def fit(self, X_train, y_train, X_validation=None, y_validation=None):
        """
        Fit AutoML
        
        :param X_train: Pandas DataFrame with training data.
        :param y_train: Numpy Array with target training data.
        
        :param X_validation: Pandas DataFrame with validation data. (Not implemented yet)
        :param y_validation: Numpy Array with target of validation data. (Not implemented yet)
        
        """
        try:

            if self._best_model is not None:
                print(
                    "Best model is already set, no need to run fit. Skipping ..."
                )
                return

            self._start_time = time.time()
            if not isinstance(X_train, pd.DataFrame):
                raise AutoMLException(
                    "AutoML needs X_train matrix to be a Pandas DataFrame")

            if X_train is not None:
                X_train = X_train.copy(deep=False)

            X_train, y_train, X_validation, y_validation = self._initial_prep(
                X_train, y_train, X_validation, y_validation)
            self._save_data(X_train, y_train, X_validation, y_validation)

            self._set_ml_task(y_train)
            self._set_algorithms()
            self._set_metric()
            self._estimate_training_times()

            if self._ml_task in [
                    BINARY_CLASSIFICATION, MULTICLASS_CLASSIFICATION
            ]:
                self._check_imbalanced(y_train)

            tuner = MljarTuner(
                self._tuner_params,
                self._algorithms,
                self._ml_task,
                self._validation,
                self._explain_level,
                self._seed,
            )

            # not so random step
            generated_params = tuner.get_not_so_random_params(X_train, y_train)
            self._del_data_variables(X_train, y_train)

            # Shuffle generated params
            # do not shuffle Baseline, Linear and Decision Trees
            dont_shuffle = []
            to_shuffle = []
            for p in generated_params:
                if p["learner"]["model_type"] in [
                        "Baseline",
                        "Linear",
                        "Decision Tree",
                ]:
                    dont_shuffle += [p]
                else:
                    to_shuffle += [p]

            np.random.shuffle(to_shuffle)
            generated_params = dont_shuffle + to_shuffle

            for params in generated_params:
                self.train_model(params)
            # hill climbing
            for params in tuner.get_hill_climbing_params(self._models):
                self.train_model(params)

            self.ensemble_step()

            self._fit_time = time.time() - self._start_time

        except Exception as e:
            raise e
        finally:
            if self._X_train_path is not None:
                self._load_data_variables(X_train)