コード例 #1
0
 def test_get_type_pandas(self):
     d = {"col1": [1, 2, 3], "col2": ["a", "b", "c"]}
     df = pd.DataFrame(data=d)
     col1_type = PreprocessingUtils.get_type(df["col1"])
     self.assertNotEqual(col1_type, PreprocessingUtils.CATEGORICAL)
     col2_type = PreprocessingUtils.get_type(df["col2"])
     self.assertNotEqual(col1_type, PreprocessingUtils.CATEGORICAL)
コード例 #2
0
 def _get_fill_value(self, x):
     # categorical type
     if PreprocessingUtils.get_type(x) == PreprocessingUtils.CATEGORICAL:
         if self._na_fill_method == PreprocessingMissingValues.FILL_NA_MIN:
             return (PreprocessingMissingValues.MISSING_VALUE
                     )  # add new categorical value
         return PreprocessingUtils.get_most_frequent(x)
     # numerical type
     if self._na_fill_method == PreprocessingMissingValues.FILL_NA_MIN:
         return PreprocessingUtils.get_min(x) - 1.0
     if self._na_fill_method == PreprocessingMissingValues.FILL_NA_MEAN:
         return PreprocessingUtils.get_mean(x)
     return PreprocessingUtils.get_median(x)
コード例 #3
0
    def compute(X, y, machinelearning_task):

        columns_info = {}
        for col in X.columns:
            columns_info[col] = []
            #
            empty_column = np.sum(pd.isnull(X[col]) == True) == X.shape[0]
            if empty_column:
                columns_info[col] += ["empty_column"]
                continue
            #
            constant_column = len(np.unique(X.loc[~pd.isnull(X[col]),
                                                  col])) == 1
            if constant_column:
                columns_info[col] += ["constant_column"]
                continue
            #
            if PreprocessingUtils.is_na(X[col]):
                columns_info[col] += ["missing_values"]
            #
            if PreprocessingUtils.is_categorical(X[col]):
                columns_info[col] += ["categorical"]
                columns_info[col] += [EncodingSelector.get(X, y, col)]
            elif PreprocessingUtils.is_datetime(X[col]):
                columns_info[col] += ["datetime_transform"]
            elif PreprocessingUtils.is_text(X[col]):
                columns_info[col] = ["text_transform"
                                     ]  # override other transforms
            else:
                # numeric type, check if scale needed
                if PreprocessingUtils.is_scale_needed(X[col]):
                    columns_info[col] += ["scale"]

        target_info = []
        if machinelearning_task == BINARY_CLASSIFICATION:
            if not PreprocessingUtils.is_0_1(y):
                target_info += ["convert_0_1"]

        if machinelearning_task == REGRESSION:
            if PreprocessingUtils.is_log_scale_needed(y):
                target_info += ["scale_log"]
            elif PreprocessingUtils.is_scale_needed(y):
                target_info += ["scale"]

        num_class = None
        if machinelearning_task == MULTICLASS_CLASSIFICATION:
            num_class = PreprocessingUtils.num_class(y)

        return {
            "columns_info": columns_info,
            "target_info": target_info,
            "num_class": num_class,
        }
コード例 #4
0
    def fit(self, X, y, X_validation=None, y_validation=None, log_to_file=None):
        if self.cat_features is None:
            self.cat_features = []
            for i in range(X.shape[1]):
                if PreprocessingUtils.is_categorical(X.iloc[:, i]):
                    self.cat_features += [i]

        eval_set = None 
        if X_validation is not None and y_validation is not None:
            eval_set = (X_validation, y_validation)

        self.model.fit(
            X,
            y,
            cat_features=self.cat_features,
            init_model=None if self.model.tree_count_ is None else self.model,
            eval_set=eval_set,
            early_stopping_rounds=self.early_stopping_rounds,
            verbose_eval=False,
        )
        if log_to_file is not None:
            
            metric_name = list(self.model.evals_result_["learn"].keys())[0]
            result = pd.DataFrame(
                {
                    "iteration": range(len(self.model.evals_result_["learn"][metric_name])),
                    "train": self.model.evals_result_["learn"][metric_name],
                    "validation": self.model.evals_result_["validation"][metric_name],
                }
            )
            result.to_csv(log_to_file, index=False, header=False)
コード例 #5
0
 def _fit_na_fill(self, X):
     for column in self._columns:
         if np.sum(pd.isnull(X[column]) == True) == 0:
             continue
         self._na_fill_params[column] = self._get_fill_value(X[column])
         if PreprocessingUtils.get_type(X[column]) == PreprocessingUtils.DATETIME:
             self._datetime_columns += [column]
コード例 #6
0
    def test_get_stats(self):
        tmp = np.array([1, np.nan, 2, 3, np.nan, np.nan])
        self.assertEqual(1, PreprocessingUtils.get_min(tmp))
        self.assertEqual(2, PreprocessingUtils.get_mean(tmp))
        self.assertEqual(2, PreprocessingUtils.get_median(tmp))
        d = {"col1": [1, 2, 1, 3, 1, np.nan], "col2": ["a", np.nan, "b", "a", "c", "a"]}
        df = pd.DataFrame(data=d)
        self.assertEqual(1, PreprocessingUtils.get_min(df["col1"]))
        self.assertEqual(8.0 / 5.0, PreprocessingUtils.get_mean(df["col1"]))
        self.assertEqual(1, PreprocessingUtils.get_median(df["col1"]))

        self.assertEqual(1, PreprocessingUtils.get_most_frequent(df["col1"]))
        self.assertEqual("a", PreprocessingUtils.get_most_frequent(df["col2"]))
コード例 #7
0
    def fit(self, X, y):
        if self.cat_features is None:
            self.cat_features = []
            for i in range(X.shape[1]):
                if PreprocessingUtils.is_categorical(X.iloc[:, i]):
                    self.cat_features += [i]

        self.model.fit(
            X,
            y,
            cat_features=self.cat_features,
            init_model=None if self.model.tree_count_ is None else self.model,
        )
コード例 #8
0
    def _fit_categorical_convert(self, X):
        for column in self._columns:

            if PreprocessingUtils.get_type(
                    X[column]) != PreprocessingUtils.CATEGORICAL:
                # no need to convert, already a number
                continue
            # limit categories - it is needed when doing one hot encoding
            # this code is also used in predict.py file
            # and transform_utils.py
            # TODO it needs refactoring !!!
            too_much_categories = len(np.unique(list(X[column].values))) > 200
            lbl = None
            if (self._convert_method
                    == PreprocessingCategorical.CONVERT_ONE_HOT
                    and not too_much_categories):
                lbl = LabelBinarizer()
                lbl.fit(X, column)
            else:
                lbl = LabelEncoder()
                lbl.fit(X[column])

            if lbl is not None:
                self._convert_params[column] = lbl.to_json()
コード例 #9
0
    def fit(
        self,
        X,
        y,
        sample_weight=None,
        X_validation=None,
        y_validation=None,
        sample_weight_validation=None,
        log_to_file=None,
        max_time=None,
    ):
        if self.is_fitted():
            print("CatBoost model already fitted. Skip fit().")
            return

        if self.cat_features is None:
            self.cat_features = []
            for i in range(X.shape[1]):
                if PreprocessingUtils.is_categorical(X.iloc[:, i]):
                    self.cat_features += [i]

        eval_set = None
        if X_validation is not None and y_validation is not None:
            eval_set = Pool(
                data=X_validation,
                label=y_validation,
                cat_features=self.cat_features,
                weight=sample_weight_validation,
            )

        if self.params.get("num_boost_round") is None:
            model_init, new_iterations = self._assess_iterations(
                X, y, sample_weight, eval_set, max_time)
            self.model.set_params(iterations=new_iterations)
        else:
            model_init = None
            self.model.set_params(
                iterations=self.params.get("num_boost_round"))
            self.early_stopping_rounds = self.params.get(
                "early_stopping_rounds", 50)

        self.model.fit(
            X,
            y,
            sample_weight=sample_weight,
            cat_features=self.cat_features,
            init_model=model_init,
            eval_set=eval_set,
            early_stopping_rounds=self.early_stopping_rounds,
            verbose_eval=False,
        )

        if self.model.best_iteration_ is not None:
            if model_init is not None:
                self.best_ntree_limit = (self.model.best_iteration_ +
                                         model_init.tree_count_ + 1)
            else:
                self.best_ntree_limit = self.model.best_iteration_ + 1

        else:
            # just take all the trees
            # the warm-up trees are already included
            # dont need to add +1
            self.best_ntree_limit = self.model.tree_count_

        if log_to_file is not None:
            train_scores = self.model.evals_result_["learn"].get(
                self.log_metric_name)
            validation_scores = self.model.evals_result_["validation"].get(
                self.log_metric_name)
            if model_init is not None:
                if train_scores is not None:
                    train_scores = (model_init.evals_result_["learn"].get(
                        self.log_metric_name) + train_scores)
                if validation_scores is not None:
                    validation_scores = (
                        model_init.evals_result_["validation"].get(
                            self.log_metric_name) + validation_scores)
            iteration = None
            if train_scores is not None:
                iteration = range(len(validation_scores))
            elif validation_scores is not None:
                iteration = range(len(validation_scores))

            result = pd.DataFrame({
                "iteration": iteration,
                "train": train_scores,
                "validation": validation_scores,
            })
            result.to_csv(log_to_file, index=False, header=False)
コード例 #10
0
    def extensive_eda(X, y, save_path):

        # Check for empty dataframes in params
        if not isinstance(X, pd.DataFrame):
            raise ValueError("X should be a dataframe")
        if X.shape[0] != len(y):
            raise ValueError("X and y should have the same number of samples")

        if X.shape[1] > MAXCOL:
            X = X.iloc[:, :MAXCOL]
            warnings.warn(
                f"AutoML EDA column limit exceeded! running for first {MAXCOL} columns"
            )

        if save_path:
            if not os.path.exists(save_path):
                os.mkdir(save_path)
        else:
            raise ValueError("Please provide a valid path to save the Extensive EDA")

        plt.style.use("ggplot")
        try:

            if PreprocessingUtils.get_type(y) in ("categorical", "discrete"):

                for col in X.columns:

                    if PreprocessingUtils.get_type(X[col]) == "continous":

                        plt.figure(figsize=(5, 5))
                        for i in np.unique(y):
                            sns.kdeplot(
                                x=X.iloc[np.where(y == i)[0]][col],
                                label=f"class {i}",
                                shade=True,
                            )
                        plt.legend()
                        plt.gca().set_title(
                            f"Distribution of {col} for each class",
                            fontsize=11,
                            weight="bold",
                            alpha=0.75,
                        )
                        plt.savefig(EDA.plot_path(save_path, col + "_target"))

                    elif PreprocessingUtils.get_type(X[col]) in (
                        "categorical",
                        "discrete",
                    ):

                        if X[col].nunique() > 7:
                            warnings.warn("Considering 7 the most frequent values")

                        values = X[col].value_counts().index[:7]
                        plt.figure(figsize=(5, 5))
                        sns.countplot(
                            x=X[X[col].isin(values)][col], hue=y[X[col].isin(values)]
                        )
                        plt.gca().set_title(
                            f"Count plot of each {col}",
                            fontsize=11,
                            weight="bold",
                            alpha=0.75,
                        )
                        plt.savefig(EDA.plot_path(save_path, col + "_target"))

            elif PreprocessingUtils.get_type(y) == "continous":
                for col in X.columns:

                    if PreprocessingUtils.get_type(X[col]) == "continous":

                        plt.figure(figsize=(5, 5))
                        plt.scatter(X[col].values, y)
                        plt.gca().set_xlabel(f"{col}")
                        plt.gca().set_ylabel("target")
                        plt.gca().set_title(
                            f"Scatter plot of {col} vs target",
                            fontsize=11,
                            weight="bold",
                            alpha=0.75,
                        )

                        plt.savefig(EDA.plot_path(save_path, col + "_target"))

                    elif PreprocessingUtils.get_type(X[col]) in (
                        "categorical",
                        "discrete",
                    ):
                        if X[col].nunique() > 7:
                            warnings.warn("Considering 7 the most frequent values")

                        plt.figure(figsize=(5, 5))
                        for i in X[col].value_counts().index[:7]:
                            sns.kdeplot(
                                x=y[X[X[col] == i].index],
                                shade=True,
                                label=f"{col}_{i}",
                            )
                        plt.gca().set_title(
                            f"Distribution of target for each {col}",
                            fontsize=11,
                            weight="bold",
                            alpha=0.75,
                        )
                        plt.legend()

                        plt.savefig(EDA.plot_path(save_path, col + "_target"))

                    elif PreprocessingUtils.get_type(X[col]) == "datetime":

                        plt.figure(figsize=(5, 5))
                        plt.plot(X[col], y)
                        plt.gca().set_xticklabels(X[col].dt.date, rotation="45")
                        plt.gca().set_title(
                            f"Distribution of target over time",
                            fontsize=11,
                            weight="bold",
                            alpha=0.75,
                        )
                        plt.savefig(EDA.plot_path(save_path, col + "_target"))

            cols = [
                col
                for col in X.columns
                if PreprocessingUtils.get_type(X[col]) == "continous"
            ][:COLS]

            if len(cols) > 0:

                plt.figure(figsize=(10, 10))
                sns.heatmap(X[cols].corr())
                plt.gca().set_title("Heatmap", fontsize=11, weight="bold", alpha=0.75)

            plt.savefig(os.path.join(save_path, "heatmap"))

            with open(os.path.join(save_path, "Extensive_EDA.md"), "w") as fout:

                for col in X.columns:

                    fout.write(f"## Bivariate analysis of {col} feature with target\n")
                    fout.write("\n![]({})\n".format(EDA.plot_fname(col + "_target")))
                    fout.write("\n")
                    fout.write(
                        "------------------------------------------------------\n"
                    )

                if len(cols) > 0:
                    fout.write("## Heatmap\n")
                    fout.write("![](heatmap.png)\n")
                    fout.write("\n")
                    fout.write(
                        "------------------------------------------------------\n"
                    )

        except Exception as e:
            AutoMLException(e)
コード例 #11
0
 def test_get_type_numpy_number(self):
     tmp = np.array([1, 2, 3])
     tmp_type = PreprocessingUtils.get_type(tmp)
     self.assertNotEqual(tmp_type, PreprocessingUtils.CATEGORICAL)
コード例 #12
0
    def get(required_preprocessing, data, machinelearning_task):

        X = data["train"]["X"]
        y = data["train"]["y"]

        columns_preprocessing = {}
        for col in X.columns:
            preprocessing_to_apply = []

            # remove empty columns and columns with only one variable
            empty_column = np.sum(pd.isnull(X[col]) == True) == X.shape[0]
            constant_column = len(np.unique(X.loc[~pd.isnull(X[col]), col])) == 1
            if empty_column or constant_column:
                preprocessing_to_apply += ["remove_column"]
                columns_preprocessing[col] = preprocessing_to_apply
                continue

            # always check for missing values
            if (
                "missing_values_inputation" in required_preprocessing
                and PreprocessingUtils.is_na(X[col])
            ):
                preprocessing_to_apply += [PreprocessingMissingValues.FILL_NA_MEDIAN]
            # convert to categorical only for categorical types
            convert_to_integer_will_be_applied = False
            if (
                "convert_categorical" in required_preprocessing
                and PreprocessingUtils.is_categorical(X[col])
            ):
                preprocessing_to_apply += [PreprocessingCategorical.CONVERT_INTEGER]
                convert_to_integer_will_be_applied = True

            if "scale" in required_preprocessing:
                if convert_to_integer_will_be_applied:
                    preprocessing_to_apply += [PreprocessingScale.SCALE_NORMAL]
                # elif PreprocessingUtils.is_log_scale_needed(X[col]):
                #    preprocessing_to_apply += [PreprocessingScale.SCALE_LOG_AND_NORMAL]
                elif PreprocessingUtils.is_scale_needed(X[col]):
                    preprocessing_to_apply += [PreprocessingScale.SCALE_NORMAL]

            # remeber which preprocessing we need to apply
            if preprocessing_to_apply:
                columns_preprocessing[col] = preprocessing_to_apply

        target_preprocessing = []
        # always remove missing values from target,
        # missing values might be in train and in validation datasets
        target_preprocessing += [PreprocessingMissingValues.NA_EXCLUDE]

        if machinelearning_task == BINARY_CLASSIFICATION:
            if not PreprocessingUtils.is_0_1(y):
                target_preprocessing += [PreprocessingCategorical.CONVERT_INTEGER]

        if machinelearning_task == MULTICLASS_CLASSIFICATION:
            if PreprocessingUtils.is_categorical(y):
                target_preprocessing += [PreprocessingCategorical.CONVERT_INTEGER]

        if machinelearning_task == REGRESSION:
            if PreprocessingUtils.is_log_scale_needed(y):
                target_preprocessing += [PreprocessingScale.SCALE_LOG_AND_NORMAL]
            elif PreprocessingUtils.is_scale_needed(y):
                target_preprocessing += [PreprocessingScale.SCALE_NORMAL]

        return {
            "columns_preprocessing": columns_preprocessing,
            "target_preprocessing": target_preprocessing,
        }
コード例 #13
0
    def fit(self,
            X,
            y,
            sample_weight=None,
            X_validation=None,
            y_validation=None,
            sample_weight_validation=None,
            log_to_file=None,
            max_time=None):
        if self.model.tree_count_ is not None:
            print("CatBoost model already fitted. Skip fit().")
            return

        if self.cat_features is None:
            self.cat_features = []
            for i in range(X.shape[1]):
                if PreprocessingUtils.is_categorical(X.iloc[:, i]):
                    self.cat_features += [i]

        eval_set = None
        if X_validation is not None and y_validation is not None:
            eval_set = Pool(
                data=X_validation,
                label=y_validation,
                cat_features=self.cat_features,
                weight=sample_weight_validation,
            )

        # disable for now ...
        model_init, new_iterations = self._assess_iterations(
            X, y, eval_set, max_time)
        self.model.set_params(iterations=new_iterations)

        self.model.fit(X,
                       y,
                       sample_weight=sample_weight,
                       cat_features=self.cat_features,
                       init_model=model_init,
                       eval_set=eval_set,
                       early_stopping_rounds=self.early_stopping_rounds,
                       verbose_eval=False)
        if self.model.best_iteration_ is not None:
            self.best_ntree_limit = self.model.best_iteration_ + self.warmup_iterations + 1
        else:
            # just take all the trees
            # the warm-up trees are already included
            # dont need to add +1
            self.best_ntree_limit = self.model.tree_count_

        if log_to_file is not None:

            metric_name = list(self.model.evals_result_["learn"].keys())[0]
            train_scores = self.model.evals_result_["learn"][metric_name]
            validation_scores = self.model.evals_result_["validation"][
                metric_name]
            if model_init is not None:
                train_scores = model_init.evals_result_["learn"][
                    metric_name] + train_scores
                validation_scores = model_init.evals_result_["validation"][
                    metric_name] + validation_scores

            result = pd.DataFrame({
                "iteration": range(len(train_scores)),
                "train": train_scores,
                "validation": validation_scores,
            })
            result.to_csv(log_to_file, index=False, header=False)
コード例 #14
0
    def get(required_preprocessing, data, machinelearning_task):

        X = data["train"]["X"]
        y = data["train"]["y"]

        columns_preprocessing = {}
        for col in X.columns:
            preprocessing_to_apply = []

            # remove empty columns and columns with only one variable
            empty_column = np.sum(pd.isnull(X[col]) == True) == X.shape[0]
            constant_column = len(np.unique(X.loc[~pd.isnull(X[col]), col])) == 1
            if empty_column or constant_column:
                preprocessing_to_apply += ["remove_column"]
                columns_preprocessing[col] = preprocessing_to_apply
                continue

            # always check for missing values
            if (
                "missing_values_inputation" in required_preprocessing
                and PreprocessingUtils.is_na(X[col])
            ):
                preprocessing_to_apply += [PreprocessingMissingValues.FILL_NA_MEDIAN]
            # convert to categorical only for categorical types
            convert_to_integer_will_be_applied = False
            if (
                "convert_categorical" in required_preprocessing
                and PreprocessingUtils.is_categorical(X[col])
            ):
                preprocessing_to_apply += [PreprocessingCategorical.CONVERT_INTEGER]
                convert_to_integer_will_be_applied = True

            if "scale" in required_preprocessing:
                if convert_to_integer_will_be_applied:
                    preprocessing_to_apply += [Scale.SCALE_NORMAL]
                # elif PreprocessingUtils.is_log_scale_needed(X[col]):
                #    preprocessing_to_apply += [Scale.SCALE_LOG_AND_NORMAL]
                elif PreprocessingUtils.is_scale_needed(X[col]):
                    preprocessing_to_apply += [Scale.SCALE_NORMAL]

            # remeber which preprocessing we need to apply
            if preprocessing_to_apply:
                columns_preprocessing[col] = preprocessing_to_apply

        target_preprocessing = []
        # always remove missing values from target,
        # target with missing values might be in the train and in the validation datasets
        target_preprocessing += [PreprocessingMissingValues.NA_EXCLUDE]

        if "target_as_integer" in required_preprocessing:
            if machinelearning_task == BINARY_CLASSIFICATION:
                if not PreprocessingUtils.is_0_1(y):
                    target_preprocessing += [PreprocessingCategorical.CONVERT_INTEGER]

            if machinelearning_task == MULTICLASS_CLASSIFICATION:
                # if PreprocessingUtils.is_categorical(y):
                # always convert to integer, there can be many situations that can break
                # for example, classes starting from 1, ...
                # or classes not for every number, for example 0,2,3,4
                # just always convert
                target_preprocessing += [PreprocessingCategorical.CONVERT_INTEGER]

        elif "target_as_one_hot" in required_preprocessing:
            target_preprocessing += [PreprocessingCategorical.CONVERT_ONE_HOT]

        if (
            machinelearning_task == REGRESSION
            and "target_scale" in required_preprocessing
        ):
            if PreprocessingUtils.is_log_scale_needed(y):
                target_preprocessing += [Scale.SCALE_LOG_AND_NORMAL]
            elif PreprocessingUtils.is_scale_needed(y):
                target_preprocessing += [Scale.SCALE_NORMAL]

        """    
        if machinelearning_task == BINARY_CLASSIFICATION:
            if not PreprocessingUtils.is_0_1(y):
                target_preprocessing += [PreprocessingCategorical.CONVERT_INTEGER]

        if machinelearning_task == MULTICLASS_CLASSIFICATION:
            if PreprocessingUtils.is_categorical(y):
                target_preprocessing += [PreprocessingCategorical.CONVERT_INTEGER]

        """
        return {
            "columns_preprocessing": columns_preprocessing,
            "target_preprocessing": target_preprocessing,
            "ml_task": machinelearning_task,
        }
コード例 #15
0
    def optimize(
        self,
        algorithm,
        data_type,
        X_train,
        y_train,
        sample_weight,
        X_validation,
        y_validation,
        sample_weight_validation,
        learner_params,
    ):
        # only tune models with original data type
        if data_type != "original":
            return learner_params

        key = f"{data_type}_{algorithm}"
        if key in self.tuning:
            return self.update_learner_params(learner_params, self.tuning[key])

        if self.verbose:
            print(
                f"Optuna optimizes {algorithm} with time budget {self.time_budget} seconds "
                f"eval_metric {self.eval_metric.name} ({self.direction})")

        self.cat_features_indices = []
        for i in range(X_train.shape[1]):
            if PreprocessingUtils.is_categorical(X_train.iloc[:, i]):
                self.cat_features_indices += [i]

        study = optuna.create_study(
            direction=self.direction,
            sampler=optuna.samplers.TPESampler(seed=self.random_state),
            pruner=optuna.pruners.MedianPruner(
                n_warmup_steps=self.n_warmup_steps),
        )
        obejctive = None
        if algorithm == "LightGBM":
            objective = LightgbmObjective(
                self.ml_task,
                X_train,
                y_train,
                sample_weight,
                X_validation,
                y_validation,
                sample_weight_validation,
                self.eval_metric,
                self.cat_features_indices,
                self.n_jobs,
                self.random_state,
            )
        elif algorithm == "Xgboost":
            objective = XgboostObjective(
                self.ml_task,
                X_train,
                y_train,
                sample_weight,
                X_validation,
                y_validation,
                sample_weight_validation,
                self.eval_metric,
                self.n_jobs,
                self.random_state,
            )
        elif algorithm == "CatBoost":
            objective = CatBoostObjective(
                self.ml_task,
                X_train,
                y_train,
                sample_weight,
                X_validation,
                y_validation,
                sample_weight_validation,
                self.eval_metric,
                self.cat_features_indices,
                self.n_jobs,
                self.random_state,
            )
        elif algorithm == "Random Forest":
            objective = RandomForestObjective(
                self.ml_task,
                X_train,
                y_train,
                sample_weight,
                X_validation,
                y_validation,
                sample_weight_validation,
                self.eval_metric,
                self.n_jobs,
                self.random_state,
            )
        elif algorithm == "Extra Trees":
            objective = ExtraTreesObjective(
                self.ml_task,
                X_train,
                y_train,
                sample_weight,
                X_validation,
                y_validation,
                sample_weight_validation,
                self.eval_metric,
                self.n_jobs,
                self.random_state,
            )
        elif algorithm == "Nearest Neighbors":
            objective = KNNObjective(
                self.ml_task,
                X_train,
                y_train,
                sample_weight,
                X_validation,
                y_validation,
                sample_weight_validation,
                self.eval_metric,
                self.n_jobs,
                self.random_state,
            )
        elif algorithm == "Neural Network":
            objective = NeuralNetworkObjective(
                self.ml_task,
                X_train,
                y_train,
                sample_weight,
                X_validation,
                y_validation,
                sample_weight_validation,
                self.eval_metric,
                self.n_jobs,
                self.random_state,
            )

        study.optimize(objective, n_trials=5000, timeout=self.time_budget)

        self.plot_study(algorithm, data_type, study)

        joblib.dump(study, os.path.join(self.study_dir, key + ".joblib"))

        best = study.best_params

        if algorithm == "LightGBM":
            best["metric"] = objective.eval_metric_name
            best["custom_eval_metric_name"] = objective.custom_eval_metric_name
            best["num_boost_round"] = objective.rounds
            best["early_stopping_rounds"] = objective.early_stopping_rounds
            # best["learning_rate"] = objective.learning_rate
            best["cat_feature"] = self.cat_features_indices
            best["feature_pre_filter"] = False
            best["seed"] = objective.seed
        elif algorithm == "CatBoost":
            best["eval_metric"] = objective.eval_metric_name
            best["num_boost_round"] = objective.rounds
            best["early_stopping_rounds"] = objective.early_stopping_rounds
            # best["bootstrap_type"] = "Bernoulli"
            # best["learning_rate"] = objective.learning_rate
            best["seed"] = objective.seed
        elif algorithm == "Xgboost":
            best["objective"] = objective.objective
            best["eval_metric"] = objective.eval_metric_name
            # best["eta"] = objective.learning_rate
            best["max_rounds"] = objective.rounds
            best["early_stopping_rounds"] = objective.early_stopping_rounds
            best["seed"] = objective.seed
        elif algorithm == "Extra Trees":
            # Extra Trees are not using early stopping
            best["max_steps"] = objective.max_steps  # each step has 100 trees
            best["seed"] = objective.seed
            best["eval_metric_name"] = self.eval_metric.name
        elif algorithm == "Random Forest":
            # Random Forest is not using early stopping
            best["max_steps"] = objective.max_steps  # each step has 100 trees
            best["seed"] = objective.seed
            best["eval_metric_name"] = self.eval_metric.name
        elif algorithm == "Nearest Neighbors":
            best["rows_limit"] = 100000
        elif algorithm == "Neural Network":
            pass

        self.tuning[key] = best
        self.save()

        return self.update_learner_params(learner_params, best)
コード例 #16
0
    def compute(X_train, y_train, eda_path):

        try:
            # check if exists
            if os.path.exists(eda_path):
                # probably the EDA analysis is already done
                # skip from here
                return
            else:
                # need to create directory for EDA analysis
                os.mkdir(eda_path)

            inform = defaultdict(list)

            if isinstance(y_train, pd.Series):

                if PreprocessingUtils.get_type(y_train) in ("categorical"):

                    plt.figure(figsize=(5, 5))
                    sns.countplot(y_train, color=BLUE)
                    plt.title("Target class distribution")
                    plt.tight_layout(pad=2.0)
                    plot_path = os.path.join(eda_path, "target.png")
                    plt.savefig(plot_path)
                    plt.close("all")

                else:

                    plt.figure(figsize=(5, 5))
                    sns.distplot(y_train, color=BLUE)
                    plt.title("Target class distribution")
                    plt.tight_layout(pad=2.0)
                    plot_path = os.path.join(eda_path, "target.png")
                    plt.savefig(plot_path)
                    plt.close("all")

                inform["missing"].append(
                    pd.isnull(y_train).sum() / y_train.shape[0])
                inform["unique"].append(y_train.nunique())
                inform["feature_type"].append(
                    PreprocessingUtils.get_type(y_train))
                inform["plot"].append("![](target.png)")
                inform["feature"].append("target")
                inform["desc"].append(y_train.describe().to_dict())

            for col in X_train.columns:

                inform["feature_type"].append(
                    PreprocessingUtils.get_type(X_train[col]))

                if PreprocessingUtils.get_type(X_train[col]) in (
                        "categorical",
                        "discrete",
                ):

                    plt.figure(figsize=(5, 5))
                    chart = sns.countplot(
                        X_train[col],
                        order=X_train[col].value_counts().iloc[:10].index,
                        color=BLUE,
                    )
                    chart.set_xticklabels(chart.get_xticklabels(), rotation=90)
                    plt.title(f"{col} class distribution")
                    plt.tight_layout(pad=2.0)
                    plot_path = os.path.join(eda_path, f"{col}.png")
                    plt.savefig(plot_path)
                    plt.close("all")

                elif PreprocessingUtils.get_type(
                        X_train[col]) in ("continous"):

                    plt.figure(figsize=(5, 5))
                    sns.distplot(X_train[col], color=BLUE)
                    plt.title(f"{col} value distribution")
                    plt.tight_layout(pad=2.0)
                    plot_path = os.path.join(eda_path, f"{col}.png")
                    plt.savefig(plot_path)
                    plt.close("all")

                elif PreprocessingUtils.get_type(X_train[col]) in ("text"):

                    plt.figure(figsize=(10, 10), dpi=70)
                    word_string = " ".join(X_train[col].str.lower())
                    wordcloud = WordCloud(
                        width=500,
                        height=500,
                        stopwords=STOPWORDS,
                        background_color="white",
                        max_words=400,
                        max_font_size=None,
                    ).generate(word_string)

                    plt.imshow(wordcloud,
                               aspect="auto",
                               interpolation="nearest")
                    plt.axis("off")
                    plot_path = os.path.join(eda_path, f"{col}.png")
                    plt.savefig(plot_path)

                elif PreprocessingUtils.get_type(X_train[col]) in ("datetime"):

                    plt.figure(figsize=(5, 5))
                    pd.to_datetime(X_train[col]).plot(grid="True", color=BLUE)
                    plt.tight_layout(pad=2.0)
                    plot_path = os.path.join(eda_path, f"{col}.png")
                    plt.savefig(plot_path)
                    plt.close("all")

                inform["missing"].append(
                    pd.isnull(X_train[col]).sum() * 100 / X_train.shape[0])

                inform["unique"].append(int(X_train[col].nunique()))
                inform["plot"].append(f"![]({col}.png)")
                inform["feature"].append(str(col))
                inform["desc"].append(X_train[col].describe().to_dict())

            df = pd.DataFrame(inform)

            with open(os.path.join(eda_path, "README.md"), "w") as fout:

                for i, row in df.iterrows():

                    fout.write(f"## Feature : {row['feature']}\n")
                    fout.write(f"- **Feature type** : {row['feature_type']}\n")
                    fout.write(f"- **Missing** : {row['missing']}%\n")
                    fout.write(f"- **Unique** : {row['unique']}\n")

                    for key in row["desc"].keys():

                        if key in ("25%", "50%", "75%"):

                            fout.write(
                                f"- **{key.capitalize()}th Percentile** : {row['desc'][key]}\n"
                            )
                        else:

                            fout.write(
                                f"- **{key.capitalize()}** :{row['desc'][key]}\n"
                            )

                    fout.write(f"- {row['plot']}\n")

            fout.close()
        except Exception as e:
            logger.error(f"There was an issue when running EDA. {str(e)}")
コード例 #17
0
 def test_get_type_numpy_categorical(self):
     tmp = np.array(["a", "b", "c"])
     tmp_type = PreprocessingUtils.get_type(tmp)
     self.assertEqual(tmp_type, PreprocessingUtils.CATEGORICAL)