Exemple #1
0
 def as_data_frame(self):
     """Convert to a python 'data frame'."""
     if can_use_pandas():
         import pandas
         pandas.options.display.max_colwidth = 70
         return pandas.DataFrame(self._cell_values, columns=self._col_header)
     return self
Exemple #2
0
    def show(self, header=True):
        """Print the contents of this table."""
        # if h2o.can_use_pandas():
        #  import pandas
        #  pandas.options.display.max_rows = 20
        #  print pandas.DataFrame(self._cell_values,columns=self._col_header)
        #  return
        if header and self._table_header:
            print(self._table_header + ":", end=' ')
            if self._table_description: print(self._table_description)
        print()
        table = copy.deepcopy(self._cell_values)
        nr = 0
        if _is_list_of_lists(table):
            nr = len(
                table
            )  # only set if we truly have multiple rows... not just one long row :)
        if nr > 20:  # create a truncated view of the table, first/last 5 rows
            trunc_table = []
            trunc_table += [v for v in table[:5]]
            trunc_table.append(["---"] * len(table[0]))
            trunc_table += [v for v in table[(nr - 5):]]
            table = trunc_table

        H2ODisplay(table, self._col_header, numalign="left", stralign="left")
        if nr > 20 and can_use_pandas():
            print('\nSee the whole table with table.as_data_frame()')
Exemple #3
0
 def as_data_frame(self):
     """Convert to a python 'data frame'."""
     if can_use_pandas():
         import pandas
         pandas.options.display.max_colwidth = 70
         return pandas.DataFrame(self._cell_values, columns=self._col_header)
     return self
Exemple #4
0
    def varsplits(self, use_pandas=False):
        """
        Retrieve per-variable split information for a given Isolation Forest model. Output will include:
        - count - The number of times a variable was used to make a split.
        - aggregated_split_ratios - The split ratio is defined as "abs(#left_observations - #right_observations) / #before_split".
                                    Even splits (#left_observations approx the same as #right_observations) contribute
                                    less to the total aggregated split ratio value for the given feature;
                                    highly imbalanced splits (eg. #left_observations >> #right_observations) contribute more.
        - aggregated_split_depths - The sum of all depths of a variable used to make a split. (If a variable is used
                                    on level N of a tree, then it contributes with N to the total aggregate.)

        :param use_pandas: If True, then the variable splits will be returned as a Pandas data frame.

        :returns: A list or Pandas DataFrame.
        """
        model = self._model_json["output"]
        if "variable_splits" in list(
                model.keys()) and model["variable_splits"]:
            vals = model["variable_splits"].cell_values
            header = model["variable_splits"].col_header

            if use_pandas and can_use_pandas():
                import pandas
                return pandas.DataFrame(vals, columns=header)
            else:
                return vals
        else:
            print(
                "Warning: This model doesn't provide variable split information"
            )
Exemple #5
0
    def scoring_history_plot(self,
                             timestep,
                             metric,
                             server=False,
                             save_plot_path=None):
        plt = get_matplotlib_pyplot(server)
        if plt is None:
            return decorate_plot_result(figure=RAISE_ON_FIGURE_ACCESS)

        scoring_history = self._get_scoring_history_to_plot()
        timestep = self._validate_timestep(timestep)
        training_metric = "training_{}".format(metric)
        validation_metric = "validation_{}".format(metric)
        if timestep == "duration":
            dur_colname = "duration_{}".format(
                scoring_history["duration"][1].split()[1])
            scoring_history[dur_colname] = [
                str(x).split()[0] for x in scoring_history["duration"]
            ]
            timestep = dur_colname

        if can_use_pandas():
            valid = validation_metric in list(scoring_history)
            ylim = (scoring_history[[training_metric, validation_metric]].min().min(),
                    scoring_history[[training_metric, validation_metric]].max().max()) if valid \
                else (scoring_history[training_metric].min(), scoring_history[training_metric].max())
        else:
            valid = validation_metric in scoring_history.col_header
            ylim = (min(min(scoring_history[[training_metric, validation_metric]])),
                    max(max(scoring_history[[training_metric, validation_metric]]))) if valid \
                else (min(scoring_history[training_metric]), max(scoring_history[training_metric]))
        if ylim[0] == ylim[1]: ylim = (0, 1)

        fig = plt.figure()
        if valid:  # Training and validation scoring history
            plt.xlabel(timestep)
            plt.ylabel(metric)
            plt.title("Scoring History")
            plt.ylim(ylim)
            plt.plot(scoring_history[timestep],
                     scoring_history[training_metric],
                     label="Training")
            plt.plot(scoring_history[timestep],
                     scoring_history[validation_metric],
                     color="orange",
                     label="Validation")
            plt.legend()
        else:  # Training scoring history only
            plt.xlabel(timestep)
            plt.ylabel(training_metric)
            plt.title("Training Scoring History")
            plt.ylim(ylim)
            plt.plot(scoring_history[timestep],
                     scoring_history[training_metric])
        if save_plot_path is not None:
            plt.savefig(fname=save_plot_path)
        if not server:
            plt.show()
        return decorate_plot_result(figure=fig)
def test_external_libraries_detection():
    assert can_use_pandas(), "pandas should be detected in test environment"
    assert can_use_numpy(), "numpy should be detected in test environment"
    assert is_module_available(
        'matplotlib'), "matplotlib should be detected in test environment"
    assert is_module_available(
        'sklearn'), "sklearn should be detected in test environment"
    assert not is_module_available('foobar'), "please don't"
Exemple #7
0
    def show(self, header=True):
        """Print the contents of this table."""
        print()
        if header and self._table_header:
            print(self._table_header + ":", end=' ')
            if self._table_description: print(self._table_description)

        (table, nr, is_pandas) = self._as_show_table()

        H2ODisplay(table, is_pandas=is_pandas, header=self._col_header, numalign="left", stralign="left")
        if nr > 20 and can_use_pandas(): print('\nSee the whole table with table.as_data_frame()')
Exemple #8
0
 def _as_show_table(self):
     if H2ODisplay.prefer_pandas() and can_use_pandas():
         pd = self.as_data_frame()
         return self.as_data_frame().head(20), pd.shape[0], True
     else:
         table = copy.deepcopy(self._cell_values)
         nr = 0
         if _is_list_of_lists(table): nr = len(
             table)  # only set if we truly have multiple rows... not just one long row :)
         if nr > 20:  # create a truncated view of the table, first/last 5 rows
             trunc_table = []
             trunc_table += [v for v in table[:5]]
             trunc_table.append(["---"] * len(table[0]))
             trunc_table += [v for v in table[(nr - 5):]]
             table = trunc_table
         return table, nr, False
Exemple #9
0
    def varimp(self, use_pandas=False):
        """
        Return the Importance of components associcated with a pca model.

        :param bool use_pandas: If True, then the variable importances will be returned as a pandas data frame. (Default is False.)
        """
        model = self._model_json["output"]
        if "importance" in list(model.keys()) and model["importance"]:
            vals = model["importance"].cell_values
            header = model["importance"].col_header
            if use_pandas and can_use_pandas():
                import pandas
                return pandas.DataFrame(vals, columns=header)
            else:
                return vals
        else:
            print("Warning: This model doesn't have importances of components.")
Exemple #10
0
    def varimp(self, use_pandas=False):
        """
        Return the Importance of components associcated with a pca model.

        use_pandas: ``bool``  (default: ``False``).
        """
        model = self._model_json["output"]
        if "importance" in list(model.keys()) and model["importance"]:
            vals = model["importance"].cell_values
            header = model["importance"].col_header
            if use_pandas and can_use_pandas():
                import pandas
                return pandas.DataFrame(vals, columns=header)
            else:
                return vals
        else:
            print("Warning: This model doesn't have importances of components.")
Exemple #11
0
    def varimp(self, use_pandas=False):
        """
        Return the Importance of components associcated with a pca model.

        use_pandas: ``bool``  (default: ``False``).
        """
        model = self._model_json["output"]
        if "importance" in list(model.keys()) and model["importance"]:
            vals = model["importance"].cell_values
            header = model["importance"].col_header
            if use_pandas and can_use_pandas():
                import pandas
                return pandas.DataFrame(vals, columns=header)
            else:
                return vals
        else:
            print(
                "Warning: This model doesn't have importances of components.")
Exemple #12
0
    def varimp(self, use_pandas=False):
        """
        Pretty print the variable importances, or return them in a list.

        :param use_pandas: If True, then the variable importances will be returned as a pandas data frame.

        :returns: A list or Pandas DataFrame.
        """
        model = self._model_json["output"]
        if "variable_importances" in list(model.keys()) and model["variable_importances"]:
            vals = model["variable_importances"].cell_values
            header = model["variable_importances"].col_header
            if use_pandas and can_use_pandas():
                import pandas
                return pandas.DataFrame(vals, columns=header)
            else:
                return vals
        else:
            print("Warning: This model doesn't have variable importances")
Exemple #13
0
    def varimp(self, use_pandas=False):
        """
        Pretty print the variable importances, or return them in a list.

        :param use_pandas: If True, then the variable importances will be returned as a pandas data frame.

        :returns: A list or Pandas DataFrame.
        """
        model = self._model_json["output"]
        if "variable_importances" in list(model.keys()) and model["variable_importances"]:
            vals = model["variable_importances"].cell_values
            header = model["variable_importances"].col_header
            if use_pandas and can_use_pandas():
                import pandas
                return pandas.DataFrame(vals, columns=header)
            else:
                return vals
        else:
            print("Warning: This model doesn't have variable importances")
Exemple #14
0
    def varsplits(self, use_pandas=False):
        """
        Retrieve per-variable split information for a given Isolation Forest model.

        :param use_pandas: If True, then the variable splits will be returned as a pandas data frame.

        :returns: A list or Pandas DataFrame.
        """
        model = self._model_json["output"]
        if "variable_splits" in list(
                model.keys()) and model["variable_splits"]:
            vals = model["variable_splits"].cell_values
            header = model["variable_splits"].col_header

            if use_pandas and can_use_pandas():
                import pandas
                return pandas.DataFrame(vals, columns=header)
            else:
                return vals
        else:
            print(
                "Warning: This model doesn't provide variable split information"
            )
Exemple #15
0
    def show(self, header=True):
        # if h2o.can_use_pandas():
        #  import pandas
        #  pandas.options.display.max_rows = 20
        #  print pandas.DataFrame(self.cell_values,columns=self.col_header)
        #  return
        print()
        if header:
            print(self.table_header + ":", end=' ')
            if self.table_description: print(self.table_description)
        print()
        table = copy.deepcopy(self.cell_values)
        nr = 0
        if _is_list_of_lists(table): nr = len(
            table)  # only set if we truly have multiple rows... not just one long row :)
        if nr > 20:  # create a truncated view of the table, first/last 5 rows
            trunc_table = []
            trunc_table += [v for v in table[:5]]
            trunc_table.append(["---"] * len(table[0]))
            trunc_table += [v for v in table[(nr - 5):]]
            table = trunc_table

        H2ODisplay(table, self.col_header, numalign="left", stralign="left")
        if nr > 20 and can_use_pandas(): print('\nSee the whole table with table.as_data_frame()')
Exemple #16
0
 def as_data_frame(self):
     if can_use_pandas():
         import pandas
         pandas.options.display.max_colwidth = 70
         return pandas.DataFrame(self.cell_values, columns=self.col_header)
     return self
Exemple #17
0
    def _plot(self, timestep, metric, **kwargs):

        # check for matplotlib. exit if absent
        try:
            imp.find_module('matplotlib')
            import matplotlib
            if 'server' in kwargs and kwargs['server']: matplotlib.use('Agg', warn=False)
            import matplotlib.pyplot as plt
        except ImportError:
            print("matplotlib is required for this function!")
            return

        scoring_history = self.scoring_history()
        # Separate functionality for GLM since its output is different from other algos
        if self._model_json["algo"] == "glm":
            # GLM has only one timestep option, which is `iteration`
            timestep = "iteration"
            if metric == "AUTO":
                metric = "log_likelihood"
            elif metric not in ("log_likelihood", "objective"):
                raise ValueError("for GLM, metric must be one of: log_likelihood, objective")
            plt.xlabel(timestep)
            plt.ylabel(metric)
            plt.title("Validation Scoring History")
            plt.plot(scoring_history[timestep], scoring_history[metric])

        elif self._model_json["algo"] in ("deeplearning", "drf", "gbm"):
            # Set timestep
            if self._model_json["algo"] in ("gbm", "drf"):
                if timestep == "AUTO":
                    timestep = "number_of_trees"
                elif timestep not in ("duration", "number_of_trees"):
                    raise ValueError("timestep for gbm or drf must be one of: duration, number_of_trees")
            else:  # self._model_json["algo"] == "deeplearning":
                # Delete first row of DL scoring history since it contains NAs & NaNs
                if scoring_history["samples"][0] == 0:
                    scoring_history = scoring_history[1:]
                if timestep == "AUTO":
                    timestep = "epochs"
                elif timestep not in ("epochs", "samples", "duration"):
                    raise ValueError("timestep for deeplearning must be one of: epochs, samples, duration")

            training_metric = "training_{}".format(metric)
            validation_metric = "validation_{}".format(metric)
            if timestep == "duration":
                dur_colname = "duration_{}".format(scoring_history["duration"][1].split()[1])
                scoring_history[dur_colname] = [str(x).split()[0] for x in scoring_history["duration"]]
                timestep = dur_colname

            if can_use_pandas():
                valid = validation_metric in list(scoring_history)
                ylim = (scoring_history[[training_metric, validation_metric]].min().min(),
                        scoring_history[[training_metric, validation_metric]].max().max()) if valid \
                    else (scoring_history[training_metric].min(), scoring_history[training_metric].max())
            else:
                valid = validation_metric in scoring_history.col_header
                ylim = (min(min(scoring_history[[training_metric, validation_metric]])),
                        max(max(scoring_history[[training_metric, validation_metric]]))) if valid \
                    else (min(scoring_history[training_metric]), max(scoring_history[training_metric]))
            if ylim[0] == ylim[1]: ylim = (0, 1)

            if valid:  # Training and validation scoring history
                plt.xlabel(timestep)
                plt.ylabel(metric)
                plt.title("Scoring History")
                plt.ylim(ylim)
                plt.plot(scoring_history[timestep], scoring_history[training_metric], label="Training")
                plt.plot(scoring_history[timestep], scoring_history[validation_metric], color="orange",
                         label="Validation")
                plt.legend()
            else:  # Training scoring history only
                plt.xlabel(timestep)
                plt.ylabel(training_metric)
                plt.title("Training Scoring History")
                plt.ylim(ylim)
                plt.plot(scoring_history[timestep], scoring_history[training_metric])

        else:  # algo is not glm, deeplearning, drf, gbm
            raise ValueError("Plotting not implemented for this type of model")
        if "server" not in list(kwargs.keys()) or not kwargs["server"]: plt.show()
Exemple #18
0
    def _plot(self, timestep, metric, **kwargs):

        # check for matplotlib. exit if absent
        try:
            imp.find_module('matplotlib')
            import matplotlib
            if 'server' in kwargs and kwargs['server']:
                matplotlib.use('Agg', warn=False)
            import matplotlib.pyplot as plt
        except ImportError:
            print("matplotlib is required for this function!")
            return

        scoring_history = self.scoring_history()
        # Separate functionality for GLM since its output is different from other algos
        if self._model_json["algo"] == "glm":
            # GLM has only one timestep option, which is `iteration`
            timestep = "iteration"
            if metric == "AUTO":
                metric = "log_likelihood"
            elif metric not in ("log_likelihood", "objective"):
                raise ValueError(
                    "for GLM, metric must be one of: log_likelihood, objective"
                )
            plt.xlabel(timestep)
            plt.ylabel(metric)
            plt.title("Validation Scoring History")
            plt.plot(scoring_history[timestep], scoring_history[metric])

        elif self._model_json["algo"] in ("deeplearning", "drf", "gbm"):
            # Set timestep
            if self._model_json["algo"] in ("gbm", "drf"):
                if timestep == "AUTO":
                    timestep = "number_of_trees"
                elif timestep not in ("duration", "number_of_trees"):
                    raise ValueError(
                        "timestep for gbm or drf must be one of: duration, number_of_trees"
                    )
            else:  # self._model_json["algo"] == "deeplearning":
                # Delete first row of DL scoring history since it contains NAs & NaNs
                if scoring_history["samples"][0] == 0:
                    scoring_history = scoring_history[1:]
                if timestep == "AUTO":
                    timestep = "epochs"
                elif timestep not in ("epochs", "samples", "duration"):
                    raise ValueError(
                        "timestep for deeplearning must be one of: epochs, samples, duration"
                    )

            training_metric = "training_{}".format(metric)
            validation_metric = "validation_{}".format(metric)
            if timestep == "duration":
                dur_colname = "duration_{}".format(
                    scoring_history["duration"][1].split()[1])
                scoring_history[dur_colname] = [
                    str(x).split()[0] for x in scoring_history["duration"]
                ]
                timestep = dur_colname

            if can_use_pandas():
                valid = validation_metric in list(scoring_history)
                ylim = (scoring_history[[training_metric, validation_metric]].min().min(),
                        scoring_history[[training_metric, validation_metric]].max().max()) if valid \
                    else (scoring_history[training_metric].min(), scoring_history[training_metric].max())
            else:
                valid = validation_metric in scoring_history.col_header
                ylim = (min(min(scoring_history[[training_metric, validation_metric]])),
                        max(max(scoring_history[[training_metric, validation_metric]]))) if valid \
                    else (min(scoring_history[training_metric]), max(scoring_history[training_metric]))
            if ylim[0] == ylim[1]: ylim = (0, 1)

            if valid:  # Training and validation scoring history
                plt.xlabel(timestep)
                plt.ylabel(metric)
                plt.title("Scoring History")
                plt.ylim(ylim)
                plt.plot(scoring_history[timestep],
                         scoring_history[training_metric],
                         label="Training")
                plt.plot(scoring_history[timestep],
                         scoring_history[validation_metric],
                         color="orange",
                         label="Validation")
                plt.legend()
            else:  # Training scoring history only
                plt.xlabel(timestep)
                plt.ylabel(training_metric)
                plt.title("Training Scoring History")
                plt.ylim(ylim)
                plt.plot(scoring_history[timestep],
                         scoring_history[training_metric])

        else:  # algo is not glm, deeplearning, drf, gbm
            raise ValueError("Plotting not implemented for this type of model")
        if "server" not in list(kwargs.keys()) or not kwargs["server"]:
            plt.show()
Exemple #19
0
    def _plot(self, timestep, metric, server=False):
        plt = _get_matplotlib_pyplot(server)
        if not plt: return

        scoring_history = self.scoring_history()
        # Separate functionality for GLM since its output is different from other algos
        if self._model_json["algo"] == "glm":
            # GLM has only one timestep option, which is `iteration`
            timestep = "iteration"
            if metric == "AUTO":
                metric = "log_likelihood"
            elif metric not in ("log_likelihood", "objective"):
                raise H2OValueError("for GLM, metric must be one of: log_likelihood, objective")
            plt.xlabel(timestep)
            plt.ylabel(metric)
            plt.title("Validation Scoring History")
            plt.plot(scoring_history[timestep], scoring_history[metric])

        elif self._model_json["algo"] in ("deeplearning", "deepwater", "drf", "gbm"):
            # Set timestep
            if self._model_json["algo"] in ("gbm", "drf"):
                assert_is_type(timestep, "AUTO", "duration", "number_of_trees")
                if timestep == "AUTO":
                    timestep = "number_of_trees"
            else:  # self._model_json["algo"] == "deeplearning":
                # Delete first row of DL scoring history since it contains NAs & NaNs
                if scoring_history["samples"][0] == 0:
                    scoring_history = scoring_history[1:]
                assert_is_type(timestep, "AUTO", "epochs",  "samples", "duration")
                if timestep == "AUTO":
                    timestep = "epochs"

            training_metric = "training_{}".format(metric)
            validation_metric = "validation_{}".format(metric)
            if timestep == "duration":
                dur_colname = "duration_{}".format(scoring_history["duration"][1].split()[1])
                scoring_history[dur_colname] = [str(x).split()[0] for x in scoring_history["duration"]]
                timestep = dur_colname

            if can_use_pandas():
                valid = validation_metric in list(scoring_history)
                ylim = (scoring_history[[training_metric, validation_metric]].min().min(),
                        scoring_history[[training_metric, validation_metric]].max().max()) if valid \
                    else (scoring_history[training_metric].min(), scoring_history[training_metric].max())
            else:
                valid = validation_metric in scoring_history.col_header
                ylim = (min(min(scoring_history[[training_metric, validation_metric]])),
                        max(max(scoring_history[[training_metric, validation_metric]]))) if valid \
                    else (min(scoring_history[training_metric]), max(scoring_history[training_metric]))
            if ylim[0] == ylim[1]: ylim = (0, 1)

            if valid:  # Training and validation scoring history
                plt.xlabel(timestep)
                plt.ylabel(metric)
                plt.title("Scoring History")
                plt.ylim(ylim)
                plt.plot(scoring_history[timestep], scoring_history[training_metric], label="Training")
                plt.plot(scoring_history[timestep], scoring_history[validation_metric], color="orange",
                         label="Validation")
                plt.legend()
            else:  # Training scoring history only
                plt.xlabel(timestep)
                plt.ylabel(training_metric)
                plt.title("Training Scoring History")
                plt.ylim(ylim)
                plt.plot(scoring_history[timestep], scoring_history[training_metric])

        else:  # algo is not glm, deeplearning, drf, gbm
            raise H2OValueError("Plotting not implemented for this type of model")
        if not server: plt.show()