Esempio n. 1
0
    def screeplot(self, type="barplot", server=False):
        """
        Produce the scree plot.

        Library ``matplotlib`` is required for this function.

        :param str type: either ``"barplot"`` or ``"lines"``.
        :param bool server: if true set server settings to matplotlib and do not show the graph
        """
        # check for matplotlib. exit if absent.
        plt = get_matplotlib_pyplot(server)
        if plt is None:
            return

        variances = [s ** 2 for s in self._model_json['output']['importance'].cell_values[0][1:]]
        plt.xlabel('Components')
        plt.ylabel('Variances')
        plt.title('Scree Plot')
        plt.xticks(list(range(1, len(variances) + 1)))

        if type == "barplot":
            plt.bar(list(range(1, len(variances) + 1)), variances)
        elif type == "lines":
            plt.plot(list(range(1, len(variances) + 1)), variances, 'b--')

        if not server:
            plt.show()
Esempio n. 2
0
    def screeplot(self, type="barplot", **kwargs):
        """
        Produce the scree plot.

        Library ``matplotlib`` is required for this function.

        :param str type: either ``"barplot"`` or ``"lines"``.
        """
        # check for matplotlib. exit if absent.
        is_server = kwargs.pop("server")
        if kwargs:
            raise ValueError("Unknown arguments %s to screeplot()" %
                             ", ".join(kwargs.keys()))
        plt = get_matplotlib_pyplot(is_server)
        if plt is None:
            return

        variances = [
            s**2 for s in self._model_json['output']
            ['importance'].cell_values[0][1:]
        ]
        plt.xlabel('Components')
        plt.ylabel('Variances')
        plt.title('Scree Plot')
        plt.xticks(list(range(1, len(variances) + 1)))
        if type == "barplot":
            plt.bar(list(range(1, len(variances) + 1)), variances)
        elif type == "lines":
            plt.plot(list(range(1, len(variances) + 1)), variances, 'b--')
        if not is_server: plt.show()
Esempio n. 3
0
    def scoring_history_plot(self, timestep, metric, server=False):
        plt = get_matplotlib_pyplot(server)
        if plt is None: return

        scoring_history = self._get_scoring_history_to_plot()
        timestep = self._validate_timestep(timestep)
        training_metric = "training_{}".format(metric)
        validation_metric = "validation_{}".format(metric)
        if timestep == "duration":
            dur_colname = "duration_{}".format(
                scoring_history["duration"][1].split()[1])
            scoring_history[dur_colname] = [
                str(x).split()[0] for x in scoring_history["duration"]
            ]
            timestep = dur_colname

        if can_use_pandas():
            valid = validation_metric in list(scoring_history)
            ylim = (scoring_history[[training_metric, validation_metric]].min().min(),
                    scoring_history[[training_metric, validation_metric]].max().max()) if valid \
                else (scoring_history[training_metric].min(), scoring_history[training_metric].max())
        else:
            valid = validation_metric in scoring_history.col_header
            ylim = (min(min(scoring_history[[training_metric, validation_metric]])),
                    max(max(scoring_history[[training_metric, validation_metric]]))) if valid \
                else (min(scoring_history[training_metric]), max(scoring_history[training_metric]))
        if ylim[0] == ylim[1]: ylim = (0, 1)

        if valid:  # Training and validation scoring history
            plt.xlabel(timestep)
            plt.ylabel(metric)
            plt.title("Scoring History")
            plt.ylim(ylim)
            plt.plot(scoring_history[timestep],
                     scoring_history[training_metric],
                     label="Training")
            plt.plot(scoring_history[timestep],
                     scoring_history[validation_metric],
                     color="orange",
                     label="Validation")
            plt.legend()
        else:  # Training scoring history only
            plt.xlabel(timestep)
            plt.ylabel(training_metric)
            plt.title("Training Scoring History")
            plt.ylim(ylim)
            plt.plot(scoring_history[timestep],
                     scoring_history[training_metric])
        if not server:
            plt.show()
Esempio n. 4
0
    def scoring_history_plot(self, timestep, metric, server=False):
        plt = get_matplotlib_pyplot(server)
        if plt is None: return

        scoring_history = self.scoring_history()

        if self.actual_params.get("lambda_search"):
            allowed_timesteps = ["iteration", "duration"]
            allowed_metrics = [
                "deviance_train", "deviance_test", "deviance_xval"
            ]
            # When provided with multiple alpha values, scoring history contains history of all...
            scoring_history = scoring_history[scoring_history["alpha"] ==
                                              self._model_json["output"]
                                              ["alpha_best"]]
        elif self.actual_params.get("HGLM"):
            allowed_timesteps = ["iterations", "duration"]
            allowed_metrics = ["convergence", "sumetaieta02"]
        else:
            allowed_timesteps = ["iterations", "duration"]
            allowed_metrics = ["objective", "negative_log_likelihood"]
        if metric == "AUTO":
            metric = allowed_metrics[0]
        elif metric not in allowed_metrics:
            raise H2OValueError("for {}, metric must be one of: {}".format(
                self.algo.upper(), ", ".join(allowed_metrics)))

        if timestep == "AUTO":
            timestep = allowed_timesteps[0]
        elif timestep not in allowed_timesteps:
            raise H2OValueError("for {}, timestep must be one of: {}".format(
                self.algo.upper(), ", ".join(allowed_timesteps)))

        plt.xlabel(timestep)
        plt.ylabel(metric)
        plt.title("Validation Scoring History")
        style = "b-" if len(scoring_history[timestep]) > 1 else "bx"
        plt.plot(scoring_history[timestep], scoring_history[metric], style)
        if not server:
            plt.show()
Esempio n. 5
0
    def _std_coef_plot(self, num_of_features=None, server=False):
        """
        Plot a GLM model"s standardized coefficient magnitudes.

        :param num_of_features: the number of features shown in the plot.
        :param server: if true set server settings to matplotlib and show the graph

        :returns: None.
        """
        assert_is_type(num_of_features, None, I(int, lambda x: x > 0))

        plt = get_matplotlib_pyplot(server)
        if not plt: return

        # get unsorted tuple of labels and coefficients
        unsorted_norm_coef = self.coef_norm().items()
        # drop intercept value then sort tuples by the coefficient"s absolute value
        drop_intercept = [
            tup for tup in unsorted_norm_coef if tup[0] != "Intercept"
        ]
        norm_coef = sorted(drop_intercept,
                           key=lambda x: abs(x[1]),
                           reverse=True)

        signage = []
        for element in norm_coef:
            # if positive including zero, color blue, else color orange (use same colors as Flow)
            if element[1] >= 0:
                signage.append("#1F77B4")  # blue
            else:
                signage.append("#FF7F0E")  # dark orange

        # get feature labels and their corresponding magnitudes
        feature_labels = [tup[0] for tup in norm_coef]
        norm_coef_magn = [abs(tup[1]) for tup in norm_coef]
        # specify bar centers on the y axis, but flip the order so largest bar appears at top
        pos = range(len(feature_labels))[::-1]
        # specify the bar lengths
        val = norm_coef_magn

        # check number of features, default is all the features
        if num_of_features is None:
            num_of_features = len(val)

        # plot horizontal plot
        fig, ax = plt.subplots(1, 1, figsize=(14, 10))
        # create separate plot for the case where num_of_features = 1
        if num_of_features == 1:
            plt.barh(pos[0],
                     val[0],
                     align="center",
                     height=0.8,
                     color=signage[0],
                     edgecolor="none")
            # Hide the right and top spines, color others grey
            ax.spines["right"].set_visible(False)
            ax.spines["top"].set_visible(False)
            ax.spines["bottom"].set_color("#7B7B7B")
            ax.spines["left"].set_color("#7B7B7B")
            # Only show ticks on the left and bottom spines
            ax.yaxis.set_ticks_position("left")
            ax.xaxis.set_ticks_position("bottom")
            plt.yticks([0], feature_labels[0])
            ax.margins(None, 0.5)

        else:
            plt.barh(pos[0:num_of_features],
                     val[0:num_of_features],
                     align="center",
                     height=0.8,
                     color=signage[0:num_of_features],
                     edgecolor="none")
            # Hide the right and top spines, color others grey
            ax.spines["right"].set_visible(False)
            ax.spines["top"].set_visible(False)
            ax.spines["bottom"].set_color("#7B7B7B")
            ax.spines["left"].set_color("#7B7B7B")
            # Only show ticks on the left and bottom spines
            ax.yaxis.set_ticks_position("left")
            ax.xaxis.set_ticks_position("bottom")
            plt.yticks(pos[0:num_of_features],
                       feature_labels[0:num_of_features])
            ax.margins(None, 0.05)

        # generate custom fake lines that will be used as legend entries:
        # check if positive and negative values exist
        # if positive create positive legend
        if "#1F77B4" in signage[0:num_of_features] and "#FF7F0E" not in signage[
                0:num_of_features]:
            color_ids = ("Positive", )
            markers = [
                plt.Line2D([0, 0], [0, 0],
                           color=color,
                           marker="s",
                           linestyle="")
                for color in signage[0:num_of_features]
            ]
            lgnd = plt.legend(markers,
                              color_ids,
                              numpoints=1,
                              loc="best",
                              frameon=False,
                              fontsize=13)
            lgnd.legendHandles[0]._legmarker.set_markersize(10)
        # if neg create neg legend
        elif "#FF7F0E" in signage[
                0:num_of_features] and "#1F77B4" not in signage[
                    0:num_of_features]:
            color_ids = ("Negative", )
            markers = [
                plt.Line2D([0, 0], [0, 0],
                           color=color,
                           marker="s",
                           linestyle="")
                for color in set(signage[0:num_of_features])
            ]
            lgnd = plt.legend(markers,
                              color_ids,
                              numpoints=1,
                              loc="best",
                              frameon=False,
                              fontsize=13)
            lgnd.legendHandles[0]._legmarker.set_markersize(10)
        # if both provide both colors in legend
        else:
            color_ids = ("Positive", "Negative")
            markers = [
                plt.Line2D([0, 0], [0, 0],
                           color=color,
                           marker="s",
                           linestyle="") for color in ['#1F77B4', '#FF7F0E']
            ]  # blue should always be positive, orange negative
            lgnd = plt.legend(markers,
                              color_ids,
                              numpoints=1,
                              loc="best",
                              frameon=False,
                              fontsize=13)
            lgnd.legendHandles[0]._legmarker.set_markersize(10)
            lgnd.legendHandles[1]._legmarker.set_markersize(10)

        # Hide the right and top spines, color others grey
        ax.spines["right"].set_visible(False)
        ax.spines["top"].set_visible(False)
        ax.spines["bottom"].set_color("#7B7B7B")
        ax.spines["left"].set_color("#7B7B7B")

        # Only show ticks on the left and bottom spines
        plt.yticks(pos[0:num_of_features], feature_labels[0:num_of_features])
        plt.tick_params(axis="x",
                        which="minor",
                        bottom="off",
                        top="off",
                        labelbottom="off")
        plt.title("Standardized Coef. Magnitudes: H2O GLM", fontsize=20)
        # show plot
        if server:
            plt.show()
Esempio n. 6
0
    def _varimp_plot(self, num_of_features=None, server=False):
        """
        Plot the variable importance for a trained model.

        :param num_of_features: the number of features shown in the plot (default is 10 or all if less than 10).
        :param server: if true set server settings to matplotlib and show the graph

        :returns: None.
        """
        assert_is_type(num_of_features, None, int)
        assert_is_type(server, bool)

        plt = get_matplotlib_pyplot(server)
        if plt is None:
            return

        # get the variable importances as a list of tuples, do not use pandas dataframe
        importances = self.varimp(use_pandas=False)
        # features labels correspond to the first value of each tuple in the importances list
        feature_labels = [tup[0] for tup in importances]
        # relative importances correspond to the first value of each tuple in the importances list
        scaled_importances = [tup[2] for tup in importances]
        # specify bar centers on the y axis, but flip the order so largest bar appears at top
        pos = range(len(feature_labels))[::-1]
        # specify the bar lengths
        val = scaled_importances

        # default to 10 or less features if num_of_features is not specified
        if num_of_features is None:
            num_of_features = min(len(val), 10)

        fig, ax = plt.subplots(1, 1, figsize=(14, 10))
        # create separate plot for the case where num_of_features == 1
        if num_of_features == 1:
            plt.barh(pos[0:num_of_features], val[0:num_of_features], align="center",
                     height=0.8, color="#1F77B4", edgecolor="none")
            # Hide the right and top spines, color others grey
            ax.spines["right"].set_visible(False)
            ax.spines["top"].set_visible(False)
            ax.spines["bottom"].set_color("#7B7B7B")
            ax.spines["left"].set_color("#7B7B7B")
            # Only show ticks on the left and bottom spines
            ax.yaxis.set_ticks_position("left")
            ax.xaxis.set_ticks_position("bottom")
            plt.yticks(pos[0:num_of_features], feature_labels[0:num_of_features])
            ax.margins(None, 0.5)

        else:
            plt.barh(pos[0:num_of_features], val[0:num_of_features], align="center",
                     height=0.8, color="#1F77B4", edgecolor="none")
            # Hide the right and top spines, color others grey
            ax.spines["right"].set_visible(False)
            ax.spines["top"].set_visible(False)
            ax.spines["bottom"].set_color("#7B7B7B")
            ax.spines["left"].set_color("#7B7B7B")
            # Only show ticks on the left and bottom spines
            ax.yaxis.set_ticks_position("left")
            ax.xaxis.set_ticks_position("bottom")
            plt.yticks(pos[0:num_of_features], feature_labels[0:num_of_features])
            plt.ylim([min(pos[0:num_of_features])- 1, max(pos[0:num_of_features])+1])
            # ax.margins(y=0.5)

        # check which algorithm was used to select right plot title
        plt.title("Variable Importance: H2O %s" % self._model_json["algo_full_name"], fontsize=20)
        if not server:
            plt.show()