Esempio n. 1
0
    def _decode_labels(self, y):
        """
        An internal helper function that uses either the classes or encoder
        properties to correctly decode y as user-readable string labels.

        If both classes and encoder are set, a warning is issued and encoder is
        used instead of classes. If neither encoder nor classes is set then the
        original array is returned unmodified.
        """
        if self.classes is not None and self.encoder is not None:
            warnings.warn(
                "both classes and encoder specified, using encoder", YellowbrickWarning
            )

        if self.encoder is not None:
            # Use the label encoder or other transformer
            if hasattr(self.encoder, "inverse_transform"):
                try:
                    return self.encoder.inverse_transform(y)
                except ValueError:
                    y_labels = np.unique(y)
                    raise ModelError(
                        "could not decode {} y values to {} labels".format(
                            y_labels, self._labels()
                        )
                    )

            # Otherwise, treat as a dictionary
            try:
                return np.asarray([self.encoder[yi] for yi in y])
            except KeyError as e:
                raise ModelError(
                    (
                        "cannot decode class {} to label, "
                        "key not specified by encoder"
                    ).format(e)
                )

        if self.classes is not None:
            # Determine indices to perform class mappings on
            yp = np.asarray(y)
            if yp.dtype.kind in {"i", "u"}:
                idx = yp
            else:
                # Use label encoder to get indices by sorted class names
                idx = LabelEncoder().fit_transform(yp)

            # Use index mapping for classes
            try:
                return np.asarray(self.classes)[idx]
            except IndexError:
                y_labels = np.unique(yp)
                raise ModelError(
                    "could not decode {} y values to {} labels".format(
                        y_labels, self._labels()
                    )
                )

        # could not decode y without encoder or classes, return it as it is, unmodified
        return y
Esempio n. 2
0
    def _get_y_scores(self, X):
        """
        The ``precision_recall_curve`` metric requires target scores that
        can either be the probability estimates of the positive class,
        confidence values, or non-thresholded measures of decisions (as
        returned by a "decision function").
        """
        # TODO refactor shared method with ROCAUC

        # Resolution order of scoring functions
        attrs = ("decision_function", "predict_proba")

        # Return the first resolved function
        for attr in attrs:
            try:
                method = getattr(self.estimator, attr, None)
                if method:
                    # Compute the scores from the decision function
                    y_scores = method(X)

                    # Return only the positive class for binary predict_proba
                    if self.target_type_ == BINARY and y_scores.ndim == 2:
                        return y_scores[:, 1]
                    return y_scores

            except AttributeError:
                # Some Scikit-Learn estimators have both probability and
                # decision functions but override __getattr__ and raise an
                # AttributeError on access.
                continue

        # If we've gotten this far, we can't do anything
        raise ModelError((
            "{} requires an estimator with predict_proba or decision_function."
        ).format(self.__class__.__name__))
Esempio n. 3
0
    def fit(self, X, y=None, **kwargs):
        """
        Fits the manifold on X and transforms the data to plot it on the axes.
        See fit_transform() for more details.

        Parameters
        ----------
        X : array-like of shape (n, m)
            A matrix or data frame with n instances and m features

        y : array-like of shape (n,), optional
            A vector or series with target values for each instance in X. This
            vector is used to determine the color of the points in X.

        Returns
        -------
        self : Manifold
            Returns the visualizer object.

        """
        if not hasattr(self.manifold, "transform"):
            name = self.manifold.__class__.__name__
            raise ModelError(
                ("{} requires data to be simultaneously fit and transformed, "
                 "use fit_transform instead").format(name))

        # Call super to compute features, classes, colors, etc.
        super(Manifold, self).fit(X, y)
        with Timer() as self.fit_time_:
            self.manifold.fit(X)
        return self
Esempio n. 4
0
    def score(self, X, y):
        """
        Generates a 2D array where each row is the count of the
        predicted classes and each column is the true class

        Parameters
        ----------
        X : ndarray or DataFrame of shape n x m
            A matrix of n instances with m features

        y : ndarray or Series of length n
            An array or series of target or class values

        Returns
        -------
        score_ : float
            Global accuracy score
        """
        # Must be computed before calling super
        # We're relying on predict to raise NotFitted
        y_pred = self.predict(X)
        y_type, y_true, y_pred = _check_targets(y, y_pred)
        if y_type not in ("binary", "multiclass"):
            raise YellowbrickValueError("{} is not supported".format(y_type))

        # Get the indices of the unique labels
        indices = unique_labels(y_true, y_pred)
        labels = self._labels()

        # Call super to compute self.score_ and verify classes
        try:
            super(ClassPredictionError, self).score(X, y)
        except ModelError as e:
            # raise visualizer-specific errors
            if labels is not None and len(labels) < len(indices):
                raise NotImplementedError(
                    "filtering classes is currently not supported"
                )
            else:
                raise e

        # Ensure all labels are used
        if labels is not None and len(labels) > len(indices):
            raise ModelError(
                "y and y_pred contain zero values for one of the specified classes"
            )

        # Create a table of predictions whose rows are the true classes
        # and whose columns are the predicted classes; each element
        # is the count of predictions for that class that match the true
        # value of that class.
        self.predictions_ = np.array(
            [
                [(y_pred[y == label_t] == label_p).sum() for label_p in indices]
                for label_t in indices
            ]
        )

        self.draw()
        return self.score_
Esempio n. 5
0
    def _get_y_scores(self, X):
        """
        The ``roc_curve`` metric requires target scores that can either be the
        probability estimates of the positive class, confidence values or non-
        thresholded measure of decisions (as returned by "decision_function").

        This method computes the scores by resolving the estimator methods
        that retreive these values.

        .. todo:: implement confidence values metric.

        Parameters
        ----------
        X : ndarray or DataFrame of shape n x m
            A matrix of n instances with m features -- generally the test data
            that is associated with y_true values.
        """
        # The resolution order of scoring functions
        attrs = ("predict_proba", "decision_function")

        # Return the first resolved function
        for attr in attrs:
            try:
                method = getattr(self.estimator, attr, None)
                if method:
                    return method(X)
            except AttributeError:
                # Some Scikit-Learn estimators have both probability and
                # decision functions but override __getattr__ and raise an
                # AttributeError on access.
                # Note that because of the ordering of our attrs above,
                # estimators with both will *only* ever use probability.
                continue

        # If we've gotten this far, raise an error
        raise ModelError("ROCAUC requires estimators with predict_proba or "
                         "decision_function methods.")
Esempio n. 6
0
    def transform(self, X, y=None, **kwargs):
        """
        Returns the transformed data points from the manifold embedding.

        Parameters
        ----------
        X : array-like of shape (n, m)
            A matrix or data frame with n instances and m features

        y : array-like of shape (n,), optional
            The target, used to specify the colors of the points.

        Returns
        -------
        Xprime : array-like of shape (n, 2)
            Returns the 2-dimensional embedding of the instances.

        Note
        ----
        This method does not work with MDS, TSNE and SpectralEmbedding because
        it is yet to be implemented in sklearn.
        """
        # Because some manifolds do not have transform we cannot call super
        try:
            Xp = self.manifold.transform(X)
            self.draw(Xp, y)
            return Xp
        except NotFittedError:
            raise NotFitted.from_estimator(self, "transform")
        except AttributeError:
            name = self.manifold.__class__.__name__
            raise ModelError(
                ("{} requires data to be simultaneously fit and transformed, "
                 "use fit_transform instead").format(name))

        return Xp
Esempio n. 7
0
    def score(self, X, y=None):
        """
        Generates the predicted target values using the Scikit-Learn
        estimator.

        Parameters
        ----------
        X : ndarray or DataFrame of shape n x m
            A matrix of n instances with m features

        y : ndarray or Series of length n
            An array or series of target or class values

        Returns
        -------
        score_ : float
            Global accuracy unless micro or macro scores are requested.
        """
        # Call super to check if fitted and to compute self.score_
        # NOTE: this sets score to the base score if neither macro nor micro
        super(ROCAUC, self).score(X, y)

        # Compute the predictions for the test data
        y_pred = self._get_y_scores(X)

        if self.target_type_ == BINARY:
            # For binary, per_class must be True to draw micro/macro curves
            if (self.micro or self.macro) and not self.per_class:
                raise ModelError(
                    "no curves will be drawn; ",
                    "set per_class=True or micro=False and macro=False.",
                )

            # For binary, if predictions are returned in shape (n,), micro and macro
            # curves are not defined
            if (self.micro or self.macro) and len(y_pred.shape) == 1:
                raise ModelError("no curves will be drawn; set binary=True.", )

        if self.target_type_ == MULTICLASS:
            # If it's multiclass classification, at least one of micro, macro, or
            # per_class must be True
            if not self.micro and not self.macro and not self.per_class:
                raise YellowbrickValueError(
                    "no curves will be drawn; specify micro, macro, or per_class"
                )

        # Classes may be label encoded so only use what's in y to compute.
        # The self.classes_ attribute will be used as names for labels.
        classes = np.unique(y)
        n_classes = len(classes)

        # Store the false positive rate, true positive rate and curve info.
        self.fpr = dict()
        self.tpr = dict()
        self.roc_auc = dict()

        # If the decision is binary draw only ROC curve for the positive class
        if self.target_type_ is BINARY and not self.per_class:
            # In this case predict_proba returns an array of shape (n, 2) which
            # specifies the probabilities of both the negative and positive classes.
            if len(y_pred.shape) == 2 and y_pred.shape[1] == 2:
                self.fpr[BINARY], self.tpr[BINARY], _ = roc_curve(
                    y, y_pred[:, 1])
            else:
                # decision_function returns array of shape (n,), so plot it directly
                self.fpr[BINARY], self.tpr[BINARY], _ = roc_curve(y, y_pred)
            self.roc_auc[BINARY] = auc(self.fpr[BINARY], self.tpr[BINARY])

        # Per-class binary decisions may have to have the negative class curve computed
        elif self.target_type_ is BINARY and self.per_class:
            # draw a curve for class 1 (the positive class)
            if len(y_pred.shape) == 2 and y_pred.shape[1] == 2:
                # predict_proba returns array of shape (n, 2), so use
                # probability of class 1 to compute ROC
                self.fpr[1], self.tpr[1], _ = roc_curve(y, y_pred[:, 1])
            else:
                # decision_function returns array of shape (n,)
                self.fpr[1], self.tpr[1], _ = roc_curve(y, y_pred)
            self.roc_auc[1] = auc(self.fpr[1], self.tpr[1])

            # draw a curve for class 0 (the negative class)
            if len(y_pred.shape) == 2 and y_pred.shape[1] == 2:
                # predict_proba returns array of shape (n, 2), so use
                # probability of class 0 to compute ROC
                self.fpr[0], self.tpr[0], _ = roc_curve(1 - y, y_pred[:, 0])
            else:
                # decision_function returns array of shape (n,).
                # To draw a ROC curve for class 0 we swap the classes 0 and 1 in y
                # and reverse classifiers predictions y_pred.
                self.fpr[0], self.tpr[0], _ = roc_curve(1 - y, -y_pred)
            self.roc_auc[0] = auc(self.fpr[0], self.tpr[0])

        else:
            # Otherwise compute the ROC curve and ROC area for each class
            for i, c in enumerate(classes):
                self.fpr[i], self.tpr[i], _ = roc_curve(y,
                                                        y_pred[:, i],
                                                        pos_label=c)
                self.roc_auc[i] = auc(self.fpr[i], self.tpr[i])

        # Compute micro average
        if self.micro:
            self._score_micro_average(y, y_pred, classes, n_classes)

        # Compute macro average
        if self.macro:
            self._score_macro_average(n_classes)

        # Draw the Curves
        self.draw()

        # Set score to micro average if specified
        if self.micro:
            self.score_ = self.roc_auc[MICRO]

        # Set score to macro average if not micro
        if self.macro:
            self.score_ = self.roc_auc[MACRO]

        return self.score_
Esempio n. 8
0
    def score(self, X, y=None):
        """
        Generates the predicted target values using the Scikit-Learn
        estimator.

        Parameters
        ----------
        X : ndarray or DataFrame of shape n x m
            A matrix of n instances with m features

        y : ndarray or Series of length n
            An array or series of target or class values

        Returns
        -------
        score_ : float
            Global accuracy unless micro or macro scores are requested.
        """
        # Call super to check if fitted and to compute self.score_
        # NOTE: this sets score to the base score if neither macro nor micro
        super(ROCAUC, self).score(X, y)

        # Compute the predictions for the test data
        y_pred = self._get_y_scores(X)

        # Note: In the above, _get_y_scores calls either a decision_function or
        # predict_proba, which should return a 2D array. But in a binary
        # classification using an estimator with only a decision_function, y_pred
        # will instead be 1D, meaning only one curve can be plotted. In this case,
        # we set the _binary_decision attribute to True to ensure only one curve is
        # computed and plotted later on.
        if y_pred.ndim == 1:
            self._binary_decision = True

            # Raise an error if it's a binary decision and user has set micro,
            # macro, or per_class to True
            if self.micro or self.macro or self.per_class:
                raise ModelError(
                    "Micro, macro, and per-class scores are not defined for "
                    "binary classification for estimators with only "
                    "decision_function methods; set micro, macro, and "
                    "per-class params to False.")
        else:
            self._binary_decision = False
            # If it's not a binary decision, at least one of micro, macro, or
            # per_class must be True
            if not self.micro and not self.macro and not self.per_class:
                raise YellowbrickValueError(
                    "no curves will be drawn; specify micro, macro, or per_class"
                )

        # Classes may be label encoded so only use what's in y to compute.
        # The self.classes_ attribute will be used as names for labels.
        classes = np.unique(y)
        n_classes = len(classes)

        # Store the false positive rate, true positive rate and curve info.
        self.fpr = dict()
        self.tpr = dict()
        self.roc_auc = dict()

        # If the decision is binary, compute the ROC curve and ROC area
        if self._binary_decision is True:
            self.fpr[0], self.tpr[0], _ = roc_curve(y, y_pred)
            self.roc_auc[0] = auc(self.fpr[0], self.tpr[0])
        else:
            # Otherwise compute the ROC curve and ROC area for each class
            for i, c in enumerate(classes):
                self.fpr[i], self.tpr[i], _ = roc_curve(y,
                                                        y_pred[:, i],
                                                        pos_label=c)
                self.roc_auc[i] = auc(self.fpr[i], self.tpr[i])

        # Compute micro average
        if self.micro:
            self._score_micro_average(y, y_pred, classes, n_classes)

        # Compute macro average
        if self.macro:
            self._score_macro_average(n_classes)

        # Draw the Curves
        self.draw()

        # Set score to micro average if specified
        if self.micro:
            self.score_ = self.roc_auc[MICRO]

        # Set score to macro average if not micro
        if self.macro:
            self.score_ = self.roc_auc[MACRO]

        return self.score_