Beispiel #1
0
    def __init__(self,
                 ax=None,
                 x=None,
                 y=None,
                 features=None,
                 classes=None,
                 color=None,
                 colormap=None,
                 markers=None,
                 **kwargs):
        """
        Initialize the base scatter with many of the options required in order
        to make the visualization work.
        """
        super(ScatterVisualizer, self).__init__(ax, features, classes, color,
                                                colormap, **kwargs)

        self.x = x
        self.y = y
        self.markers = itertools.cycle(
            kwargs.pop('markers', (',', '+', 'o', '*', 'v', 'h', 'd')))

        if self.x is not None and self.y is not None and self.features_ is not None:
            raise YellowbrickValueError(
                'Please specify x,y or features, not both.')

        if self.x is not None and self.y is not None and self.features_ is None:
            self.features_ = [self.x, self.y]

        # Ensure with init that features doesn't have more than two features
        if features is not None:
            if len(features) != 2:
                raise YellowbrickValueError(
                    'ScatterVisualizer only accepts two features.')
Beispiel #2
0
    def _select_features_to_plot(self, X):
        """
        Select features to plot.

        feature_index is always used as the filter and
        if filter_names is supplied, a new feature_index
        is computed from those names.
        """
        if self.feature_index:
            if self.feature_names:
                raise YellowbrickWarning(
                    "Both feature_index and feature_names "
                    "are specified. feature_names is ignored")
            if min(self.feature_index) < 0 or max(
                    self.feature_index) >= X.shape[1]:
                raise YellowbrickValueError("Feature index is out of range")
        elif self.feature_names:
            self.feature_index = []
            features_list = self.features_.tolist()
            for feature_name in self.feature_names:
                try:
                    self.feature_index.append(
                        features_list.index(feature_name))
                except ValueError:
                    raise YellowbrickValueError(
                        "{} not in labels".format(feature_name))
Beispiel #3
0
    def __init__(
        self,
        model,
        ax=None,
        hist=True,
        qqplot=False,
        train_color="b",
        test_color="g",
        line_color=LINE_COLOR,
        train_alpha=0.75,
        test_alpha=0.75,
        is_fitted="auto",
        **kwargs
    ):
        # Whether or not to check if the model is already fitted
        self.is_fitted = is_fitted

        # Initialize the visualizer base
        super(ResidualsPlot, self).__init__(model, ax=ax, **kwargs)

        # TODO: allow more scatter plot arguments for train and test points
        # See #475 (RE: ScatterPlotMixin)
        self.colors = {
            "train_point": train_color,
            "test_point": test_color,
            "line": line_color,
        }

        self.hist = hist
        if self.hist not in {True, "density", "frequency", None, False}:
            raise YellowbrickValueError(
                "'{}' is an invalid argument for hist, use None, True, "
                "False, 'density', or 'frequency'".format(hist)
            )

        self.qqplot = qqplot
        if self.qqplot not in {True, False}:
            raise YellowbrickValueError(
                "'{}' is an invalid argument for qqplot, use True, "
                " or False".format(hist)
            )

        if self.hist in {True, "density", "frequency"} and self.qqplot in {True}:
            raise YellowbrickValueError(
                "Set either hist or qqplot to False, can not plot "
                "both of them simultaneously."
            )

        if self.hist in {True, "density", "frequency"}:
            self.hax  # If hist is True, test the version availability

        if self.qqplot in {True}:
            self.qqax  # If qqplot is True, test the version availability

        # Store labels and colors for the legend ordered by call
        self._labels, self._colors = [], []

        self.alphas = {"train_point": train_alpha, "test_point": test_alpha}
Beispiel #4
0
    def fit(self, X, y, **kwargs):
        """
        Sets up the X and y variables for the jointplot
        and checks to ensure that X and y are of the
        correct data type

        Fit calls draw

        Parameters
        ----------

        X : ndarray or DataFrame of shape n x 1
            A matrix of n instances with 1 feature

        y : ndarray or Series of length n
            An array or series of the target value

        kwargs: dict
            keyword arguments passed to Scikit-Learn API.
        """

        #throw an error if X has more than 1 column
        if is_dataframe(X):
            nrows, ncols = X.shape

            if ncols > 1:
                raise YellowbrickValueError((
                    "X needs to be an ndarray or DataFrame with one feature, "
                    "please select one feature from the DataFrame"
                ))

        #throw an error is y is None
        if y is None:
            raise YellowbrickValueError((
                "Joint plots are useful for classification and regression "
                "problems, which require a target variable"
            ))


        # Handle the feature name if it is None.
        if self.feature is None:

            # If X is a data frame, get the columns off it.
            if is_dataframe(X):
                self.feature = X.columns

            else:
                self.feature = ['x']

        # Handle the target name if it is None.
        if self.target is None:
            self.target = ['y']

        self.draw(X, y, **kwargs)
        return self
Beispiel #5
0
    def __init__(self,
                 model,
                 ax=None,
                 x=None,
                 y=None,
                 features=None,
                 classes=None,
                 show_scatter=True,
                 step_size=0.0025,
                 markers=None,
                 pcolormesh_alpha=0.8,
                 scatter_alpha=1.0,
                 encoder=None,
                 is_fitted="auto",
                 force_model=False,
                 **kwargs):
        super(DecisionBoundariesVisualizer, self).__init__(
            model,
            ax=ax,
            classes=classes,
            encoder=encoder,
            is_fitted=is_fitted,
            force_model=force_model,
        )

        self.x = x
        self.y = y
        self.features_ = features
        self.estimator = model
        self.show_scatter = show_scatter
        self.step_size = step_size
        self.markers = itertools.cycle(
            kwargs.pop("markers", (",", "o", "d", "*", "v", "h", "+")))
        self.pcolormesh_alpha = pcolormesh_alpha
        self.scatter_alpha = scatter_alpha

        # these are set later
        self.Z = None
        self.Z_shape = None
        self.xx = None
        self.yy = None
        self.class_labels = None

        if self.x is not None and self.y is not None and self.features_ is not None:
            raise YellowbrickValueError(
                "Please specify x,y or features, not both.")

        if self.x is not None and self.y is not None and self.features_ is None:
            self.features_ = [self.x, self.y]

        # Ensure with init that features doesn't have more than two features
        if features is not None:
            if len(features) != 2:
                raise YellowbrickValueError(
                    "DecisionBoundariesVisualizer only accepts two features.")
Beispiel #6
0
    def __init__(
            self,
            model,
            x=None,
            y=None,
            features=None,
            show_scatter=True,
            step_size=0.0025,
            markers=None,
            pcolormesh_alpha=0.8,
            scatter_alpha=1.0,
            #  title=None,
            *args,
            **kwargs):
        """
        Pass in a unfitted model to generate a decision boundaries
        visualization.
        """
        super(DecisionBoundariesVisualizer,
              self).__init__(model, *args, **kwargs)

        self.x = x
        self.y = y
        self.features_ = features
        self.estimator = model
        self.show_scatter = show_scatter
        self.step_size = step_size
        self.markers = itertools.cycle(
            kwargs.pop('markers', (',', 'o', 'd', '*', 'v', 'h', '+')))
        self.pcolormesh_alpha = pcolormesh_alpha
        self.scatter_alpha = scatter_alpha

        # these are set later
        self.Z = None
        self.Z_shape = None
        self.xx = None
        self.yy = None
        self.class_labels = None

        if self.x is not None and self.y is not None and self.features_ is not None:
            raise YellowbrickValueError(
                'Please specify x,y or features, not both.')

        if self.x is not None and self.y is not None and self.features_ is None:
            self.features_ = [self.x, self.y]

        # Ensure with init that features doesn't have more than two features
        if features is not None:
            if len(features) != 2:
                raise YellowbrickValueError(
                    'DecisionBoundariesVisualizer only accepts two features.')
Beispiel #7
0
    def draw(self, X, y=None):
        """
        Draws the points described by X and colored by the points in y. Can be
        called multiple times before finalize to add more scatter plots to the
        axes, however ``fit()`` must be called before use.

        Parameters
        ----------
        X : array-like of shape (n, 2)
            The matrix produced by the ``transform()`` method.

        y : array-like of shape (n,), optional
            The target, used to specify the colors of the points.

        Returns
        -------
        self.ax : matplotlib Axes object
            Returns the axes that the scatter plot was drawn on.
        """
        scatter_kwargs = {"alpha": self.alpha}

        # Determine the colors
        if self._target_color_type == SINGLE:
            scatter_kwargs["c"] = "b"

        elif self._target_color_type == DISCRETE:
            if y is None:
                raise YellowbrickValueError(
                    "y is required for discrete target")

            scatter_kwargs["c"] = [
                self._colors[np.searchsorted(self.classes_, (yi))] for yi in y
            ]

        elif self._target_color_type == CONTINUOUS:
            if y is None:
                raise YellowbrickValueError(
                    "y is required for continuous target")

            # TODO manually make colorbar so we can draw it in finalize
            scatter_kwargs["c"] = y
            scatter_kwargs["cmap"] = self.colors or palettes.DEFAULT_SEQUENCE

        else:
            # Technically this should never be raised
            raise NotFitted("could not determine target color type")

        # Draw the scatter plot with the associated colors and alpha
        self._scatter = self.ax.scatter(X[:, 0], X[:, 1], **scatter_kwargs)
        return self.ax
Beispiel #8
0
    def __init__(
        self,
        model,
        ax=None,
        k=10,
        metric="distortion",
        timings=True,
        locate_elbow=True,
        **kwargs
    ):
        super(KElbowVisualizer, self).__init__(model, ax=ax, **kwargs)

        # Get the scoring method
        if metric not in KELBOW_SCOREMAP:
            raise YellowbrickValueError(
                "'{}' is not a defined metric "
                "use one of distortion, silhouette, or calinski_harabasz"
            )

        # Store the arguments
        self.scoring_metric = KELBOW_SCOREMAP[metric]
        self.metric = metric
        self.timings = timings
        self.locate_elbow = locate_elbow

        # Convert K into a tuple argument if an integer
        if isinstance(k, int):
            self.k_values_ = list(range(2, k + 1))
        elif (
            isinstance(k, tuple)
            and len(k) == 2
            and all(isinstance(x, (int, np.integer)) for x in k)
        ):
            self.k_values_ = list(range(*k))
        elif isinstance(k, Iterable) and all(
            isinstance(x, (int, np.integer)) for x in k
        ):
            self.k_values_ = list(k)
        else:
            raise YellowbrickValueError(
                (
                    "Specify an iterable of integers, a range, or maximal K value,"
                    " the value '{}' is not a valid argument for K.".format(k)
                )
            )

        # Holds the values of the silhoutte scores
        self.k_scores_ = None
        # Set Default Elbow Value
        self.elbow_value_ = None
Beispiel #9
0
    def score(self, X, y):
        """
        Generates a 2D array where each row is the count of the
        predicted classes and each column is the true class

        Parameters
        ----------
        X : ndarray or DataFrame of shape n x m
            A matrix of n instances with m features

        y : ndarray or Series of length n
            An array or series of target or class values

        Returns
        -------
        score_ : float
            Global accuracy score
        """
        # Must be computed before calling super
        # We're relying on predict to raise NotFitted
        y_pred = self.predict(X)
        y_type, y_true, y_pred = _check_targets(y, y_pred)
        if y_type not in ("binary", "multiclass"):
            raise YellowbrickValueError("{} is not supported".format(y_type))

        # Get the indices of the unique labels
        indices = unique_labels(y_true, y_pred)
        labels = self._labels()

        # Call super to compute self.score_ and verify classes
        try:
            super(ClassPredictionError, self).score(X, y)
        except ModelError as e:
            # raise visualizer-specific errors
            if labels is not None and len(labels) < len(indices):
                raise NotImplementedError(
                    "filtering classes is currently not supported"
                )
            else:
                raise e

        # Ensure all labels are used
        if labels is not None and len(labels) > len(indices):
            raise ModelError(
                "y and y_pred contain zero values for one of the specified classes"
            )

        # Create a table of predictions whose rows are the true classes
        # and whose columns are the predicted classes; each element
        # is the count of predictions for that class that match the true
        # value of that class.
        self.predictions_ = np.array(
            [
                [(y_pred[y == label_t] == label_p).sum() for label_p in indices]
                for label_t in indices
            ]
        )

        self.draw()
        return self.score_
Beispiel #10
0
    def lax(self):
        """
        Returns the legend axes, creating it only on demand by creating a 2"
        by 2" inset axes that has no grid, ticks, spines or face frame (e.g
        is mostly invisible). The legend can then be drawn on this axes.
        """
        if inset_locator is None:
            raise YellowbrickValueError((
                "intercluster distance map legend requires matplotlib 2.0.2 or "
                "later please upgrade matplotlib or set legend=False "))

        lax = inset_locator.inset_axes(
            self.ax,
            width=self.legend_size,
            height=self.legend_size,
            loc=self.legend_loc,
        )

        lax.set_frame_on(False)
        lax.set_facecolor("none")
        lax.grid(False)
        lax.set_xlim(-1.4, 1.4)
        lax.set_ylim(-1.4, 1.4)
        lax.set_xticks([])
        lax.set_yticks([])

        for name in lax.spines:
            lax.spines[name].set_visible(False)

        return lax
Beispiel #11
0
    def make_transformer(self,
                         decompose='svd',
                         decompose_by=50,
                         tsne_kwargs={}):
        """
        Creates an internal transformer pipeline to project the data set into
        2D space using TSNE, applying an pre-decomposition technique ahead of
        embedding if necessary. This method will reset the transformer on the
        class, and can be used to explore different decompositions.

        Parameters
        ----------

        decompose : string or None, default: ``'svd'``
            A preliminary decomposition is often used prior to TSNE to make
            the projection faster. Specify ``"svd"`` for sparse data or ``"pca"``
            for dense data. If decompose is None, the original data set will
            be used.

        decompose_by : int, default: 50
            Specify the number of components for preliminary decomposition, by
            default this is 50; the more components, the slower TSNE will be.

        Returns
        -------

        transformer : Pipeline
            Pipelined transformer for TSNE projections
        """

        # TODO: detect decompose by inferring from sparse matrix or dense or
        # If number of features > 50 etc.
        decompositions = {
            'svd': TruncatedSVD,
            'pca': PCA,
        }

        if decompose and decompose.lower() not in decompositions:
            raise YellowbrickValueError(
                "'{}' is not a valid decomposition, use {}, or None".format(
                    decompose, ", ".join(decompositions.keys())))

        # Create the pipeline steps
        steps = []

        # Add the pre-decomposition
        if decompose:
            klass = decompositions[decompose]
            steps.append((decompose,
                          klass(n_components=decompose_by,
                                random_state=self.random_state)))

        # Add the TSNE manifold
        steps.append(('tsne',
                      TSNE(n_components=2,
                           random_state=self.random_state,
                           **tsne_kwargs)))

        # return the pipeline
        return Pipeline(steps)
Beispiel #12
0
 def ClassPredictionErrorViz(self):
     y_type, y_true, y_pred = _check_targets(self.y_true, self.y_pred)
     if y_type not in ("binary", "multiclass"):
         raise YellowbrickValueError("{} is not supported".format(y_type))
     # Get the indices of the unique labels
     indices = unique_labels(self.y_true, self.y_pred)
     labels = self.classes
     predictions_ = np.array([[
         (self.y_pred[self.y_true == label_t] == label_p).sum()
         for label_p in indices
     ] for label_t in indices])
     fig, ax = plt.subplots(ncols=1, nrows=1)
     legend_kws = {"bbox_to_anchor": (1.04, 0.5), "loc": "center left"}
     bar_stack(
         predictions_,
         ax,
         labels=list(self.classes),
         ticks=self.classes,
         legend_kws=legend_kws,
     )
     # Set the title
     ax.set_title("Class Prediction Error for {}".format(self.name))
     # Set the axes labels
     ax.set_xlabel("Actual Class")
     ax.set_ylabel("Number of Predicted Class")
     # Compute the ceiling for the y limit
     cmax = max([sum(predictions) for predictions in predictions_])
     ax.set_ylim(0, cmax + cmax * 0.1)
     # Ensure the legend fits on the figure
     fig.tight_layout(rect=[0, 0, 0.90, 1])
     fig.savefig(self.path_to_save + "/ClassPredictionError_" + self.name +
                 ".pdf")
     return ax
    def __init__(self,
                 estimator,
                 ax=None,
                 classes=None,
                 cmap="YlOrRd",
                 support=None,
                 encoder=None,
                 is_fitted="auto",
                 force_model=False,
                 **kwargs):
        super(ClassificationReport, self).__init__(estimator,
                                                   ax=ax,
                                                   classes=classes,
                                                   encoder=encoder,
                                                   is_fitted=is_fitted,
                                                   force_model=force_model,
                                                   **kwargs)

        self.support = support
        self.cmap = color_sequence(cmap)
        self.cmap.set_over(color=CMAP_OVERCOLOR)
        self.cmap.set_under(color=CMAP_UNDERCOLOR)
        self._displayed_scores = [key for key in SCORES_KEYS]

        if support not in {None, True, False, "percent", "count"}:
            raise YellowbrickValueError(
                "'{}' is an invalid argument for support, use None, True, "
                "False, 'percent', or 'count'".format(support))

        if not support:
            self._displayed_scores.remove("support")
Beispiel #14
0
    def __init__(
        self,
        ax=None,
        tagset="penn_treebank",
        colormap=None,
        colors=None,
        frequency=False,
        stack=False,
        parser=None,
        **kwargs,
    ):
        super(PosTagVisualizer, self).__init__(ax=ax, **kwargs)

        self.tagset_names = TAGSET_NAMES

        if tagset not in self.tagset_names:
            raise YellowbrickValueError(
                "'{}' is an invalid tagset. Please choose one of {}.".format(
                    tagset, ", ".join(self.tagset_names.keys())))
        else:
            self.tagset = tagset

        self.punct_tags = frozenset(PUNCT_TAGS)
        self.frequency = frequency
        self.colormap = colormap
        self.colors = colors
        self.stack = stack
        self.parser = parser
Beispiel #15
0
    def manifold(self, transformer):
        """
        Creates the manifold estimator if a string value is passed in,
        validates other objects passed in.
        """
        if not is_estimator(transformer):
            if transformer not in self.ALGORITHMS:
                raise YellowbrickValueError(
                    "could not create manifold for '%s'".format(
                        str(transformer)))

            # Create a new transformer with the specified params
            self._name = MANIFOLD_NAMES[transformer]
            transformer = clone(self.ALGORITHMS[transformer])
            params = {
                "n_components": 2,
                "n_neighbors": self.n_neighbors,
                "random_state": self.random_state,
            }

            for param in list(params.keys()):
                if param not in transformer.get_params():
                    del params[param]

            transformer.set_params(**params)

        self._manifold = transformer
        if self._name is None:
            self._name = self._manifold.__class__.__name__
Beispiel #16
0
    def fit(self, X, y=None):
        """
        Fit the classification model; if ``y`` is multi-class, then the estimator
        is adapted with a ``OneVsRestClassifier`` strategy, otherwise the estimator
        is fit directly.
        """
        # The target determines what kind of estimator is fit
        ttype = type_of_target(y)
        self._target_labels = np.unique(y)
        if ttype.startswith(MULTICLASS):
            self.target_type_ = MULTICLASS
            self.estimator = OneVsRestClassifier(self.estimator)

            # Use label_binarize to create multi-label output for OneVsRestClassifier
            Y = label_binarize(y, classes=self._target_labels)
        elif ttype.startswith(BINARY):
            # Different variable is used here to prevent transformation
            Y = y
            self.target_type_ = BINARY
        else:
            raise YellowbrickValueError(
                ("{} does not support target type '{}', "
                 "please provide a binary or multiclass single-output target"
                 ).format(self.__class__.__name__, ttype))

        # Fit the model and return self
        return super(PrecisionRecallCurve, self).fit(X, Y)
Beispiel #17
0
    def __init__(self,
                 model,
                 param_name,
                 param_range,
                 ax=None,
                 logx=False,
                 groups=None,
                 cv=None,
                 scoring=None,
                 n_jobs=1,
                 pre_dispatch="all",
                 **kwargs):

        # Initialize the model visualizer
        super(ValidationCurve, self).__init__(model, ax=ax, **kwargs)

        # Validate the param_range
        param_range = np.asarray(param_range)
        if param_range.ndim != 1:
            raise YellowbrickValueError(
                "must specify array of param values, '{}' is not valid".format(
                    repr(param_range)))

        # Set the visual and validation curve parameters on the estimator
        self.set_params(
            param_name=param_name,
            param_range=param_range,
            logx=logx,
            groups=groups,
            cv=cv,
            scoring=scoring,
            n_jobs=n_jobs,
            pre_dispatch=pre_dispatch,
        )
Beispiel #18
0
def resolve_colors(n_colors=None, colormap=None, colors=None):
    """
    Generates a list of colors based on common color arguments, for example
    the name of a colormap or palette or another iterable of colors. The list
    is then truncated (or multiplied) to the specific number of requested
    colors.

    Parameters
    ----------
    n_colors : int, default: None
        Specify the length of the list of returned colors, which will either
        truncate or multiple the colors available. If None the length of the
        colors will not be modified.

    colormap : str, default: None
        The name of the matplotlib color map with which to generate colors.

    colors : iterable, default: None
        A collection of colors to use specifically with the plot.

    Returns
    -------
    colors : list
        A list of colors that can be used in matplotlib plots.

    Notes
    -----
    This function was originally based on a similar function in the pandas
    plotting library that has been removed in the new version of the library.
    """

    # Work with the colormap if specified and colors is not
    if colormap is not None and colors is None:
        if isinstance(colormap, string_types):
            try:
                colormap = cm.get_cmap(colormap)
            except ValueError as e:
                raise YellowbrickValueError(e)

        n_colors = n_colors or len(get_color_cycle())
        _colors = list(map(colormap, np.linspace(0, 1, num=n_colors)))

    # Work with the color list
    elif colors is not None:

        # Warn if both colormap and colors is specified.
        if colormap is not None:
            warnings.warn("both colormap and colors specified; using colors")

        _colors = list(colors)  # Ensure colors is a list

    # Get the default colors
    else:
        _colors = get_color_cycle()

    # Truncate or multiple the color list according to the number of colors
    if n_colors is not None and len(_colors) != n_colors:
        _colors = [_colors[idx % len(_colors)] for idx in np.arange(n_colors)]

    return _colors
Beispiel #19
0
    def layout(self, divider=None):
        """
        Creates the layout for colorbar when target type is continuous.
        The colorbar is added to the right of the scatterplot.

        Subclasses can override this method to add other axes or layouts.

        Parameters
        ----------
        divider: AxesDivider
            An AxesDivider to be passed among all layout calls.
        """
        if (self._target_color_type == TargetType.CONTINUOUS
                and self.projection == 2 and self.colorbar
                and self._cax is None):
            # Ensure matplotlib version compatibility
            if make_axes_locatable is None:
                raise YellowbrickValueError(
                    ("Colorbar requires matplotlib 2.0.2 or greater "
                     "please upgrade matplotlib"))

            # Create the new axes for the colorbar
            if divider is None:
                divider = make_axes_locatable(self.ax)

            self._cax = divider.append_axes("right", size="5%", pad=0.3)
            self._cax.set_yticks([])
            self._cax.set_xticks([])
Beispiel #20
0
    def __init__(self,
                 ax=None,
                 method="pearson",
                 labels=None,
                 sort=False,
                 feature_index=None,
                 feature_names=None,
                 color=None,
                 **kwargs):
        super(FeatureCorrelation, self).__init__(ax=None, **kwargs)

        self.correlation_labels = CORRELATION_LABELS
        self.correlation_methods = CORRELATION_METHODS

        if method not in self.correlation_labels:
            raise YellowbrickValueError(
                "Method {} not implement; choose from {}".format(
                    method, ", ".join(self.correlation_labels)))

        # Parameters
        self.set_params(
            sort=sort,
            color=color,
            method=method,
            labels=labels,
            feature_index=feature_index,
            feature_names=feature_names,
        )
Beispiel #21
0
    def draw(self, points, target=None, **kwargs):
        """
        Called from the fit method, this method creates the canvas and
        draws the plot on it.
        Parameters
        ----------
        kwargs: generic keyword arguments.
        """

        # Resolve the labels with the classes
        labels = self.labels if self.labels is not None else self.classes_
        if len(labels) != len(self.classes_):
            raise YellowbrickValueError(
                ("number of supplied labels ({}) does not "
                 "match the number of classes ({})").format(
                     len(labels), len(self.classes_)))

        # Create the color mapping for the labels.
        color_values = resolve_colors(n_colors=len(labels),
                                      colormap=self.colormap,
                                      colors=self.color)
        colors = dict(zip(labels, color_values))

        # Transform labels into a map of class to label
        labels = dict(zip(self.classes_, labels))

        # Define boundaries with a vertical line
        if self.annotate_docs:
            for xcoords in self.boundaries_:
                self.ax.axvline(x=xcoords,
                                color="lightgray",
                                linestyle="dashed")

        series = defaultdict(lambda: {"x": [], "y": []})

        if target is not None:
            for point, t in zip(points, target):
                label = labels[t]
                series[label]["x"].append(point[0])
                series[label]["y"].append(point[1])
        else:
            label = self.classes_[0]
            for x, y in points:
                series[label]["x"].append(x)
                series[label]["y"].append(y)

        for label, points in series.items():
            self.ax.scatter(
                points["x"],
                points["y"],
                marker="|",
                c=colors[label],
                zorder=100,
                label=label,
            )

        self.ax.set_yticks(list(range(len(self.indexed_words_))))
        self.ax.set_yticklabels(self.indexed_words_)

        return self.ax
Beispiel #22
0
    def rank(self, X, algorithm=None):
        """
        Returns the ranking of each pair of columns as an m by m matrix.

        Parameters
        ----------
        X : ndarray or DataFrame of shape n x m
            A matrix of n instances with m features

        algorithm : str or None
            The ranking mechanism to use, or None for the default

        Returns
        -------
        R : ndarray
            The mxm ranking matrix of the variables
        """
        algorithm = algorithm or self.ranking_
        algorithm = algorithm.lower()

        if algorithm not in self.ranking_methods:
            raise YellowbrickValueError(
                "'{}' is unrecognized ranking method".format(algorithm))

        return self.ranking_methods[algorithm](X)
Beispiel #23
0
    def fit(self, X, **kwargs):
        """
        Sets up X for the histogram and checks to
        ensure that X is of the correct data type
        Fit calls draw
        Parameters
        ----------
        X : ndarray or DataFrame of shape n x 1
            A matrix of n instances with 1 feature
        kwargs: dict
            keyword arguments passed to Scikit-Learn API.
        """

        #throw an error if X has more than 1 column
        if is_dataframe(X):
            nrows, ncols = X.shape

            if ncols > 1:
                raise YellowbrickValueError((
                    "X needs to be an ndarray or DataFrame with one feature, "
                    "please select one feature from the DataFrame"
                ))

        # Handle the feature name if it is None.
        if self.feature is None:

            # If X is a data frame, get the columns off it.
            if is_dataframe(X):
                self.feature = X.columns

            else:
                self.feature = ['x']

        self.draw(X)
        return self
Beispiel #24
0
    def draw(self, **kwargs):
        """
        Called from the fit method, this method creates the canvas and
        draws the distribution plot on it.

        Parameters
        ----------
        kwargs: generic keyword arguments.

        """
        # Prepare the data
        bins = np.arange(self.N)
        words = [self.features[i] for i in self.sorted_[: self.N]]
        freqs = {}

        # Set up the bar plots
        if self.conditional_freqdist_:
            for label, values in sorted(
                self.conditional_freqdist_.items(), key=itemgetter(0)
            ):
                freqs[label] = [values[i] for i in self.sorted_[: self.N]]
        else:
            freqs["corpus"] = [self.freqdist_[i] for i in self.sorted_[: self.N]]

        # Draw a horizontal barplot
        if self.orient == "h":
            # Add the barchart, stacking if necessary
            for label, freq in freqs.items():
                self.ax.barh(bins, freq, label=label, color=self.color, align="center")

            # Set the y ticks to the words
            self.ax.set_yticks(bins)
            self.ax.set_yticklabels(words)

            # Order the features from top to bottom on the y axis
            self.ax.invert_yaxis()

            # Turn off y grid lines and turn on x grid lines
            self.ax.yaxis.grid(False)
            self.ax.xaxis.grid(True)

        # Draw a vertical barplot
        elif self.orient == "v":
            # Add the barchart, stacking if necessary
            for label, freq in freqs.items():
                self.ax.bar(bins, freq, label=label, color=self.color, align="edge")

            # Set the y ticks to the words
            self.ax.set_xticks(bins)
            self.ax.set_xticklabels(words, rotation=90)

            # Turn off x grid lines and turn on y grid lines
            self.ax.yaxis.grid(True)
            self.ax.xaxis.grid(False)

        # Unknown state
        else:
            raise YellowbrickValueError("Orientation must be 'h' or 'v'")

        return self.ax
Beispiel #25
0
    def _draw_projection_features(self, Xp, y):
        """
        Draw the projection of features in the transformed space.
        Parameters
        ----------
        Xp : array-like of shape (n, 2) or (n, 3)
            The matrix produced by the ``transform()`` method.

        y : array-like of shape (n,), optional
            The target, used to specify the colors of the points.

        Returns
        -------
        self.ax : matplotlib Axes object
            Returns the axes that the scatter plot was drawn on.

        """

        x_vector = self.pca_components_[0]
        y_vector = self.pca_components_[1]
        max_x = max(Xp[:, 0])
        max_y = max(Xp[:, 1])
        if self.projection == 2:
            for i in range(self.pca_components_.shape[1]):
                self.ax.arrow(
                    x=0,
                    y=0,
                    dx=x_vector[i] * max_x,
                    dy=y_vector[i] * max_y,
                    color="r",
                    head_width=0.05,
                    width=0.005,
                )
                self.ax.text(
                    x_vector[i] * max_x * 1.05,
                    y_vector[i] * max_y * 1.05,
                    self.features_[i],
                    color="r",
                )
        elif self.projection == 3:
            z_vector = self.pca_components_[2]
            max_z = max(Xp[:, 1])
            for i in range(self.pca_components_.shape[1]):
                self.ax.plot(
                    [0, x_vector[i] * max_x],
                    [0, y_vector[i] * max_y],
                    [0, z_vector[i] * max_z],
                    color="r",
                )
                self.ax.text(
                    x_vector[i] * max_x * 1.05,
                    y_vector[i] * max_y * 1.05,
                    z_vector[i] * max_z * 1.05,
                    self.features_[i],
                    color="r",
                )
        else:
            raise YellowbrickValueError("Projection dimensions must be either 2 or 3")

        return self.ax
Beispiel #26
0
    def rank(self, X, algorithm=None):
        """
        Returns the feature ranking.

        Parameters
        ----------
        X : ndarray or DataFrame of shape n x m
            A matrix of n instances with m features

        algorithm : str or None
            The ranking mechanism to use, or None for the default

        Returns
        -------
        ranks : ndarray
            An n-dimensional, symmetric array of rank scores, where n is the
            number of features. E.g. for 1D ranking, it is (n,), for a
            2D ranking it is (n,n) and so forth.
        """
        algorithm = algorithm or self.ranking_
        algorithm = algorithm.lower()

        if algorithm not in self.ranking_methods:
            raise YellowbrickValueError(
                "'{}' is unrecognized ranking method".format(algorithm))

        # Extract matrix from dataframe if necessary
        if is_dataframe(X):
            X = X.as_matrix()

        return self.ranking_methods[algorithm](X)
Beispiel #27
0
    def fit(self, y, **kwargs):
        """
        Sets up y for the histogram and checks to
        ensure that ``y`` is of the correct data type.
        Fit calls draw.

        Parameters
        ----------
        y : an array of one dimension or a pandas Series

        kwargs : dict
            keyword arguments passed to scikit-learn API.

        """

        # throw an error if y has more than 1 column
        if y.ndim > 1:
            raise YellowbrickValueError(
                "y needs to be an array or Series with one dimension"
            )

        # Handle the target name if it is None.
        if self.target is None:
            self.target = "y"

        self.draw(y)
        return self
Beispiel #28
0
    def _determine_target_color_type(self, y):
        """
        Determines the target color type from the vector y as follows:

            - if y is None: only a single color is used
            - if target is auto: determine if y is continuous or discrete
            - otherwise specify supplied target type

        This property will be used to compute the colors for each point.
        """
        if y is None:
            self._target_color_type = SINGLE
        elif self.target == "auto":
            # NOTE: See #73 for a generalization to use when implemented
            if len(np.unique(y)) < 10:
                self._target_color_type = DISCRETE
            else:
                self._target_color_type = CONTINUOUS
        else:
            self._target_color_type = self.target

        if self._target_color_type not in {SINGLE, DISCRETE, CONTINUOUS}:
            raise YellowbrickValueError(
                ("could not determine target color type "
                 "from target='{}' to '{}'").format(self.target,
                                                    self._target_color_type))
Beispiel #29
0
    def __init__(self,
                 ax=None,
                 labels=None,
                 classes=None,
                 colors=None,
                 colormap=None,
                 random_state=None,
                 alpha=0.7,
                 **kwargs):

        if UMAP is None:
            raise YellowbrickValueError(
                ("umap package doesn't seem to be installed."
                 "Please install UMAP via: pip install umap-learn"))

        # Visual Parameters
        self.alpha = alpha
        self.labels = labels
        self.colors = colors
        self.colormap = colormap
        self.random_state = random_state

        # Fetch UMAP kwargs from kwargs by popping only keys belonging to UMAP params
        umap_kwargs = {
            key: kwargs.pop(key)
            for key in UMAP().get_params() if key in kwargs
        }

        # UMAP doesn't require any pre-processing before embedding and thus doesn't
        # require a pipeline.
        self.transformer_ = self.make_transformer(umap_kwargs)

        # Call super at the end so that size and title are set correctly
        super(UMAPVisualizer, self).__init__(ax=ax, **kwargs)
Beispiel #30
0
    def fit(self, X, y=None, **kwargs):
        """
        The fit method is the primary drawing input for the parallel coords
        visualization since it has both the X and y data required for the
        viz and the transform method does not.

        Parameters
        ----------
        X : ndarray or DataFrame of shape n x m
            A matrix of n instances with 2 features

        y : ndarray or Series of length n
            An array or series of target or class values

        kwargs : dict
            Pass generic arguments to the drawing method

        Returns
        -------
        self : instance
            Returns the instance of the transformer/visualizer
        """
        _, ncols = X.shape

        if ncols == 2:
            X_two_cols = X
            if self.features_ is None:
                self.features_ = ["Feature One", "Feature Two"]

        # Handle the feature names if they're None.
        elif self.features_ is not None and is_dataframe(X):
            X_two_cols = X[self.features_].as_matrix()

        # handle numpy named/ structured array
        elif self.features_ is not None and is_structured_array(X):
            X_selected = X[self.features_]
            X_two_cols = X_selected.view((np.float64, len(X_selected.dtype.names)))

        # handle features that are numeric columns in ndarray matrix
        elif self.features_ is not None and has_ndarray_int_columns(self.features_, X):
            f_one, f_two = self.features_
            X_two_cols = X[:, [int(f_one), int(f_two)]]

        else:
            raise YellowbrickValueError("""
                ScatterVisualizer only accepts two features, please
                explicitly set these two features in the init kwargs or
                pass a matrix/ dataframe in with only two columns.""")

        # Store the classes for the legend if they're None.
        if self.classes_ is None:
            # TODO: Is this the most efficient method?
            self.classes_ = [str(label) for label in np.unique(y)]

        # Draw the instances
        self.draw(X_two_cols, y, **kwargs)

        # Fit always returns self.
        return self