def __init__(self, ax=None, x=None, y=None, features=None, classes=None, color=None, colormap=None, markers=None, **kwargs): """ Initialize the base scatter with many of the options required in order to make the visualization work. """ super(ScatterVisualizer, self).__init__(ax, features, classes, color, colormap, **kwargs) self.x = x self.y = y self.markers = itertools.cycle( kwargs.pop('markers', (',', '+', 'o', '*', 'v', 'h', 'd'))) if self.x is not None and self.y is not None and self.features_ is not None: raise YellowbrickValueError( 'Please specify x,y or features, not both.') if self.x is not None and self.y is not None and self.features_ is None: self.features_ = [self.x, self.y] # Ensure with init that features doesn't have more than two features if features is not None: if len(features) != 2: raise YellowbrickValueError( 'ScatterVisualizer only accepts two features.')
def _select_features_to_plot(self, X): """ Select features to plot. feature_index is always used as the filter and if filter_names is supplied, a new feature_index is computed from those names. """ if self.feature_index: if self.feature_names: raise YellowbrickWarning( "Both feature_index and feature_names " "are specified. feature_names is ignored") if min(self.feature_index) < 0 or max( self.feature_index) >= X.shape[1]: raise YellowbrickValueError("Feature index is out of range") elif self.feature_names: self.feature_index = [] features_list = self.features_.tolist() for feature_name in self.feature_names: try: self.feature_index.append( features_list.index(feature_name)) except ValueError: raise YellowbrickValueError( "{} not in labels".format(feature_name))
def __init__( self, model, ax=None, hist=True, qqplot=False, train_color="b", test_color="g", line_color=LINE_COLOR, train_alpha=0.75, test_alpha=0.75, is_fitted="auto", **kwargs ): # Whether or not to check if the model is already fitted self.is_fitted = is_fitted # Initialize the visualizer base super(ResidualsPlot, self).__init__(model, ax=ax, **kwargs) # TODO: allow more scatter plot arguments for train and test points # See #475 (RE: ScatterPlotMixin) self.colors = { "train_point": train_color, "test_point": test_color, "line": line_color, } self.hist = hist if self.hist not in {True, "density", "frequency", None, False}: raise YellowbrickValueError( "'{}' is an invalid argument for hist, use None, True, " "False, 'density', or 'frequency'".format(hist) ) self.qqplot = qqplot if self.qqplot not in {True, False}: raise YellowbrickValueError( "'{}' is an invalid argument for qqplot, use True, " " or False".format(hist) ) if self.hist in {True, "density", "frequency"} and self.qqplot in {True}: raise YellowbrickValueError( "Set either hist or qqplot to False, can not plot " "both of them simultaneously." ) if self.hist in {True, "density", "frequency"}: self.hax # If hist is True, test the version availability if self.qqplot in {True}: self.qqax # If qqplot is True, test the version availability # Store labels and colors for the legend ordered by call self._labels, self._colors = [], [] self.alphas = {"train_point": train_alpha, "test_point": test_alpha}
def fit(self, X, y, **kwargs): """ Sets up the X and y variables for the jointplot and checks to ensure that X and y are of the correct data type Fit calls draw Parameters ---------- X : ndarray or DataFrame of shape n x 1 A matrix of n instances with 1 feature y : ndarray or Series of length n An array or series of the target value kwargs: dict keyword arguments passed to Scikit-Learn API. """ #throw an error if X has more than 1 column if is_dataframe(X): nrows, ncols = X.shape if ncols > 1: raise YellowbrickValueError(( "X needs to be an ndarray or DataFrame with one feature, " "please select one feature from the DataFrame" )) #throw an error is y is None if y is None: raise YellowbrickValueError(( "Joint plots are useful for classification and regression " "problems, which require a target variable" )) # Handle the feature name if it is None. if self.feature is None: # If X is a data frame, get the columns off it. if is_dataframe(X): self.feature = X.columns else: self.feature = ['x'] # Handle the target name if it is None. if self.target is None: self.target = ['y'] self.draw(X, y, **kwargs) return self
def __init__(self, model, ax=None, x=None, y=None, features=None, classes=None, show_scatter=True, step_size=0.0025, markers=None, pcolormesh_alpha=0.8, scatter_alpha=1.0, encoder=None, is_fitted="auto", force_model=False, **kwargs): super(DecisionBoundariesVisualizer, self).__init__( model, ax=ax, classes=classes, encoder=encoder, is_fitted=is_fitted, force_model=force_model, ) self.x = x self.y = y self.features_ = features self.estimator = model self.show_scatter = show_scatter self.step_size = step_size self.markers = itertools.cycle( kwargs.pop("markers", (",", "o", "d", "*", "v", "h", "+"))) self.pcolormesh_alpha = pcolormesh_alpha self.scatter_alpha = scatter_alpha # these are set later self.Z = None self.Z_shape = None self.xx = None self.yy = None self.class_labels = None if self.x is not None and self.y is not None and self.features_ is not None: raise YellowbrickValueError( "Please specify x,y or features, not both.") if self.x is not None and self.y is not None and self.features_ is None: self.features_ = [self.x, self.y] # Ensure with init that features doesn't have more than two features if features is not None: if len(features) != 2: raise YellowbrickValueError( "DecisionBoundariesVisualizer only accepts two features.")
def __init__( self, model, x=None, y=None, features=None, show_scatter=True, step_size=0.0025, markers=None, pcolormesh_alpha=0.8, scatter_alpha=1.0, # title=None, *args, **kwargs): """ Pass in a unfitted model to generate a decision boundaries visualization. """ super(DecisionBoundariesVisualizer, self).__init__(model, *args, **kwargs) self.x = x self.y = y self.features_ = features self.estimator = model self.show_scatter = show_scatter self.step_size = step_size self.markers = itertools.cycle( kwargs.pop('markers', (',', 'o', 'd', '*', 'v', 'h', '+'))) self.pcolormesh_alpha = pcolormesh_alpha self.scatter_alpha = scatter_alpha # these are set later self.Z = None self.Z_shape = None self.xx = None self.yy = None self.class_labels = None if self.x is not None and self.y is not None and self.features_ is not None: raise YellowbrickValueError( 'Please specify x,y or features, not both.') if self.x is not None and self.y is not None and self.features_ is None: self.features_ = [self.x, self.y] # Ensure with init that features doesn't have more than two features if features is not None: if len(features) != 2: raise YellowbrickValueError( 'DecisionBoundariesVisualizer only accepts two features.')
def draw(self, X, y=None): """ Draws the points described by X and colored by the points in y. Can be called multiple times before finalize to add more scatter plots to the axes, however ``fit()`` must be called before use. Parameters ---------- X : array-like of shape (n, 2) The matrix produced by the ``transform()`` method. y : array-like of shape (n,), optional The target, used to specify the colors of the points. Returns ------- self.ax : matplotlib Axes object Returns the axes that the scatter plot was drawn on. """ scatter_kwargs = {"alpha": self.alpha} # Determine the colors if self._target_color_type == SINGLE: scatter_kwargs["c"] = "b" elif self._target_color_type == DISCRETE: if y is None: raise YellowbrickValueError( "y is required for discrete target") scatter_kwargs["c"] = [ self._colors[np.searchsorted(self.classes_, (yi))] for yi in y ] elif self._target_color_type == CONTINUOUS: if y is None: raise YellowbrickValueError( "y is required for continuous target") # TODO manually make colorbar so we can draw it in finalize scatter_kwargs["c"] = y scatter_kwargs["cmap"] = self.colors or palettes.DEFAULT_SEQUENCE else: # Technically this should never be raised raise NotFitted("could not determine target color type") # Draw the scatter plot with the associated colors and alpha self._scatter = self.ax.scatter(X[:, 0], X[:, 1], **scatter_kwargs) return self.ax
def __init__( self, model, ax=None, k=10, metric="distortion", timings=True, locate_elbow=True, **kwargs ): super(KElbowVisualizer, self).__init__(model, ax=ax, **kwargs) # Get the scoring method if metric not in KELBOW_SCOREMAP: raise YellowbrickValueError( "'{}' is not a defined metric " "use one of distortion, silhouette, or calinski_harabasz" ) # Store the arguments self.scoring_metric = KELBOW_SCOREMAP[metric] self.metric = metric self.timings = timings self.locate_elbow = locate_elbow # Convert K into a tuple argument if an integer if isinstance(k, int): self.k_values_ = list(range(2, k + 1)) elif ( isinstance(k, tuple) and len(k) == 2 and all(isinstance(x, (int, np.integer)) for x in k) ): self.k_values_ = list(range(*k)) elif isinstance(k, Iterable) and all( isinstance(x, (int, np.integer)) for x in k ): self.k_values_ = list(k) else: raise YellowbrickValueError( ( "Specify an iterable of integers, a range, or maximal K value," " the value '{}' is not a valid argument for K.".format(k) ) ) # Holds the values of the silhoutte scores self.k_scores_ = None # Set Default Elbow Value self.elbow_value_ = None
def score(self, X, y): """ Generates a 2D array where each row is the count of the predicted classes and each column is the true class Parameters ---------- X : ndarray or DataFrame of shape n x m A matrix of n instances with m features y : ndarray or Series of length n An array or series of target or class values Returns ------- score_ : float Global accuracy score """ # Must be computed before calling super # We're relying on predict to raise NotFitted y_pred = self.predict(X) y_type, y_true, y_pred = _check_targets(y, y_pred) if y_type not in ("binary", "multiclass"): raise YellowbrickValueError("{} is not supported".format(y_type)) # Get the indices of the unique labels indices = unique_labels(y_true, y_pred) labels = self._labels() # Call super to compute self.score_ and verify classes try: super(ClassPredictionError, self).score(X, y) except ModelError as e: # raise visualizer-specific errors if labels is not None and len(labels) < len(indices): raise NotImplementedError( "filtering classes is currently not supported" ) else: raise e # Ensure all labels are used if labels is not None and len(labels) > len(indices): raise ModelError( "y and y_pred contain zero values for one of the specified classes" ) # Create a table of predictions whose rows are the true classes # and whose columns are the predicted classes; each element # is the count of predictions for that class that match the true # value of that class. self.predictions_ = np.array( [ [(y_pred[y == label_t] == label_p).sum() for label_p in indices] for label_t in indices ] ) self.draw() return self.score_
def lax(self): """ Returns the legend axes, creating it only on demand by creating a 2" by 2" inset axes that has no grid, ticks, spines or face frame (e.g is mostly invisible). The legend can then be drawn on this axes. """ if inset_locator is None: raise YellowbrickValueError(( "intercluster distance map legend requires matplotlib 2.0.2 or " "later please upgrade matplotlib or set legend=False ")) lax = inset_locator.inset_axes( self.ax, width=self.legend_size, height=self.legend_size, loc=self.legend_loc, ) lax.set_frame_on(False) lax.set_facecolor("none") lax.grid(False) lax.set_xlim(-1.4, 1.4) lax.set_ylim(-1.4, 1.4) lax.set_xticks([]) lax.set_yticks([]) for name in lax.spines: lax.spines[name].set_visible(False) return lax
def make_transformer(self, decompose='svd', decompose_by=50, tsne_kwargs={}): """ Creates an internal transformer pipeline to project the data set into 2D space using TSNE, applying an pre-decomposition technique ahead of embedding if necessary. This method will reset the transformer on the class, and can be used to explore different decompositions. Parameters ---------- decompose : string or None, default: ``'svd'`` A preliminary decomposition is often used prior to TSNE to make the projection faster. Specify ``"svd"`` for sparse data or ``"pca"`` for dense data. If decompose is None, the original data set will be used. decompose_by : int, default: 50 Specify the number of components for preliminary decomposition, by default this is 50; the more components, the slower TSNE will be. Returns ------- transformer : Pipeline Pipelined transformer for TSNE projections """ # TODO: detect decompose by inferring from sparse matrix or dense or # If number of features > 50 etc. decompositions = { 'svd': TruncatedSVD, 'pca': PCA, } if decompose and decompose.lower() not in decompositions: raise YellowbrickValueError( "'{}' is not a valid decomposition, use {}, or None".format( decompose, ", ".join(decompositions.keys()))) # Create the pipeline steps steps = [] # Add the pre-decomposition if decompose: klass = decompositions[decompose] steps.append((decompose, klass(n_components=decompose_by, random_state=self.random_state))) # Add the TSNE manifold steps.append(('tsne', TSNE(n_components=2, random_state=self.random_state, **tsne_kwargs))) # return the pipeline return Pipeline(steps)
def ClassPredictionErrorViz(self): y_type, y_true, y_pred = _check_targets(self.y_true, self.y_pred) if y_type not in ("binary", "multiclass"): raise YellowbrickValueError("{} is not supported".format(y_type)) # Get the indices of the unique labels indices = unique_labels(self.y_true, self.y_pred) labels = self.classes predictions_ = np.array([[ (self.y_pred[self.y_true == label_t] == label_p).sum() for label_p in indices ] for label_t in indices]) fig, ax = plt.subplots(ncols=1, nrows=1) legend_kws = {"bbox_to_anchor": (1.04, 0.5), "loc": "center left"} bar_stack( predictions_, ax, labels=list(self.classes), ticks=self.classes, legend_kws=legend_kws, ) # Set the title ax.set_title("Class Prediction Error for {}".format(self.name)) # Set the axes labels ax.set_xlabel("Actual Class") ax.set_ylabel("Number of Predicted Class") # Compute the ceiling for the y limit cmax = max([sum(predictions) for predictions in predictions_]) ax.set_ylim(0, cmax + cmax * 0.1) # Ensure the legend fits on the figure fig.tight_layout(rect=[0, 0, 0.90, 1]) fig.savefig(self.path_to_save + "/ClassPredictionError_" + self.name + ".pdf") return ax
def __init__(self, estimator, ax=None, classes=None, cmap="YlOrRd", support=None, encoder=None, is_fitted="auto", force_model=False, **kwargs): super(ClassificationReport, self).__init__(estimator, ax=ax, classes=classes, encoder=encoder, is_fitted=is_fitted, force_model=force_model, **kwargs) self.support = support self.cmap = color_sequence(cmap) self.cmap.set_over(color=CMAP_OVERCOLOR) self.cmap.set_under(color=CMAP_UNDERCOLOR) self._displayed_scores = [key for key in SCORES_KEYS] if support not in {None, True, False, "percent", "count"}: raise YellowbrickValueError( "'{}' is an invalid argument for support, use None, True, " "False, 'percent', or 'count'".format(support)) if not support: self._displayed_scores.remove("support")
def __init__( self, ax=None, tagset="penn_treebank", colormap=None, colors=None, frequency=False, stack=False, parser=None, **kwargs, ): super(PosTagVisualizer, self).__init__(ax=ax, **kwargs) self.tagset_names = TAGSET_NAMES if tagset not in self.tagset_names: raise YellowbrickValueError( "'{}' is an invalid tagset. Please choose one of {}.".format( tagset, ", ".join(self.tagset_names.keys()))) else: self.tagset = tagset self.punct_tags = frozenset(PUNCT_TAGS) self.frequency = frequency self.colormap = colormap self.colors = colors self.stack = stack self.parser = parser
def manifold(self, transformer): """ Creates the manifold estimator if a string value is passed in, validates other objects passed in. """ if not is_estimator(transformer): if transformer not in self.ALGORITHMS: raise YellowbrickValueError( "could not create manifold for '%s'".format( str(transformer))) # Create a new transformer with the specified params self._name = MANIFOLD_NAMES[transformer] transformer = clone(self.ALGORITHMS[transformer]) params = { "n_components": 2, "n_neighbors": self.n_neighbors, "random_state": self.random_state, } for param in list(params.keys()): if param not in transformer.get_params(): del params[param] transformer.set_params(**params) self._manifold = transformer if self._name is None: self._name = self._manifold.__class__.__name__
def fit(self, X, y=None): """ Fit the classification model; if ``y`` is multi-class, then the estimator is adapted with a ``OneVsRestClassifier`` strategy, otherwise the estimator is fit directly. """ # The target determines what kind of estimator is fit ttype = type_of_target(y) self._target_labels = np.unique(y) if ttype.startswith(MULTICLASS): self.target_type_ = MULTICLASS self.estimator = OneVsRestClassifier(self.estimator) # Use label_binarize to create multi-label output for OneVsRestClassifier Y = label_binarize(y, classes=self._target_labels) elif ttype.startswith(BINARY): # Different variable is used here to prevent transformation Y = y self.target_type_ = BINARY else: raise YellowbrickValueError( ("{} does not support target type '{}', " "please provide a binary or multiclass single-output target" ).format(self.__class__.__name__, ttype)) # Fit the model and return self return super(PrecisionRecallCurve, self).fit(X, Y)
def __init__(self, model, param_name, param_range, ax=None, logx=False, groups=None, cv=None, scoring=None, n_jobs=1, pre_dispatch="all", **kwargs): # Initialize the model visualizer super(ValidationCurve, self).__init__(model, ax=ax, **kwargs) # Validate the param_range param_range = np.asarray(param_range) if param_range.ndim != 1: raise YellowbrickValueError( "must specify array of param values, '{}' is not valid".format( repr(param_range))) # Set the visual and validation curve parameters on the estimator self.set_params( param_name=param_name, param_range=param_range, logx=logx, groups=groups, cv=cv, scoring=scoring, n_jobs=n_jobs, pre_dispatch=pre_dispatch, )
def resolve_colors(n_colors=None, colormap=None, colors=None): """ Generates a list of colors based on common color arguments, for example the name of a colormap or palette or another iterable of colors. The list is then truncated (or multiplied) to the specific number of requested colors. Parameters ---------- n_colors : int, default: None Specify the length of the list of returned colors, which will either truncate or multiple the colors available. If None the length of the colors will not be modified. colormap : str, default: None The name of the matplotlib color map with which to generate colors. colors : iterable, default: None A collection of colors to use specifically with the plot. Returns ------- colors : list A list of colors that can be used in matplotlib plots. Notes ----- This function was originally based on a similar function in the pandas plotting library that has been removed in the new version of the library. """ # Work with the colormap if specified and colors is not if colormap is not None and colors is None: if isinstance(colormap, string_types): try: colormap = cm.get_cmap(colormap) except ValueError as e: raise YellowbrickValueError(e) n_colors = n_colors or len(get_color_cycle()) _colors = list(map(colormap, np.linspace(0, 1, num=n_colors))) # Work with the color list elif colors is not None: # Warn if both colormap and colors is specified. if colormap is not None: warnings.warn("both colormap and colors specified; using colors") _colors = list(colors) # Ensure colors is a list # Get the default colors else: _colors = get_color_cycle() # Truncate or multiple the color list according to the number of colors if n_colors is not None and len(_colors) != n_colors: _colors = [_colors[idx % len(_colors)] for idx in np.arange(n_colors)] return _colors
def layout(self, divider=None): """ Creates the layout for colorbar when target type is continuous. The colorbar is added to the right of the scatterplot. Subclasses can override this method to add other axes or layouts. Parameters ---------- divider: AxesDivider An AxesDivider to be passed among all layout calls. """ if (self._target_color_type == TargetType.CONTINUOUS and self.projection == 2 and self.colorbar and self._cax is None): # Ensure matplotlib version compatibility if make_axes_locatable is None: raise YellowbrickValueError( ("Colorbar requires matplotlib 2.0.2 or greater " "please upgrade matplotlib")) # Create the new axes for the colorbar if divider is None: divider = make_axes_locatable(self.ax) self._cax = divider.append_axes("right", size="5%", pad=0.3) self._cax.set_yticks([]) self._cax.set_xticks([])
def __init__(self, ax=None, method="pearson", labels=None, sort=False, feature_index=None, feature_names=None, color=None, **kwargs): super(FeatureCorrelation, self).__init__(ax=None, **kwargs) self.correlation_labels = CORRELATION_LABELS self.correlation_methods = CORRELATION_METHODS if method not in self.correlation_labels: raise YellowbrickValueError( "Method {} not implement; choose from {}".format( method, ", ".join(self.correlation_labels))) # Parameters self.set_params( sort=sort, color=color, method=method, labels=labels, feature_index=feature_index, feature_names=feature_names, )
def draw(self, points, target=None, **kwargs): """ Called from the fit method, this method creates the canvas and draws the plot on it. Parameters ---------- kwargs: generic keyword arguments. """ # Resolve the labels with the classes labels = self.labels if self.labels is not None else self.classes_ if len(labels) != len(self.classes_): raise YellowbrickValueError( ("number of supplied labels ({}) does not " "match the number of classes ({})").format( len(labels), len(self.classes_))) # Create the color mapping for the labels. color_values = resolve_colors(n_colors=len(labels), colormap=self.colormap, colors=self.color) colors = dict(zip(labels, color_values)) # Transform labels into a map of class to label labels = dict(zip(self.classes_, labels)) # Define boundaries with a vertical line if self.annotate_docs: for xcoords in self.boundaries_: self.ax.axvline(x=xcoords, color="lightgray", linestyle="dashed") series = defaultdict(lambda: {"x": [], "y": []}) if target is not None: for point, t in zip(points, target): label = labels[t] series[label]["x"].append(point[0]) series[label]["y"].append(point[1]) else: label = self.classes_[0] for x, y in points: series[label]["x"].append(x) series[label]["y"].append(y) for label, points in series.items(): self.ax.scatter( points["x"], points["y"], marker="|", c=colors[label], zorder=100, label=label, ) self.ax.set_yticks(list(range(len(self.indexed_words_)))) self.ax.set_yticklabels(self.indexed_words_) return self.ax
def rank(self, X, algorithm=None): """ Returns the ranking of each pair of columns as an m by m matrix. Parameters ---------- X : ndarray or DataFrame of shape n x m A matrix of n instances with m features algorithm : str or None The ranking mechanism to use, or None for the default Returns ------- R : ndarray The mxm ranking matrix of the variables """ algorithm = algorithm or self.ranking_ algorithm = algorithm.lower() if algorithm not in self.ranking_methods: raise YellowbrickValueError( "'{}' is unrecognized ranking method".format(algorithm)) return self.ranking_methods[algorithm](X)
def fit(self, X, **kwargs): """ Sets up X for the histogram and checks to ensure that X is of the correct data type Fit calls draw Parameters ---------- X : ndarray or DataFrame of shape n x 1 A matrix of n instances with 1 feature kwargs: dict keyword arguments passed to Scikit-Learn API. """ #throw an error if X has more than 1 column if is_dataframe(X): nrows, ncols = X.shape if ncols > 1: raise YellowbrickValueError(( "X needs to be an ndarray or DataFrame with one feature, " "please select one feature from the DataFrame" )) # Handle the feature name if it is None. if self.feature is None: # If X is a data frame, get the columns off it. if is_dataframe(X): self.feature = X.columns else: self.feature = ['x'] self.draw(X) return self
def draw(self, **kwargs): """ Called from the fit method, this method creates the canvas and draws the distribution plot on it. Parameters ---------- kwargs: generic keyword arguments. """ # Prepare the data bins = np.arange(self.N) words = [self.features[i] for i in self.sorted_[: self.N]] freqs = {} # Set up the bar plots if self.conditional_freqdist_: for label, values in sorted( self.conditional_freqdist_.items(), key=itemgetter(0) ): freqs[label] = [values[i] for i in self.sorted_[: self.N]] else: freqs["corpus"] = [self.freqdist_[i] for i in self.sorted_[: self.N]] # Draw a horizontal barplot if self.orient == "h": # Add the barchart, stacking if necessary for label, freq in freqs.items(): self.ax.barh(bins, freq, label=label, color=self.color, align="center") # Set the y ticks to the words self.ax.set_yticks(bins) self.ax.set_yticklabels(words) # Order the features from top to bottom on the y axis self.ax.invert_yaxis() # Turn off y grid lines and turn on x grid lines self.ax.yaxis.grid(False) self.ax.xaxis.grid(True) # Draw a vertical barplot elif self.orient == "v": # Add the barchart, stacking if necessary for label, freq in freqs.items(): self.ax.bar(bins, freq, label=label, color=self.color, align="edge") # Set the y ticks to the words self.ax.set_xticks(bins) self.ax.set_xticklabels(words, rotation=90) # Turn off x grid lines and turn on y grid lines self.ax.yaxis.grid(True) self.ax.xaxis.grid(False) # Unknown state else: raise YellowbrickValueError("Orientation must be 'h' or 'v'") return self.ax
def _draw_projection_features(self, Xp, y): """ Draw the projection of features in the transformed space. Parameters ---------- Xp : array-like of shape (n, 2) or (n, 3) The matrix produced by the ``transform()`` method. y : array-like of shape (n,), optional The target, used to specify the colors of the points. Returns ------- self.ax : matplotlib Axes object Returns the axes that the scatter plot was drawn on. """ x_vector = self.pca_components_[0] y_vector = self.pca_components_[1] max_x = max(Xp[:, 0]) max_y = max(Xp[:, 1]) if self.projection == 2: for i in range(self.pca_components_.shape[1]): self.ax.arrow( x=0, y=0, dx=x_vector[i] * max_x, dy=y_vector[i] * max_y, color="r", head_width=0.05, width=0.005, ) self.ax.text( x_vector[i] * max_x * 1.05, y_vector[i] * max_y * 1.05, self.features_[i], color="r", ) elif self.projection == 3: z_vector = self.pca_components_[2] max_z = max(Xp[:, 1]) for i in range(self.pca_components_.shape[1]): self.ax.plot( [0, x_vector[i] * max_x], [0, y_vector[i] * max_y], [0, z_vector[i] * max_z], color="r", ) self.ax.text( x_vector[i] * max_x * 1.05, y_vector[i] * max_y * 1.05, z_vector[i] * max_z * 1.05, self.features_[i], color="r", ) else: raise YellowbrickValueError("Projection dimensions must be either 2 or 3") return self.ax
def rank(self, X, algorithm=None): """ Returns the feature ranking. Parameters ---------- X : ndarray or DataFrame of shape n x m A matrix of n instances with m features algorithm : str or None The ranking mechanism to use, or None for the default Returns ------- ranks : ndarray An n-dimensional, symmetric array of rank scores, where n is the number of features. E.g. for 1D ranking, it is (n,), for a 2D ranking it is (n,n) and so forth. """ algorithm = algorithm or self.ranking_ algorithm = algorithm.lower() if algorithm not in self.ranking_methods: raise YellowbrickValueError( "'{}' is unrecognized ranking method".format(algorithm)) # Extract matrix from dataframe if necessary if is_dataframe(X): X = X.as_matrix() return self.ranking_methods[algorithm](X)
def fit(self, y, **kwargs): """ Sets up y for the histogram and checks to ensure that ``y`` is of the correct data type. Fit calls draw. Parameters ---------- y : an array of one dimension or a pandas Series kwargs : dict keyword arguments passed to scikit-learn API. """ # throw an error if y has more than 1 column if y.ndim > 1: raise YellowbrickValueError( "y needs to be an array or Series with one dimension" ) # Handle the target name if it is None. if self.target is None: self.target = "y" self.draw(y) return self
def _determine_target_color_type(self, y): """ Determines the target color type from the vector y as follows: - if y is None: only a single color is used - if target is auto: determine if y is continuous or discrete - otherwise specify supplied target type This property will be used to compute the colors for each point. """ if y is None: self._target_color_type = SINGLE elif self.target == "auto": # NOTE: See #73 for a generalization to use when implemented if len(np.unique(y)) < 10: self._target_color_type = DISCRETE else: self._target_color_type = CONTINUOUS else: self._target_color_type = self.target if self._target_color_type not in {SINGLE, DISCRETE, CONTINUOUS}: raise YellowbrickValueError( ("could not determine target color type " "from target='{}' to '{}'").format(self.target, self._target_color_type))
def __init__(self, ax=None, labels=None, classes=None, colors=None, colormap=None, random_state=None, alpha=0.7, **kwargs): if UMAP is None: raise YellowbrickValueError( ("umap package doesn't seem to be installed." "Please install UMAP via: pip install umap-learn")) # Visual Parameters self.alpha = alpha self.labels = labels self.colors = colors self.colormap = colormap self.random_state = random_state # Fetch UMAP kwargs from kwargs by popping only keys belonging to UMAP params umap_kwargs = { key: kwargs.pop(key) for key in UMAP().get_params() if key in kwargs } # UMAP doesn't require any pre-processing before embedding and thus doesn't # require a pipeline. self.transformer_ = self.make_transformer(umap_kwargs) # Call super at the end so that size and title are set correctly super(UMAPVisualizer, self).__init__(ax=ax, **kwargs)
def fit(self, X, y=None, **kwargs): """ The fit method is the primary drawing input for the parallel coords visualization since it has both the X and y data required for the viz and the transform method does not. Parameters ---------- X : ndarray or DataFrame of shape n x m A matrix of n instances with 2 features y : ndarray or Series of length n An array or series of target or class values kwargs : dict Pass generic arguments to the drawing method Returns ------- self : instance Returns the instance of the transformer/visualizer """ _, ncols = X.shape if ncols == 2: X_two_cols = X if self.features_ is None: self.features_ = ["Feature One", "Feature Two"] # Handle the feature names if they're None. elif self.features_ is not None and is_dataframe(X): X_two_cols = X[self.features_].as_matrix() # handle numpy named/ structured array elif self.features_ is not None and is_structured_array(X): X_selected = X[self.features_] X_two_cols = X_selected.view((np.float64, len(X_selected.dtype.names))) # handle features that are numeric columns in ndarray matrix elif self.features_ is not None and has_ndarray_int_columns(self.features_, X): f_one, f_two = self.features_ X_two_cols = X[:, [int(f_one), int(f_two)]] else: raise YellowbrickValueError(""" ScatterVisualizer only accepts two features, please explicitly set these two features in the init kwargs or pass a matrix/ dataframe in with only two columns.""") # Store the classes for the legend if they're None. if self.classes_ is None: # TODO: Is this the most efficient method? self.classes_ = [str(label) for label in np.unique(y)] # Draw the instances self.draw(X_two_cols, y, **kwargs) # Fit always returns self. return self