def fit(self, X, **kwargs): """ Sets up X for the histogram and checks to ensure that X is of the correct data type Fit calls draw Parameters ---------- X : ndarray or DataFrame of shape n x 1 A matrix of n instances with 1 feature kwargs: dict keyword arguments passed to Scikit-Learn API. """ #throw an error if X has more than 1 column if is_dataframe(X): nrows, ncols = X.shape if ncols > 1: raise YellowbrickValueError(( "X needs to be an ndarray or DataFrame with one feature, " "please select one feature from the DataFrame" )) # Handle the feature name if it is None. if self.feature is None: # If X is a data frame, get the columns off it. if is_dataframe(X): self.feature = X.columns else: self.feature = ['x'] self.draw(X) return self
def fit(self, X, y, **kwargs): """ Sets up the X and y variables for the jointplot and checks to ensure that X and y are of the correct data type Fit calls draw Parameters ---------- X : ndarray or DataFrame of shape n x 1 A matrix of n instances with 1 feature y : ndarray or Series of length n An array or series of the target value kwargs: dict keyword arguments passed to Scikit-Learn API. """ #throw an error if X has more than 1 column if is_dataframe(X): nrows, ncols = X.shape if ncols > 1: raise YellowbrickValueError(( "X needs to be an ndarray or DataFrame with one feature, " "please select one feature from the DataFrame" )) #throw an error is y is None if y is None: raise YellowbrickValueError(( "Joint plots are useful for classification and regression " "problems, which require a target variable" )) # Handle the feature name if it is None. if self.feature is None: # If X is a data frame, get the columns off it. if is_dataframe(X): self.feature = X.columns else: self.feature = ['x'] # Handle the target name if it is None. if self.target is None: self.target = ['y'] self.draw(X, y, **kwargs) return self
def fit(self, X, y=None, **kwargs): """ The fit method is the primary drawing input for the visualization since it has both the X and y data required for the viz and the transform method does not. Parameters ---------- X : ndarray or DataFrame of shape n x m A matrix of n instances with m features y : ndarray or Series of length n An array or series of target or class values kwargs : dict Pass generic arguments to the drawing method Returns ------- self : instance Returns the instance of the transformer/visualizer """ if is_dataframe(X): self.X = X.values if self.features_ is None: self.features_ = X.columns else: self.X = X self.y = y super(MissingDataVisualizer, self).fit(X, y, **kwargs)
def fit(self, X, y=None, **kwargs): """ The fit method is the primary drawing input for the visualization since it has both the X and y data required for the viz and the transform method does not. Parameters ---------- X : ndarray or DataFrame of shape n x m A matrix of n instances with m features y : ndarray or Series of length n An array or series of target or class values kwargs : dict Pass generic arguments to the drawing method Returns ------- self : instance Returns the instance of the transformer/visualizer """ if is_dataframe(X): self.X = X.values if self.features_ is None: self.features_ = X.columns else: self.X = X self.y = y super(MissingDataVisualizer, self).fit(X, y, **kwargs)
def rank(self, X, algorithm=None): """ Returns the feature ranking. Parameters ---------- X : ndarray or DataFrame of shape n x m A matrix of n instances with m features algorithm : str or None The ranking mechanism to use, or None for the default Returns ------- ranks : ndarray An n-dimensional, symmetric array of rank scores, where n is the number of features. E.g. for 1D ranking, it is (n,), for a 2D ranking it is (n,n) and so forth. """ algorithm = algorithm or self.ranking_ algorithm = algorithm.lower() if algorithm not in self.ranking_methods: raise YellowbrickValueError( "'{}' is unrecognized ranking method".format(algorithm)) # Extract matrix from dataframe if necessary if is_dataframe(X): X = X.as_matrix() return self.ranking_methods[algorithm](X)
def rank(self, X, algorithm=None): """ Returns the feature ranking. Parameters ---------- X : ndarray or DataFrame of shape n x m A matrix of n instances with m features algorithm : str or None The ranking mechanism to use, or None for the default Returns ------- ranks : ndarray An n-dimensional, symmetric array of rank scores, where n is the number of features. E.g. for 1D ranking, it is (n,), for a 2D ranking it is (n,n) and so forth. """ algorithm = algorithm or self.ranking_ algorithm = algorithm.lower() if algorithm not in self.ranking_methods: raise YellowbrickValueError( "'{}' is unrecognized ranking method".format(algorithm) ) # Extract matrix from dataframe if necessary if is_dataframe(X): X = X.as_matrix() return self.ranking_methods[algorithm](X)
def fit(self, X, y=None, **kwargs): """ The fit method is the primary drawing input for the parallel coords visualization since it has both the X and y data required for the viz and the transform method does not. Parameters ---------- X : ndarray or DataFrame of shape n x m A matrix of n instances with 2 features y : ndarray or Series of length n An array or series of target or class values kwargs : dict Pass generic arguments to the drawing method Returns ------- self : instance Returns the instance of the transformer/visualizer """ _, ncols = X.shape if ncols == 2: X_two_cols = X if self.features_ is None: self.features_ = ["Feature One", "Feature Two"] # Handle the feature names if they're None. elif self.features_ is not None and is_dataframe(X): X_two_cols = X[self.features_].as_matrix() # handle numpy named/ structured array elif self.features_ is not None and is_structured_array(X): X_selected = X[self.features_] X_two_cols = X_selected.view((np.float64, len(X_selected.dtype.names))) # handle features that are numeric columns in ndarray matrix elif self.features_ is not None and has_ndarray_int_columns(self.features_, X): f_one, f_two = self.features_ X_two_cols = X[:, [int(f_one), int(f_two)]] else: raise YellowbrickValueError(""" ScatterVisualizer only accepts two features, please explicitly set these two features in the init kwargs or pass a matrix/ dataframe in with only two columns.""") # Store the classes for the legend if they're None. if self.classes_ is None: # TODO: Is this the most efficient method? self.classes_ = [str(label) for label in np.unique(y)] # Draw the instances self.draw(X_two_cols, y, **kwargs) # Fit always returns self. return self
def fit(self, X, y=None, **kwargs): """ The fit method is the primary drawing input for the parallel coords visualization since it has both the X and y data required for the viz and the transform method does not. Parameters ---------- X : ndarray or DataFrame of shape n x m A matrix of n instances with 2 features y : ndarray or Series of length n An array or series of target or class values kwargs : dict Pass generic arguments to the drawing method Returns ------- self : instance Returns the instance of the transformer/visualizer """ _, ncols = X.shape if ncols == 2: X_two_cols = X if self.features_ is None: self.features_ = ["Feature One", "Feature Two"] # Handle the feature names if they're None. elif self.features_ is not None and is_dataframe(X): X_two_cols = X[self.features_].as_matrix() # handle numpy named/ structured array elif self.features_ is not None and is_structured_array(X): X_selected = X[self.features_] X_two_cols = X_selected.copy().view((np.float64, len(X_selected.dtype.names))) # handle features that are numeric columns in ndarray matrix elif self.features_ is not None and has_ndarray_int_columns(self.features_, X): f_one, f_two = self.features_ X_two_cols = X[:, [int(f_one), int(f_two)]] else: raise YellowbrickValueError(""" ScatterVisualizer only accepts two features, please explicitly set these two features in the init kwargs or pass a matrix/ dataframe in with only two columns.""") # Store the classes for the legend if they're None. if self.classes_ is None: # TODO: Is this the most efficient method? self.classes_ = [str(label) for label in np.unique(y)] # Draw the instances self.draw(X_two_cols, y, **kwargs) # Fit always returns self. return self
def fit(self, X, y=None, **kwargs): """ Fits the estimator to discover the feature importances described by the data, then draws those importances as a bar plot. Parameters ---------- X : ndarray or DataFrame of shape n x m A matrix of n instances with m features y : ndarray or Series of length n An array or series of target or class values kwargs : dict Keyword arguments passed to the fit method of the estimator. Returns ------- self : visualizer The fit method must always return self to support pipelines. """ super(FeatureImportances, self).fit(X, y, **kwargs) # Get the feature importances from the model self.feature_importances_ = self._find_importances_param() # Apply absolute value filter before normalization if self.absolute: self.feature_importances_ = np.abs(self.feature_importances_) # Normalize features relative to the maximum if self.relative: maxv = self.feature_importances_.max() self.feature_importances_ /= maxv self.feature_importances_ *= 100.0 # Create labels for the feature importances # NOTE: this code is duplicated from MultiFeatureVisualizer if self.labels is None: # Use column names if a dataframe if is_dataframe(X): self.features_ = np.array(X.columns) # Otherwise use the column index as the labels else: _, ncols = X.shape self.features_ = np.arange(0, ncols) else: self.features_ = np.array(self.labels) # Sort the features and their importances sort_idx = np.argsort(self.feature_importances_) self.features_ = self.features_[sort_idx] self.feature_importances_ = self.feature_importances_[sort_idx] # Draw the feature importances self.draw() return self
def fit(self, X, y=None, **kwargs): """ The fit method is the primary drawing input for the parallel coords visualization since it has both the X and y data required for the viz and the transform method does not. Parameters ---------- X : ndarray or DataFrame of shape n x m A matrix of n instances with m features y : ndarray or Series of length n An array or series of target or class values kwargs : dict Pass generic arguments to the drawing method Returns ------ self : instance Returns the instance of the transformer/visualizer """ # TODO: This class is identical to the Parallel Coordinates version, # so hoist this functionality to a higher level class that is extended # by both RadViz and ParallelCoordinates. # Get the shape of the data nrows, ncols = X.shape # Store the classes for the legend if they're None. if self.classes_ is None: # TODO: Is this the most efficient method? self.classes_ = [str(label) for label in set(y)] # Handle the feature names if they're None. if self.features_ is None: # If X is a data frame, get the columns off it. if is_dataframe(X): self.features_ = X.columns # Otherwise create numeric labels for each column. else: self.features_ = [str(cdx) for cdx in range(ncols)] # Draw the instances self.draw(X, y, **kwargs) # Fit always returns self. return self
def fit(self, X, y=None, **kwargs): """ The fit method is the primary drawing input for the parallel coords visualization since it has both the X and y data required for the viz and the transform method does not. Parameters ---------- X : ndarray or DataFrame of shape n x m A matrix of n instances with m features y : ndarray or Series of length n An array or series of target or class values kwargs : dict Pass generic arguments to the drawing method Returns ------- self : instance Returns the instance of the transformer/visualizer """ # Get the shape of the data nrows, ncols = X.shape # Store the classes for the legend if they're None. if self.classes_ is None: # TODO: Is this the most efficient method? self.classes_ = [str(label) for label in set(y)] # Handle the feature names if they're None. if self.features_ is None: # If X is a data frame, get the columns off it. if is_dataframe(X): self.features_ = X.columns # Otherwise create numeric labels for each column. else: self.features_ = [ str(cdx) for cdx in range(ncols) ] # Draw the instances self.draw(X, y, **kwargs) # Fit always returns self. return self
def _create_labels_for_features(self, X): """ Create labels for the features NOTE: this code is duplicated from MultiFeatureVisualizer """ if self.labels is None: # Use column names if a dataframe if is_dataframe(X): self.features_ = np.array(X.columns) # Otherwise use the column index as the labels else: _, ncols = X.shape self.features_ = np.arange(0, ncols) else: self.features_ = np.array(self.labels)
def _create_labels_for_features(self, X): """ Create labels for the features NOTE: this code is duplicated from MultiFeatureVisualizer """ if self.labels is None: # Use column names if a dataframe if is_dataframe(X): self.features_ = np.array(X.columns) # Otherwise use the column index as the labels else: _, ncols = X.shape self.features_ = np.arange(0, ncols) else: self.features_ = np.array(self.labels)
def fit(self, X, y=None, **kwargs): """ The fit method is the primary drawing input for the visualization since it has both the X and y data required for the viz and the transform method does not. Parameters ---------- X : ndarray or DataFrame of shape n x m A matrix of n instances with m features y : ndarray or Series of length n An array or series of target or class values kwargs : dict Pass generic arguments to the drawing method Returns ------- self : instance Returns the instance of the transformer/visualizer """ # Determine the features, classes, and colors super(ParallelCoordinates, self).fit(X, y) # Convert from pandas data types if is_dataframe(X): X = X.values if is_series(y): y = y.values # Ticks for each feature specified self._increments = np.arange(len(self.features_)) # Subsample instances X, y = self._subsample(X, y) # Normalize instances if self.normalize is not None: X = self.NORMALIZERS[self.normalize].fit_transform(X) self.draw(X, y, **kwargs) return self
def fit(self, X, y=None): """ This method performs preliminary computations in order to set up the figure or perform other analyses. It can also call drawing methods in order to set up various non-instance related figure elements. Parameters ---------- X : array-like, shape (n_samples, n_features) Feature dataset to be transformed. y : array-like, shape (n_samples,) Optional dependent target data associated with X. Returns ------- self : MultiFeatureVisualizer Returns the visualizer/transformer for use in Pipelines and chaining. """ n_columns = X.shape[1] if self.features is not None: # Use the user-specified features with some checking # TODO: allow the user specified features to filter the dataset if len(self.features) != n_columns: raise YellowbrickValueError(( "number of supplied feature names does not match the number " "of columns in the training data.")) self.features_ = np.array(self.features) else: # Attempt to determine the feature names from the input data if is_dataframe(X): self.features_ = np.array(X.columns) # Otherwise create numeric labels for each column. else: self.features_ = np.arange(0, n_columns) # Ensure super is called and fit is returned super(MultiFeatureVisualizer, self).fit(X, y) return self
def fit(self, X, y=None, **kwargs): """ The fit method gathers information about the state of the visualizer. Parameters ---------- X : ndarray or DataFrame of shape n x m A matrix of n instances with m features y : ndarray or Series of length n An array or series of target or class values kwargs : dict Pass generic arguments to the drawing method Returns ------- self : instance Returns the instance of the transformer/visualizer """ # TODO: This class is identical to the Parallel Coordinates version, # so hoist this functionality to a higher level class that is extended # by both RadViz and ParallelCoordinates. # Get the shape of the data nrows, ncols = X.shape # Handle the feature names if they're None. if self.features_ is None: # If X is a data frame, get the columns off it. if is_dataframe(X): self.features_ = X.columns # Otherwise create numeric labels for each column. else: self.features_ = [str(cdx) for cdx in range(ncols)] # Fit always returns self. return self
def fit(self, X, y=None, **kwargs): """ The fit method is the primary drawing input for the visualization since it has both the X and y data required for the viz and the transform method does not. Parameters ---------- X : ndarray or DataFrame of shape n x m A matrix of n instances with m features y : ndarray or Series of length n An array or series of target or class values kwargs : dict Pass generic arguments to the drawing method Returns ------- self : instance Returns the instance of the transformer/visualizer """ # Do not call super here - the data visualizer has been refactored # to provide increased functionality that is not yet compatible with # the current implementation. This mimicks the previous functionality. # TODO: Refactor MissingDataVisualizer to make use of new features. self.features_ = self.features if is_dataframe(X): self.X = X.values if self.features_ is None: self.features_ = X.columns else: self.X = X self.y = y self.draw(X, y, **kwargs) return self
def fit(self, X, y=None, **fit_params): """ This method performs preliminary computations in order to set up the figure or perform other analyses. It can also call drawing methods in order to set up various non-instance related figure elements. This method must return self. """ # Handle the feature names if they're None. if self.features_ is None: # If X is a data frame, get the columns off it. if is_dataframe(X): self.features_ = np.array(X.columns) # Otherwise create numeric labels for each column. else: _, ncols = X.shape self.features_ = np.arange(0, ncols) return self
def fit(self, X, y=None, **fit_params): """ This method performs preliminary computations in order to set up the figure or perform other analyses. It can also call drawing methods in order to set up various non-instance related figure elements. This method must return self. """ # Handle the feature names if they're None. if self.features_ is None: # If X is a data frame, get the columns off it. if is_dataframe(X): self.features_ = np.array(X.columns) # Otherwise create numeric labels for each column. else: _, ncols = X.shape self.features_ = np.arange(0, ncols) return self
def _select_feature_columns(self, X): """ """ if len(X.shape) == 1: X_flat = X.view(np.float64).reshape(len(X), -1) else: X_flat = X _, ncols = X_flat.shape if ncols == 2: X_two_cols = X if self.features_ is None: self.features_ = ["Feature One", "Feature Two"] # Handle the feature names if they're None. elif self.features_ is not None and is_dataframe(X): X_two_cols = X[self.features_].as_matrix() # handle numpy named/ structured array elif self.features_ is not None and is_structured_array(X): X_selected = X[self.features_] X_two_cols = X_selected.view(np.float64).reshape( len(X_selected), -1) # handle features that are numeric columns in ndarray matrix elif self.features_ is not None and has_ndarray_int_columns( self.features_, X): f_one, f_two = self.features_ X_two_cols = X[:, [int(f_one), int(f_two)]] else: raise YellowbrickValueError(""" ScatterVisualizer only accepts two features, please explicitly set these two features in the init kwargs or pass a matrix/ dataframe in with only two columns.""") return X_two_cols
def _select_feature_columns(self, X): """ """ if len(X.shape) == 1: X_flat = X.copy().view(np.float64).reshape(len(X), -1) else: X_flat = X _, ncols = X_flat.shape if ncols == 2: X_two_cols = X if self.features_ is None: self.features_ = ["Feature One", "Feature Two"] # Handle the feature names if they're None. elif self.features_ is not None and is_dataframe(X): X_two_cols = X[self.features_].as_matrix() # handle numpy named/ structured array elif self.features_ is not None and is_structured_array(X): X_selected = X[self.features_] X_two_cols = X_selected.copy().view(np.float64).reshape(len(X_selected), -1) # handle features that are numeric columns in ndarray matrix elif self.features_ is not None and has_ndarray_int_columns(self.features_, X): f_one, f_two = self.features_ X_two_cols = X[:, [int(f_one), int(f_two)]] else: raise YellowbrickValueError(""" ScatterVisualizer only accepts two features, please explicitly set these two features in the init kwargs or pass a matrix/ dataframe in with only two columns.""") return X_two_cols
def draw(self, X, y, **kwargs): """ Called from the fit method, this method creates the parallel coordinates canvas and draws each instance and vertical lines on it. """ # Convert from dataframe if is_dataframe(X): X = X.as_matrix() # Choose a subset of samples # TODO: allow selection of a random subset of samples instead of head if isinstance(self.sample, int): self.n_samples = min([self.sample, len(X)]) elif isinstance(self.sample, float): self.n_samples = int(len(X) * self.sample) X = X[:self.n_samples, :] # Normalize if self.normalize is not None: X = self.normalizers[self.normalize].fit_transform(X) # Get the shape of the data nrows, ncols = X.shape # Create the xticks for each column # TODO: Allow the user to specify this feature x = list(range(ncols)) # Create the colors # TODO: Allow both colormap, listed colors, and palette definition # TODO: Make this an independent function or property for override! color_values = resolve_colors( n_colors=len(self.classes_), colormap=self.colormap, colors=self.color ) colors = dict(zip(self.classes_, color_values)) # Track which labels are already in the legend used_legends = set([]) # TODO: Make this function compatible with DataFrames! # TODO: Make an independent function to allow addition of instances! for idx, row in enumerate(X): # TODO: How to map classmap to labels? label = y[idx] # Get the label for the row label = self.classes_[label] if label not in used_legends: used_legends.add(label) self.ax.plot(x, row, color=colors[label], alpha=0.25, label=label, **kwargs) else: self.ax.plot(x, row, color=colors[label], alpha=0.25, **kwargs) # Add the vertical lines # TODO: Make an independent function for override! if self.show_vlines: for idx in x: self.ax.axvline(idx, **self.vlines_kwds) # Set the limits self.ax.set_xticks(x) self.ax.set_xticklabels(self.features_) self.ax.set_xlim(x[0], x[-1])
def draw(self, X, y, **kwargs): """ Called from the fit method, this method creates the radviz canvas and draws each instance as a class or target colored point, whose location is determined by the feature data set. """ # Convert from dataframe if is_dataframe(X): X = X.values # Clean out nans and warn that the user they aren't plotted nan_warnings.warn_if_nans_exist(X) X, y = nan_warnings.filter_missing(X, y) # Get the shape of the data nrows, ncols = X.shape # Set the axes limits self.ax.set_xlim([-1,1]) self.ax.set_ylim([-1,1]) # Create the colors # TODO: Allow both colormap, listed colors, and palette definition # TODO: Make this an independent function or property for override! color_values = resolve_colors( n_colors=len(self.classes_), colormap=self.colormap, colors=self.color ) self._colors = dict(zip(self.classes_, color_values)) # Create a data structure to hold scatter plot representations to_plot = {} for kls in self.classes_: to_plot[kls] = [[], []] # Compute the arcs around the circumference for each feature axis # TODO: make this an independent function for override s = np.array([ (np.cos(t), np.sin(t)) for t in [ 2.0 * np.pi * (i / float(ncols)) for i in range(ncols) ] ]) # Compute the locations of the scatter plot for each class # Normalize the data first to plot along the 0, 1 axis for i, row in enumerate(self.normalize(X)): row_ = np.repeat(np.expand_dims(row, axis=1), 2, axis=1) xy = (s * row_).sum(axis=0) / row.sum() kls = self.classes_[y[i]] to_plot[kls][0].append(xy[0]) to_plot[kls][1].append(xy[1]) # Add the scatter plots from the to_plot function # TODO: store these plots to add more instances to later # TODO: make this a separate function for i, kls in enumerate(self.classes_): self.ax.scatter( to_plot[kls][0], to_plot[kls][1], color=self._colors[kls], label=str(kls), alpha=self.alpha, **kwargs ) # Add the circular axis path # TODO: Make this a seperate function (along with labeling) self.ax.add_patch(patches.Circle( (0.0, 0.0), radius=1.0, facecolor='none', edgecolor='grey', linewidth=.5 )) # Add the feature names for xy, name in zip(s, self.features_): # Add the patch indicating the location of the axis self.ax.add_patch(patches.Circle(xy, radius=0.025, facecolor='#777777')) # Add the feature names offset around the axis marker if xy[0] < 0.0 and xy[1] < 0.0: self.ax.text(xy[0] - 0.025, xy[1] - 0.025, name, ha='right', va='top', size='small') elif xy[0] < 0.0 and xy[1] >= 0.0: self.ax.text(xy[0] - 0.025, xy[1] + 0.025, name, ha='right', va='bottom', size='small') elif xy[0] >= 0.0 and xy[1] < 0.0: self.ax.text(xy[0] + 0.025, xy[1] - 0.025, name, ha='left', va='top', size='small') elif xy[0] >= 0.0 and xy[1] >= 0.0: self.ax.text(xy[0] + 0.025, xy[1] + 0.025, name, ha='left', va='bottom', size='small') self.ax.axis('equal')
def fit(self, X, y=None, **kwargs): """ Fits the estimator to discover the feature importances described by the data, then draws those importances as a bar plot. Parameters ---------- X : ndarray or DataFrame of shape n x m A matrix of n instances with m features y : ndarray or Series of length n An array or series of target or class values kwargs : dict Keyword arguments passed to the fit method of the estimator. Returns ------- self : visualizer The fit method must always return self to support pipelines. """ super(FeatureImportances, self).fit(X, y, **kwargs) # Get the feature importances from the model self.feature_importances_ = self._find_importances_param() # Get the classes from the model if is_classifier(self): self.classes_ = self._find_classes_param() else: self.classes_ = None self.stack = False # If self.stack = True and feature importances is a multidim array, # we're expecting a shape of (n_classes, n_features) # therefore we flatten by taking the average by # column to get shape (n_features,) (see LogisticRegression) if not self.stack and self.feature_importances_.ndim > 1: self.feature_importances_ = np.mean(self.feature_importances_, axis=0) # Apply absolute value filter before normalization if self.absolute: self.feature_importances_ = np.abs(self.feature_importances_) # Normalize features relative to the maximum if self.relative: maxv = np.abs(self.feature_importances_).max() self.feature_importances_ /= maxv self.feature_importances_ *= 100.0 # Create labels for the feature importances # NOTE: this code is duplicated from MultiFeatureVisualizer if self.labels is None: # Use column names if a dataframe if is_dataframe(X): self.features_ = np.array(X.columns) # Otherwise use the column index as the labels else: _, ncols = X.shape self.features_ = np.arange(0, ncols) else: self.features_ = np.array(self.labels) # Sort the features and their importances if self.stack: sort_idx = np.argsort(np.mean(self.feature_importances_, 0)) self.features_ = self.features_[sort_idx] self.feature_importances_ = self.feature_importances_[:, sort_idx] else: sort_idx = np.argsort(self.feature_importances_) self.features_ = self.features_[sort_idx] self.feature_importances_ = self.feature_importances_[sort_idx] # Draw the feature importances self.draw() return self
def draw(self, X, y, **kwargs): """ Called from the fit method, this method creates the radviz canvas and draws each instance as a class or target colored point, whose location is determined by the feature data set. """ # Convert from dataframe if is_dataframe(X): X = X.values # Clean out nans and warn that the user they aren't plotted nan_warnings.warn_if_nans_exist(X) X, y = nan_warnings.filter_missing(X, y) # Get the shape of the data nrows, ncols = X.shape # Set the axes limits self.ax.set_xlim([-1, 1]) self.ax.set_ylim([-1, 1]) # Create a data structure to hold scatter plot representations to_plot = {label: [[], []] for label in self.classes_} # Compute the arcs around the circumference for each feature axis # TODO: make this an independent function for override s = np.array([ (np.cos(t), np.sin(t)) for t in [2.0 * np.pi * (i / float(ncols)) for i in range(ncols)] ]) # Compute the locations of the scatter plot for each class # Normalize the data first to plot along the 0, 1 axis for i, row in enumerate(self.normalize(X)): row_ = np.repeat(np.expand_dims(row, axis=1), 2, axis=1) xy = (s * row_).sum(axis=0) / row.sum() label = self._label_encoder[y[i]] to_plot[label][0].append(xy[0]) to_plot[label][1].append(xy[1]) # Add the scatter plots from the to_plot function # TODO: store these plots to add more instances to later # TODO: make this a separate function for label in self.classes_: color = self.get_colors([label])[0] self.ax.scatter(to_plot[label][0], to_plot[label][1], color=color, label=label, alpha=self.alpha, **kwargs) # Add the circular axis path # TODO: Make this a seperate function (along with labeling) self.ax.add_patch( patches.Circle( (0.0, 0.0), radius=1.0, facecolor="none", edgecolor="grey", linewidth=0.5, )) # Add the feature names for xy, name in zip(s, self.features_): # Add the patch indicating the location of the axis self.ax.add_patch( patches.Circle(xy, radius=0.025, facecolor="#777777")) # Add the feature names offset around the axis marker if xy[0] < 0.0 and xy[1] < 0.0: self.ax.text( xy[0] - 0.025, xy[1] - 0.025, name, ha="right", va="top", size="small", ) elif xy[0] < 0.0 and xy[1] >= 0.0: self.ax.text( xy[0] - 0.025, xy[1] + 0.025, name, ha="right", va="bottom", size="small", ) elif xy[0] >= 0.0 and xy[1] < 0.0: self.ax.text( xy[0] + 0.025, xy[1] - 0.025, name, ha="left", va="top", size="small", ) elif xy[0] >= 0.0 and xy[1] >= 0.0: self.ax.text( xy[0] + 0.025, xy[1] + 0.025, name, ha="left", va="bottom", size="small", ) self.ax.axis("equal") return self.ax
def draw(self, X, y, **kwargs): """ Called from the fit method, this method creates the radviz canvas and draws each instance as a class or target colored point, whose location is determined by the feature data set. """ # Convert from dataframe if is_dataframe(X): X = X.values # Clean out nans and warn that the user they aren't plotted nan_warnings.warn_if_nans_exist(X) X, y = nan_warnings.filter_missing(X, y) # Get the shape of the data nrows, ncols = X.shape # Set the axes limits self.ax.set_xlim([-1, 1]) self.ax.set_ylim([-1, 1]) # Create the colors # TODO: Allow both colormap, listed colors, and palette definition # TODO: Make this an independent function or property for override! color_values = resolve_colors(n_colors=len(self.classes_), colormap=self.colormap, colors=self.color) self._colors = dict(zip(self.classes_, color_values)) # Create a data structure to hold scatter plot representations to_plot = {} for kls in self.classes_: to_plot[kls] = [[], []] # Compute the arcs around the circumference for each feature axis # TODO: make this an independent function for override s = np.array([ (np.cos(t), np.sin(t)) for t in [2.0 * np.pi * (i / float(ncols)) for i in range(ncols)] ]) # Compute the locations of the scatter plot for each class # Normalize the data first to plot along the 0, 1 axis for i, row in enumerate(self.normalize(X)): row_ = np.repeat(np.expand_dims(row, axis=1), 2, axis=1) xy = (s * row_).sum(axis=0) / row.sum() kls = self.classes_[y[i]] to_plot[kls][0].append(xy[0]) to_plot[kls][1].append(xy[1]) # Add the scatter plots from the to_plot function # TODO: store these plots to add more instances to later # TODO: make this a separate function for i, kls in enumerate(self.classes_): self.ax.scatter(to_plot[kls][0], to_plot[kls][1], color=self._colors[kls], label=str(kls), alpha=self.alpha, **kwargs) # Add the circular axis path # TODO: Make this a seperate function (along with labeling) self.ax.add_patch( patches.Circle((0.0, 0.0), radius=1.0, facecolor='none', edgecolor='grey', linewidth=.5)) # Add the feature names for xy, name in zip(s, self.features_): # Add the patch indicating the location of the axis self.ax.add_patch( patches.Circle(xy, radius=0.025, facecolor='#777777')) # Add the feature names offset around the axis marker if xy[0] < 0.0 and xy[1] < 0.0: self.ax.text(xy[0] - 0.025, xy[1] - 0.025, name, ha='right', va='top', size='small') elif xy[0] < 0.0 and xy[1] >= 0.0: self.ax.text(xy[0] - 0.025, xy[1] + 0.025, name, ha='right', va='bottom', size='small') elif xy[0] >= 0.0 and xy[1] < 0.0: self.ax.text(xy[0] + 0.025, xy[1] - 0.025, name, ha='left', va='top', size='small') elif xy[0] >= 0.0 and xy[1] >= 0.0: self.ax.text(xy[0] + 0.025, xy[1] + 0.025, name, ha='left', va='bottom', size='small') self.ax.axis('equal')
def fit(self, X, y=None, **kwargs): """ Fits the estimator to discover the feature importances described by the data, then draws those importances as a bar plot. Parameters ---------- X : ndarray or DataFrame of shape n x m A matrix of n instances with m features y : ndarray or Series of length n An array or series of target or class values kwargs : dict Keyword arguments passed to the fit method of the estimator. Returns ------- self : visualizer The fit method must always return self to support pipelines. """ super(FeatureImportances, self).fit(X, y, **kwargs) # Get the feature importances from the model self.feature_importances_ = self._find_importances_param() # If feature importances is a multidim array, we're expecting a shape of # (n_classes, n_features) therefore we flatten by taking the average by # column to get shape (n_features,) (see LogisticRegression) if self.feature_importances_.ndim > 1: self.feature_importances_ = np.mean(self.feature_importances_, axis=0) # TODO - as an alternative to the above flattening approach, explore an # alternative visualize that uses the array shape to create a stacked bar chart # of feature importances for each class/feature combination # Apply absolute value filter before normalization if self.absolute: self.feature_importances_ = np.abs(self.feature_importances_) # Normalize features relative to the maximum if self.relative: maxv = self.feature_importances_.max() self.feature_importances_ /= maxv self.feature_importances_ *= 100.0 # Create labels for the feature importances # NOTE: this code is duplicated from MultiFeatureVisualizer if self.labels is None: # Use column names if a dataframe if is_dataframe(X): self.features_ = np.array(X.columns) # Otherwise use the column index as the labels else: _, ncols = X.shape self.features_ = np.arange(0, ncols) else: self.features_ = np.array(self.labels) # Sort the features and their importances sort_idx = np.argsort(self.feature_importances_) self.features_ = self.features_[sort_idx] self.feature_importances_ = self.feature_importances_[sort_idx] # Draw the feature importances self.draw() return self
def fit(self, X, y=None, **kwargs): """ Fits the estimator to discover the feature importances described by the data, then draws those importances as a bar plot. Parameters ---------- X : ndarray or DataFrame of shape n x m A matrix of n instances with m features y : ndarray or Series of length n An array or series of target or class values kwargs : dict Keyword arguments passed to the fit method of the estimator. Returns ------- self : visualizer The fit method must always return self to support pipelines. """ # Super call fits the underlying estimator if it's not already fitted super(FeatureImportances, self).fit(X, y, **kwargs) # Get the feature importances from the model self.feature_importances_ = self._find_importances_param() # Get the classes from the model if is_classifier(self): self.classes_ = self._find_classes_param() else: self.classes_ = None self.stack = False # If self.stack = True and feature importances is a multidim array, # we're expecting a shape of (n_classes, n_features) # therefore we flatten by taking the average by # column to get shape (n_features,) (see LogisticRegression) if not self.stack and self.feature_importances_.ndim > 1: self.feature_importances_ = np.mean(self.feature_importances_, axis=0) warnings.warn( ("detected multi-dimensional feature importances but stack=False, " "using mean to aggregate them."), YellowbrickWarning, ) # Apply absolute value filter before normalization if self.absolute: self.feature_importances_ = np.abs(self.feature_importances_) # Normalize features relative to the maximum if self.relative: maxv = np.abs(self.feature_importances_).max() self.feature_importances_ /= maxv self.feature_importances_ *= 100.0 # Create labels for the feature importances # NOTE: this code is duplicated from MultiFeatureVisualizer if self.labels is None: # Use column names if a dataframe if is_dataframe(X): self.features_ = np.array(X.columns) # Otherwise use the column index as the labels else: _, ncols = X.shape self.features_ = np.arange(0, ncols) else: self.features_ = np.array(self.labels) # Sort the features and their importances if self.stack: sort_idx = np.argsort(np.mean(self.feature_importances_, 0)) self.features_ = self.features_[sort_idx] self.feature_importances_ = self.feature_importances_[:, sort_idx] else: sort_idx = np.argsort(self.feature_importances_) self.features_ = self.features_[sort_idx] self.feature_importances_ = self.feature_importances_[sort_idx] # Draw the feature importances self.draw() return self
def fit(self, X, y=None, **kwargs): """ The fit method is the primary drawing input for the visualization since it has both the X and y data required for the viz and the transform method does not. Parameters ---------- X : ndarray or DataFrame of shape n x m A matrix of n instances with m features y : ndarray or Series of length n An array or series of target or class values kwargs : dict Pass generic arguments to the drawing method Returns ------- self : instance Returns the instance of the transformer/visualizer """ # Convert from pandas data types if is_dataframe(X): # Get column names before reverting to an np.ndarray if self.features_ is None: self.features_ = np.array(X.columns) X = X.values if is_series(y): y = y.values # Assign integer labels to the feature columns from the input if self.features_ is None: self.features_ = np.arange(0, X.shape[1]) # Ensure that all classes are represented in the color mapping (before sample) # NOTE: np.unique also specifies the ordering of the classes if self.classes_ is None: self.classes_ = [str(label) for label in np.unique(y)] # Create the color mapping for each class # TODO: Allow both colormap, listed colors, and palette definition # TODO: Make this an independent function or property for override! color_values = resolve_colors(n_colors=len(self.classes_), colormap=self.colormap, colors=self.color) self._colors = dict(zip(self.classes_, color_values)) # Ticks for each feature specified self._increments = np.arange(len(self.features_)) # Subsample instances X, y = self._subsample(X, y) # Normalize instances if self.normalize is not None: X = self.NORMALIZERS[self.normalize].fit_transform(X) # the super method calls draw and returns self return super(ParallelCoordinates, self).fit(X, y, **kwargs)