def get_box_data(boxPlotter, boxName): """ boxName can be either a name "cat" or a tuple ("cat", "hue") Here we really have to duplicate seaborn code, because there is not direct access to the box_data in the BoxPlotter class. """ if boxPlotter.plot_hues is None: cat = boxName else: cat = boxName[0] hue = boxName[1] i = boxPlotter.group_names.index(cat) group_data = boxPlotter.plot_data[i] if boxPlotter.plot_hues is None: # Draw a single box or a set of boxes # with a single level of grouping box_data = remove_na(group_data) else: hue_level = hue hue_mask = boxPlotter.plot_hues[i] == hue_level box_data = remove_na(group_data[hue_mask]) return box_data
def get_box_data(boxPlotter, boxName): """ boxName can be either a name "cat" or a tuple ("cat", "hue") Here we really have to duplicate seaborn code, because there is not direct access to the box_data in the BoxPlotter class. """ if boxPlotter.plot_hues is None: cat = boxName else: cat = boxName[0] hue = boxName[1] i = boxPlotter.group_names.index(cat) group_data = boxPlotter.plot_data[i] if boxPlotter.plot_hues is None: # Draw a single box or a set of boxes # with a single level of grouping box_data = remove_na(group_data) else: hue_level = hue hue_mask = boxPlotter.plot_hues[i] == hue_level box_data = remove_na(group_data[hue_mask]) return box_data
def get_box_data(box_plotter, boxName): """ boxName can be either a name "cat" or a tuple ("cat", "hue") Here we really have to duplicate seaborn code, because there is not direct access to the box_data in the BoxPlotter class. """ # if boxName isn't a string, then boxName[0] raises an IndexError. This fixes that. try: cat = box_plotter.plot_hues is None and boxName or boxName[0] except IndexError: cat = box_plotter.plot_hues is None and boxName index = box_plotter.group_names.index(cat) group_data = box_plotter.plot_data[index] if box_plotter.plot_hues is None: # Draw a single box or a set of boxes # with a single level of grouping box_data = remove_na(group_data) else: hue_level = boxName[1] hue_mask = box_plotter.plot_hues[index] == hue_level box_data = remove_na(group_data[hue_mask]) return box_data
def test_remove_na(): a_array = np.array([1, 2, np.nan, 3]) a_array_rm = remove_na(a_array) assert_array_equal(a_array_rm, np.array([1, 2, 3])) a_series = pd.Series([1, 2, np.nan, 3]) a_series_rm = remove_na(a_series) assert_series_equal(a_series_rm, pd.Series([1., 2, 3], [0, 1, 3]))
def draw_boxplot(self, ax, kws): ''' Below code has been copied partly from seaborn.categorical.py and is reproduced only for educational purposes. ''' if self.plot_hues is None: # Sorting by hue doesn't apply here. Just return super(SortedBoxPlotter, self).draw_boxplot(ax, kws) vert = self.orient == "v" props = {} for obj in ["box", "whisker", "cap", "median", "flier"]: props[obj] = kws.pop(obj + "props", {}) for i, group_data in enumerate(self.plot_data): # ==> Sort offsets by median offsets = self.hue_offsets medians = [ np.nanmedian(group_data[self.plot_hues[i] == h]) for h in self.hue_names ] offsets_sorted = offsets[np.argsort(medians)[::-1].argsort()] # Draw nested groups of boxes for j, hue_level in enumerate(self.hue_names): # Add a legend for this hue level if not i: self.add_legend_data(ax, self.colors[j], hue_level) # Handle case where there is data at this level if group_data.size == 0: continue hue_mask = self.plot_hues[i] == hue_level box_data = remove_na(group_data[hue_mask]) # Handle case where there is no non-null data if box_data.size == 0: continue # ==> Fix ordering center = i + offsets_sorted[j] artist_dict = ax.boxplot(box_data, vert=vert, patch_artist=True, positions=[center], widths=self.nested_width, **kws) self.restyle_boxplot(artist_dict, self.colors[j], props)
def draw_significance(self, ax, test='mann_whitney'): significance = {'top': -np.inf, 'val': {}} for i, group_data in enumerate(self.plot_data): tmp_data = [] for j, hue_level in enumerate(self.hue_names): hue_mask = self.plot_hues[i] == hue_level violin_data = remove_na(group_data[hue_mask]) tmp_data.append(violin_data) if test == 'mann_whitney': Uvalue, pvalue = scipy.stats.mannwhitneyu( *tmp_data, alternative='two-sided') elif test == 'permutation_resampling': pvalue, observed_diff, diffs = permutation_resampling_test( *tmp_data, statistic=np.median) else: raise KeyError('Unable to recognize {}'.format(test)) # significance if pvalue < 0.0001: symbol = "****" elif pvalue < 0.001: symbol = "***" elif pvalue < 0.01: symbol = "**" elif pvalue < 0.05: symbol = "*" else: symbol = "ns" significance['val'][i] = symbol data_max = np.max([max(tmp_data[0]), max(tmp_data[1])]) * 1.05 # data_min = np.min([min(tmp_data[0]), min(tmp_data[1])]) # y = data_max * 1.05 # h = 0.025 * (data_max - data_min) if data_max > significance['top']: significance['top'] = data_max for i, s in significance['val'].items(): plt.text(i, significance['top'], s, ha='center', va='bottom')
def draw_violins(self, ax): """Draw the violins onto `ax`.""" fill_func = ax.fill_betweenx if self.orient == "v" else ax.fill_between for i, group_data in enumerate(self.plot_data): kws = dict(edgecolor=self.gray, linewidth=self.linewidth) # Option 1: we have a single level of grouping # -------------------------------------------- if self.plot_hues is None: support, density = self.support[i], self.density[i] # Handle special case of no observations in this bin if support.size == 0: continue # Handle special case of a single observation elif support.size == 1: val = np.asscalar(support) d = np.asscalar(density) self.draw_single_observation(ax, i, val, d) continue # Draw the violin for this group grid = np.ones(self.gridsize) * i fill_func(support, -self.offset + grid - density * self.dwidth, -self.offset + grid, facecolor=self.colors[i], **kws) # Draw the interior representation of the data if self.inner is None: continue # Get a nan-free vector of datapoints violin_data = remove_na(group_data) # Draw box and whisker information if self.inner.startswith("box"): self.draw_box_lines(ax, violin_data, support, density, i) # Draw quartile lines elif self.inner.startswith("quart"): self.draw_quartiles(ax, violin_data, support, density, i) # Draw stick observations elif self.inner.startswith("stick"): self.draw_stick_lines(ax, violin_data, support, density, i) # Draw point observations elif self.inner.startswith("point"): self.draw_points(ax, violin_data, i) # Option 2: we have nested grouping by a hue variable # --------------------------------------------------- else: offsets = self.hue_offsets for j, hue_level in enumerate(self.hue_names): support, density = self.support[i][j], self.density[i][j] kws["facecolor"] = self.colors[j] # Add legend data, but just for one set of violins if not i: self.add_legend_data(ax, self.colors[j], hue_level) # Handle the special case where we have no observations if support.size == 0: continue # Handle the special case where we have one observation elif support.size == 1: val = np.asscalar(support) d = np.asscalar(density) if self.split: d = d / 2 at_group = i + offsets[j] self.draw_single_observation(ax, at_group, val, d) continue # Option 2a: we are drawing a single split violin # ----------------------------------------------- if self.split: grid = np.ones(self.gridsize) * i if j: fill_func( support, -self.offset + grid - density * self.dwidth, -self.offset + grid, **kws) else: fill_func( support, -self.offset + grid - density * self.dwidth, -self.offset + grid, **kws) # Draw the interior representation of the data if self.inner is None: continue # Get a nan-free vector of datapoints hue_mask = self.plot_hues[i] == hue_level violin_data = remove_na(group_data[hue_mask]) # Draw quartile lines if self.inner.startswith("quart"): self.draw_quartiles(ax, violin_data, support, density, i, ["left", "right"][j]) # Draw stick observations elif self.inner.startswith("stick"): self.draw_stick_lines(ax, violin_data, support, density, i, ["left", "right"][j]) # The box and point interior plots are drawn for # all data at the group level, so we just do that once if not j: continue # Get the whole vector for this group level violin_data = remove_na(group_data) # Draw box and whisker information if self.inner.startswith("box"): self.draw_box_lines(ax, violin_data, support, density, i) # Draw point observations elif self.inner.startswith("point"): self.draw_points(ax, violin_data, i) # Option 2b: we are drawing full nested violins # ----------------------------------------------- else: grid = np.ones(self.gridsize) * (i + offsets[j]) fill_func(support, -self.offset + grid - density * self.dwidth, -self.offset + grid, **kws) # Draw the interior representation if self.inner is None: continue # Get a nan-free vector of datapoints hue_mask = self.plot_hues[i] == hue_level violin_data = remove_na(group_data[hue_mask]) # Draw box and whisker information if self.inner.startswith("box"): self.draw_box_lines(ax, violin_data, support, density, i + offsets[j]) # Draw quartile lines elif self.inner.startswith("quart"): self.draw_quartiles(ax, violin_data, support, density, i + offsets[j]) # Draw stick observations elif self.inner.startswith("stick"): self.draw_stick_lines(ax, violin_data, support, density, i + offsets[j]) # Draw point observations elif self.inner.startswith("point"): self.draw_points(ax, violin_data, i + offsets[j])
def estimate_densities(self, bw, cut, scale, scale_hue, gridsize): """Find the support and density for all of the data.""" # Initialize data structures to keep track of plotting data if self.hue_names is None: support = [] density = [] counts = np.zeros(len(self.plot_data)) max_density = np.zeros(len(self.plot_data)) else: support = [[] for _ in self.plot_data] density = [[] for _ in self.plot_data] size = len(self.group_names), len(self.hue_names) counts = np.zeros(size) max_density = np.zeros(size) for i, group_data in enumerate(self.plot_data): # Option 1: we have a single level of grouping # -------------------------------------------- if self.plot_hues is None: # Strip missing datapoints kde_data = remove_na(group_data) # Handle special case of no data at this level if kde_data.size == 0: support.append(np.array([])) density.append(np.array([1.])) counts[i] = 0 max_density[i] = 0 continue # Handle special case of a single unique datapoint elif np.unique(kde_data).size == 1: support.append(np.unique(kde_data)) density.append(np.array([1.])) counts[i] = 1 max_density[i] = 0 continue # Fit the KDE and get the used bandwidth size kde, bw_used = self.fit_kde(kde_data, bw) # Determine the support grid and get the density over it support_i = self.kde_support(kde_data, bw_used, cut, gridsize) density_i = kde.evaluate(support_i) # Update the data structures with these results support.append(support_i) density.append(density_i) counts[i] = kde_data.size max_density[i] = density_i.max() # Option 2: we have nested grouping by a hue variable # --------------------------------------------------- else: for j, hue_level in enumerate(self.hue_names): # Handle special case of no data at this category level if not group_data.size: support[i].append(np.array([])) density[i].append(np.array([1.])) counts[i, j] = 0 max_density[i, j] = 0 continue # Select out the observations for this hue level hue_mask = self.plot_hues[i] == hue_level # Strip missing datapoints kde_data = remove_na(group_data[hue_mask]) # Handle special case of no data at this level if kde_data.size == 0: support[i].append(np.array([])) density[i].append(np.array([1.])) counts[i, j] = 0 max_density[i, j] = 0 continue # Handle special case of a single unique datapoint elif np.unique(kde_data).size == 1: support[i].append(np.unique(kde_data)) density[i].append(np.array([1.])) counts[i, j] = 1 max_density[i, j] = 0 continue # Fit the KDE and get the used bandwidth size kde, bw_used = self.fit_kde(kde_data, bw) # Determine the support grid and get the density over it support_ij = self.kde_support(kde_data, bw_used, cut, gridsize) density_ij = kde.evaluate(support_ij) # Update the data structures with these results support[i].append(support_ij) density[i].append(density_ij) counts[i, j] = kde_data.size max_density[i, j] = density_ij.max() # Scale the height of the density curve. # For a violinplot the density is non-quantitative. # The objective here is to scale the curves relative to 1 so that # they can be multiplied by the width parameter during plotting. if scale == "area": self.scale_area(density, max_density, scale_hue) elif scale == "width": self.scale_width(density) elif scale == "count": self.scale_count(density, counts, scale_hue) else: raise ValueError("scale method '{}' not recognized".format(scale)) # Set object attributes that will be used while plotting self.support = support self.density = density
def estimate_statistic(self, estimator, ci, n_boot): if self.hue_names is None: statistic = [] confint = [] else: statistic = [[] for _ in self.plot_data] confint = [[] for _ in self.plot_data] for i, group_data in enumerate(self.plot_data): # Option 1: we have a single layer of grouping # -------------------------------------------- if self.plot_hues is None: if self.plot_units is None: stat_data = remove_na(group_data) unit_data = None else: unit_data = self.plot_units[i] have = pd.notnull(np.c_[group_data, unit_data]).all(axis=1) stat_data = group_data[have] unit_data = unit_data[have] # Estimate a statistic from the vector of data if not stat_data.size: statistic.append(np.nan) else: statistic.append(estimator(stat_data)) # Get a confidence interval for this estimate if ci is not None: if stat_data.size < 2: confint.append([np.nan, np.nan]) continue if ci == "sd": estimate = estimator(stat_data) sd = np.std(stat_data) confint.append((estimate - sd, estimate + sd)) elif ci == "range": confint.append((np.min(stat_data), np.max(stat_data))) else: boots = bootstrap(stat_data, func=estimator, n_boot=n_boot, units=unit_data) confint.append(utils.ci(boots, ci)) # Option 2: we are grouping by a hue layer # ---------------------------------------- else: for j, hue_level in enumerate(self.hue_names): if not self.plot_hues[i].size: statistic[i].append(np.nan) if ci is not None: confint[i].append((np.nan, np.nan)) continue hue_mask = self.plot_hues[i] == hue_level if self.plot_units is None: stat_data = remove_na(group_data[hue_mask]) unit_data = None else: group_units = self.plot_units[i] have = pd.notnull(np.c_[group_data, group_units]).all(axis=1) stat_data = group_data[hue_mask & have] unit_data = group_units[hue_mask & have] # Estimate a statistic from the vector of data if not stat_data.size: statistic[i].append(np.nan) else: statistic[i].append(estimator(stat_data)) # Get a confidence interval for this estimate if ci is not None: if stat_data.size < 2: confint[i].append([np.nan, np.nan]) continue if ci == "sd": estimate = estimator(stat_data) sd = np.std(stat_data) confint[i].append((estimate - sd, estimate + sd)) elif ci == "range": confint[i].append( (np.min(stat_data), np.max(stat_data))) else: boots = bootstrap(stat_data, func=estimator, n_boot=n_boot, units=unit_data) confint[i].append(utils.ci(boots, ci)) # Save the resulting values for plotting self.statistic = np.array(statistic) self.confint = np.array(confint)
Here we really have to duplicate seaborn code, because there is not direct access to the box_data in the BoxPlotter class. """ if boxPlotter.plot_hues is None: cat = boxName else: cat = boxName[0] hue = boxName[1] i = boxPlotter.group_names.index(cat) group_data = boxPlotter.plot_data[i] if boxPlotter.plot_hues is None: # Draw a single box or a set of boxes # with a single level of grouping box_data = remove_na(group_data) else: hue_level = hue hue_mask = boxPlotter.plot_hues[i] == hue_level box_data = remove_na(group_data[hue_mask]) return box_data fig = plt.gcf() validList = ['inside', 'outside'] if loc not in validList: raise ValueError("loc value should be one of the following: {}.".format(', '.join(validList))) validList = ['t-test_ind', 't-test_paired', 'Mann-Whitney'] if test not in validList: raise ValueError("test value should be one of the following: {}.".format(', '.join(validList)))