def get_vpstats(dataset, points=500, bw_method=None): def _kde_method(X, coords): if np.all(X[0] == X): return (X[0] == coords).astype(float) kde = mlab.GaussianKDE(X, bw_method) return kde.evaluate(coords) vpstats = cbook.violin_stats(dataset, _kde_method, points=points) return vpstats
def get_violinstats(dataset, points=100, bw_method=None): def _kde_method(X, coords): # fallback gracefully if the vector contains only one value if np.all(X[0] == X): return (X[0] == coords).astype(float) kde = mlab.GaussianKDE(X, bw_method) return kde.evaluate(coords) vpstats = cbook.violin_stats(dataset, _kde_method, points=points) return vpstats
def plot_bindist(obs_table, b, base_pdf=None): def _kde_method(X, coords): kde = mlab.GaussianKDE(X, None) return kde.evaluate(coords) obs = obs_table.Observable.unique() if len(obs) != 1: raise ValueError("Must be only one observable") obs = obs[0] figure, ax = plt.subplots() plt.title("%s [Bin %d]" % (obs, b+1)) colors = plotutils.color_names_to_rgb(colorlist) alpha = 1 if base_pdf: base = obs_table[obs_table.PDF.get_values()==base_pdf].Result[0] else: base = None results = obs_table.Result.unique( ) for result in results: if base is not None: cv = base.central_value.as_matrix() data = result._violin_data(rel_to=cv) else: data = data = result._violin_data() if isinstance(data, list): stats = data[b] else: stats = violin_stats(data, _kde_method)[b] color = next(colors) alphacolor = color + (alpha,) plt.plot(stats['coords'], stats['vals'], color=color,label=result.pdf.label) plt.fill_between(stats['coords'], 0, stats['vals'], color=alphacolor, ) alpha /= 2 plt.ylabel("Distribution") if base_pdf: plt.xlabel('Ratio to %s' % base_pdf.label) else: plt.xlabel("Observable value") ax.yaxis.set_major_locator(MaxNLocator(nbins=10, prune="both")) ax.xaxis.set_major_locator(MaxNLocator(nbins=8, prune="both")) plt.legend() yield (obs, b), figure
def custom_violin_stats(data, weights): # Get wquantiles median and mean (using wquantiles module for median) median = wquantiles.quantile_1D(data, weights, 0.5) mean, sumw = np.ma.average(data, weights=list(weights), returned=True) # Use matplotlib violin_stats, which expects a function that takes in data and coords # which we get from closure above results = violin_stats(data, vdensity_with_weights(weights)) # Update result dictionary with our updated info results[0][u"mean"] = mean results[0][u"median"] = median # No need to do this, since it should be populated from violin_stats # results[0][u"min"] = np.min(data) # results[0][u"max"] = np.max(data) return results
def custom_violin_stats(data, weights): """Taken from https://colab.research.google.com/drive/1cSnJGKJEqbllkPbF2z0cnfdwT40sUKKR#scrollTo=RIcLIr5XJRmx""" # Get weighted median and mean (using weighted module for median) median = weighted.quantile_1D(data, weights, 0.5) mean, sumw = np.ma.average(data, weights=list(weights), returned=True) # Use matplotlib violin_stats, which expects a function that takes in data and coords # which we get from closure above results = violin_stats(data, vdensity_with_weights(weights)) # Update result dictionary with our updated info results[0][u"mean"] = mean results[0][u"median"] = median # No need to do this, since it should be populated from violin_stats # results[0][u"min"] = np.min(data) # results[0][u"max"] = np.max(data) return results
def violin_plot(data, normvalues=None, ax=None, bw_method=None, **kwargs): def _kde_method(X, coords): kde = mlab.GaussianKDE(X, bw_method) return kde.evaluate(coords) myargs = {} myargs.update(kwargs) if 'color' in myargs: color = myargs.pop('color') else: color = None if 'label' in myargs: label = myargs.pop('label') else: label = None if 'hatches' in myargs: hatches = myargs.pop('hatches') else: hatches = None if isinstance(data, list): stats = data else: stats = violin_stats(data, _kde_method) N = len(stats) if normvalues is not None: if np.isscalar(normvalues): normvalues = [normvalues] * N elif len(normvalues) != N: raise ValueError("Incorrect number of normvalues") widths = [normval*np.max(stat['vals']) for normval, stat in zip(normvalues, stats)] myargs['widths'] = widths if 'widths' in myargs: widths = myargs['widths'] if np.isscalar(widths): widths = [widths] * N elif len(widths) != N: raise ValueError("Incorrect number of widths") myargs['widths'] = widths else: myargs['widths'] = [0.5]*N ournorms = [w/np.max(stat['vals']) for w,stat in zip(myargs['widths'], stats)] vp = ax.violin(stats, **myargs) vp_edge = ax.violin(stats, **myargs) for pc , edge in zip(vp['bodies'], vp_edge['bodies']): if color: if len(color) == 4: pc.set_alpha(color[3]) edge.set_alpha(1) pc.set_facecolor(color) pc.set_edgecolor('none') edge.set_edgecolor(color[:3]) edge.set_facecolor('none') if hatches: pc.set_hatch(hatches) if label: if not color: color = vp['bodies'][0].get_facecolor()[0] vp['bodies'][0].set_label(label) handle = mpatches.Patch(facecolor=color, label=label, hatch=hatches, edgecolor=color[:3]) else: handle = None return vp, handle, ournorms
def sinaplot(dataset, positions=None, vert=True, widths=0.5, showmeans=False, showextrema=True, showmedians=False, scaled=True, show_violin=False, points=100, bw_method=None, ax=None, scatter_kwargs=None, line_kwargs=None): """ a cross between a violinplot and a scatterplot, from the R package sinaplot. Reimplemented by ripping off the violinplot function from matplotlib and tweaking a few bits. """ def _kde_method(X, coords): # fallback gracefully if the vector contains only one value if np.all(X[0] == X): return (X[0] == coords).astype(float) kde = GaussianKDE(X, bw_method) return kde.evaluate(coords) vpstats = cbook.violin_stats(dataset, _kde_method, points=points) if ax is None: ax = plt.subplot() if scatter_kwargs is None: scatter_kwargs = dict(s=20, color='b', marker='o', alpha=0.9) scatter_color = scatter_kwargs.pop('color') if line_kwargs is None: line_kwargs = dict(color='red', linewidth='1') # Collections to be returned artists = {} N = len(vpstats) datashape_message = ("List of violinplot statistics and `{0}` " "values must have the same length") # Validate positions if positions is None: positions = np.arange(1, N + 1) elif len(positions) != N: raise ValueError(datashape_message.format("positions")) positions = positions.reshape((N, 1)) # Validate widths if np.isscalar(widths): widths = np.ones((N, 1)) * widths elif len(widths) != N: raise ValueError(datashape_message.format("widths")) widths = widths.reshape((N, 1)) # Validate colors if isinstance(scatter_color, str): scatter_color = [scatter_color] * N elif len(scatter_color) != N: raise ValueError(datashape_message.format("scatter_color")) # Calculate ranges for statistics lines pmins = -0.25 * np.array(widths) + positions pmaxes = 0.25 * np.array(widths) + positions # Check whether we are rendering vertically or horizontally if vert: fill = ax.fill_betweenx perp_lines = ax.hlines par_lines = ax.vlines else: fill = ax.fill_between perp_lines = ax.vlines par_lines = ax.hlines # Calculate jittered scatter positions and render sinaplots jittered = [] means = [] mins = [] maxes = [] medians = [] scatter_color_flattened = [] for i in range(N): x = dataset[:,i] xp = vpstats[i]['coords'] fp = vpstats[i]['vals'] #Uses numpy interpolate to estimate the limits for each points interp = np.interp(x, xp, fp) jittered.append(np.asarray([np.random.uniform(-r, r) for r in interp])) # append some stuff for the means/medians/extremities means.append(vpstats[i]['mean']) mins.append(vpstats[i]['min']) maxes.append(vpstats[i]['max']) medians.append(vpstats[i]['median']) scatter_color_flattened += [scatter_color[i],]*len(x) jittered = np.asarray(jittered) # get scale_factor (either scaled by largest value in all sinaplots or not scaled) if scaled: scale_factor = np.ones((N, 1)) * jittered.max() else: scale_factor = jittered.max(1) scale_factor = scale_factor.reshape((N, 1)) jittered = 0.5 * widths * jittered / scale_factor + positions # add background density plots if show_violin: # Render violins bodies = [] for stats, pos, width, sf, col in zip(vpstats, positions, widths, scale_factor, scatter_color): # The 0.5 factor reflects the fact that we plot from v-p to # v+p vals = np.array(stats['vals']) vals = 0.5 * width * vals / sf bodies += [fill(stats['coords'], -vals + pos, vals + pos, facecolor=col, alpha=0.2)] artists['bodies'] = bodies # plot scatterplot if vert: artists['scatter'] = plt.scatter(jittered.flatten(), dataset.T.flatten(), color=scatter_color_flattened, **scatter_kwargs) else: artists['scatter'] = plt.scatter(dataset.T.flatten(), jittered.flatten(), color=scatter_color_flattened, **scatter_kwargs) # Render means if showmeans: artists['cmeans'] = perp_lines(means, pmins, pmaxes, **line_kwargs) # Render extrema if showextrema: artists['cmaxes'] = perp_lines(maxes, pmins, pmaxes, **line_kwargs) artists['cmins'] = perp_lines(mins, pmins, pmaxes, **line_kwargs) artists['cbars'] = par_lines(positions, mins, maxes, **line_kwargs) # Render medians if showmedians: artists['cmedians'] = perp_lines(medians, pmins, pmaxes, **line_kwargs) return artists
def sinaplot(dataset, positions=None, vert=True, widths=0.5, showmeans=False, showextrema=True, showmedians=False, scaled=True, show_violin=False, points=100, bw_method=None, ax=None, scatter_kwargs=None, line_kwargs=None): """ a cross between a violinplot and a scatterplot, from the R package sinaplot. Reimplemented by ripping off the violinplot function from matplotlib and tweaking a few bits. """ def _kde_method(X, coords): # fallback gracefully if the vector contains only one value if np.all(X[0] == X): return (X[0] == coords).astype(float) kde = GaussianKDE(X, bw_method) return kde.evaluate(coords) vpstats = cbook.violin_stats(dataset, _kde_method, points=points) if ax is None: ax = plt.subplot() if scatter_kwargs is None: scatter_kwargs = dict(s=20, color='b', marker='o', alpha=0.9) scatter_color = scatter_kwargs.pop('color') if line_kwargs is None: line_kwargs = dict(color='red', linewidth='1') # Collections to be returned artists = {} N = len(vpstats) datashape_message = ("List of violinplot statistics and `{0}` " "values must have the same length") # Validate positions if positions is None: positions = np.arange(1, N + 1) elif len(positions) != N: raise ValueError(datashape_message.format("positions")) positions = positions.reshape((N, 1)) # Validate widths if np.isscalar(widths): widths = np.ones((N, 1)) * widths elif len(widths) != N: raise ValueError(datashape_message.format("widths")) widths = widths.reshape((N, 1)) # Validate colors if isinstance(scatter_color, str): scatter_color = [scatter_color] * N elif len(scatter_color) != N: raise ValueError(datashape_message.format("scatter_color")) # Calculate ranges for statistics lines pmins = -0.25 * np.array(widths) + positions pmaxes = 0.25 * np.array(widths) + positions # Check whether we are rendering vertically or horizontally if vert: fill = ax.fill_betweenx perp_lines = ax.hlines par_lines = ax.vlines else: fill = ax.fill_between perp_lines = ax.vlines par_lines = ax.hlines # Calculate jittered scatter positions and render sinaplots jittered = [] means = [] mins = [] maxes = [] medians = [] scatter_color_flattened = [] for i in range(N): x = dataset[:, i] xp = vpstats[i]['coords'] fp = vpstats[i]['vals'] #Uses numpy interpolate to estimate the limits for each points interp = np.interp(x, xp, fp) jittered.append(np.asarray([np.random.uniform(-r, r) for r in interp])) # append some stuff for the means/medians/extremities means.append(vpstats[i]['mean']) mins.append(vpstats[i]['min']) maxes.append(vpstats[i]['max']) medians.append(vpstats[i]['median']) scatter_color_flattened += [ scatter_color[i], ] * len(x) jittered = np.asarray(jittered) # get scale_factor (either scaled by largest value in all sinaplots or not scaled) if scaled: scale_factor = np.ones((N, 1)) * jittered.max() else: scale_factor = jittered.max(1) scale_factor = scale_factor.reshape((N, 1)) jittered = 0.5 * widths * jittered / scale_factor + positions # add background density plots if show_violin: # Render violins bodies = [] for stats, pos, width, sf, col in zip(vpstats, positions, widths, scale_factor, scatter_color): # The 0.5 factor reflects the fact that we plot from v-p to # v+p vals = np.array(stats['vals']) vals = 0.5 * width * vals / sf bodies += [ fill(stats['coords'], -vals + pos, vals + pos, facecolor=col, alpha=0.2) ] artists['bodies'] = bodies # plot scatterplot if vert: artists['scatter'] = plt.scatter(jittered.flatten(), dataset.T.flatten(), color=scatter_color_flattened, **scatter_kwargs) else: artists['scatter'] = plt.scatter(dataset.T.flatten(), jittered.flatten(), color=scatter_color_flattened, **scatter_kwargs) # Render means if showmeans: artists['cmeans'] = perp_lines(means, pmins, pmaxes, **line_kwargs) # Render extrema if showextrema: artists['cmaxes'] = perp_lines(maxes, pmins, pmaxes, **line_kwargs) artists['cmins'] = perp_lines(mins, pmins, pmaxes, **line_kwargs) artists['cbars'] = par_lines(positions, mins, maxes, **line_kwargs) # Render medians if showmedians: artists['cmedians'] = perp_lines(medians, pmins, pmaxes, **line_kwargs) return artists