def bin_edges_f(bin_method, mags_cols_cl): ''' Obtain bin edges for each photometric dimension using the cluster region diagram. The 'bin_edges' list will contain all magnitudes first, and then all colors (in the same order in which they are read). ''' bin_edges = [] if bin_method in ( 'auto', 'fd', 'doane', 'scott', 'rice', 'sturges', 'sqrt'): for mag in mags_cols_cl[0]: bin_edges.append(np.histogram(mag, bins=bin_method)[1]) for col in mags_cols_cl[1]: bin_edges.append(np.histogram(col, bins=bin_method)[1]) elif bin_method == 'fixed': # Based on Bonatto & Bica (2007) 377, 3, 1301-1323 but using larger # the values used by them (0.25 for colors and 0.5 for magnitudes) for mag in mags_cols_cl[0]: b_num = max(2, (max(mag) - min(mag)) / 1.) bin_edges.append(np.histogram(mag, bins=int(b_num))[1]) for col in mags_cols_cl[1]: b_num = max(2, (max(col) - min(col)) / .5) bin_edges.append(np.histogram(col, bins=int(b_num))[1]) elif bin_method == 'knuth': for mag in mags_cols_cl[0]: bin_edges.append(knuth_bin_width( mag, return_bins=True, quiet=True)[1]) for col in mags_cols_cl[1]: bin_edges.append(knuth_bin_width( col, return_bins=True, quiet=True)[1]) elif bin_method == 'blocks': for mag in mags_cols_cl[0]: bin_edges.append(bayesian_blocks(mag)) for col in mags_cols_cl[1]: bin_edges.append(bayesian_blocks(col)) # TODO this method is currently hidden from the params file. # To be used when #325 is implemented. Currently used to test # multi-dimensional likelihoods. # # For 4 to 6 dimensions the rule below appears to be a somewhat reasonable # rule of thumb for the number of bins for each dimension. # There is a trade-off between a large number of smaller bins which # better match features of the observed cluster but benefits larger # mass values, and fewer larger bins which better match masses but losing # finer details of the cluster. elif bin_method == 'man': d = len(mags_cols_cl[0]) + len(mags_cols_cl[1]) b_num = [15, 10, 7][d - 4] for mag in mags_cols_cl[0]: bin_edges.append(np.histogram(mag, bins=int(b_num))[1]) for col in mags_cols_cl[1]: bin_edges.append(np.histogram(col, bins=int(b_num))[1]) return bin_edges
def test_knuth_bin_width(N=10000, rseed=0): rng = np.random.RandomState(rseed) X = rng.randn(N) dx, bins = knuth_bin_width(X, return_bins=True) assert_allclose(len(bins), 59) dx2 = knuth_bin_width(X) assert dx == dx2 with pytest.raises(ValueError): knuth_bin_width(rng.rand(2, 10))
def test_knuth_bin_width(N=10000, rseed=0): rng = np.random.default_rng(rseed) X = rng.standard_normal(N) dx, bins = knuth_bin_width(X, return_bins=True) assert_allclose(len(bins), 58) dx2 = knuth_bin_width(X) assert dx == dx2 with pytest.raises(ValueError): knuth_bin_width(rng.random((2, 10)))
def histogram(a, bins=10, range=None, **kwargs): """Enhanced histogram This is a histogram function that enables the use of more sophisticated algorithms for determining bins. Aside from the `bins` argument allowing a string specified how bins are computed, the parameters are the same as numpy.histogram(). Parameters ---------- a : array_like array of data to be histogrammed bins : int or list or str (optional) If bins is a string, then it must be one of: 'blocks' : use bayesian blocks for dynamic bin widths 'knuth' : use Knuth's rule to determine bins 'scotts' : use Scott's rule to determine bins 'freedman' : use the Freedman-diaconis rule to determine bins range : tuple or None (optional) the minimum and maximum range for the histogram. If not specified, it will be (x.min(), x.max()) other keyword arguments are described in numpy.hist(). Returns ------- hist : array The values of the histogram. See `normed` and `weights` for a description of the possible semantics. bin_edges : array of dtype float Return the bin edges ``(length(hist)+1)``. See Also -------- numpy.histogram astroML.plotting.hist """ a = np.asarray(a) # if range is specified, we need to truncate the data for # the bin-finding routines if (range is not None and (bins in ['blocks', 'knuth', 'scotts', 'freedman'])): a = a[(a >= range[0]) & (a <= range[1])] if isinstance(bins, str): if bins == 'blocks': bins = astropy_stats.bayesian_blocks(a) elif bins == 'knuth': da, bins = astropy_stats.knuth_bin_width(a, True) elif bins == 'scotts': da, bins = astropy_stats.scott_bin_width(a, True) elif bins == 'freedman': da, bins = astropy_stats.freedman_bin_width(a, True) else: raise ValueError("unrecognized bin code: '{}'".format(bins)) return np.histogram(a, bins, range, **kwargs)
def knuth_bw_selector(dat_list): """Selects the kde bandwidth using Knuth's rule implemented in Astropy If Knuth's rule raises error, Scott's rule is used Parameters ---------- dat_list : list List of data arrays that will be used to generate a kde Returns ------- bw_min : float Minimum of bandwidths for all of the data arrays in dat_list """ bw_list = [] for dat in dat_list: try: bw = astrostats.knuth_bin_width(dat) except: print('Using Scott Rule!!') bw = astrostats.scott_bin_width(dat) bw_list.append(bw) return np.mean(bw_list)
def f(x, mode): if mode == "knuth": # https://docs.astropy.org/en/stable/api/astropy.stats.knuth_bin_width.html from astropy.stats import knuth_bin_width _, bin_edges = knuth_bin_width(x, return_bins=True) return bin_edges else: return np.histogram_bin_edges(x, bins=mode)
def cmd_hist(self, upper, lower, colour, cmd_colour): plt.figure() binwidth = stats.knuth_bin_width(colour) kde_data = colour bins = np.arange(min(kde_data), max(kde_data) + binwidth, binwidth) kde_data = colour plt.hist(kde_data, bins=bins, label='Binned Data') plt.title(cmd_colour)
def prepObsMass(obs_mass, bin_edges): """ """ # Obtain histogram for observed cluster. if bin_edges == 'knuth': bin_edges = knuth_bin_width(obs_mass, return_bins=True, quiet=True)[1] elif bin_edges == 'block': bin_edges = bayesian_blocks(obs_mass) cl_histo, bin_edges = np.histogram(obs_mass, bins=bin_edges) return [bin_edges, cl_histo]
def get_bin_sizes_xy(x, y, algo='scott'): """ Smartly get bin size to have a loer bias due to binning""" from astropy.stats import freedman_bin_width, scott_bin_width, knuth_bin_width, bayesian_blocks logger.info(" > Get smart bin sizes in 2D") if algo == 'scott': logger.info("use scott rule of thumb") width_x, bins_x = scott_bin_width(x, return_bins=True) width_y, bins_y = scott_bin_width(y, return_bins=True) elif algo == 'knuth': logger.info("use knuth rule of thumb") width_x, bins_x = knuth_bin_width(x, return_bins=True) width_y, bins_y = knuth_bin_width(y, return_bins=True) elif algo == 'freedman': logger.info("use freedman rule of thumb") width_x, bins_x = freedman_bin_width(x, return_bins=True) width_y, bins_y = freedman_bin_width(y, return_bins=True) else: raise NotImplementedError("use scott or knuth") n_bins_x, n_bins_y = len(bins_x), len(bins_y) return bins_x, bins_y, width_x, width_y
def calc_bin(data, mode): n = len(data) if mode == "sqrt": bins = np.sqrt(n) elif mode == "sturges": bins = np.log2(n) + 1 elif mode == "scott": width, bins = scott_bin_width(data, return_bins=True) bins = len(bins) elif mode == "freedman": width, bins = freedman_bin_width(data, return_bins=True) bins = len(bins) elif mode == "knuth": width, bins = knuth_bin_width(data, return_bins=True) bins = len(bins) return bins
def cmd_kde(self, upper, lower, colour): plt.figure() print(colour) binwidth = stats.knuth_bin_width(colour) kde_data = colour #print(binwidth) bins = np.arange(min(kde_data), max(kde_data) + binwidth, binwidth) x_eval = np.linspace(kde_data.min() - 1.0, kde_data.max() + 1.0, 500) kde = gaussian_kde(kde_data, bw_method=binwidth) plt.plot(x_eval, kde(x_eval), 'k', lw=2, label='KDE') plt.hist(kde_data, bins=bins, density=True, label='Binned Data') plt.legend() plt.xlabel('$(J-K)_0$') plt.ylabel('Normalised Density') plt.show()
def plotHistWithKnuth(data, axis, x_label=""): from scipy.stats import norm """ This is a funtion that helps with the ploting of the data. data: data to be plotted. It must be a Series of pd axis: axis instanco of marplotlib x_label: the label to put in the x axis. default is "" but that means that it takes the name of the series. """ #Obtain the bins using an specific method dx, bins = knuth_bin_width(data, return_bins=True) #Plot the bins axis.hist(data, bins, density=True) #Obtain the gaussian distribution that fits best the bins. mu, sigma = norm.fit(data) x = np.linspace(round(data.min()), round(data.max()), 100) #points to draw y = norm.pdf(x, mu, sigma) #value of gaussian dist in those points axis.plot(x, y, 'r--', linewidth=2) #plot #print some valuable info axis.text(1.0, 1.15, 'Número total de trayectorias = ' + str(data.count()), verticalalignment='top', horizontalalignment='right', transform=axis.transAxes, fontsize=20) axis.text(1.0, 0.9, '$\mu =${0:.3f} \n $\sigma =${1:.3f}'.format(mu, sigma), verticalalignment='top', horizontalalignment='right', transform=axis.transAxes, fontsize=20) #Change axis name or set the default name, the name of the data series if not len(x_label): axis.set_xlabel(data.name) else: axis.set_xlabel(x_label) axis.grid() axis.set_ylabel('Número de trayectorias (normalizado)')
def knuth_bin_width(data, return_bins=False, disp=True): r"""Return the optimal histogram bin width using Knuth's rule [1]_ Parameters ---------- data : array-like, ndim=1 observed (one-dimensional) data return_bins : bool (optional) if True, then return the bin edges Returns ------- dx : float optimal bin width. Bins are measured starting at the first data point. bins : ndarray bin edges: returned if `return_bins` is True Notes ----- The optimal number of bins is the value M which maximizes the function .. math:: F(M|x,I) = n\log(M) + \log\Gamma(\frac{M}{2}) - M\log\Gamma(\frac{1}{2}) - \log\Gamma(\frac{2n+M}{2}) + \sum_{k=1}^M \log\Gamma(n_k + \frac{1}{2}) where :math:`\Gamma` is the Gamma function, :math:`n` is the number of data points, :math:`n_k` is the number of measurements in bin :math:`k`. References ---------- .. [1] Knuth, K.H. "Optimal Data-Based Binning for Histograms". arXiv:0605197, 2006 See Also -------- KnuthF freedman_bin_width scotts_bin_width """ return astropy_stats.knuth_bin_width(data, return_bins)
def knuth_bin_width(data, return_bins=False, disp=True): r"""Return the optimal histogram bin width using Knuth's rule [1]_ Parameters ---------- data : array-like, ndim=1 observed (one-dimensional) data return_bins : bool (optional) if True, then return the bin edges Returns ------- dx : float optimal bin width. Bins are measured starting at the first data point. bins : ndarray bin edges: returned if `return_bins` is True Notes ----- The optimal number of bins is the value M which maximizes the function .. math:: F(M|x,I) = n\log(M) + \log\Gamma(\frac{M}{2}) - M\log\Gamma(\frac{1}{2}) - \log\Gamma(\frac{2n+M}{2}) + \sum_{k=1}^M \log\Gamma(n_k + \frac{1}{2}) where :math:`\Gamma` is the Gamma function, :math:`n` is the number of data points, :math:`n_k` is the number of measurements in bin :math:`k`. References ---------- .. [1] Knuth, K.H. "Optimal Data-Based Binning for Histograms". arXiv:0605197, 2006 See Also -------- KnuthF freedman_bin_width scotts_bin_width """ return astropy_stats.knuth_bin_width(data, return_bins)
def get_bin_sizes_x(x, algo='scott'): """ Smartly get bin size to have a loer bias due to binning""" from astropy.stats import freedman_bin_width, scott_bin_width, knuth_bin_width, bayesian_blocks logger.info(" > Get smart bin sizes in 1D") if algo == 'scott': logger.info("use scott rule of thumb") width_x, bins_x = scott_bin_width(x, return_bins=True) elif algo == 'knuth': logger.info("use knuth rule of thumb") width_x, bins_x = knuth_bin_width(x, return_bins=True) elif algo == 'freedman': logger.info("use freedman rule of thumb") width_x, bins_x = freedman_bin_width(x, return_bins=True) elif algo == 'blocks': logger.info("use bayesian blocks rule of thumb") width_x, bins_x = bayesian_blocks(x, return_bins=True) else: raise NotImplementedError("use scott, knuth, freedman or blocks") return bins_x, width_x
def hist(x, bins=10, range=None, *args, **kwargs): """Enhanced histogram This is a histogram function that enables the use of more sophisticated algorithms for determining bins. Aside from the `bins` argument allowing a string specified how bins are computed, the parameters are the same as pylab.hist(). Parameters ---------- x : array_like array of data to be histogrammed bins : int or list or str (optional) If bins is a string, then it must be one of: 'blocks' : use bayesian blocks for dynamic bin widths 'knuth' : use Knuth's rule to determine bins 'scott' : use Scott's rule to determine bins 'freedman' : use the Freedman-diaconis rule to determine bins range : tuple or None (optional) the minimum and maximum range for the histogram. If not specified, it will be (x.min(), x.max()) ax : Axes instance (optional) specify the Axes on which to draw the histogram. If not specified, then the current active axes will be used. **kwargs : other keyword arguments are described in pylab.hist(). """ if isinstance(bins, str) and "weights" in kwargs: warnings.warn("weights argument is not supported: it will be ignored.") kwargs.pop('weights') x = np.asarray(x) if 'ax' in kwargs: ax = kwargs['ax'] del kwargs['ax'] else: # import here so that testing with Agg will work from matplotlib import pyplot as plt ax = plt.gca() # if range is specified, we need to truncate the data for # the bin-finding routines if (range is not None and (bins in ['blocks', 'knuth', 'knuths', 'scott', 'scotts', 'freedman', 'freedmans'])): x = x[(x >= range[0]) & (x <= range[1])] if bins in ['blocks']: bins = bayesian_blocks(x) elif bins in ['knuth', 'knuths']: dx, bins = knuth_bin_width(x, True) elif bins in ['scott', 'scotts']: dx, bins = scott_bin_width(x, True) elif bins in ['freedman', 'freedmans']: dx, bins = freedman_bin_width(x, True) elif isinstance(bins, str): raise ValueError("unrecognized bin code: '{}'".format(bins)) return ax.hist(x, bins, range, **kwargs)
def match(dataCm): """Performs the Match calculation in Eq. 1 of Breivik & Larson (2018) Parameters ---------- dataCm : list List of two cumulative data sets for a single paramter Returns ------- match : list List of matches for each cumulative data set binwidth : float Binwidth of histograms used for match computation """ # DEFINE A LIST TO HOLD THE BINNED DATA: histo = [[], []] histoBinEdges = [[], []] # COMPUTE THE BINWIDTH FOR THE MOST COMPLETE DATA SET: # NOTE: THIS WILL BE THE BINWIDTH FOR ALL THE HISTOGRAMS IN THE HISTO LIST with warnings.catch_warnings(): warnings.filterwarnings( "ignore", message="divide by zero encountered in double_scalars") try: bw, binEdges = astroStats.knuth_bin_width(np.array(dataCm[0]), return_bins=True) except Exception: bw, binEdges = astroStats.scott_bin_width(np.array(dataCm[0]), return_bins=True) if bw < 1e-4: bw = 1e-4 binEdges = np.arange(binEdges[0], binEdges[-1], bw) # BIN THE DATA: for i in range(2): histo[i], histoBinEdges[i] = astroStats.histogram(dataCm[i], bins=binEdges, density=True) # COMPUTE THE MATCH: nominator = [] denominator1 = [] denominator2 = [] nominatorSum = [] denominator1Sum = [] denominator2Sum = [] histo2 = histo[1] histo1 = histo[0] for j in range(len(histo1)): nominator.append(histo1[j] * histo2[j]) denominator1.append((histo1[j] * histo1[j])) denominator2.append((histo2[j] * histo2[j])) nominatorSum.append(np.sum(nominator)) denominator1Sum.append(np.sum(denominator1)) denominator2Sum.append(np.sum(denominator2)) nominatorSum = np.array(nominatorSum) denominator1Sum = np.array(denominator1Sum) denominator2Sum = np.array(denominator2Sum) binwidth = binEdges[1] - binEdges[0] if binwidth < 1e-7: match = 1e-9 else: match = np.log10(1 - nominatorSum / np.sqrt(denominator1Sum * denominator2Sum)) return match[0], binwidth
def knuth_bandwidth_determination(self, bw_selection='min'): # bandwidth selection is min, max, mean bandwidths = np.asarray([knuth_bin_width(data_set) for data_set in self.data.T]) self.bw = getattr(bandwidths, bw_selection)() return
def calc_bins_intervals(self, nbins=101, precision=None): r""" Calculate histogram bins. nbins: int, str, array-like If int, use np.histogram to calculate the bin edges. If str and nbins == "knuth", use `astropy.stats.knuth_bin_width` to calculate optimal bin widths. If str and nbins != "knuth", use `np.histogram(data, bins=nbins)` to calculate bins. If array-like, treat as bins. precision: int or None Precision at which to store intervals. If None, default to 3. """ data = self.data bins = {} intervals = {} if precision is None: precision = 5 gb_axes = self._gb_axes if isinstance(nbins, (str, int)) or (hasattr(nbins, "__iter__") and len(nbins) != len(gb_axes)): # Single paramter for `nbins`. nbins = {k: nbins for k in gb_axes} elif len(nbins) == len(gb_axes): # Passed one bin spec per axis nbins = {k: v for k, v in zip(gb_axes, nbins)} else: msg = f"Unrecognized `nbins`\ntype: {type(nbins)}\n bins:{nbins}" raise ValueError(msg) for k in self._gb_axes: b = nbins[k] # Numpy and Astropy don't like NaNs when calculating bins. # Infinities in bins (typically from log10(0)) also create problems. d = data.loc[:, k].replace([-np.inf, np.inf], np.nan).dropna() if isinstance(b, str): b = b.lower() if isinstance(b, str) and b == "knuth": try: assert knuth_bin_width except NameError: raise NameError("Astropy is unavailable.") dx, b = knuth_bin_width(d, return_bins=True) else: try: b = np.histogram_bin_edges(d, b) except MemoryError: # Clip the extremely large values and extremely small outliers. lo, up = d.quantile([0.0005, 0.9995]) b = np.histogram_bin_edges(d.clip(lo, up), b) except AttributeError: c, b = np.histogram(d, b) assert np.unique(b).size == b.size try: assert not np.isnan(b).any() except TypeError: assert not b.isna().any() b = b.round(precision) zipped = zip(b[:-1], b[1:]) i = [pd.Interval(*b0b1, closed="right") for b0b1 in zipped] bins[k] = b # intervals[k] = pd.IntervalIndex(i) intervals[k] = pd.CategoricalIndex(i) bins = tuple(bins.items()) intervals = tuple(intervals.items()) # self._intervals = intervals self._categoricals = intervals
def splitEnv(cluster, turn_off, isoch_phot, low_env_perc=50): """ TODO 1. implement iterative outliers removal for the estimation of the MSRL 2. don't use binary envelope. Instead use the following method: - estimate the MSRL - divide it in (rotated) magnitude bins - for each bin, count how many members there are below the MSRL - estimate the """ # Estimate the optimal rotation angle using the best fit isochrone theta = rotIsoch(turn_off, isoch_phot) # Rotate the cluster using 'theta' origin = (cluster[0].max(), cluster[1].max()) cluster_rot = rotate(theta, cluster.T, origin).T # Define the edges along the rotated sequence bin_edges = knuth_bin_width( cluster_rot[1], return_bins=True, quiet=True)[1] # Remove edges in the brightest portion msk = bin_edges > np.percentile(cluster_rot[1], .1) bin_edges = bin_edges[msk] # Add resolution to the low mass region extra_edges = np.linspace(bin_edges[-2], bin_edges[-1], 5) bin_edges = list(bin_edges[:-2]) + list(extra_edges) # Obtain lower envelope lower_env_rot = [] for i, low in enumerate(bin_edges): if i + 1 == len(bin_edges): break msk = (cluster_rot[1] > low) & (cluster_rot[1] <= bin_edges[i + 1]) if msk.sum() > 0: mid_p = (low + bin_edges[i + 1]) * .5 lower_env_rot.append([ np.percentile(cluster_rot[0][msk], low_env_perc), mid_p]) # Rotate the lower envelope back to its original position lower_env = rotate(-theta, lower_env_rot, origin).T # Extend envelope to lower magnitudes poly = np.polyfit(lower_env[0][-3:], lower_env[1][-3:], deg=1) # Extrapolate 1 mag x_ext = lower_env[0][-1] + 1 y_ext = np.polyval(poly, x_ext) lower_env = np.array([list(lower_env[0]) + [x_ext], list(lower_env[1]) + [y_ext]]) # import matplotlib.pyplot as plt # plt.subplot(121) # plt.scatter(cluster_rot[1], cluster_rot[0], marker='.', c='r') # mag_l, col_l = np.array(lower_env_rot).T # plt.plot(col_l, mag_l) # plt.gca().invert_yaxis() # plt.subplot(122) # plt.scatter(cluster[1], cluster[0], marker='.', c='r') # mag_l, col_l = lower_env # plt.plot(col_l, mag_l) # plt.gca().invert_yaxis() # plt.show() # Generate binary envelope mag_l, col_l = lower_env mag_binar = clusterHandle.mag_combine(mag_l, mag_l) # Generate extra points l_envelope = isochHandle.interp(lower_env) b_envelope = isochHandle.interp(np.array([mag_binar, col_l])) # cluster = remOutliers(cluster, l_envelope, col_max, mag_lim, delta) # import matplotlib.pyplot as plt # plt.scatter(cluster[1], cluster[0], marker='.', c='g') # plt.plot(l_envelope[1], l_envelope[0], 'x', ms=2, c='k') # plt.plot(b_envelope[1], b_envelope[0], 'x', ms=2, c='b') # plt.gca().invert_yaxis() # plt.show() # Distances to the lower envelope, for all the stars dist_l = cdist(cluster.T, l_envelope.T) min_dist_l = dist_l.min(1) # Distances to the binary envelope, for all the stars dist_b = cdist(cluster.T, b_envelope.T) min_dist_b = dist_b.min(1) # If delta_d>0 then min_dist_l>min_dist_b, and the star is closer to the # binary sequence delta_d = min_dist_l - min_dist_b # Split systems binar_msk = delta_d >= 0 single_msk = ~binar_msk return cluster, (l_envelope, b_envelope), single_msk, binar_msk
def hist(x, bins=10, range=None, *args, **kwargs): """Enhanced histogram This is a histogram function that enables the use of more sophisticated algorithms for determining bins. Aside from the `bins` argument allowing a string specified how bins are computed, the parameters are the same as pylab.hist(). Parameters ---------- x : array_like array of data to be histogrammed bins : int or list or str (optional) If bins is a string, then it must be one of: 'blocks' : use bayesian blocks for dynamic bin widths 'knuth' : use Knuth's rule to determine bins 'scott' : use Scott's rule to determine bins 'freedman' : use the Freedman-diaconis rule to determine bins range : tuple or None (optional) the minimum and maximum range for the histogram. If not specified, it will be (x.min(), x.max()) ax : Axes instance (optional) specify the Axes on which to draw the histogram. If not specified, then the current active axes will be used. **kwargs : other keyword arguments are described in pylab.hist(). """ if isinstance(bins, str) and "weights" in kwargs: warnings.warn("weights argument is not supported: it will be ignored.") kwargs.pop('weights') x = np.asarray(x) if 'ax' in kwargs: ax = kwargs['ax'] del kwargs['ax'] else: # import here so that testing with Agg will work from matplotlib import pyplot as plt ax = plt.gca() # if range is specified, we need to truncate the data for # the bin-finding routines if (range is not None and (bins in [ 'blocks', 'knuth', 'knuths', 'scott', 'scotts', 'freedman', 'freedmans' ])): x = x[(x >= range[0]) & (x <= range[1])] if bins in ['blocks']: bins = bayesian_blocks(x) elif bins in ['knuth', 'knuths']: dx, bins = knuth_bin_width(x, True, disp=False) elif bins in ['scott', 'scotts']: dx, bins = scott_bin_width(x, True) elif bins in ['freedman', 'freedmans']: dx, bins = freedman_bin_width(x, True) elif isinstance(bins, str): raise ValueError("unrecognized bin code: '%s'" % bins) return ax.hist(x, bins, range, **kwargs)
def bin_edges_f(bin_method, mags_cols_cl, lkl_manual_bins=None, nbins=None, min_bins=2, max_bins=50): """ Obtain bin edges for each photometric dimension using the cluster region diagram. The 'bin_edges' list will contain all magnitudes first, and then all colors (in the same order in which they are read). """ bin_edges = [] if bin_method in ('auto', 'fd', 'doane', 'scott', 'rice', 'sturges', 'sqrt'): for mag in mags_cols_cl[0]: bin_edges.append(np.histogram(mag, bins=bin_method)[1]) for col in mags_cols_cl[1]: bin_edges.append(np.histogram(col, bins=bin_method)[1]) elif bin_method == 'optm': for mag in mags_cols_cl[0]: bin_edges.append(np.histogram(mag, bins=nbins * 2)[1]) for col in mags_cols_cl[1]: bin_edges.append(np.histogram(col, bins=nbins)[1]) elif bin_method == 'fixed': # Based on Bonatto & Bica (2007) 377, 3, 1301-1323 but using larger # values than those used there (0.25 for colors and 0.5 for magnitudes) for mag in mags_cols_cl[0]: b_num = int(round(max(2, (max(mag) - min(mag)) / 1.))) bin_edges.append(np.histogram(mag, bins=b_num)[1]) for col in mags_cols_cl[1]: b_num = int(round(max(2, (max(col) - min(col)) / .5))) bin_edges.append(np.histogram(col, bins=b_num)[1]) elif bin_method == 'knuth': for mag in mags_cols_cl[0]: bin_edges.append( knuth_bin_width(mag, return_bins=True, quiet=True)[1]) for col in mags_cols_cl[1]: bin_edges.append( knuth_bin_width(col, return_bins=True, quiet=True)[1]) elif bin_method == 'blocks': with warnings.catch_warnings(): warnings.simplefilter("ignore") for mag in mags_cols_cl[0]: bin_edges.append(bayesian_blocks(mag)) for col in mags_cols_cl[1]: bin_edges.append(bayesian_blocks(col)) elif bin_method == 'blocks-max': with warnings.catch_warnings(): warnings.simplefilter("ignore") for mag in mags_cols_cl[0]: bin_edges.append(slpitArr(bayesian_blocks(mag))) for col in mags_cols_cl[1]: bin_edges.append(slpitArr(bayesian_blocks(col), 1.)) elif bin_method == 'manual': for mag in mags_cols_cl[0]: bin_edges.append( np.histogram(mag, bins=int(lkl_manual_bins[0]))[1]) for i, col in enumerate(mags_cols_cl[1]): bin_edges.append( np.histogram(col, bins=int(lkl_manual_bins[i + 1]))[1]) # TODO this method is currently hidden from the params file. # To be used when #325 is implemented. Currently used to test # multi-dimensional likelihoods. # # For 4 to 6 dimensions the rule below appears to be a somewhat reasonable # rule of thumb for the number of bins for each dimension. # There is a trade-off between a large number of smaller bins which # better match features of the observed cluster but benefits larger # mass values, and fewer larger bins which better match masses but losing # finer details of the cluster. elif bin_method == 'man': d = len(mags_cols_cl[0]) + len(mags_cols_cl[1]) b_num = [15, 10, 7][d - 4] for mag in mags_cols_cl[0]: bin_edges.append(np.histogram(mag, bins=int(b_num))[1]) for col in mags_cols_cl[1]: bin_edges.append(np.histogram(col, bins=int(b_num))[1]) # Impose a minimum of 'min_bins' cells per dimension. The number of bins # is the number of edges minus 1. for i, be in enumerate(bin_edges): N_bins = len(be) - 1 if N_bins < min_bins: # print(" WARNING too few bins in histogram, use 'min_bins'") bin_edges[i] = np.linspace(be[0], be[-1], min_bins + 1) # Impose a maximum of 'max_bins' cells per dimension. for i, be in enumerate(bin_edges): N_bins = len(be) - 1 if N_bins > max_bins: # print(" WARNING too many bins in histogram, use 'max_bins'") bin_edges[i] = np.linspace(be[0], be[-1], max_bins) return bin_edges
def comp_study(input_data, n_events, xlims=None, resamples=100, dist_name='2Gauss'): bb_dir = os.path.join('/Users/brianpollack/Coding/BayesianBlocks') do_log = True # data_nom = input_data[:n_events] if dist_name == 'Gauss': np.random.seed(88) data_nom = np.random.normal(125, 2, size=n_events) resample_list = np.random.normal(125, 2, size=(resamples, n_events)) do_log = False elif dist_name == '2LP': np.random.seed(33) data_nom = np.concatenate( (np.random.laplace(loc=90, scale=5, size=int(n_events * 0.65)), np.random.laplace(loc=110, scale=1.5, size=int(n_events * 0.25)), np.random.uniform(low=80, high=120, size=int(n_events * 0.10)))) resample_list = np.concatenate( (np.random.laplace( loc=90, scale=5, size=(resamples, int(n_events * 0.65))), np.random.laplace( loc=110, scale=1.5, size=(resamples, int(n_events * 0.25))), np.random.uniform( low=80, high=120, size=(resamples, int(n_events * 0.10)))), axis=1) do_log = False elif dist_name == 'jPT': np.random.seed(11) data_nom = np.random.choice(input_data, size=n_events, replace=False) resample_list = np.random.choice(input_data, size=(resamples, n_events), replace=True) elif dist_name == 'DY': np.random.seed(200) data_nom = np.random.choice(input_data, size=n_events, replace=False) resample_list = np.random.choice(input_data, size=(resamples, n_events), replace=True) else: np.random.seed(1) data_nom = np.random.choice(input_data, size=n_events, replace=False) resample_list = np.random.choice(input_data, size=(resamples, n_events), replace=True) fig_hist, axes_hist = plt.subplots(3, 3, sharex=True, sharey=False, constrained_layout=True) fig_hist.suptitle(f'{dist_name} Distribution, N={n_events}', fontsize=22) # fig_hist.text(-0.03, 0.5, 'Entries/Bin Width', va='center', rotation='vertical', fontsize=20) # axes_hist[2][0].get_xaxis().set_ticks([]) # axes_hist[2][1].get_xaxis().set_ticks([]) # axes_hist[2][2].get_xaxis().set_ticks([]) axes_hist[0][0].set_title('Sturges') hist_sturges_bw = skh_plt.hist(x=data_nom, histtype='stepfilled', bins='sturges', errorbars=False, alpha=0.5, log=do_log, scale='binwidth', err_type='gaussian', ax=axes_hist[0][0]) axes_hist[0][1].set_title('Doane') hist_doane_bw = skh_plt.hist(x=data_nom, histtype='stepfilled', bins='doane', errorbars=False, alpha=0.5, log=do_log, scale='binwidth', err_type='gaussian', ax=axes_hist[0][1]) axes_hist[0][2].set_title('Scott') hist_scott_bw = skh_plt.hist(x=data_nom, histtype='stepfilled', bins='scott', errorbars=False, alpha=0.5, log=do_log, scale='binwidth', err_type='gaussian', ax=axes_hist[0][2]) axes_hist[1][0].set_title('Freedman Diaconis') axes_hist[1][0].set_ylabel('Entries/Bin Width', fontsize=20) hist_fd_bw = skh_plt.hist(x=data_nom, histtype='stepfilled', bins='fd', errorbars=False, alpha=0.5, log=do_log, scale='binwidth', err_type='gaussian', ax=axes_hist[1][0]) axes_hist[1][1].set_title('Knuth') _, bk = knuth_bin_width(data_nom, return_bins=True) hist_knuth_bw = skh_plt.hist(x=data_nom, histtype='stepfilled', bins=bk, errorbars=False, alpha=0.5, log=do_log, scale='binwidth', err_type='gaussian', ax=axes_hist[1][1]) axes_hist[1][2].set_title('Rice') hist_rice_bw = skh_plt.hist(x=data_nom, histtype='stepfilled', bins='rice', errorbars=False, alpha=0.5, log=do_log, scale='binwidth', err_type='gaussian', ax=axes_hist[1][2]) axes_hist[2][0].set_title('Sqrt(N)') hist_sqrt_bw = skh_plt.hist(x=data_nom, histtype='stepfilled', bins='sqrt', errorbars=False, alpha=0.5, log=do_log, scale='binwidth', err_type='gaussian', ax=axes_hist[2][0]) # bep = bep_optimizer(data_nom) # _, bep = pd.qcut(data_nom, nep, retbins=True) hist_sturges = np.histogram(data_nom, bins='sturges') hist_doane = np.histogram(data_nom, bins='doane') hist_scott = np.histogram(data_nom, bins='scott') hist_fd = np.histogram(data_nom, bins='fd') hist_knuth = np.histogram(data_nom, bins=bk) hist_rice = np.histogram(data_nom, bins='rice') hist_sqrt = np.histogram(data_nom, bins='sqrt') r_sturges = rough(hist_sturges_bw, plot=False) r_doane = rough(hist_doane_bw) r_scott = rough(hist_scott_bw) r_fd = rough(hist_fd_bw) r_knuth = rough(hist_knuth_bw, plot=False) r_rice = rough(hist_rice_bw) r_sqrt = rough(hist_sqrt_bw, plot=False) eli_sturges = err_li(data_nom, hist_sturges) eli_doane = err_li(data_nom, hist_doane) eli_scott = err_li(data_nom, hist_scott) eli_fd = err_li(data_nom, hist_fd) eli_knuth = err_li(data_nom, hist_knuth) eli_rice = err_li(data_nom, hist_rice) eli_sqrt = err_li(data_nom, hist_sqrt) avg_eli_sturges = [] avg_eli_doane = [] avg_eli_scott = [] avg_eli_fd = [] avg_eli_knuth = [] avg_eli_rice = [] avg_eli_sqrt = [] for i in resample_list: avg_eli_sturges.append(err_li(i, hist_sturges)) avg_eli_doane.append(err_li(i, hist_doane)) avg_eli_scott.append(err_li(i, hist_scott)) avg_eli_fd.append(err_li(i, hist_fd)) avg_eli_knuth.append(err_li(i, hist_knuth)) avg_eli_rice.append(err_li(i, hist_rice)) avg_eli_sqrt.append(err_li(i, hist_sqrt)) avg_eli_sturges = np.mean(avg_eli_sturges) avg_eli_doane = np.mean(avg_eli_doane) avg_eli_scott = np.mean(avg_eli_scott) avg_eli_fd = np.mean(avg_eli_fd) avg_eli_knuth = np.mean(avg_eli_knuth) avg_eli_rice = np.mean(avg_eli_rice) avg_eli_sqrt = np.mean(avg_eli_sqrt) avg_eli_list = [ avg_eli_sturges, avg_eli_doane, avg_eli_scott, avg_eli_fd, avg_eli_knuth, avg_eli_rice, avg_eli_sqrt ] r_list = [r_sturges, r_doane, r_scott, r_fd, r_knuth, r_rice, r_sqrt] elis_list = [ eli_sturges, eli_doane, eli_scott, eli_fd, eli_knuth, eli_rice, eli_sqrt ] axes_hist[2][1].set_title('Equal Population') bep = bep_optimizer(data_nom, resample_list, r_list, avg_eli_list) hist_ep_bw = skh_plt.hist(x=data_nom, histtype='stepfilled', bins=bep, errorbars=False, alpha=0.5, log=do_log, scale='binwidth', err_type='gaussian', ax=axes_hist[2][1]) hist_ep = np.histogram(data_nom, bins=bep) r_ep = rough(hist_ep_bw) eli_ep = err_li(data_nom, hist_ep) avg_eli_ep = [] for i in resample_list: avg_eli_ep.append(err_li(i, hist_ep)) avg_eli_ep = np.mean(avg_eli_ep) axes_hist[2][2].set_title('Bayesian Blocks') p0 = bb_optimizer(data_nom, resample_list, r_list, avg_eli_list) bb = bayesian_blocks(data_nom, p0=p0) if xlims: bb[0] = xlims[0] bb[-1] = xlims[-1] hist_bb_bw = skh_plt.hist(x=data_nom, histtype='stepfilled', bins=bb, errorbars=False, alpha=1, log=do_log, scale='binwidth', err_type='gaussian', ax=axes_hist[2][2]) # if n_events == 1000 and dist_name == '2LP': # axes_hist[2][2].set_ylim((0, 100)) hist_bb = np.histogram(data_nom, bins=bb) r_bb = rough(hist_bb_bw, plot=False) eli_bb = err_li(data_nom, hist_bb) avg_eli_bb = [] for i in resample_list: avg_eli_bb.append(err_li(i, hist_bb)) avg_eli_bb = np.mean(avg_eli_bb) r_list.append(r_ep) r_list.append(r_bb) avg_eli_list.append(avg_eli_ep) avg_eli_list.append(avg_eli_bb) elis_list.append(eli_ep) elis_list.append(eli_bb) plt.savefig(bb_dir + f'/plots/bin_comp/hists_{dist_name}_{n_events}.pdf') xs = [ 'Sturges', 'Doane', 'Scott', 'FD', 'Knuth', 'Rice', 'Sqrt', 'EP', 'BB' ] fig_metric, axes_metric = plt.subplots(2, 1, constrained_layout=True) fig_hist.suptitle(f'{dist_name} Distribution, N={n_events}') for i in range(len(elis_list)): if xs[i] == 'BB': axes_metric[0].scatter(avg_eli_list[i], r_list[i], label=xs[i], s=400, marker='*', c='k') else: axes_metric[0].scatter(avg_eli_list[i], r_list[i], label=xs[i], s=200) axes_metric[0].set_ylabel(r'$W_n$ (Wiggles)') axes_metric[0].set_xlabel(r'$\hat{E}$ (Average Error)') # ax = plt.gca() # ax.set_yscale('log') # ax.set_xscale('log') # ax.relim() # ax.autoscale_view() axes_metric[0].grid() axes_metric[0].legend(ncol=1, bbox_to_anchor=(1.05, 1.15), loc='upper left') axes_metric[0].set_title(f'{dist_name} Distribution, N={n_events}', fontsize=22) # plt.savefig(bb_dir+f'/plots/bin_comp/scat_{dist_name}_{n_events}.pdf') # plt.figure() rank_rough = rankdata(r_list, method='min') rank_avg_eli = rankdata(avg_eli_list, method='min') cont = axes_metric[1].bar(xs, rank_rough, 0.35, label=r'$W_n$ Ranking', alpha=0.5) cont[-1].set_alpha(1) cont = axes_metric[1].bar(xs, rank_avg_eli, 0.35, bottom=rank_rough, label=r'$\hat{E}$ Ranking', alpha=0.5) cont[-1].set_alpha(1) axes_metric[1].legend(loc='upper left', bbox_to_anchor=(1.0, 0.8)) # axes_metric[1].set_title(f'Combined Ranking, {dist_name} Distribution, N={n_events}') axes_metric[1].set_xlabel('Binning Method') axes_metric[1].set_ylabel('Rank') plt.savefig(bb_dir + f'/plots/bin_comp/metric_{dist_name}_{n_events}.pdf')