def test_histogram_bin_range_raises(bins, hist_range): data = da.random.random(10, chunks=2) with pytest.raises(ValueError) as info: da.histogram(data, bins=bins, range=hist_range) err_msg = str(info.value) assert 'bins' in err_msg assert 'range' in err_msg
def test_histogram_normed_deprecation(): x = da.arange(10) with pytest.raises(ValueError) as info: da.histogram(x, bins=[1, 2, 3], normed=True) assert 'density' in str(info.value) assert 'deprecated' in str(info.value).lower()
def test_histogram_extra_args_and_shapes(): # Check for extra args and shapes bins = np.arange(0, 1.01, 0.01) v = da.random.random(100, chunks=10) data = [(v, bins, da.ones(100, chunks=v.chunks) * 5), (da.random.random( (50, 50), chunks=10), bins, da.ones((50, 50), chunks=10) * 5)] for v, bins, w in data: # density assert_eq( da.histogram(v, bins=bins, normed=True)[0], np.histogram(v, bins=bins, normed=True)[0]) # normed assert_eq( da.histogram(v, bins=bins, density=True)[0], np.histogram(v, bins=bins, density=True)[0]) # weights assert_eq( da.histogram(v, bins=bins, weights=w)[0], np.histogram(v, bins=bins, weights=w)[0]) assert_eq( da.histogram(v, bins=bins, weights=w, density=True)[0], da.histogram(v, bins=bins, weights=w, density=True)[0])
def _do_ctp_validation(data, adef, out_size, idxs): """ Calculate CTP validation (included in CTTH plot). """ # detected ctth mask detected_clouds = da.logical_and(data['caliop_cma'] == 1, data['imager_cma'] == 1) detected_height = da.logical_and(detected_clouds, np.isfinite(data['imager_cth'])) # find pps low and caliop low low_clouds_c = gc.get_calipso_low_clouds(data['caliop_cflag']) detected_low_c = np.logical_and(detected_height, low_clouds_c) low_clouds_pps = da.where(data['imager_ctp'] > 680., 1, 0) detected_low_pps = da.logical_and(detected_height, low_clouds_pps) # pattern: CALIOP_SEVIRI cld_cld_a = da.logical_and(detected_low_c == 1, detected_low_pps == 1) clr_cld_b = da.logical_and(detected_low_c == 0, detected_low_pps == 1) cld_clr_c = da.logical_and(detected_low_c == 1, detected_low_pps == 0) clr_clr_d = da.logical_and(detected_low_c == 0, detected_low_pps == 0) cld_cld_a = cld_cld_a.astype(np.int64) clr_cld_b = clr_cld_b.astype(np.int64) cld_clr_c = cld_clr_c.astype(np.int64) clr_clr_d = clr_clr_d.astype(np.int64) a, _ = da.histogram(idxs, bins=out_size, range=(0, out_size), weights=cld_cld_a, density=False) b, _ = da.histogram(idxs, bins=out_size, range=(0, out_size), weights=clr_cld_b, density=False) c, _ = da.histogram(idxs, bins=out_size, range=(0, out_size), weights=cld_clr_c, density=False) d, _ = da.histogram(idxs, bins=out_size, range=(0, out_size), weights=clr_clr_d, density=False) scu = ScoreUtils(a, b, c, d) scores = dict() scores['CTP low clouds POD'] = [ scu.pod_1().reshape(adef.shape), 0, 1, 'rainbow' ] scores['CTP low clouds FAR'] = [ scu.far_1().reshape(adef.shape), 0, 1, 'rainbow' ] scores['CTP low clouds POFD'] = [ scu.pofd_1().reshape(adef.shape), 0, 1, 'rainbow' ] # scores['Heidke low clouds'] = [scu.heidke().reshape(adef.shape),0, 1, 'rainbow'] return scores
def do_ctp_validation(data, adef, out_size, idxs): """ Scores: low clouds detection """ # detected ctth mask detected_clouds = da.logical_and(data['caliop_cma'] == 1, data['imager_cma'] == 1) detected_height = da.logical_and(detected_clouds, np.isfinite(data['imager_cth'])) # find pps low and caliop low low_clouds_c = get_calipso_low_clouds(data['caliop_cflag']) detected_low_c = np.logical_and(detected_height, low_clouds_c) low_clouds_pps = da.where(data['imager_ctp'] > 680., 1, 0) detected_low_pps = da.logical_and(detected_height, low_clouds_pps) # pattern: CALIOP_SEVIRI cld_cld_a = da.logical_and(detected_low_c == 1, detected_low_pps == 1) clr_cld_b = da.logical_and(detected_low_c == 0, detected_low_pps == 1) cld_clr_c = da.logical_and(detected_low_c == 1, detected_low_pps == 0) clr_clr_d = da.logical_and(detected_low_c == 0, detected_low_pps == 0) cld_cld_a = cld_cld_a.astype(np.int64) clr_cld_b = clr_cld_b.astype(np.int64) cld_clr_c = cld_clr_c.astype(np.int64) clr_clr_d = clr_clr_d.astype(np.int64) a, _ = da.histogram(idxs, bins=out_size, range=(0, out_size), weights=cld_cld_a, density=False) b, _ = da.histogram(idxs, bins=out_size, range=(0, out_size), weights=clr_cld_b, density=False) c, _ = da.histogram(idxs, bins=out_size, range=(0, out_size), weights=cld_clr_c, density=False) d, _ = da.histogram(idxs, bins=out_size, range=(0, out_size), weights=clr_clr_d, density=False) # n = a + b + c + d # n2d = N.reshape(adef.shape) # scores = [hitrate(a, d, n).reshape(adef.shape), # 0.7, 1, 'rainbow'] # hitrate low PPS pod_low = a / (a + c) far_low = c / (a + c) scores = dict() scores['POD low clouds'] = [pod_low.reshape(adef.shape), 0.2, 1, 'rainbow'] scores['FAR low clouds'] = [far_low.reshape(adef.shape), 0.2, 1, 'rainbow'] return scores
def plot_hist(size_data, shape_data, range=(0, 10), bins=100): size_hist, size_edges = da.histogram(size_data, bins=bins, range=range) shape_hist, shape_edges = da.histogram(shape_data, bins=80, range=(0, 0.9)) fig, (ax1, ax2) = plt.subplots(nrows=1, ncols=2, figsize=(7, 4)) ax1.semilogy(size_edges[:-1], size_hist.compute(), drawstyle='steps-mid') ax1.set_xlabel('FWHM/(2*sqrt(2*ln(2))) [pixels]') ax2.set_xlabel('Symmetry') ax2.semilogy(shape_edges[:-1], shape_hist.compute(), drawstyle='steps-mid') fig.savefig('size_symmetry_hist.png', format='png', dpi=300) plt.show()
def coarsen(fmask): # rename v1min, v1max, dv1 = lon_bins[0], lon_bins[-1], dl v2min, v2max, dv2 = lat_bins[0], lat_bins[-1], dl i1max = int(np.rint((v1max - v1min) / dv1)) + 1 i2max = int(np.rint((v2max - v2min) / dv2)) + 1 # meshgrid lon/lat, note: need transposing fmask = fmask.to_dataset() fmask['lon'] = (1. * fmask['longitude'] + 0. * fmask['latitude']).transpose() fmask['lat'] = (0. * fmask['longitude'] + 1. * fmask['latitude']).transpose() # need rechunking fmask = fmask.chunk(chunks) def get_index(v1, v2): ''' This function provides the index of (v1,v2) coupled value position in the 2D histogram array ''' i1 = np.maximum(np.floor((v1 - v1min) / dv1) + 1, 0) i1 = np.minimum(i1, i1max) i2 = np.maximum(np.floor((v2 - v2min) / dv2) + 1, 0) i2 = np.minimum(i2, i2max) return i1 + i2 * (i1max + 1) # sum QA over coarse grid cells v12 = da.map_blocks(get_index, fmask['lon'].data, fmask['lat'].data, dtype='float') h, lbins = da.histogram(v12, bins=np.arange(-.5, (i1max + 1) * (i2max + 1) + 0.5, 1.), weights=fmask['QA'].data) H = h.compute() # compute the number of points per grid cells hnorm, lbins = da.histogram(v12, bins=np.arange(-.5, (i1max + 1) * (i2max + 1) + 0.5, 1.)) Hnorm = 1. * hnorm.compute() Hnorm[np.where(Hnorm == 0)] = np.NaN # average the mask over coarse grid cells H = (H / Hnorm).reshape((i1max + 1, i2max + 1), order='F') cmask = xr.Dataset() #cmask['QA'] = (('longitude', 'latitude'), H[1:-1,1:-1].transpose()) cmask['QA'] = (('longitude', 'latitude'), H[1:-1, 1:-1]) cmask.coords['longitude'] = (('longitude'), lon_center) cmask.coords['latitude'] = (('latitude'), lat_center) return cmask
def test_histogram(): # Test for normal, flattened input n = 100 v = da.random.random(n, chunks=10) bins = np.arange(0, 1.01, 0.01) (a1, b1) = da.histogram(v, bins=bins) (a2, b2) = np.histogram(v, bins=bins) # Check if the sum of the bins equals the number of samples assert a2.sum(axis=0) == n assert a1.sum(axis=0) == n assert_eq(a1, a2) assert same_keys(da.histogram(v, bins=bins)[0], a1)
def test_histogram(): # Test for normal, flattened input n = 100 v = da.random.random(n, chunks=10) bins = np.arange(0, 1.01, 0.01) (a1, b1) = da.histogram(v, bins=bins) (a2, b2) = np.histogram(v, bins=bins) # Check if the sum of the bins equals the number of samples assert a2.sum(axis=0) == n assert a1.sum(axis=0) == n assert eq(a1, a2) assert same_keys(da.histogram(v, bins=bins)[0], a1)
def _mask_sums_with_nan_if_not_skipna(self, skipna, data, out_size, sums): if not skipna: nans = np.isnan(data) nan_sums, _ = da.histogram(self.idxs[nans], bins=out_size, range=(0, out_size)) sums = da.where(nan_sums > 0, np.nan, sums) return sums
def process(input_path, pedestal_path, output_path): reader = TIOReader(input_path) wf_calib = WaveformCalibrator( pedestal_path, reader.n_pixels, reader.n_samples ) wfs = get_da(reader, wf_calib) mean, std, mean_pix, std_pix, (hist, edges) = da.compute( wfs.mean(), wfs.std(), wfs.mean(axis=(0, 2)), wfs.std(axis=(0, 2)), da.histogram(wfs, bins=1000, range=(-10, 10)) ) np.savez( output_path, mean=mean, std=std, mean_pix=mean_pix, std_pix=std_pix, hist=hist, edges=edges )
def van_hove_distinct(onset, frame, bins, box=None, use_dask=True, comp=False, bincount=True): r""" Compute the distinct part of the Van Hove autocorrelation function. ..math:: G(r, t) = \sum_{i, j} \delta(|\vec r_i(0) - \vec r_j(t)| - r) """ if box is None: box = onset.box.diagonal() dimension = len(box) N = len(onset) if use_dask: onset = darray.from_array(onset, chunks=(500, dimension)).reshape(1, N, dimension) frame = darray.from_array(frame, chunks=(500, dimension)).reshape(N, 1, dimension) dist = ((pbc_diff(onset, frame, box)**2).sum(axis=-1)**0.5) if np.diff(bins).std() < 1e6: dx = bins[0] - bins[1] hist = darray.bincount((dist // dx).astype(int), minlength=(len(bins) - 1)) else: hist = darray.histogram(dist, bins=bins)[0] return hist.compute() / N else: if comp: dx = bins[1] - bins[0] minlength = len(bins) - 1 def f(x): d = (pbc_diff(x, frame, box)**2).sum(axis=-1)**0.5 return np.bincount((d // dx).astype(int), minlength=minlength)[:minlength] hist = sum(f(x) for x in onset) else: dist = (pbc_diff(onset.reshape(1, -1, 3), frame.reshape(-1, 1, 3), box)**2).sum(axis=-1)**0.5 hist = histogram(dist, bins=bins)[0] return hist / N
def get_hists(yearStr, monStr, varT, binVals, binWidth, maxValue): vars=[varT, 'seg_length', 'region_flag', 'ssh_flag'] #if (monStr=='12'): # dataOutPath=dataPathIS2+releaseStr+'/'+runStr+'/raw/' #else: # dataOutPath=dataPathIS2+releaseStr+'/'+runStr+'/raw/' print(dataOutPath) dFbeams = cF.getProcessedATL10ShotdataNCDF(dataOutPath, yearStr=yearStr, ssh_mask=1, monStr=monStr, dayStr=dayStr, vars=vars, fNum=fNum, beamStr=beam) print('Got data') dFbeams=dFbeams.where(dFbeams[varT]>0.0, drop=True) dFbeams=dFbeams.where(dFbeams[varT]<30, drop=True) dFbeams=dFbeams.where(~np.isnan(dFbeams[varT]), drop=True) dFbeams=dFbeams.where(dFbeams.seg_length>4, drop=True) dFbeams=dFbeams.where(dFbeams.seg_length<200, drop=True) vals=dFbeams[varT][np.isin(dFbeams.region_flag, regions)] segs=dFbeams['seg_length'][np.isin(dFbeams.region_flag, regions)] weights=segs/segs.sum().values #counts[r, m]=vals.count().values meansT=(vals*segs).sum().values/segs.sum().values h, bins = da.histogram(vals.data, bins=size(binVals)-1, range=[0, maxValue], weights=weights.data) #histVals[m]=h histValsT=h.compute() return histValsT, meansT
def compute_scaling(df, region1, region2=None, dmin=int(1e1), dmax=int(1e7), n_bins=50): import dask.array as da if region2 is None: region2 = region1 distbins = numutils.logbins(dmin, dmax, N=n_bins) areas = contact_areas(distbins, region1, region2) df = df[(df["pos1"] >= region1[0]) & (df["pos1"] < region1[1]) & (df["pos2"] >= region2[0]) & (df["pos2"] < region2[1])] dists = (df["pos2"] - df["pos1"]).values if isinstance(dists, da.Array): obs, _ = da.histogram(dists[(dists >= dmin) & (dists < dmax)], bins=distbins) else: obs, _ = np.histogram(dists[(dists >= dmin) & (dists < dmax)], bins=distbins) return distbins, obs, areas
def test_histogram_return_type(): v = da.random.random(100, chunks=10) bins = np.arange(0, 1.01, 0.01) # Check if return type is same as hist bins = np.arange(0, 11, 1, dtype='i4') assert eq(da.histogram(v * 10, bins=bins)[0], np.histogram(v * 10, bins=bins)[0])
def uni_histogram( srs: dd.Series, srs_dtype: DType, cfg: Config, ) -> Tuple[da.Array, ...]: """Calculate "histogram" for both numerical and categorical.""" if isinstance(srs_dtype, Continuous): counts, edges = da.histogram(srs, cfg.hist.bins, (srs.min(), srs.max())) centers = (edges[:-1] + edges[1:]) / 2 return counts, centers, edges elif isinstance(srs_dtype, (Nominal, GeoGraphy, SmallCardNum, DateTime)): # Dask array's unique is way slower than the values_counts on Series # See https://github.com/dask/dask/issues/2851 # centers, counts = da.unique(arr, return_counts=True) value_counts = srs.value_counts() counts = value_counts.to_dask_array() centers = value_counts.index.to_dask_array() return (counts, centers) else: raise ValueError(f"Unsupported dtype {srs.dtype}")
def test_histogram_return_type(): v = da.random.random(100, chunks=10) bins = np.arange(0, 1.01, 0.01) # Check if return type is same as hist bins = np.arange(0, 11, 1, dtype='i4') assert_eq(da.histogram(v * 10, bins=bins)[0], np.histogram(v * 10, bins=bins)[0])
def calc_hist(srs: dd.Series, bins: int, orig_df_len: int) -> Tuple[pd.DataFrame, float]: """ Calculate a histogram over a given series. Parameters ---------- srs : dd.Series one numerical column over which to compute the histogram bins : int number of bins to use in the histogram orig_df_len : int length of the original dataframe Returns ------- Tuple[pd.DataFrame, float]: The histogram in a dataframe and the percent of missing values """ miss_pct = round(srs.isna().sum().compute() / len(srs) * 100, 1) data = srs.dropna().values minv, maxv = data.min().compute(), data.max().compute() hist_arr, bins_arr = da.histogram(data, range=[minv, maxv], bins=bins) hist_arr = hist_arr.compute() intervals = _format_bin_intervals(bins_arr) hist_df = pd.DataFrame({ "intervals": intervals, "left": bins_arr[:-1], "right": bins_arr[1:], "freq": hist_arr, "pct": hist_arr / orig_df_len * 100, }) return hist_df, miss_pct
def test_histogram_bins_range_with_nan_array(): # Regression test for issue #3977 v = da.from_array(np.array([-2, np.nan, 2]), chunks=1) (a1, b1) = da.histogram(v, bins=10, range=(-3, 3)) (a2, b2) = np.histogram(v, bins=10, range=(-3, 3)) assert_eq(a1, a2) assert_eq(b1, b2)
def uni_histogram( srs: dd.Series, bins: int, dtype: Optional[DTypeDef] = None, ) -> Tuple[da.Array, ...]: """Calculate "histogram" for both numerical and categorical.""" if is_dtype(detect_dtype(srs, dtype), Continuous()): counts, edges = da.histogram(srs, bins, range=[srs.min(), srs.max()]) centers = (edges[:-1] + edges[1:]) / 2 return counts, centers, edges elif is_dtype(detect_dtype(srs, dtype), Nominal()): # Dask array's unique is way slower than the values_counts on Series # See https://github.com/dask/dask/issues/2851 # centers, counts = da.unique(arr, return_counts=True) value_counts = srs.value_counts() counts = value_counts.to_dask_array() centers = value_counts.index.to_dask_array() return (counts, centers) else: raise ValueError(f"Unsupported dtype {srs.dtype}")
def _cont_calcs(srs: dd.Series, cfg: Config) -> Dict[str, Any]: """ Computations for a continuous column in plot(df) """ # dictionary of data for the histogram and related insights data: Dict[str, Any] = {} if cfg.insight.enable: data["npres"] = srs.shape[0] # number of present (not null) values # drop infinite values srs = srs[~srs.isin({np.inf, -np.inf})] # histogram data["hist"] = da.histogram(srs, bins=cfg.hist.bins, range=(srs.min(), srs.max())) if cfg.insight.enable: data["chisq"] = chisquare(data["hist"][0]) data["norm"] = normaltest(data["hist"][0]) data["skew"] = skewtest(data["hist"][0]) data["nneg"] = (srs < 0).sum() # number of negative values data["nuniq"] = srs.nunique_approx() # number of unique values data["nzero"] = (srs == 0).sum() # number of zeros data["nreals"] = srs.shape[0] # number of non-inf values return data
def calc_cat_stats( srs: dd.Series, df: dd.DataFrame, bins: int, nrows: int, nuniq: Optional[dd.core.Scalar] = None, ) -> Dict[str, Any]: """ Calculate stats for a categorical column Parameters ---------- srs a categorical column df groupby-count on the categorical column as a dataframe bins number of bins for the category length frequency histogram nrows number of rows before dropping null values nuniq number of unique values in the column """ # pylint: disable=too-many-locals # overview stats stats = { "nrows": nrows, "npres": srs.shape[0], "nuniq": nuniq, # if cfg.bar_endable or cfg.pie_enable else srs.nunique(), "mem_use": srs.memory_usage(deep=True), "first_rows": srs.reset_index(drop=True).loc[:4], } # length stats lengths = srs.str.len() minv, maxv = lengths.min(), lengths.max() hist = da.histogram(lengths.values, bins=bins, range=[minv, maxv]) leng = { "Mean": lengths.mean(), "Standard Deviation": lengths.std(), "Median": lengths.quantile(0.5), "Minimum": minv, "Maximum": maxv, } # letter stats # computed on groupby-count: # compute the statistic for each group then multiply by the count of the group grp, col = df.columns lc_cnt = (df[grp].str.count(r"[a-z]") * df[col]).sum() uc_cnt = (df[grp].str.count(r"[A-Z]") * df[col]).sum() letter = { "Count": lc_cnt + uc_cnt, "Lowercase Letter": lc_cnt, "Space Separator": (df[grp].str.count(r"[ ]") * df[col]).sum(), "Uppercase Letter": uc_cnt, "Dash Punctuation": (df[grp].str.count(r"[-]") * df[col]).sum(), "Decimal Number": (df[grp].str.count(r"[0-9]") * df[col]).sum(), } return {"stats": stats, "len_stats": leng, "letter_stats": letter, "len_hist": hist}
def test_histogram_alternative_bins_range(): v = da.random.random(100, chunks=10) bins = np.arange(0, 1.01, 0.01) # Other input (a1, b1) = da.histogram(v, bins=10, range=(0, 1)) (a2, b2) = np.histogram(v, bins=10, range=(0, 1)) assert eq(a1, a2) assert eq(b1, b2)
def getHist2(imgs, bins=np.arange(-2, 20, 0.05)): """ get intensity histogram from a stack of imgs """ if isinstance(imgs, dask.array.Array): H = da.histogram(imgs[da.isfinite(imgs)], bins=bins)[0] return H else: H = np.histogram(imgs[np.isfinite(imgs)], bins)[0] return np.asarray(H)
def _mask_bins_with_nan_if_not_skipna(self, skipna, data, out_size, statistic): if not skipna: nans = np.isnan(data) nan_bins, _ = da.histogram(self.idxs[nans], bins=out_size, range=(0, out_size)) statistic = da.where(nan_bins > 0, np.nan, statistic) return statistic
def test_histogram_extra_args_and_shapes(): # Check for extra args and shapes bins = np.arange(0, 1.01, 0.01) v = da.random.random(100, chunks=10) data = [(v, bins, da.ones(100, chunks=v.chunks) * 5), (da.random.random((50, 50), chunks=10), bins, da.ones((50, 50), chunks=10) * 5)] for v, bins, w in data: # density assert_eq(da.histogram(v, bins=bins, density=True)[0], np.histogram(v, bins=bins, density=True)[0]) # weights assert_eq(da.histogram(v, bins=bins, weights=w)[0], np.histogram(v, bins=bins, weights=w)[0]) assert_eq(da.histogram(v, bins=bins, weights=w, density=True)[0], da.histogram(v, bins=bins, weights=w, density=True)[0])
def histogram(a, **kwargs): y, bins = da.histogram(a.task, **kwargs) w = bins[1:] - bins[:-1] x = (bins[1:] + bins[:-1]) * 0.5 x = Index(x, a.name, a.attrs) y = type(a)(y, coords=[x], name='number') w = type(a)(w, coords=[x], name='binwidth', attrs=a.attrs) return x, y, w
def get_sum(self, data, mask_all_nan=False): """Calculate sums for each bin with drop-in-a-bucket resampling. Parameters ---------- data : Numpy or Dask array mask_all_nan : boolean (optional) Mask bins that have only NaN results, default: False Returns ------- data : Numpy or Dask array Bin-wise sums in the target grid """ LOG.info("Get sum of values in each location") if isinstance(data, xr.DataArray): data = data.data data = data.ravel() # Remove NaN values from the data when used as weights weights = da.where(np.isnan(data), 0, data) # Rechunk indices to match the data chunking if weights.chunks != self.idxs.chunks: self.idxs = da.rechunk(self.idxs, weights.chunks) # Calculate the sum of the data falling to each bin out_size = self.target_area.size sums, _ = da.histogram(self.idxs, bins=out_size, range=(0, out_size), weights=weights, density=False) if mask_all_nan: nans = np.isnan(data) nan_sums, _ = da.histogram(self.idxs[nans], bins=out_size, range=(0, out_size)) counts = self.get_count().ravel() sums = da.where(nan_sums == counts, np.nan, sums) return sums.reshape(self.target_area.shape)
def calc_cat_stats(srs: dd.Series, bins: int, nrows: int, nuniq: Optional[dd.core.Scalar] = None) -> Dict[str, Any]: """ Calculate stats for a categorical column Parameters ---------- srs a categorical column nrows number of rows before dropping null values bins number of bins for the category length frequency histogram """ # overview stats stats = { "nrows": nrows, "npres": srs.shape[0], "nuniq": nuniq, # if cfg.bar_endable or cfg.pie_enable else srs.nunique(), "mem_use": srs.memory_usage(deep=True), "first_rows": srs.reset_index(drop=True).loc[:4], } # length stats lengths = srs.str.len() minv, maxv = lengths.min(), lengths.max() hist = da.histogram(lengths.values, bins=bins, range=[minv, maxv]) leng = { "Mean": lengths.mean(), "Standard Deviation": lengths.std(), "Median": lengths.quantile(0.5), "Minimum": minv, "Maximum": maxv, } # letter stats letter = { "Count": srs.str.count(r"[a-zA-Z]").sum(), "Lowercase Letter": srs.str.count(r"[a-z]").sum(), "Space Separator": srs.str.count(r"[ ]").sum(), "Uppercase Letter": srs.str.count(r"[A-Z]").sum(), "Dash Punctuation": srs.str.count(r"[-]").sum(), "Decimal Number": srs.str.count(r"[0-9]").sum(), } return { "stats": stats, "len_stats": leng, "letter_stats": letter, "len_hist": hist }
def save_histograms(df, h_list, wp_list, out_hdf): """ Creates histograms for each working point and saves them into a dataframe """ print("Saving histograms...") for h in h_list: ## For SumET this should only be plotted once! if h.name == "Tight_Final_SumET": hist, _ = da.histogram( df[h.name], bins=h.nbins, range=h.range, density=True ) hist = hist.compute() hists = pd.DataFrame( data=hist, index=h.centers, columns=[h.name], ) ## All other histograms are plotted per working point else: hists = [] for wp in wp_list: hist, _ = da.histogram( df[wp + "_" + h.name], bins=h.nbins, range=h.range, density=True ) hists.append(hist.compute()) hists = pd.DataFrame( data=np.vstack(hists).T, index=h.centers, columns=[wp + "_" + h.name for wp in wp_list], ) ## Print and save key = h.name hists.to_hdf(out_hdf, h.name) print(" - " + key, "\n")
def dasky_histogram(a, bins=10, **kwargs): """Enhanced histogram for dask arrays. The range keyword is ignored. Reads the data at most two times - once to determine best bins (if required), and second time to actually calculate the histogram. Parameters ---------- a : array_like array of data to be histogrammed bins : int or list or str (optional) If bins is a string, then it must be one of: 'scotts' : use Scott's rule to determine bins 'freedman' : use the Freedman-Diaconis rule to determine bins other keyword arguments are described in numpy.hist(). Returns ------- hist : array The values of the histogram. See `normed` and `weights` for a description of the possible semantics. bin_edges : array of dtype float Return the bin edges ``(length(hist)+1)``. See Also -------- numpy.histogram, astroML.plotting.hist """ if not isinstance(a, da.Array): raise TypeError('the given array has to be a dask.Array') if a.ndim != 1: a = a.flatten() if bins == 'scotts': _, bins = dasky_scotts_bin_width(a, True) elif bins == 'freedman': _, bins = dasky_freedman_bin_width(a, True) elif isinstance(bins, str): raise ValueError("unrecognized bin code: '%s'" % bins) elif not np.iterable(bins): with ProgressBar(): kwargs['range'] = da.compute(a.min(), a.max()) h, bins = da.histogram(a, bins=bins, **kwargs) with ProgressBar(): return h.compute(), bins
def calc_cont_col(srs: dd.Series, bins: int) -> Dict[str, Any]: """ Computations for a numerical column in plot(df) Parameters ---------- srs srs over which to compute the barchart and insights bins number of bins in the bar chart """ # dictionary of data for the histogram and related insights data: Dict[str, Any] = {} ## if cfg.insight.missing_enable: data["npres"] = srs.shape[0] ## if cfg.insight.infinity_enable: is_inf_srs = srs.isin({np.inf, -np.inf}) data["ninf"] = is_inf_srs.sum() # remove infinite values srs = srs[~is_inf_srs] ## if cfg.hist_enable or config.insight.uniform_enable or cfg.insight.normal_enable: ## bins = cfg.hist_bins data["hist"] = da.histogram(srs, bins=bins, range=[srs.min(), srs.max()]) ## if cfg.insight.uniform_enable: data["chisq"] = chisquare(data["hist"][0]) ## if cfg.insight.normal_enable data["norm"] = normaltest(data["hist"][0]) ## if cfg.insight.negative_enable: data["nneg"] = (srs < 0).sum() ## if cfg.insight.skew_enabled: data["skew"] = skewtest(data["hist"][0]) ## if cfg.insight.unique_enabled: data["nuniq"] = srs.nunique() ## if cfg.insight.zero_enabled: data["nzero"] = (srs == 0).sum() return data
def dasky_histogram(a, bins=10, **kwargs): """Enhanced histogram for dask arrays. The range keyword is ignored. Reads the data at most two times - once to determine best bins (if required), and second time to actually calculate the histogram. Parameters ---------- a : array_like array of data to be histogrammed bins : int or list or str (optional) If bins is a string, then it must be one of: 'scotts' : use Scott's rule to determine bins 'freedman' : use the Freedman-Diaconis rule to determine bins other keyword arguments are described in numpy.hist(). Returns ------- hist : array The values of the histogram. See `normed` and `weights` for a description of the possible semantics. bin_edges : array of dtype float Return the bin edges ``(length(hist)+1)``. See Also -------- numpy.histogram astroML.plotting.hist """ if not isinstance(a, da.Array): raise TypeError('the given array has to be a dask.Array') if a.ndim != 1: a = a.flatten() if bins == 'scotts': _, bins = dasky_scotts_bin_width(a, True) elif bins == 'freedman': _, bins = dasky_freedman_bin_width(a, True) elif isinstance(bins, str): raise ValueError("unrecognized bin code: '%s'" % bins) elif not np.iterable(bins): with ProgressBar(): kwargs['range'] = da.compute(a.min(), a.max()) h, bins = da.histogram(a, bins=bins, **kwargs) with ProgressBar(): return h.compute(), bins
def hist1d_from_mpa_data(file_, xchannel, nxbins=1024, chunk_size=TYPICAL_DASK_CHUNK): with h5py.File(file_, "r") as f: config = f["CFG"] xmin = 0 try: xmax = config[xchannel].attrs["range"] except: xmax = INVALID_ADC_VALUE - 1 events = f["EVENTS"] xdata = da.from_array(events[xchannel], chunks=chunk_size) binned, ex = da.histogram(xdata, nxbins, range=(xmin, xmax)) with DaskProgressBar(): binned = binned.compute() return binned, ex
def test_histogram_alternative_bins_range(): v = da.random.random(100, chunks=10) (a1, b1) = da.histogram(v, bins=10, range=(0, 1)) (a2, b2) = np.histogram(v, bins=10, range=(0, 1)) assert_eq(a1, a2) assert_eq(b1, b2)