def stadistics(self): headers = ["group", "mean", "std dev", "min", "25%", "50%", "75%", "max", "nonzero", "nonan", "unique", "dtype"] self.chunksize = Chunks.build_from_shape(self.shape, self.dtypes) table = [] for group, (dtype, _) in self.dtypes.fields.items(): values = dict() values["dtype"] = dtype values["group"] = group darray = self.data[group].da if dtype == np.dtype(float) or dtype == np.dtype(int): da_mean = da.around(darray.mean(), decimals=3) da_std = da.around(darray.std(), decimals=3) da_min = da.around(darray.min(), decimals=3) da_max = da.around(darray.max(), decimals=3) result = dask.compute([da_mean, da_std, da_min, da_max])[0] values["mean"] = result[0] if not np.isnan(result[0]) else da.around(da.nanmean(darray), decimals=3).compute() values["std dev"] = result[1] if not np.isnan(result[0]) else da.around(da.nanstd(darray), decimals=3).compute() values["min"] = result[2] if not np.isnan(result[0]) else da.around(da.nanmin(darray), decimals=3).compute() values["max"] = result[3] if not np.isnan(result[0]) else da.around(da.nanmax(darray), decimals=3).compute() if len(self.shape[group]) == 1: da_percentile = da.around(da.percentile(darray, [25, 50, 75]), decimals=3) result = da_percentile.compute() values["25%"] = result[0] values["50%"] = result[1] values["75%"] = result[2] else: values["25%"] = "-" values["50%"] = "-" values["75%"] = "-" values["nonzero"] = da.count_nonzero(darray).compute() values["nonan"] = da.count_nonzero(da.notnull(darray)).compute() values["unique"] = "-" else: values["mean"] = "-" values["std dev"] = "-" values["min"] = "-" values["max"] = "-" values["25%"] = "-" values["50%"] = "-" values["75%"] = "-" values["nonzero"] = "-" values["nonan"] = da.count_nonzero(da.notnull(darray)).compute() vunique = darray.to_dask_dataframe().fillna('').nunique().compute() values["unique"] = vunique row = [] for column in headers: row.append(values[column]) table.append(row) print("# rows {}".format(self.shape[0])) return tabulate(table, headers)
def test_count_nonzero_str(): x = np.array(list("Hello world")) d = da.from_array(x, chunks=(4, )) x_c = np.count_nonzero(x) d_c = da.count_nonzero(d) assert x_c == d_c.compute()
def test_count_nonzero_str(): x = np.array(list("Hello world")) d = da.from_array(x, chunks=(4,)) x_c = np.count_nonzero(x) d_c = da.count_nonzero(d) assert x_c == d_c.compute()
def test_count_nonzero_obj(): x = np.random.randint(10, size=(15, 16)).astype(object) d = da.from_array(x, chunks=(4, 5)) x_c = np.count_nonzero(x) d_c = da.count_nonzero(d) if d_c.shape == tuple(): assert x_c == d_c.compute() else: assert_eq(x_c, d_c)
def count_value(pixel_value): global total_count if total_count >= max_pixels: return 0 if parallel: count = da.count_nonzero(dataset == pixel_value).compute() else: count = np.count_nonzero(dataset == pixel_value) total_count += count progress.setValue(int(total_count * 100 / max_pixels)) return count
def test_count_nonzero_axis(axis): for shape, chunks in [((0, 0), (0, 0)), ((15, 16), (4, 5))]: x = np.random.randint(10, size=shape) d = da.from_array(x, chunks=chunks) x_c = np.count_nonzero(x, axis) d_c = da.count_nonzero(d, axis) if d_c.shape == tuple(): assert x_c == d_c.compute() else: assert_eq(x_c, d_c)
def coverage_total(division_dict, cost_coverage=False): conus_path = DP.join("rasters", "albers", "acre", "masks", "conus.tif") code_path = DP.join("rasters", "albers", "acre", "cost_codes.tif") cost_path = DP.join("rasters/albers/acre/rent_map.tif") chunks = {"band": 1, "x": 5000, "y": 5000} # Read in the tifs codes = xr.open_rasterio(code_path, chunks=chunks)[0].data conus = xr.open_rasterio(conus_path, chunks=chunks)[0].data costs = xr.open_rasterio(cost_path, chunks=chunks)[0].data divisions = xr.open_rasterio(DIVISIONS_PATH, chunks=chunks)[0].data # Set nans to zero (count_nonzero counts nans) codes[da.isnan(codes)] = 0 coverages = {} with Client(): for key, item in tqdm(division_dict.items(), position=0): div = conus[divisions == key] total = da.count_nonzero(div) # If calculating costs if cost_coverage: coverage = codes[((costs > 0) | (codes == 9999)) & (divisions == key)] coded = da.count_nonzero(coverage) else: coverage = codes[divisions == key] coded = da.count_nonzero(coverage) ratio = coded / total coverages[item] = ratio.compute() df = pd.DataFrame(coverages, index=[0]).T df.columns = ["total_coverage"] return df
def get_counts(cost_coverage=False): """Get cell counts for each category.""" code_dict = get_codes(cost_coverage) # Read in code and conus rasters chunks = {"band": 1, "x": 5000, "y": 5000} code_path = DP.join("rasters/albers/acre/cost_codes.tif") cost_path = DP.join("rasters/albers/acre/rent_map.tif") conus_path = DP.join("rasters/albers/acre/masks/conus.tif") codes = xr.open_rasterio(code_path, chunks=chunks)[0].data costs = xr.open_rasterio(cost_path, chunks=chunks)[0].data conus = xr.open_rasterio(conus_path, chunks=chunks)[0].data # Dask array's `count_nonzero` counts na values codes[da.isnan(codes)] = 0 conus[da.isnan(conus)] = 0 # If calculating costs if cost_coverage: coverage = codes[(costs > 0) | (codes == 9999)] # No exclusion in cost else: coverage = codes.copy() # Extract code from dictionary blm_codes = code_dict["blm"] tribal_codes = code_dict["tribal"] state_codes = code_dict["state"] private_codes = code_dict["private"] # Arrays developable = conus[codes != 9999] dev_covered = coverage[coverage != 9999] excl = coverage[coverage == 9999] blm = coverage[da.isin(coverage, blm_codes)] tribal = coverage[da.isin(coverage, tribal_codes)] state = coverage[da.isin(coverage, state_codes)] private = coverage[da.isin(coverage, private_codes)] arrays = {"excl": excl, "blm": blm, "tribal": tribal, "state": state, "private": private, "covered": coverage, "total": conus, "developable": developable, "dev_covered": dev_covered} # Collect counts counts = {} with Client(): for key, item in tqdm(arrays.items(), position=0): counts["n" + key] = da.count_nonzero(item).compute() return counts
def test_count_nonzero_obj_axis(axis): x = np.random.randint(10, size=(15, 16)).astype(object) d = da.from_array(x, chunks=(4, 5)) x_c = np.count_nonzero(x, axis) d_c = da.count_nonzero(d, axis) if d_c.shape == tuple(): assert x_c == d_c.compute() else: ####################################################### # Workaround oddness with Windows and object arrays. # # # # xref: https://github.com/numpy/numpy/issues/9468 # ####################################################### assert_eq(x_c.astype(np.int64), d_c)
def test_count_nonzero_obj_axis(axis): x = np.random.randint(10, size=(15, 16)).astype(object) d = da.from_array(x, chunks=(4, 5)) x_c = np.count_nonzero(x, axis) d_c = da.count_nonzero(d, axis) if d_c.shape == tuple(): assert x_c == d_c.compute() else: ####################################################### # Workaround oddness with Windows and object arrays. # # # # xref: https://github.com/numpy/numpy/issues/9468 # ####################################################### assert_eq(x_c.astype(np.intp), d_c)
def two_steps(x,y,r0,r,method): n=y.shape[0] if method=='uni': pis_uni=np.repeat(1/n,n) x_uni,y_uni,pi_uni = subsample(x,y,r+r0,pis_uni) beta_uni=newton(x_uni,y_uni,pi_uni) result=beta_uni else: n1=da.count_nonzero(y).compute() n0=n-n1 pis_prop=y.compute()*(1/(2*n1)) for count in range(y.shape[0]): if pis_prop[count]==0: pis_prop[count]=(1/(2*n0)) pis_prop=da.from_array(pis_prop,(500000,)) x_prop,y_prop,pis_prop = subsample(x,y,r0,pis_prop) beta0 = newton(x_prop,y_prop,pis_prop) if method=='mvc': pi_mVc=pis_mVc(x,y,beta0) x_mVc,y_mVc,pismVc = subsample(x,y,r,pi_mVc) x1=da.concatenate([x_prop,x_mVc]) y1=da.concatenate([y_prop,y_mVc]) pis1=da.concatenate([pis_prop,pismVc]) beta_mVc=newton(x1,y1,pis1) result=beta_mVc elif method=='mmse': pi_mMSE=pis_mMSE(x,y,x_prop,pis_prop,beta0) #pi_mMSE=pis_mMSE(x,y,beta0) x_mMSE,y_mMSE,pismMSE = subsample(x,y,r,pi_mMSE) x1=np.append(x_prop,x_mMSE,axis=0) y1=np.append(y_prop,y_mMSE,axis=0) pis1=np.append(pis_prop,pismMSE,axis=0) beta_mMSE=newton(x1,y1,pis1) result=beta_mMSE elif method=='lcc': pi_LCC=pis_LCC(x,y,beta0) x_LCC,y_LCC,pisLCC = subsample(x,y,r,pi_LCC) beta_LCC=mle(x_LCC,y_LCC)+beta0 result=beta_LCC return result
def _benchmark_pca(self, gt): # Count alleles at each variant self.benchmark_profiler.start_benchmark('PCA: Count alleles') ac = gt.count_alleles() self.benchmark_profiler.end_benchmark() # Count number of multiallelic SNPs self.benchmark_profiler.start_benchmark('PCA: Count multiallelic SNPs') if self.bench_conf.genotype_array_type == config.GENOTYPE_ARRAY_DASK: num_multiallelic_snps = da.count_nonzero( ac.max_allele() > 1).compute() else: num_multiallelic_snps = np.count_nonzero(ac.max_allele() > 1) self.benchmark_profiler.end_benchmark() del num_multiallelic_snps # Count number of biallelic singletons self.benchmark_profiler.start_benchmark( 'PCA: Count biallelic singletons') if self.bench_conf.genotype_array_type == config.GENOTYPE_ARRAY_DASK: num_biallelic_singletons = da.count_nonzero( (ac.max_allele() == 1) & ac.is_singleton(1)).compute() else: num_biallelic_singletons = np.count_nonzero((ac.max_allele() == 1) & ac.is_singleton(1)) self.benchmark_profiler.end_benchmark() del num_biallelic_singletons # Apply filtering to remove singletons and multiallelic SNPs flt = (ac.max_allele() == 1) & (ac[:, :2].min(axis=1) > 1) flt_count = np.count_nonzero(flt) self.benchmark_profiler.start_benchmark( 'PCA: Remove singletons and multiallelic SNPs') if flt_count > 0: if self.bench_conf.genotype_array_type == config.GENOTYPE_ARRAY_DASK: gf = gt.take(np.flatnonzero(flt), axis=0) else: gf = gt.compress(condition=flt, axis=0) else: # Don't apply filtering print( '[Exec][PCA] Cannot remove singletons and multiallelic SNPs as no data would remain. Skipping...' ) gf = gt self.benchmark_profiler.end_benchmark() del ac, flt, flt_count # Transform genotype data into 2-dim matrix self.benchmark_profiler.start_benchmark( 'PCA: Transform genotype data for PCA') gn = gf.to_n_alt() self.benchmark_profiler.end_benchmark() del gf # Randomly choose subset of SNPs if self.bench_conf.pca_subset_size == -1: print('[Exec][PCA] Including all ({}) variants for PCA.'.format( gn.shape[0])) gnr = gn else: n = min(gn.shape[0], self.bench_conf.pca_subset_size) print( '[Exec][PCA] Including {} random variants for PCA.'.format(n)) vidx = np.random.choice(gn.shape[0], n, replace=False) vidx.sort() if self.bench_conf.genotype_array_type in [ config.GENOTYPE_ARRAY_NORMAL, config.GENOTYPE_ARRAY_CHUNKED ]: gnr = gn.take(vidx, axis=0) elif self.bench_conf.genotype_array_type == config.GENOTYPE_ARRAY_DASK: gnr = gn[ vidx] # Use indexing workaround since Dask Array's take() method is not working properly else: print( '[Exec][PCA] Error: Unspecified genotype array type specified.' ) exit(1) del vidx if self.bench_conf.pca_ld_enabled: if self.bench_conf.genotype_array_type != config.GENOTYPE_ARRAY_DASK: # Apply LD pruning to subset of SNPs size = self.bench_conf.pca_ld_pruning_size step = self.bench_conf.pca_ld_pruning_step threshold = self.bench_conf.pca_ld_pruning_threshold n_iter = self.bench_conf.pca_ld_pruning_number_iterations self.benchmark_profiler.start_benchmark( 'PCA: Apply LD pruning') gnu = self._pca_ld_prune(gnr, size=size, step=step, threshold=threshold, n_iter=n_iter) self.benchmark_profiler.end_benchmark() else: print( '[Exec][PCA] Cannot apply LD pruning because Dask genotype arrays do not support this operation.' ) gnu = gnr else: print('[Exec][PCA] LD pruning disabled. Skipping this operation.') gnu = gnr # Run PCA analysis pca_num_components = self.bench_conf.pca_number_components scaler = self.bench_conf.pca_data_scaler if self.bench_conf.genotype_array_type == config.GENOTYPE_ARRAY_DASK: # Rechunk Dask array to work with Dask's svd function (single chunk for transposed column) gnu_pca_conv = gnu.rechunk({0: -1, 1: gt.values.chunksize[1]}) else: gnu_pca_conv = gnu # Run conventional PCA analysis self.benchmark_profiler.start_benchmark( 'PCA: Run conventional PCA analysis (scaler: {})'.format( scaler if scaler is not None else 'none')) coords, model = allel.pca(gnu_pca_conv, n_components=pca_num_components, scaler=scaler) if self.bench_conf.genotype_array_type == config.GENOTYPE_ARRAY_DASK: coords.compute() self.benchmark_profiler.end_benchmark() del gnu_pca_conv, coords, model if self.bench_conf.genotype_array_type == config.GENOTYPE_ARRAY_DASK: # Rechunk Dask array to match original genotype chunk size gnu_pca_rand = gnu.rechunk( (gt.values.chunksize[0], gt.values.chunksize[1])) else: gnu_pca_rand = gnu # Run randomized PCA analysis self.benchmark_profiler.start_benchmark( 'PCA: Run randomized PCA analysis (scaler: {})'.format( scaler if scaler is not None else 'none')) coords, model = allel.randomized_pca(gnu_pca_rand, n_components=pca_num_components, scaler=scaler) if self.bench_conf.genotype_array_type == config.GENOTYPE_ARRAY_DASK: coords.compute() self.benchmark_profiler.end_benchmark() del gnu_pca_rand, coords, model
in_cat[5]['Mass'] = in_cat[5]['BH_Mass'] # Make a combined catalog of all particles and masses if sim_type == 'hydro': part_types = ['gas','DM','stars','BHs'] comb_cat = nbk.MultipleSpeciesCatalog(part_types, in_cat[0],in_cat[1],in_cat[4],in_cat[5]) elif sim_type == 'baryons': part_types = ['gas','stars','BHs'] comb_cat = nbk.MultipleSpeciesCatalog(part_types, in_cat[0],in_cat[4],in_cat[5]) if do_mass_moments_only: # Compute moments of particle mass distribution. I don't know a one-line # command for taking the mean over multiple fields, so I do it manually. m_len = {pt: comb_cat.compute(da.count_nonzero(comb_cat[pt + '/Mass'])) for pt in part_types} m_sum = {pt: comb_cat.compute(comb_cat[pt + '/Mass'].sum()) for pt in part_types} m2_arr = {pt: comb_cat[pt + '/Mass']**2. for pt in part_types} m2_sum = {pt: comb_cat.compute(m2_arr[pt].sum()) for pt in part_types} m3_arr = {pt: comb_cat[pt + '/Mass']**3. for pt in part_types} m3_sum = {pt: comb_cat.compute(m3_arr[pt].sum()) for pt in part_types} m_len_all = np.sum(m_len.values()) m_mean = np.sum(m_sum.values())/m_len_all m2_mean = np.sum(m2_sum.values())/m_len_all m3_mean = np.sum(m3_sum.values())/m_len_all print_status(comm,start_time,'Mean particle mass: %g' % m_mean) print_status(comm,start_time,'Mean squared particle mass: %g' % m2_mean) print_status(comm,start_time,'Mean cubed particle mass: %g' % m3_mean) else: