def similarity_threshold(fine_image_t0): # , st_dev): fine_image_t0 = da.where(fine_image_t0 == 0, np.nan, fine_image_t0) st_dev = da.nanstd(fine_image_t0, axis=1) # new sim_threshold = st_dev * 2 / numberClass print("Done similarity threshold!", sim_threshold) return sim_threshold
def _hotspots_dask_numpy(raster, kernel): # apply kernel to raster values mean_array = convolve_2d(raster.data, kernel / kernel.sum()) # calculate z-scores global_mean = da.nanmean(raster.data) global_std = da.nanstd(raster.data) # commented out to avoid early compute to check if global_std is zero # if global_std == 0: # raise ZeroDivisionError( # "Standard deviation of the input raster values is 0." # ) z_array = (mean_array - global_mean) / global_std _func = partial(_calc_hotspots_numpy) pad_h = kernel.shape[0] // 2 pad_w = kernel.shape[1] // 2 out = z_array.map_overlap(_func, depth=(pad_h, pad_w), boundary=np.nan, meta=np.array(())) return out
def get_array_moments( array: da.core.Array, mean: bool = True, std: bool = True, std_method: str = 'binom', axis: int = 0 ) -> Tuple[Optional[da.core.Array], Optional[da.core.Array]]: """ Computes specified array_moments Parameters ---------- array : array_like, shape (N, P) Array that moments will be computed from mean : bool Flag whether to compute mean of "array" along "axis" std : bool Flag whether to compute std of "array" along "axis" std_method : str Method used to compute standard deviation. Possible methods are: 'norm' ===> Normal Distribution Standard Deviation. See np.std 'binom' ====> Binomial Standard Deviation sqrt(2*p*(1-p)), where p = "mean"/2 axis : int Axis to compute mean and std along. Returns ------- array_mean : da.core.array, optional If "mean" is false, returns None Otherwise returns the array mean array_std: da.core.array, optional If "std" is false, returns None Otherwise returns the array std """ array_mean = None array_std = None if mean: array_mean = da.nanmean(array, axis=axis) if std: if std_method == 'binom': u = array_mean if mean else da.nanmean(array, axis=axis) u /= 2 array_std = da.sqrt(2 * u * (1 - u)) elif std_method == 'norm': array_std = da.nanstd(array, axis=axis) else: raise NotImplementedError( f'std_method, {std_method}, is not implemented ') array_mean, array_std = persist(array_mean, array_std) return array_mean, array_std
def _calculate_summary_statistics(self): data = self._lazy_data() _raveled = data.ravel() _mean, _std, _min, _q1, _q2, _q3, _max = da.compute( da.nanmean(data), da.nanstd(data), da.nanmin(data), da.percentile(_raveled, [25, ]), da.percentile(_raveled, [50, ]), da.percentile(_raveled, [75, ]), da.nanmax(data), ) return _mean, _std, _min, _q1, _q2, _q3, _max
def dasky_scotts_bin_width(data, return_bins=True): r"""Dask version of scotts_bin_width Parameters ---------- data : dask array the data return_bins : bool (optional) if True, then return the bin edges Returns ------- width : float optimal bin width using Scott's rule bins : ndarray bin edges: returned if `return_bins` is True Notes ----- The optimal bin width is: .. math:: \Delta_b = \frac{3.5\sigma}{n^{1/3}} where :math:`\sigma` is the standard deviation of the data, and :math:`n` is the number of data points. See Also -------- knuth_bin_width, freedman_bin_width, astroML.plotting.hist """ if not isinstance(data, da.Array): raise TypeError('data has to be a dask array') if data.ndim != 1: data = data.flatten() n = data.size sigma = da.nanstd(data) dx = 3.5 * sigma * 1. / (n**(1. / 3)) c_dx, mx, mn = da.compute(dx, data.max(), data.min()) if return_bins: Nbins = np.ceil((mx - mn) * 1. / c_dx) Nbins = max(1, Nbins) bins = mn + c_dx * np.arange(Nbins + 1) return c_dx, bins else: return c_dx
def dasky_scotts_bin_width(data, return_bins=True): r"""Dask version of scotts_bin_width Parameters ---------- data : dask array the data return_bins : bool (optional) if True, then return the bin edges Returns ------- width : float optimal bin width using Scott's rule bins : ndarray bin edges: returned if `return_bins` is True Notes ----- The optimal bin width is: .. math:: \Delta_b = \frac{3.5\sigma}{n^{1/3}} where :math:`\sigma` is the standard deviation of the data, and :math:`n` is the number of data points. See Also -------- knuth_bin_width, freedman_bin_width, astroML.plotting.hist """ if not isinstance(data, da.Array): raise TypeError('data has to be a dask array') if data.ndim != 1: data = data.flatten() n = data.size sigma = da.nanstd(data) dx = 3.5 * sigma * 1. / (n ** (1. / 3)) c_dx, mx, mn = da.compute(dx, data.max(), data.min()) if return_bins: Nbins = np.ceil((mx - mn) * 1. / c_dx) Nbins = max(1, Nbins) bins = mn + c_dx * np.arange(Nbins + 1) return c_dx, bins else: return c_dx
def test_nan(): x = np.array([[1, np.nan, 3, 4], [5, 6, 7, np.nan], [9, 10, 11, 12]]) d = da.from_array(x, chunks=(2, 2)) assert_eq(np.nansum(x), da.nansum(d)) assert_eq(np.nansum(x, axis=0), da.nansum(d, axis=0)) assert_eq(np.nanmean(x, axis=1), da.nanmean(d, axis=1)) assert_eq(np.nanmin(x, axis=1), da.nanmin(d, axis=1)) assert_eq(np.nanmax(x, axis=(0, 1)), da.nanmax(d, axis=(0, 1))) assert_eq(np.nanvar(x), da.nanvar(d)) assert_eq(np.nanstd(x, axis=0), da.nanstd(d, axis=0)) assert_eq(np.nanargmin(x, axis=0), da.nanargmin(d, axis=0)) assert_eq(np.nanargmax(x, axis=0), da.nanargmax(d, axis=0)) assert_eq(np.nanprod(x), da.nanprod(d))
def test_nan(): x = np.array([[1, np.nan, 3, 4], [5, 6, 7, np.nan], [9, 10, 11, 12]]) d = da.from_array(x, blockshape=(2, 2)) assert eq(np.nansum(x), da.nansum(d)) assert eq(np.nansum(x, axis=0), da.nansum(d, axis=0)) assert eq(np.nanmean(x, axis=1), da.nanmean(d, axis=1)) assert eq(np.nanmin(x, axis=1), da.nanmin(d, axis=1)) assert eq(np.nanmax(x, axis=(0, 1)), da.nanmax(d, axis=(0, 1))) assert eq(np.nanvar(x), da.nanvar(d)) assert eq(np.nanstd(x, axis=0), da.nanstd(d, axis=0)) assert eq(np.nanargmin(x, axis=0), da.nanargmin(d, axis=0)) assert eq(np.nanargmax(x, axis=0), da.nanargmax(d, axis=0)) with ignoring(AttributeError): assert eq(np.nanprod(x), da.nanprod(d))
def _calculate_summary_statistics(self, rechunk=True): if rechunk is True: # Use dask auto rechunk instead of HyperSpy's one, what should be # better for these operations rechunk = "dask_auto" data = self._lazy_data(rechunk=rechunk) _raveled = data.ravel() _mean, _std, _min, _q1, _q2, _q3, _max = da.compute( da.nanmean(data), da.nanstd(data), da.nanmin(data), da.percentile(_raveled, [25, ]), da.percentile(_raveled, [50, ]), da.percentile(_raveled, [75, ]), da.nanmax(data), ) return _mean, _std, _min, _q1, _q2, _q3, _max
def test_nan(): x = np.array([[1, np.nan, 3, 4], [5, 6, 7, np.nan], [9, 10, 11, 12]]) d = da.from_array(x, chunks=(2, 2)) assert_eq(np.nansum(x), da.nansum(d)) assert_eq(np.nansum(x, axis=0), da.nansum(d, axis=0)) assert_eq(np.nanmean(x, axis=1), da.nanmean(d, axis=1)) assert_eq(np.nanmin(x, axis=1), da.nanmin(d, axis=1)) assert_eq(np.nanmax(x, axis=(0, 1)), da.nanmax(d, axis=(0, 1))) assert_eq(np.nanvar(x), da.nanvar(d)) assert_eq(np.nanstd(x, axis=0), da.nanstd(d, axis=0)) assert_eq(np.nanargmin(x, axis=0), da.nanargmin(d, axis=0)) assert_eq(np.nanargmax(x, axis=0), da.nanargmax(d, axis=0)) assert_eq(nanprod(x), da.nanprod(d))
def _scott_bw_dask(data, return_bins=True): r"""Dask version of scotts_bin_width Parameters ---------- data : dask array the data return_bins : bool (optional) if True, then return the bin edges Returns ------- width : float optimal bin width using Scott's rule bins : ndarray bin edges: returned if `return_bins` is True Notes ----- The optimal bin width is: .. math:: \Delta_b = \frac{3.5\sigma}{n^{1/3}} where :math:`\sigma` is the standard deviation of the data, and :math:`n` is the number of data points. """ if not isinstance(data, da.Array): raise TypeError("Expected a dask array") if data.ndim != 1: data = data.flatten() n = data.size sigma = da.nanstd(data) dx = 3.5 * sigma * n**(-1.0 / 3.0) c_dx, mx, mn = da.compute(dx, data.max(), data.min()) if return_bins: Nbins = max(1, np.ceil((mx - mn) / c_dx)) bins = mn + c_dx * np.arange(Nbins + 1) return c_dx, bins else: return c_dx
def test_make_snp_array_case_normal(shape, threshold): assume(shape[0] > 1 and shape[1] > 1) # Assumes not degenerate 2d Array arr = da.random.random(size=shape) arr[arr > threshold] = float('nan') assume(da.mean(da.nanstd(arr, axis=0) > 0) == 1) # Asserts that every tested arr has a non-zero std for each column snp_array = utils.make_snp_array(arr, mean=True, std=True, std_method='norm', dtype='float') np.testing.assert_array_almost_equal(1 + snp_array.mean(axis=0), np.ones(shape[1]))
def test_reductions(): x = np.arange(5).astype('f4') a = da.from_array(x, chunks=(2,)) assert eq(da.all(a), np.all(x)) assert eq(da.any(a), np.any(x)) assert eq(da.argmax(a, axis=0), np.argmax(x, axis=0)) assert eq(da.argmin(a, axis=0), np.argmin(x, axis=0)) assert eq(da.max(a), np.max(x)) assert eq(da.mean(a), np.mean(x)) assert eq(da.min(a), np.min(x)) assert eq(da.nanargmax(a, axis=0), np.nanargmax(x, axis=0)) assert eq(da.nanargmin(a, axis=0), np.nanargmin(x, axis=0)) assert eq(da.nanmax(a), np.nanmax(x)) assert eq(da.nanmin(a), np.nanmin(x)) assert eq(da.nansum(a), np.nansum(x)) assert eq(da.nanvar(a), np.nanvar(x)) assert eq(da.nanstd(a), np.nanstd(x))
def test_reductions(): x = np.arange(5).astype('f4') a = da.from_array(x, blockshape=(2, )) assert eq(da.all(a), np.all(x)) assert eq(da.any(a), np.any(x)) assert eq(da.argmax(a, axis=0), np.argmax(x, axis=0)) assert eq(da.argmin(a, axis=0), np.argmin(x, axis=0)) assert eq(da.max(a), np.max(x)) assert eq(da.mean(a), np.mean(x)) assert eq(da.min(a), np.min(x)) assert eq(da.nanargmax(a, axis=0), np.nanargmax(x, axis=0)) assert eq(da.nanargmin(a, axis=0), np.nanargmin(x, axis=0)) assert eq(da.nanmax(a), np.nanmax(x)) assert eq(da.nanmin(a), np.nanmin(x)) assert eq(da.nansum(a), np.nansum(x)) assert eq(da.nanvar(a), np.nanvar(x)) assert eq(da.nanstd(a), np.nanstd(x))
def test_make_snp_array_case_normal(shape, max_value, mask_nans): assume(shape[0] > 1 and shape[1] > 1) # Assumes not degenerate 2d Array arr = da.random.randint(0, max_value, size=shape) if mask_nans: arr[arr == max_value - 1] = float('nan') assume(da.mean(da.nanstd(arr, axis=0) > 0) == 1) # Asserts that every tested arr has a non-zero std for each column snp_array = utils.make_snp_array(arr, mean=True, std=True, std_method='norm', mask_nan=mask_nans, dtype='int8') np.testing.assert_array_almost_equal(1 + snp_array.mean(axis=0), np.ones(shape[1]))
def xsapr_clutter(files, clutter_thresh_min=0.0002, clutter_thresh_max=1.5, radius=1, write_radar=True, out_file=None, use_dask=False): """ X-SAPR Wind Farm Clutter Calculation Parameters ---------- files : list List of radar files used for X-SAPR clutter calculation. Other Parameters ---------------- clutter_thresh_min : float Threshold value for which, any clutter values above the clutter_thres_min will be considered clutter, as long as they are also below the clutter_thres_max. clutter_thresh_max : float Threshold value for which, any clutter values below the clutter_thres_max will be considered clutter, as long as they are also above the clutter_thres_min. radius : int Radius of the area surrounding the clutter gate that will be also flagged as clutter. write_radar : bool Whether to or not, to write the clutter radar as a netCDF file. Default is True. out_file : string String of location and filename to write the radar object too, if write_radar is True. use_dask : bool Use dask instead of running stats for calculation (good to run in parallel). Returns ------- clutter_radar : Radar Radar object with the clutter field that was calculated. This radar only has the clutter field, but maintains all other radar specifications. """ def get_reflect_array(file, first_shape): """ Retrieves a reflectivity array for a radar volume. """ try: radar = pyart.io.read(file) reflect_array = deepcopy(radar.fields['reflectivity']['data']) del radar if reflect_array.shape == first_shape: return reflect_array.filled(fill_value=np.nan) except TypeError: print(file + ' is corrupt...skipping!') return np.nan * np.zeros(first_shape) if use_dask is False: run_stats = _RunningStats() first_shape = 0 for file in files: try: radar = pyart.io.read(file) reflect_array = radar.fields['reflectivity']['data'] if first_shape == 0: first_shape = reflect_array.shape clutter_radar = radar run_stats.push(reflect_array) if reflect_array.shape == first_shape: run_stats.push(reflect_array) del radar except TypeError: print(file + ' is corrupt...skipping!') continue mean = run_stats.mean() stdev = run_stats.standard_deviation() clutter_values = stdev / mean else: first_shape = 0 i = 0 while first_shape == 0: try: radar = pyart.io.read(files[i]) reflect_array = radar.fields['reflectivity']['data'] first_shape = reflect_array.shape clutter_radar = radar except TypeError: i = i + 1 print(file + ' is corrupt...skipping!') continue arrays = [ delayed(get_reflect_array)(file, first_shape) for file in files ] array = [ da.from_delayed(a, shape=first_shape, dtype=float) for a in arrays ] array = da.stack(array, axis=0) print('## Calculating mean in parallel...') mean = np.array(da.nanmean(array, axis=0)) print('## Caluclating standard deviation...') stdev = np.array(da.nanstd(array, axis=0)) clutter_values = stdev / mean clutter_values = np.ma.masked_invalid(clutter_values) shape = clutter_values.shape mask = np.ma.getmask(clutter_values) is_clutters = np.argwhere( np.logical_and(clutter_values > clutter_thresh_min, clutter_values < clutter_thresh_max)) clutter_array = _clutter_marker(is_clutters, shape, mask, radius) clutter_radar.fields.clear() clutter_dict = _clutter_to_dict(clutter_array) clutter_radar.add_field('xsapr_clutter', clutter_dict, replace_existing=True) if write_radar is True: pyart.io.write_cfradial(out_file, clutter_radar) del clutter_radar return
def stadistics(self): headers = ["group", "mean", "std dev", "min", "25%", "50%", "75%", "max", "nonzero", "nonan", "unique", "dtype"] self.chunksize = Chunks.build_from_shape(self.shape, self.dtypes) table = [] for group, (dtype, _) in self.dtypes.fields.items(): values = dict() values["dtype"] = dtype values["group"] = group darray = self.data[group].da if dtype == np.dtype(float) or dtype == np.dtype(int): da_mean = da.around(darray.mean(), decimals=3) da_std = da.around(darray.std(), decimals=3) da_min = da.around(darray.min(), decimals=3) da_max = da.around(darray.max(), decimals=3) result = dask.compute([da_mean, da_std, da_min, da_max])[0] values["mean"] = result[0] if not np.isnan(result[0]) else da.around(da.nanmean(darray), decimals=3).compute() values["std dev"] = result[1] if not np.isnan(result[0]) else da.around(da.nanstd(darray), decimals=3).compute() values["min"] = result[2] if not np.isnan(result[0]) else da.around(da.nanmin(darray), decimals=3).compute() values["max"] = result[3] if not np.isnan(result[0]) else da.around(da.nanmax(darray), decimals=3).compute() if len(self.shape[group]) == 1: da_percentile = da.around(da.percentile(darray, [25, 50, 75]), decimals=3) result = da_percentile.compute() values["25%"] = result[0] values["50%"] = result[1] values["75%"] = result[2] else: values["25%"] = "-" values["50%"] = "-" values["75%"] = "-" values["nonzero"] = da.count_nonzero(darray).compute() values["nonan"] = da.count_nonzero(da.notnull(darray)).compute() values["unique"] = "-" else: values["mean"] = "-" values["std dev"] = "-" values["min"] = "-" values["max"] = "-" values["25%"] = "-" values["50%"] = "-" values["75%"] = "-" values["nonzero"] = "-" values["nonan"] = da.count_nonzero(da.notnull(darray)).compute() vunique = darray.to_dask_dataframe().fillna('').nunique().compute() values["unique"] = vunique row = [] for column in headers: row.append(values[column]) table.append(row) print("# rows {}".format(self.shape[0])) return tabulate(table, headers)
def tall_clutter(files, config, clutter_thresh_min=0.0002, clutter_thresh_max=0.25, radius=1, write_radar=True, out_file=None, use_dask=False): """ Wind Farm Clutter Calculation Parameters ---------- files : list List of radar files used for the clutter calculation. config : str String representing the configuration for the radar. Such possible configurations are listed in default_config.py Other Parameters ---------------- clutter_thresh_min : float Threshold value for which, any clutter values above the clutter_thres_min will be considered clutter, as long as they are also below the clutter_thres_max. clutter_thresh_max : float Threshold value for which, any clutter values below the clutter_thres_max will be considered clutter, as long as they are also above the clutter_thres_min. radius : int Radius of the area surrounding the clutter gate that will be also flagged as clutter. write_radar : bool Whether to or not, to write the clutter radar as a netCDF file. Default is True. out_file : string String of location and filename to write the radar object too, if write_radar is True. use_dask : bool Use dask instead of running stats for calculation. The will reduce run time. Returns ------- clutter_radar : Radar Radar object with the clutter field that was calculated. This radar only has the clutter field, but maintains all other radar specifications. """ field_names = get_field_names(config) refl_field = field_names["reflectivity"] vel_field = field_names["velocity"] ncp_field = field_names["normalized_coherent_power"] def get_reflect_array(file, first_shape): """ Retrieves a reflectivity array for a radar volume. """ try: radar = pyart.io.read( file, include_fields=[refl_field, ncp_field, vel_field]) reflect_array = deepcopy(radar.fields[refl_field]['data']) ncp = radar.fields[ncp_field]['data'] height = radar.gate_z["data"] up_in_the_air = height > 2000.0 the_mask = np.logical_or.reduce( (ncp < 0.8, reflect_array.mask, up_in_the_air)) reflect_array = np.ma.masked_where(the_mask, reflect_array) del radar if reflect_array.shape == first_shape: return reflect_array.filled(fill_value=np.nan) except (TypeError, OSError): print(file + ' is corrupt...skipping!') return np.nan * np.zeros(first_shape) if use_dask is False: run_stats = _RunningStats() first_shape = 0 for file in files: try: radar = pyart.io.read(file) reflect_array = radar.fields[refl_field]['data'] ncp = deepcopy(radar.fields[ncp_field]['data']) #reflect_array = np.ma.masked_where(ncp < 0.7, reflect_array) if first_shape == 0: first_shape = reflect_array.shape clutter_radar = radar run_stats.push(reflect_array) if reflect_array.shape == first_shape: run_stats.push(reflect_array) del radar except (TypeError, OSError): print(file + ' is corrupt...skipping!') continue mean = run_stats.mean() stdev = run_stats.standard_deviation() clutter_values = stdev / mean clutter_values = np.ma.masked_invalid(clutter_values) clutter_values_no_mask = clutter_values.filled(clutter_values_max + 1) else: cluster = LocalCluster(n_workers=20, processes=True) client = Client(cluster) first_shape = 0 i = 0 while first_shape == 0: try: radar = pyart.io.read(files[i]) reflect_array = radar.fields[refl_field]['data'] first_shape = reflect_array.shape clutter_radar = radar except (TypeError, OSError): i = i + 1 print(file + ' is corrupt...skipping!') continue arrays = [ delayed(get_reflect_array)(file, first_shape) for file in files ] array = [ da.from_delayed(a, shape=first_shape, dtype=float) for a in arrays ] array = da.stack(array, axis=0) print('## Calculating mean in parallel...') mean = np.array(da.nanmean(array, axis=0)) print('## Calculating standard deviation...') count = np.array(da.sum(da.isfinite(array), axis=0)) stdev = np.array(da.nanstd(array, axis=0)) clutter_values = stdev / mean clutter_values = np.ma.masked_invalid(clutter_values) clutter_values = np.ma.masked_where( np.logical_or(clutter_values.mask, count < 20), clutter_values) # Masked arrays can suck clutter_values_no_mask = clutter_values.filled( (clutter_thresh_max + 1)) shape = clutter_values.shape mask = np.ma.getmask(clutter_values) is_clutters = np.argwhere( np.logical_and.reduce(( clutter_values_no_mask > clutter_thresh_min, clutter_values_no_mask < clutter_thresh_max, ))) clutter_array = _clutter_marker(is_clutters, shape, mask, radius) clutter_radar.fields.clear() clutter_array = clutter_array.filled(0) clutter_dict = _clutter_to_dict(clutter_array) clutter_value_dict = _clutter_to_dict(clutter_values) clutter_value_dict["long_name"] = "Clutter value (std. dev/mean Z)" clutter_value_dict["standard_name"] = "clutter_value" clutter_radar.add_field('ground_clutter', clutter_dict, replace_existing=True) clutter_radar.add_field('clutter_value', clutter_value_dict, replace_existing=True) if write_radar is True: pyart.io.write_cfradial(out_file, clutter_radar) del clutter_radar return
def read_geno(bfile, freq_thresh, threads, check=False, max_memory=None, usable_snps=None, normalize=False, prefix='my_geno', thinning=None): chunks = (10000, 10000) # set Cache to protect memory spilling if max_memory is not None: available_memory = max_memory else: available_memory = psutil.virtual_memory().available cache = Chest(available_memory=available_memory) (bim, fam, g) = read_plink(bfile) # read the files using pandas_plink g_std = da.nanstd(g, axis=1) if check: with ProgressBar(): print('Removing invariant sites') idx = (g_std != 0).compute(cache=cache) g = g[idx, :] bim = bim[idx].copy().reset_index(drop=True) bim.i = bim.index.tolist() g_std = g_std[idx] del idx gc.collect() if usable_snps is not None: print('Restricting genotype to user specified variants') idx = sorted(bim[bim.snp.isin(usable_snps)].i.values) g = g[idx, :] bim = bim[bim.i.isin(idx)].copy().reset_index(drop=True) bim.i = bim.index.tolist() mafs = g.sum(axis=1) / (2 * g.shape[0]) if freq_thresh > 0 else None # Filter MAF if freq_thresh > 0: print('Filtering MAFs smaller than', freq_thresh) print(' Genotype matrix shape before', g.shape) assert freq_thresh < 0.5 good = (mafs < (1 - float(freq_thresh))) & (mafs > float( freq_thresh)) with ProgressBar(): with dask.config.set(pool=ThreadPool(threads)): good, mafs = dask.compute(good, mafs, cache=cache) g = g[good, :] print(' Genotype matrix shape after', g.shape) bim = bim[good] bim['mafs'] = mafs[good] bim.reset_index(drop=True, inplace=True) bim.i = bim.index.tolist() del good gc.collect() if not is_transposed(g, bim.shape[0], fam.shape[0]): g = g.T if normalize: print('Normalizing to mean 0 and sd 1') mean = da.nanmean(g.T, axis=1) g = (g - mean) / g_std if thinning is not None: print("Thinning genotype to %d variants" % thinning) idx = np.linspace(0, g.shape[1], num=thinning, dtype=int, endpoint=False) bim = bim.reindex(index=idx) g = g[:, idx].rechunk('auto') bim['i'] = range(thinning) h5 = '%s.hdf5' % prefix if not os.path.isfile(h5): with ProgressBar(), h5py.File(h5) as hd5: print("Sending processed genotype to HDF5") chroms = sorted(bim.chrom.unique().astype(int)) gr = bim.groupby('chrom') for chrom in chroms: df = gr.get_group(str(chrom)) ch = g[:, df.i.values] ch = ch.rechunk(estimate_chunks(ch.shape, threads, memory=available_memory)) print('\tChromosome %s: %d individuals %d variants' % ( chrom, ch.shape[0], ch.shape[1])) hd5.create_dataset('/%s' % chrom, data=ch.compute()) del ch del gr return g, h5, bim, fam #g, bim, fam