def impute_mean(ds: Dataset, dim: str, variable='data'): """Mean impute variable in a dataset Parameters ---------- ds : Dataset Dataset with some variable to impute. When missing values are present, this **must** contain an `is_masked` variable (otherwise nothing will happen). dim : str Dimension over which means should be computed. For example, in a (variants, samples) data array, `dim='variants'` means that means are computed for each sample and missing values are replaced with that associated mean (per-variant). variable : str Variable in `ds` to impute Returns ------- Dataset Dataset with `variable` imputed and `is_masked` dropped. Note that this often leads to a type change (e.g. int8 -> float64) """ if not 'is_masked' in ds: return ds return (ds.assign( **{ variable: lambda ds: xr.where(ds.is_masked, ds[variable].mean(dim=dim), ds[ variable]) }).drop('is_masked'))
def _normalize_lon_360(ds: xr.Dataset) -> xr.Dataset: """ Fix the longitude of the given dataset ``ds`` so that it ranges from -180 to +180 degrees. :param ds: The dataset whose longitudes may be given in the range 0 to 360. :return: The fixed dataset or the original dataset. """ if 'lon' not in ds.coords: return ds lon_var = ds.coords['lon'] if len(lon_var.shape) != 1: return ds lon_size = lon_var.shape[0] if lon_size < 2: return ds lon_size_05 = lon_size // 2 lon_values = lon_var.values if not np.any(lon_values[lon_size_05:] > 180.): return ds delta_lon = lon_values[1] - lon_values[0] var_names = [var_name for var_name in ds.data_vars] ds = ds.assign_coords( lon=xr.DataArray(np.linspace(-180. + 0.5 * delta_lon, +180. - 0.5 * delta_lon, lon_size), dims=ds['lon'].dims, attrs=dict(long_name='longitude', standard_name='longitude', units='degrees east'))) ds = adjust_spatial_attrs_impl(ds, True) new_vars = dict() for var_name in var_names: var = ds[var_name] if len(var.dims) >= 1 and var.dims[-1] == 'lon': values = np.copy(var.values) temp = np.copy(values[..., :lon_size_05]) values[..., :lon_size_05] = values[..., lon_size_05:] values[..., lon_size_05:] = temp # import matplotlib.pyplot as plt # im = values[(len(values.shape) - 2) * [0] + [slice(None), slice(None)]] # plt.imshow(im) new_vars[var_name] = xr.DataArray(values, dims=var.dims, attrs=var.attrs, encoding=var.encoding) return ds.assign(**new_vars)
def pack_variables(ds: Dataset) -> Dataset: # Remove dosage as it is unnecessary and should be redefined # based on encoded probabilities later (w/ reduced precision) ds = ds.drop_vars(["call_dosage", "call_dosage_mask"], errors="ignore") # Remove homozygous reference GP and redefine mask gp = ds["call_genotype_probability"][..., 1:] gp_mask = ds["call_genotype_probability_mask"].any(dim="genotypes") ds = ds.drop_vars(["call_genotype_probability", "call_genotype_probability_mask"]) ds = ds.assign(call_genotype_probability=gp, call_genotype_probability_mask=gp_mask) return ds
def to(ds: Dataset) -> Dataset: if not is_shape_match(ds, {DIM_PLOIDY: 2, DIM_ALLELE: 2}): raise ValueError( 'Dosage calculation currently only supported for bi-allelic, ' 'diploid arrays (ploidy and alelle dims must have size 2)') # Get array slices for ref and alt probabilities on each chromosome c0ref, c1ref = ds.data[..., 0, 0], ds.data[..., 1, 0] c0alt, c1alt = ds.data[..., 0, 1], ds.data[..., 1, 1] # Compute dosage as float in [0, 2] data = c0ref * c1alt + c0alt * c1ref + 2 * c0alt * c1alt data = _mask(ds.assign(data=data)) return _transmute(GenotypeDosageDataset.create, ds, data)
def _normalize_lon_360(ds: xr.Dataset) -> xr.Dataset: """ Fix the longitude of the given dataset ``ds`` so that it ranges from -180 to +180 degrees. :param ds: The dataset whose longitudes may be given in the range 0 to 360. :return: The fixed dataset or the original dataset. """ if 'lon' not in ds.coords: return ds lon_var = ds.coords['lon'] if len(lon_var.shape) != 1: return ds lon_size = lon_var.shape[0] if lon_size < 2: return ds lon_size_05 = lon_size // 2 lon_values = lon_var.values if not np.any(lon_values[lon_size_05:] > 180.): return ds delta_lon = lon_values[1] - lon_values[0] var_names = [var_name for var_name in ds.data_vars] ds = ds.assign_coords(lon=xr.DataArray(np.linspace(-180. + 0.5 * delta_lon, +180. - 0.5 * delta_lon, lon_size), dims=ds['lon'].dims, attrs=dict(long_name='longitude', standard_name='longitude', units='degrees east'))) ds = adjust_spatial_attrs_impl(ds, True) new_vars = dict() for var_name in var_names: var = ds[var_name] if len(var.dims) >= 1 and var.dims[-1] == 'lon': values = np.copy(var.values) temp = np.copy(values[..., : lon_size_05]) values[..., : lon_size_05] = values[..., lon_size_05:] values[..., lon_size_05:] = temp # import matplotlib.pyplot as plt # im = values[(len(values.shape) - 2) * [0] + [slice(None), slice(None)]] # plt.imshow(im) new_vars[var_name] = xr.DataArray(values, dims=var.dims, attrs=var.attrs, encoding=var.encoding) return ds.assign(**new_vars)
def set_crs(dset: xr.Dataset, crs, coords=None, data_vars=None): grid_mapping, _ = _load_crs(dset, crs) dset = dset.assign({grid_mapping.name: grid_mapping}) if coords is not None: dset = _add_geoattrs_to_coords(dset, grid_mapping, coords) if data_vars is not None: dset = dset.copy() for v in data_vars: dset.data_vars[v].attrs['grid_mapping'] = grid_mapping.name return dset
def _normalize_lon_360(ds: xr.Dataset) -> xr.Dataset: """ Fix the longitude of the given dataset ``ds`` so that it ranges from -180 to +180 degrees. :param ds: The dataset whose longitudes may be given in the range 0 to 360. :return: The fixed dataset or the original dataset. """ if 'lon' not in ds.coords: return ds lon_var = ds.coords['lon'] if len(lon_var.shape) != 1: return ds lon_size = lon_var.shape[0] if lon_size < 2: return ds lon_size_05 = lon_size // 2 lon_values = lon_var.values if not np.any(lon_values[lon_size_05:] > 180.): return ds delta_lon = lon_values[1] - lon_values[0] var_names = [var_name for var_name in ds.data_vars] ds = ds.assign_coords( lon=xr.DataArray(np.linspace(-180. + 0.5 * delta_lon, +180. - 0.5 * delta_lon, lon_size), dims=ds['lon'].dims, attrs=dict(long_name='longitude', standard_name='longitude', units='degrees east'))) ds = adjust_spatial_attrs(ds, True) new_vars = dict() for var_name in var_names: var = ds[var_name] if 'lon' in var.dims: new_var = var.roll(lon=lon_size_05, roll_coords=False) new_var.encoding.update(var.encoding) new_vars[var_name] = new_var return ds.assign(**new_vars)
def encode_cube(cube: xr.Dataset, grid_mapping: Optional[GridMapping] = None, non_cube_subset: Optional[xr.Dataset] = None) \ -> xr.Dataset: """ Encode a *cube* with its *grid_mapping*, and additional variables in *non_cube_subset* into a new dataset. This is the inverse of the operation :func:decode_cube: cube, gm, non_cube = decode_cube(dataset) dataset = encode_cube(cube, gm, non_cube) The returned data cube comprises all variables in *cube*, whose dimensions should be ("time" , [...], y_dim_name, x_dim_name), and where y_dim_name, x_dim_name are defined by *grid_mapping*, if given. If *grid_mapping* is not geographic, a new variable "crs" will be added that holds CF-compliant attributes which encode the cube's spatial CRS. *non_cube_subset*, if given may be used to add non-cube variables the to resulting dataset. :param cube: data cube dataset, whose dimensions should be ("time" , [...], y_dim_name, x_dim_name) :param grid_mapping: Optional grid mapping for *cube*. :param non_cube_subset: An optional dataset providing non-cube data variables. :return: """ if non_cube_subset is not None: dataset = cube.assign(**non_cube_subset.data_vars) else: dataset = cube if grid_mapping is None: return dataset if grid_mapping.crs.is_geographic \ and grid_mapping.is_regular \ and grid_mapping.xy_dim_names == ('lon', 'lat') \ and grid_mapping.xy_var_names == ('lon', 'lat'): # No need to add CRS variable return dataset return dataset.assign(crs=xr.DataArray(0, attrs=grid_mapping.crs.to_cf()))
def regional_with_and_withtout_flow(region): in_region_buses = n.buses.query('country == @region').index region_branches = branches.query('bus0 in @in_region_buses ' 'or bus1 in @in_region_buses') buses_i = (pd.Index(region_branches.bus0.unique()) | pd.Index(region_branches.bus1.unique()) | in_region_buses) vicinity_buses = buses_i.difference(in_region_buses) branches_i = region_branches.index K = Incidence(n, branch_components).loc[buses_i] # create regional injection pattern with nodal injection at the border # accounting for the cross border flow p = (K @ f) # p.loc[in_region_buses] == # network_injection(n, snapshots).loc[snapshots, in_region_buses].T # modified injection pattern without transition im = upper(p.loc[vicinity_buses]) ex = lower(p.loc[vicinity_buses]) largerImport_b = im.sum('bus') > -ex.sum('bus') scaleImport = (im.sum('bus') + ex.sum('bus')) / im.sum('bus') scaleExport = (im.sum('bus') + ex.sum('bus')) / ex.sum('bus') netImOrEx = (im * scaleImport).where(largerImport_b, (ex * scaleExport)) p_wo = xr.concat([p.loc[in_region_buses], netImOrEx], dim='bus')\ .reindex(bus=buses_i).fillna(0) if 'Link' in branch_components: H = xr.concat((PTDF(n, branch_components, snapshot=sn) for sn in snapshots), dim='snapshot')\ .sel(branch=branches_i) # f == H @ p else: H = PTDF(n, branch_components).sel(branch=branches_i) f_wo = H.reindex(bus=buses_i).dot(p_wo, 'bus') res = Dataset({'flow_with_transit': f.sel(branch=branches_i), 'flow_without_transit': f_wo})\ .assign_coords(country=region) return res.assign(transit_flow=res.flow_with_transit - res.flow_without_transit)
def apply_data_var_remap(xds: xr.Dataset, var_name: str, map_func) -> xr.Dataset: """Apply the map_func to the values in the given data_var""" import numpy as np def mb(array): vals = array.values newvals = np.ndarray(vals.shape, vals.dtype) if len(vals) > 0: newvals = np.vectorize(map_func, [vals.dtype])(vals) return xr.DataArray(data=newvals, coords=array.coords, dims=array.dims, name=array.name, attrs=array.attrs) assert ( isinstance(xds[var_name], xr.DataArray) ), f"######### ERROR: trying to remap the data variable {var_name} which is a {type(xds[var_name])} but a {xr.DataArray} was expected!" var_val = xds[var_name].map_blocks(mb) return xds.assign({var_name: var_val})
def insert_column_integrated_vars( ds: xr.Dataset, column_integrated_vars: Sequence[str] ) -> xr.Dataset: """Insert column integrated (<*>) terms, really a wrapper around vcm.calc.thermo funcs""" for var in column_integrated_vars: column_integrated_name = f"column_integrated_{var}" if "Q1" in var: da = vcm.column_integrated_heating_from_isochoric_transition( ds[var], ds[DELP] ) elif "Q2" in var: da = -vcm.minus_column_integrated_moistening(ds[var], ds[DELP]) da = da.assign_attrs( {"long_name": "column integrated moistening", "units": "mm/day"} ) else: da = vcm.mass_integrate(ds[var], ds[DELP], dim="z") ds = ds.assign({column_integrated_name: da}) return ds
def add_covariates(ds: Dataset, npc: int = 20) -> Dataset: # See https://github.com/Nealelab/UK_Biobank_GWAS/blob/67289386a851a213f7bb470a3f0f6af95933b041/0.1/22.run_regressions.py#L71 ds = (ds.assign(sample_age_at_recruitment_2=lambda ds: ds[ "sample_age_at_recruitment"]**2).assign(sample_sex_x_age=lambda ds: ds[ "sample_genetic_sex"] * ds["sample_age_at_recruitment"]).assign( sample_sex_x_age_2=lambda ds: ds["sample_genetic_sex"] * ds[ "sample_age_at_recruitment_2"])) covariates = np.column_stack([ ds["sample_age_at_recruitment"].values, ds["sample_age_at_recruitment_2"].values, ds["sample_genetic_sex"].values, ds["sample_sex_x_age"].values, ds["sample_sex_x_age_2"].values, ds["sample_principal_component"].values[:, :npc], ]) assert np.all(np.isfinite(covariates)) ds["sample_covariate"] = xr.DataArray(covariates, dims=("samples", "covariates")) ds["sample_covariate"] = ds.sample_covariate.pipe( lambda x: (x - x.mean(dim="samples")) / x.std(dim="samples")) assert np.all(np.isfinite(ds.sample_covariate)) return ds
def process_SToF(dataset: xr.Dataset): """ This isn't the best unit conversion function because it doesn't properly take into account the Jacobian of the coordinate conversion. This can be fixed by multiplying each channel by the appropriate ammount, but it might still be best to use the alternative method. :param dataset: :return: """ e_min = dataset.attrs.get('E_min', 1) e_max = dataset.attrs.get('E_max', 10) de = dataset.attrs.get('dE', 0.01) ke_axis = np.linspace(e_min, e_max, (e_max - e_min) / de) dataset = transform_dataarray_axis( build_KE_coords_to_time_coords(dataset, ke_axis), 'time', 'eV', ke_axis, dataset, lambda x: x, ) dataset = dataset.rename({'t_up': 'up', 't_down': 'down'}) if 'up' in dataset.data_vars: # apply the sherman function corrections sherman = dataset.attrs.get('sherman', 0.2) polarization = 1 / sherman * (dataset.up - dataset.down) / ( dataset.up + dataset.down) new_up = (dataset.up + dataset.down) * (1 + polarization) new_down = (dataset.up + dataset.down) * (1 - polarization) dataset = dataset.assign(up=new_up, down=new_down) return dataset
def assign_net_physics_terms(ds: xr.Dataset) -> xr.Dataset: net_terms: Mapping[Hashable, Any] = { "net_heating": net_heating_from_physics(ds), "net_precipitation": net_precipitation_from_physics(ds), } return ds.assign(net_terms)
def resample_in_space(dataset: xr.Dataset, source_gm: GridMapping = None, target_gm: GridMapping = None, var_configs: Mapping[Hashable, Mapping[str, Any]] = None): """ Resample a dataset in the spatial dimensions. If the source grid mapping *source_gm* is not given, it is derived from *dataset*: ``source_gm = GridMapping.from_dataset(dataset)``. If the target grid mapping *target_gm* is not given, it is derived from *source_gm*: ``target_gm = source_gm.to_regular()``. If *source_gm* is almost equal to *target_gm*, this function is a no-op and *dataset* is returned unchanged. Otherwise the function computes a spatially resampled version of *dataset* and returns it. Using *var_configs*, the resampling of individual variables can be configured. If given, *var_configs* must be a mapping from variable names to configuration dictionaries which can have the following properties: * ``spline_order`` (int) - The order of spline polynomials used for interpolating. It is used for upsampling only. Possible values are 0 to 5. Default is 1 (bi-linear) for floating point variables, and 0 (= nearest neighbor) for integer and bool variables. * ``aggregator`` (str) - An optional aggregating function. It is used for downsampling only. Examples are numpy.nanmean, numpy.nanmin, numpy.nanmax. Default is numpy.nanmean for floating point variables, and None (= nearest neighbor) for integer and bool variables. * ``recover_nan`` (bool) - whether a special algorithm shall be used that is able to recover values that would otherwise yield NaN during resampling. Default is True for floating point variables, and False for integer and bool variables. Note that *var_configs* is only used if the resampling involves an affine transformation. This is true if the CRS of *source_gm* and CRS of *target_gm* are equal and one of two cases is given: 1. *source_gm* is regular. In this case the resampling is the affine transformation. and the result is returned directly. 2. *source_gm* is not regular and has a lower resolution than *target_cm*. In this case *dataset* is downsampled first using an affine transformation. Then the result is rectified. In all other cases, no affine transformation is applied and the resampling is a direct rectification. :param dataset: The source dataset. :param source_gm: The source grid mapping. :param target_gm: The target grid mapping. Must be regular. :param var_configs: Optional resampling configurations for individual variables. :return: The spatially resampled dataset. """ if source_gm is None: # No source grid mapping given, so do derive it from dataset source_gm = GridMapping.from_dataset(dataset) if target_gm is None: # No target grid mapping given, so do derive it from source target_gm = source_gm.to_regular() if source_gm.is_close(target_gm): # If source and target grid mappings are almost equal return dataset # target_gm must be regular GridMapping.assert_regular(target_gm, name='target_gm') # Are source and target both geographic grid mappings? both_geographic = source_gm.crs.is_geographic \ and target_gm.crs.is_geographic if both_geographic or source_gm.crs == target_gm.crs: # If CRSes are both geographic or their CRSes are equal: if source_gm.is_regular: # If also the source is regular, then resampling reduces # to an affine transformation. return affine_transform_dataset( dataset, source_gm=source_gm, target_gm=target_gm, var_configs=var_configs, ) # If the source is not regular, we need to rectify it, # so the target is regular. Our rectification implementation # works only correctly if source pixel size >= target pixel # size. Therefore check if we must downscale source first. x_scale = source_gm.x_res / target_gm.x_res y_scale = source_gm.y_res / target_gm.y_res if x_scale > _SCALE_LIMIT and y_scale > _SCALE_LIMIT: # Source pixel size >= target pixel size. # We can rectify. return rectify_dataset(dataset, source_gm=source_gm, target_gm=target_gm) # Source has higher resolution than target. # Downscale first, then rectify if source_gm.is_regular: # If source is regular downscaled_gm = source_gm.scale((x_scale, y_scale)) downscaled_dataset = resample_dataset( dataset, ((x_scale, 1, 0), (1, y_scale, 0)), size=downscaled_gm.size, tile_size=source_gm.tile_size, xy_dim_names=source_gm.xy_dim_names, var_configs=var_configs, ) else: _, downscaled_size = scale_xy_res_and_size(source_gm.xy_res, source_gm.size, (x_scale, y_scale)) downscaled_dataset = resample_dataset( dataset, ((x_scale, 1, 0), (1, y_scale, 0)), size=downscaled_size, tile_size=source_gm.tile_size, xy_dim_names=source_gm.xy_dim_names, var_configs=var_configs, ) downscaled_gm = GridMapping.from_dataset( downscaled_dataset, tile_size=source_gm.tile_size, prefer_crs=source_gm.crs) return rectify_dataset(downscaled_dataset, source_gm=downscaled_gm, target_gm=target_gm) # If CRSes are not both geographic and their CRSes are different # transform the source_gm so its CRS matches the target CRS: transformed_source_gm = source_gm.transform(target_gm.crs) transformed_x, transformed_y = transformed_source_gm.xy_coords reprojected_dataset = resample_in_space(dataset.assign( transformed_x=transformed_x, transformed_y=transformed_y), source_gm=transformed_source_gm, target_gm=target_gm) if not target_gm.crs.is_geographic: # Add 'crs' variable according to CF conventions reprojected_dataset = reprojected_dataset.assign( crs=xr.DataArray(0, attrs=target_gm.crs.to_cf())) return reprojected_dataset
def process_dataset_function(dataset: xr.Dataset, name: str = None, value: int = None): dataset = dataset.copy() return dataset.assign(**{name: value})
def to(ds: Dataset) -> Dataset: """Convert to genotype counts""" data = _mask(ds.assign(data=(ds.data > 0).sum(dim=DIM_PLOIDY))) return _transmute(GenotypeCountDataset.create, ds, data)
def my_postprocessor(ds: xr.Dataset) -> xr.Dataset: return ds.assign(crs=xr.DataArray(42))
def to(ds: Dataset, contig: int) -> Dataset: """Convert to haplotypecalls""" # FIXME: nonsense for testing data = _mask(ds.assign(data=ds.data[..., contig])) return _transmute(HaplotypeCallDataset.create, ds, data)
def standardize_coords( ds: xr.Dataset, time_shift=-timedelta(minutes=7, seconds=30)) -> xr.Dataset: ds_shifted = ds.assign(time=ds.time + time_shift) return gfdl_to_standard(ds_shifted).drop("tile")