def array2netcdf( dar: xr.DataArray, path: Optional[str] = None, **kwargs ) -> Optional[bytes]: """ Write a data array to netcdf format. A special form of this function is provided to serialize any python objects in the attrs dict using cloud pickle. Parameters ---------- dar Data Array containing continuous data path The path to save file to disk, if None, a byte rep. is returned kwargs Keyword args passed to to_netcdf function Notes --------- See :function: ~xr.DataArray.to_netcdf for supported kwargs """ byte_str = cloudpickle.dumps(dar.attrs) old_attrs = dar.attrs dar.attrs = {"attrs": np.frombuffer(byte_str, dtype=np.int8)} out = dar.to_netcdf(path=path, **kwargs) dar.attrs = old_attrs return out
def set_attrs_for_station( field: xr.DataArray, field_record: typing.Dict ) -> xr.DataArray: """ 设置站点垂直剖面数据属性 Parameters ---------- field field_record Returns ------- """ if "lon_0" in field.attrs: lon_0 = field.attrs["lon_0"] else: lon_0 = field.longitude.item() if "lat_0" in field.attrs: lat_0 = field.attrs["lat_0"] else: lat_0 = field.latitude.item() field.attrs = { "name": field_record["name"], "long_name": field_record["long_name"], "units": field_record["units"], "lat_0": lon_0, "lon_0": lat_0, } return field
def get_dataset(self, key, info): """Load a dataset.""" if self._channel != key.name: return logger.debug('Reading %s.', key.name) # FIXME: get this from MTD_MSIL1C.xml quantification_value = 10000. jp2 = glymur.Jp2k(self.filename) bitdepth = 0 for seg in jp2.codestream.segment: try: bitdepth = max(bitdepth, seg.bitdepth[0]) except AttributeError: pass jp2.dtype = (np.uint8 if bitdepth <= 8 else np.uint16) # Initialize the jp2 reader / doesn't work in a multi-threaded context. # jp2[0, 0] # data = da.from_array(jp2, chunks=CHUNK_SIZE) / quantification_value * 100 data = da.from_delayed(delayed(jp2.read)(), jp2.shape, jp2.dtype) data = data.rechunk(CHUNK_SIZE) / quantification_value * 100 proj = DataArray(data, dims=['y', 'x']) proj.attrs = info.copy() proj.attrs['units'] = '%' proj.attrs['platform_name'] = self.platform_name return proj
def from_tree(cls, tree, ctx): """ Converts basic types representing YAML trees into an 'xarray.DataArray'. Parameters ---------- tree : An instance of a basic Python type (possibly nested) that corresponds to a YAML subtree. ctx : An instance of the 'AsdfFile' object that is being constructed. Returns ------- xarray.DataArray : An instance of the 'xarray.DataArray' type. """ data = tree["data"].data dims = tree["data"].dimensions coords = {} for coordinate in tree["coordinates"]: coords[coordinate.name] = (coordinate.dimensions, coordinate.data) obj = DataArray(data=data, coords=coords, dims=dims) obj.attrs = tree["attributes"] return obj
def scale_and_clip_dataarray(dataarray: xr.DataArray, *, scale_factor=1, add_offset=0, clip_range=None, valid_range=None, new_nodata=-999, new_dtype='int16'): orig_attrs = dataarray.attrs nodata = dataarray.attrs['nodata'] mask = dataarray.data == nodata # add another mask here for if data > 10000 then also make that nodata dataarray = dataarray * scale_factor + add_offset if clip_range is not None: dataarray = dataarray.clip(*clip_range) dataarray = dataarray.astype(new_dtype) dataarray.data[mask] = new_nodata if valid_range is not None: valid_min, valid_max = valid_range dataarray = dataarray.where(dataarray >= valid_min, new_nodata) dataarray = dataarray.where(dataarray <= valid_max, new_nodata) dataarray.attrs = orig_attrs dataarray.attrs['nodata'] = new_nodata return dataarray
def read(self): """ :return: DataArray objects populated with data read from eeg files. The size of the output is number of channels x number of start offsets x number of time series points The corresponding DataArray axes are: 'channels', 'start_offsets', 'offsets' """ eventdata, read_ok_mask = self.read_file(self.dataroot, self.channels, self.start_offsets, self.read_size) # multiply by the gain eventdata *= self.params_dict['gain'] eventdata = DataArray( eventdata, dims=[self.channel_name, 'start_offsets', 'offsets'], coords={ self.channel_name: self.channels, 'start_offsets': self.start_offsets.copy(), 'offsets': np.arange(self.read_size), 'samplerate': self.params_dict['samplerate'] }) from copy import deepcopy eventdata.attrs = deepcopy(self.params_dict) return eventdata, read_ok_mask
def get_dataset(self, key, info): """Load a dataset.""" if self._channel != key['name']: return logger.debug('Reading %s.', key['name']) # FIXME: get this from MTD_MSIL1C.xml quantification_value = 10000. jp2 = glymur.Jp2k(self.filename) bitdepth = 0 for seg in jp2.codestream.segment: try: bitdepth = max(bitdepth, seg.bitdepth[0]) except AttributeError: pass jp2.dtype = (np.uint8 if bitdepth <= 8 else np.uint16) # Initialize the jp2 reader / doesn't work in a multi-threaded context. # jp2[0, 0] # data = da.from_array(jp2, chunks=CHUNK_SIZE) / quantification_value * 100 data = da.from_delayed(delayed(jp2.read)(), jp2.shape, jp2.dtype) data = data.rechunk(CHUNK_SIZE) / quantification_value * 100 proj = DataArray(data, dims=['y', 'x']) proj.attrs = info.copy() proj.attrs['units'] = '%' proj.attrs['platform_name'] = self.platform_name return proj
def read(self): """ :return: DataArray objects populated with data read from eeg files. The size of the output is number of channels x number of start offsets x number of time series points The corresponding DataArray axes are: 'channels', 'start_offsets', 'offsets' """ eventdata, read_ok_mask = self.read_file(self.dataroot,self.channels,self.start_offsets,self.read_size) # multiply by the gain eventdata *= self.params_dict['gain'] eventdata = DataArray(eventdata, dims=[self.channel_name, 'start_offsets', 'offsets'], coords={ self.channel_name: self.channels, 'start_offsets': self.start_offsets.copy(), 'offsets': np.arange(self.read_size), 'samplerate': self.params_dict['samplerate'] } ) from copy import deepcopy eventdata.attrs = deepcopy(self.params_dict) return eventdata, read_ok_mask
def deg2mpm(da: xr.DataArray) -> xr.DataArray: """Convert ``xarray.Data[Array,set]`` from degree to meter/meter.""" attrs = da.attrs da = np.tan(np.deg2rad(da)) da.attrs = attrs da.name = "slope" da.attrs["units"] = "meters/meters" return da
def _to_file(self, data: xr.DataArray) -> None: """Save the model sequences to file Create the desired parent directory and delete existing file with same name, if necessary, and then dump to file. """ assert isinstance(data, xr.DataArray), "data must be data array" data.attrs = self._get_attributes() # save all the model parameters if os.path.isfile(self._get_filename()): os.remove(self._get_filename()) data.to_netcdf(self._get_filename(), format="netCDF4", engine="netcdf4")
def set_attrs_for_lat_section( field: xr.DataArray, field_record: typing.Dict ) -> xr.DataArray: if "lon_0" in field.attrs: lon_0 = field.attrs["lon_0"] else: lon_0 = field.longitude.item() field.attrs = { "name": field_record["name"], "long_name": field_record["long_name"], "units": field_record["units"], "lon_0": lon_0, } return field
def get_dataset(self, key, info): """Get the dataset refered to by `key`.""" angles = self._get_coarse_dataset(key, info) if angles is None: return # Fill gaps at edges of swath darr = DataArray(angles, dims=['y', 'x']) darr = darr.bfill('x') darr = darr.ffill('x') angles = darr.data res = self.interpolate_angles(angles, key['resolution']) proj = DataArray(res, dims=['y', 'x']) proj.attrs = info.copy() proj.attrs['units'] = 'degrees' proj.attrs['platform_name'] = self.platform_name return proj
def read(self): """Read EEG data. Returns ------- event_data : DataArray Populated with data read from eeg files. The size of the output is number of channels * number of start offsets * number of time series points. The corresponding DataArray axes are: 'channels', 'start_offsets', 'offsets' read_ok_mask : np.ndarray Mask of chunks that were properly read. Notes ----- This method should *not* be overridden by subclasses. Instead, override the :meth:`read_file` method to implement new file types (see for example the HDF5 reader). """ eventdata, read_ok_mask = self.read_file(self.dataroot, self.channel_labels, self.start_offsets, self.read_size) # multiply by the gain eventdata *= self.params_dict['gain'] eventdata = DataArray(eventdata, dims=[self.channel_name, 'start_offsets', 'offsets'], coords={ self.channel_name: self.channels, 'start_offsets': self.start_offsets.copy(), 'offsets': np.arange(self.read_size), 'samplerate': self.params_dict['samplerate'] } ) from copy import deepcopy eventdata.attrs = deepcopy(self.params_dict) return eventdata, read_ok_mask
def get_dataset(self, key, info): """Get the dataset refered to by `key`.""" angles = self._get_coarse_dataset(key, info) if angles is None: return # Fill gaps at edges of swath darr = DataArray(angles, dims=['y', 'x']) darr = darr.bfill('x') darr = darr.ffill('x') angles = darr.data res = self.interpolate_angles(angles, key.resolution) proj = DataArray(res, dims=['y', 'x']) proj.attrs = info.copy() proj.attrs['units'] = 'degrees' proj.attrs['platform_name'] = self.platform_name return proj
def agg_time(array: xr.DataArray, ndayagg: int = 1, method: str = 'mean', firstday: pd.Timestamp = None, rolling: bool = False) -> xr.DataArray: """ Aggegates a daily time dimension, that should be continuous, otherwise non-neighbouring values are taken together. It returns a left stamped aggregation of ndays For non-rolling aggregation it is possible to supply a firstday, to sync the blocks with another timeseries. Trailing Nan's are removed. """ assert (np.diff(array.time) == np.timedelta64(1, 'D')).all( ), "time axis should be a continuous daily to be aggregated, though nan is allowed" if not firstday is None: array = array.sel(time=slice(firstday, None)) if rolling: name = array.name attrs = array.attrs f = getattr(array.rolling({'time': ndayagg}, center=False), method) # Stamped right array = f() array = array.assign_coords( time=array.time - pd.Timedelta(str(ndayagg - 1) + 'D')).isel( time=slice(ndayagg - 1, None)) # Left stamping, trailing nans removed array.name = name array.attrs = attrs else: input_length = len(array.time) f = getattr( array.resample(time=str(ndayagg) + 'D', closed='left', label='left'), method) array = f(dim='time', keep_attrs=True, skipna=False) if (input_length % ndayagg) != 0: array = array.isel( time=slice(0, -1, None) ) # Remove the last aggregation, if it has not been based on the full ndayagg return array
def test_weighted_operations_keep_attr(operation, as_dataset, keep_attrs): weights = DataArray(np.random.randn(2, 2), attrs=dict(attr="weights")) data = DataArray(np.random.randn(2, 2)) if as_dataset: data = data.to_dataset(name="data") data.attrs = dict(attr="weights") result = getattr(data.weighted(weights), operation)(keep_attrs=True) if operation == "sum_of_weights": assert weights.attrs == result.attrs else: assert data.attrs == result.attrs result = getattr(data.weighted(weights), operation)(keep_attrs=None) assert not result.attrs result = getattr(data.weighted(weights), operation)(keep_attrs=False) assert not result.attrs
def test_weighted_operations_keep_attr(operation, as_dataset, keep_attrs): weights = DataArray(np.random.randn(2, 2), attrs=dict(attr="weights")) data = DataArray(np.random.randn(2, 2)) if as_dataset: data = data.to_dataset(name="data") data.attrs = dict(attr="weights") kwargs = {"keep_attrs": keep_attrs} if operation == "quantile": kwargs["q"] = 0.5 result = getattr(data.weighted(weights), operation)(**kwargs) if operation == "sum_of_weights": assert result.attrs == (weights.attrs if keep_attrs else {}) assert result.attrs == (weights.attrs if keep_attrs else {}) else: assert result.attrs == (weights.attrs if keep_attrs else {}) assert result.attrs == (data.attrs if keep_attrs else {})
def _get_data(data: xr.DataArray, dataset_id: dict) -> xr.DataArray: """Get a dataset.""" if dataset_id.get('resolution'): data.attrs['resolution'] = dataset_id['resolution'] attrs = data.attrs.copy() fill = attrs.get('_FillValue') factor = attrs.pop('scale_factor', (np.ones(1, dtype=data.dtype))[0]) offset = attrs.pop('add_offset', (np.zeros(1, dtype=data.dtype))[0]) valid_range = attrs.get('valid_range', [None]) if isinstance(valid_range, np.ndarray): attrs["valid_range"] = valid_range.tolist() flags = not data.attrs.get("SCALED", 1) and any( data.attrs.get("flag_values", [None])) if not flags: data = data.where(data != fill) data = _CLAVRxHelper._scale_data(data, factor, offset) # don't need _FillValue if it has been applied. attrs.pop('_FillValue', None) if all(valid_range): valid_min = _CLAVRxHelper._scale_data(valid_range[0], factor, offset) valid_max = _CLAVRxHelper._scale_data(valid_range[1], factor, offset) if flags: data = data.where((data >= valid_min) & (data <= valid_max), fill) else: data = data.where((data >= valid_min) & (data <= valid_max)) attrs['valid_range'] = [valid_min, valid_max] data.attrs = _remove_attributes(attrs) return data
def grid_data( x, y, var, bins=None, how="mean", interp_lim=6, verbose=True, return_xarray=True, ): """ Grids the input variable to bins for depth/dens (y) and time/dive (x). The bins can be specified to be non-uniform to adapt to variable sampling intervals of the profile. It is useful to use the ``gt.plot.bin_size`` function to identify the sampling intervals. The bins are averaged (mean) by default but can also be the ``median, std, count``, Parameters ---------- x : np.array, dtype=float, shape=[n, ] The horizontal values by which to bin need to be in a psudeo discrete format already. Dive number or ``time_average_per_dive`` are the standard inputs for this variable. Has ``p`` unique values. y : np.array, dtype=float, shape=[n, ] The vertical values that will be binned; typically depth, but can also be density or any other variable. bins : np.array, dtype=float; shape=[q, ], default=[0 : 1 : max_depth ] Define the bin edges for y with this function. If not defined, defaults to one meter bins. how : str, defualt='mean' the string form of a function that can be applied to pandas.Groupby objects. These include ``mean, median, std, count``. interp_lim : int, default=6 sets the maximum extent to which NaNs will be filled. Returns ------- glider_section : xarray.DataArray, shape=[p, q] A 2D section in the format specified by ``ax_xarray`` input. Raises ------ Userwarning Triggers when ``x`` does not have discrete values. """ from numpy import array, c_, diff, unique from pandas import Series, cut from xarray import DataArray xvar, yvar = x.copy(), y.copy() z = Series(var) y = array(y) x = array(x) u = unique(x).size s = x.size if (u / s) > 0.2: raise UserWarning( "The x input array must be psuedo discrete (dives or dive_time). " "{:.0f}% of x is unique (max 20% unique)".format(u / s * 100)) chunk_depth = 50 # -DB this might not work if the user uses anything other than depth, example # density. Chunk_depth would in that case apply to density, which will # probably have a range that is much smaller than 50. optimal_bins, avg_sample_freq = get_optimal_bins(y, chunk_depth) if bins is None: bins = optimal_bins # warning if bin average is smaller than average bin size # -DB this is not being raised as a warning. Instead just seems like useful # information conveyed to user. Further none of this works out if y is not # depth, since avg_sample freq will not make sense otherwise. if verbose: avg_bin_size = diff(bins).mean() print(("Mean bin size = {:.2f}\n" "Mean depth binned ({} m) vertical sampling frequency = {:.2f}" ).format(avg_bin_size, chunk_depth, avg_sample_freq)) labels = c_[bins[:-1], bins[1:]].mean(axis=1) # -DB creates the mean bin values bins = cut(y, bins, labels=labels) # -DB creates a new variable where instead of variable the bin category # is mentioned (sort of like a discretization) grp = Series(z).groupby([x, bins ]) # -DB put z into the many bins (like 2D hist) grp_agg = getattr( grp, how)() # -DB basically does grp.how() or in this case grp.mean() gridded = grp_agg.unstack(level=0) gridded = gridded.reindex(labels.astype(float)) if interp_lim > 0: gridded = gridded.interpolate(limit=interp_lim).bfill(limit=interp_lim) if not return_xarray: return gridded if return_xarray: dummy = transfer_nc_attrs(getframe(), var, var, "_vert_binned") xda = DataArray(gridded) if isinstance(var, DataArray): xda.attrs = dummy.attrs xda.name = dummy.name if isinstance(yvar, DataArray): y = xda.dims[0] xda[y].attrs = yvar.attrs xda = xda.rename({y: yvar.name}) if isinstance(xvar, DataArray): x = xda.dims[1] xda[x].attrs = xvar.attrs xda = xda.rename({x: xvar.name}) return xda
def read(self): """ :return: DataArray objects populated with data read from eeg files. The size of the output is number of channels x number of start offsets x number of time series points The corresponding DataArray axes are: 'channels', 'start_offsets', 'offsets' """ if self.read_size < 0: self.read_size = int(self.get_file_size() / self.file_format.data_size) # allocate space for data eventdata = np.empty((len(self.channels), len(self.start_offsets), self.read_size), dtype=np.float) * np.nan read_ok_mask = np.ones(shape=(len(self.channels), len(self.start_offsets)), dtype=np.bool) # loop over channels for c, channel in enumerate(self.channels): try: eegfname = self.dataroot + '.' + channel except TypeError: eegfname = self.dataroot + '.' + channel.decode() with open(eegfname, 'rb') as efile: # loop over start offsets for e, start_offset in enumerate(self.start_offsets): # rejecting negative offset if start_offset < 0: read_ok_mask[c, e] = False print(('Cannot read from negative offset %d in file %s' % (start_offset, eegfname))) continue # seek to the position in the file efile.seek(self.file_format.data_size * start_offset, 0) # read the data data = efile.read(int(self.file_format.data_size * self.read_size)) # convert from string to array based on the format # hard-codes little endian fmt = '<' + str(int(len(data) / self.file_format.data_size)) + self.file_format.format_string data = np.array(struct.unpack(fmt, data)) # make sure we got some data if len(data) < self.read_size: read_ok_mask[c, e] = False print(( 'Cannot read full chunk of data for offset ' + str(start_offset) + 'End of read interval is outside the bounds of file ' + str(eegfname))) else: # append it to the eventdata eventdata[c, e, :] = data # multiply by the gain eventdata *= self.params_dict['gain'] eventdata = DataArray(eventdata, dims=['channels', 'start_offsets', 'offsets'], coords={ 'channels': self.channels, 'start_offsets': self.start_offsets.copy(), 'offsets': np.arange(self.read_size), 'samplerate': self.params_dict['samplerate'] } ) from copy import deepcopy eventdata.attrs = deepcopy(self.params_dict) return eventdata, read_ok_mask