def test_decode_cf_time_bounds(): da = DataArray(np.arange(6, dtype='int64').reshape((3, 2)), coords={'time': [1, 2, 3]}, dims=('time', 'nbnd'), name='time_bnds') attrs = {'units': 'days since 2001-01', 'calendar': 'standard', 'bounds': 'time_bnds'} ds = da.to_dataset() ds['time'].attrs.update(attrs) _update_bounds_attributes(ds.variables) assert ds.variables['time_bnds'].attrs == {'units': 'days since 2001-01', 'calendar': 'standard'} dsc = decode_cf(ds) assert dsc.time_bnds.dtype == np.dtype('M8[ns]') dsc = decode_cf(ds, decode_times=False) assert dsc.time_bnds.dtype == np.dtype('int64') # Do not overwrite existing attrs ds = da.to_dataset() ds['time'].attrs.update(attrs) bnd_attr = {'units': 'hours since 2001-01', 'calendar': 'noleap'} ds['time_bnds'].attrs.update(bnd_attr) _update_bounds_attributes(ds.variables) assert ds.variables['time_bnds'].attrs == bnd_attr # If bounds variable not available do not complain ds = da.to_dataset() ds['time'].attrs.update(attrs) ds['time'].attrs['bounds'] = 'fake_var' _update_bounds_attributes(ds.variables)
def test_sweep_data(self, get_loader): if isinstance(self, MeasuredDataVolume): pytest.skip("requires synthetic data") if get_loader == "netcdf4" and self.format == "GAMIC": pytest.skip("gamic needs hdf-based loader") with self.get_volume_data( get_loader, decode_coords=False, mask_and_scale=False, decode_times=False, chunks=None, parallel=False, ) as vol: for i, ts in enumerate(vol): if "02" in self.name: ds = create_dataset(i, nrays=361) else: ds = create_dataset(i) for j, swp in enumerate(ts): xr.testing.assert_equal(swp.data, ds) with self.get_volume_data( get_loader, decode_coords=True, mask_and_scale=False, decode_times=True, chunks=None, parallel=False, ) as vol: for i, ts in enumerate(vol): for j, swp in enumerate(ts): data = create_dataset(i) data = data.assign_coords(create_coords(i).coords) data = data.assign_coords( create_site(self.data["where"]["attrs"]).coords) data = data.assign_coords( {"sweep_mode": "azimuth_surveillance"}) data = xr.decode_cf(data, mask_and_scale=False) xr.testing.assert_equal(swp.data, data) with self.get_volume_data( get_loader, decode_coords=True, mask_and_scale=True, decode_times=True, chunks=None, parallel=False, ) as vol: for i, ts in enumerate(vol): for j, swp in enumerate(ts): data = create_dataset(i, type=self.format) data = data.assign_coords(create_coords(i).coords) data = data.assign_coords( create_site(self.data["where"]["attrs"]).coords) data = data.assign_coords( {"sweep_mode": "azimuth_surveillance"}) data = xr.decode_cf(data) xr.testing.assert_equal(swp.data, data) del swp del ts del vol gc.collect()
async def get_shaped_resultcube(self, shape_query: ShapeQuery) -> xr.DataArray: fs = set() ps = await asyncio.gather(*[ self.dataset_files(when) for when in shape_query.temporal.dates() ]) [fs.add(f) for f in ps if (f is not None and Path(f).is_file())] [logger.debug("Confirmed: %s" % f) for f in fs] if len(fs) == 1: with xr.open_dataset(*fs) as ds: ds = xr.decode_cf(ds) ds.attrs['var_name'] = "fmc_mean" tr = ds.sel(time=slice( shape_query.temporal.start.strftime("%Y-%m-%d"), shape_query.temporal.finish.strftime("%Y-%m-%d"))) return tr elif len(fs) > 1: fs = list(set(fs)) with xr.open_mfdataset(*fs) as ds: ds = xr.decode_cf(ds) ds.attrs['var_name'] = "fmc_mean" ts = ds.sel(time=slice( shape_query.temporal.start.strftime("%Y-%m-%d"), shape_query.temporal.finish.strftime("%Y-%m-%d"))) return ts else: logger.debug("No files available/gathered for that space/time.") return xr.DataArray([])
def time_decoder(xds): if 'time' not in xds: return xds if xds.time.attrs.get('units', None): xds = xr.decode_cf(xds) xds['time'] = xr.DataArray(xds.time.values.astype('datetime64[D]'), dims=('time', )) return xds t = xds.time.values.astype(int) # nies workaround (they use seconds since 1980) if t[0] > 10000: t = (t / 86400).astype(int) add_processing(xds, f'time units converted to days since from seconds') xds['time'] = xr.DataArray(data=t, dims=('time', ), coords={'time': t}, attrs=dict(units='days since 1980', calendar='gregorian')) xds = xr.decode_cf(xds) return xds
def test_moment_data(self, get_loader): if isinstance(self, MeasuredDataVolume): pytest.skip("requires synthetic data") if get_loader == 'netcdf4' and self.format == 'GAMIC': pytest.skip("gamic needs hdf-based loader") with self.get_volume_data(get_loader, decode_coords=False, mask_and_scale=False, decode_times=False, chunks=None, parallel=False) as vol: for i, ts in enumerate(vol): if '02' in self.name: ds = create_dataset(i, nrays=361)['DBZH'] else: ds = create_dataset(i)['DBZH'] for j, swp in enumerate(ts): for k, mom in enumerate(swp): xr.testing.assert_equal(mom.data, ds) with self.get_volume_data(get_loader, decode_coords=True, mask_and_scale=False, decode_times=True, chunks=None, parallel=False) as vol: for i, ts in enumerate(vol): for j, swp in enumerate(ts): for k, mom in enumerate(swp): data = create_dataset(i) data = data.assign_coords(create_coords(i).coords) data = data.assign_coords( create_site(self.data['where']['attrs']).coords) data = data.assign_coords( {'sweep_mode': 'azimuth_surveillance'}) data = xr.decode_cf(data, mask_and_scale=False) xr.testing.assert_equal(mom.data, data['DBZH']) with self.get_volume_data(get_loader, decode_coords=True, mask_and_scale=True, decode_times=True, chunks=None, parallel=False) as vol: for i, ts in enumerate(vol): for j, swp in enumerate(ts): for k, mom in enumerate(swp): data = create_dataset(i, type=self.format) data = data.assign_coords(create_coords(i).coords) data = data.assign_coords( create_site(self.data['where']['attrs']).coords) data = data.assign_coords( {'sweep_mode': 'azimuth_surveillance'}) data = xr.decode_cf(data) xr.testing.assert_equal(mom.data, data['DBZH']) del mom del swp del ts del vol gc.collect()
def open(self, *args, **kwargs): kwargs["decode_times"] = False da = super().open(*args, **kwargs) da["time"], _ = fix_time_units(da["time"]) if hasattr(da, "to_dataset"): return xr.decode_cf(da.to_dataset()) else: return xr.decode_cf(da)
def _load(f, v): dataset = xr.open_dataset(f, decode_cf=False) if "time_bnds" in dataset: tb = dataset['time_bnds'].mean(axis=1) dataset['time'].values = tb dataset = xr.decode_cf(dataset) else: dataset = xr.decode_cf(dataset) return dataset[v]
def read(self, file_info, fields=None, mapping=None, **kwargs): """Read SEVIRI HDF5 files and load them to a xarray.Dataset Args: file_info: Path and name of the file as string or FileInfo object. This can also be a tuple/list of file names or a path with asterisk. fields: ... **kwargs: Additional keyword arguments that are valid for :class:`typhon.files.handlers.common.NetCDF4`. Returns: A xrarray.Dataset object. """ self._ensure_local_filesystem(file_info) # Here, the user fields overwrite the standard fields: if fields is None: raise NotImplementedError( "Loading complete HDF5 files without giving explicit field " "names is not yet implemented!") # keys are dimension size, values are dimension names dim_dict = {} # Load the dataset from the file: with h5py.File(file_info.path, 'r') as file: dataset = xr.Dataset() for field in fields: if field not in file: raise KeyError(f"No field named '{field}'!") dims = [] for dim_size in file[field].shape: dim_name = dim_dict.get(dim_size, None) if dim_name is None: dim_name = f"dim_{len(dim_dict)}" dim_dict[dim_size] = dim_name dims.append(dim_name) dataset[field] = xr.DataArray( file[field], dims=dims, # Currently, some attributes may contain byte-strings that # are not nice for further processing attrs={}, #dict(file[field].attrs) ) xr.decode_cf(dataset, **kwargs) dataset.load() return _xarray_rename_fields(dataset, mapping)
def decode_cf(self, dataset: xr.Dataset) -> xr.Dataset: """------------------------------------------------------------------------------------ Decodes the dataset according to CF conventions. This helps ensure that the dataset is formatted correctly after it has been constructed from unstandardized sources or heavily modified. Args: dataset (xr.Dataset): The dataset to decode. Returns: xr.Dataset: The decoded dataset. ------------------------------------------------------------------------------------""" # We have to make sure that time variables do not have units set as attrs, and # instead have units set on the encoding or else xarray will crash when trying # to save: https://github.com/pydata/xarray/issues/3739 for variable in dataset.variables.values(): if variable.data.dtype.type == np.datetime64 and "units" in variable.attrs: units = variable.attrs["units"] del variable.attrs["units"] variable.encoding["units"] = units # Leaving the "dtype" entry in the encoding causes a crash when calling # `dataset.to_netcdf`. Related to but not fixed by https://github.com/pydata/xarray/pull/4684 ds = xr.decode_cf(dataset) for variable in ds.variables.values(): if variable.data.dtype.type == np.datetime64: if "dtype" in variable.encoding: del variable.encoding["dtype"] return ds
def test_maybe_apply_time_shift_ts(gfdl_data_loader, ds, var_name, generate_file_set_args): ds = xr.decode_cf(ds) da = ds[var_name] result = gfdl_data_loader._maybe_apply_time_shift( da.copy(), **generate_file_set_args)[TIME_STR] assert result.identical(da[TIME_STR])
def test_sel_time(): time_bounds = np.array([[0, 31], [31, 59], [59, 90]]) nv = np.array([0, 1]) time = np.array([15, 46, 74]) data = np.zeros((3)) var_name = 'a' ds = xr.DataArray(data, coords=[time], dims=[TIME_STR], name=var_name).to_dataset() ds[TIME_BOUNDS_STR] = xr.DataArray(time_bounds, coords=[time, nv], dims=[TIME_STR, BOUNDS_STR], name=TIME_BOUNDS_STR) units_str = 'days since 2000-01-01 00:00:00' ds[TIME_STR].attrs['units'] = units_str ds = ensure_time_avg_has_cf_metadata(ds) ds = set_grid_attrs_as_coords(ds) ds = xr.decode_cf(ds) da = ds[var_name] start_date = np.datetime64('2000-02-01') end_date = np.datetime64('2000-03-31') result = sel_time(da, start_date, end_date) assert result[SUBSET_START_DATE_STR].values == start_date assert result[SUBSET_END_DATE_STR].values == end_date
def test_assert_has_data_for_time_str_input(): time_bounds = np.array([[0, 31], [31, 59], [59, 90]]) nv = np.array([0, 1]) time = np.array([15, 46, 74]) data = np.zeros((3)) var_name = 'a' ds = xr.DataArray(data, coords=[time], dims=[TIME_STR], name=var_name).to_dataset() ds[TIME_BOUNDS_STR] = xr.DataArray(time_bounds, coords=[time, nv], dims=[TIME_STR, BOUNDS_STR], name=TIME_BOUNDS_STR) units_str = 'days since 2000-01-01 00:00:00' ds[TIME_STR].attrs['units'] = units_str ds = ensure_time_avg_has_cf_metadata(ds) ds = set_grid_attrs_as_coords(ds) ds = xr.decode_cf(ds) da = ds[var_name] start_date = '2000-01-01' end_date = '2000-03-31' _assert_has_data_for_time(da, start_date, end_date) start_date_bad = '1999-12-31' end_date_bad = '2000-04-01' # With strings these checks are disabled _assert_has_data_for_time(da, start_date_bad, end_date) _assert_has_data_for_time(da, start_date, end_date_bad) _assert_has_data_for_time(da, start_date_bad, end_date_bad)
def preprocess_time(x): # TODO: if monthly, use begining of period time step ''' Convert time to initialization and foreast lead time (to fit into orthoganal matrices) Input Dims: lat x lon x Time Output Dims: lat x lon x init_time x fore_time_i''' # Set record dimension of 'time' to the beinging of averaging period 'average_T1' x['time'] = x.average_T1 # Grab forecast times xtimes = xr.decode_cf(x).time.values; # Get initialization time x.coords['init_time'] = xtimes[0] # get first one x.coords['init_time'].attrs['comments'] = 'Initilzation time of forecast' # Get forecast time in days from initilization x.rename({'time':'fore_time_i'}, inplace=True); x.coords['fore_time_i'] = np.arange(0,12,1) x.fore_time_i.attrs['units'] = 'Index of forecast dates' # Store actual forecast dates x.coords['fore_time'] = xr.DataArray(xtimes, dims=('fore_time_i'), coords={'fore_time_i':x.fore_time_i}) x.fore_time.attrs['comments'] = 'Date of forecast' return x
def to_xarray(self, enhance=False): """ Convert Wave data from a Pandas DataFrame to an xarray Dataset. Args: enhance (bool, optional): Rename variables to something meaningful and add useful attributes. Defaults to False. Returns: xarray.Dataset: xarray dataset containing converted wave data. """ logging.info("Converting wave data to xarray dataset") # Set dataframe to indexes defined during class initialization tdf = self.data.set_index(self.df_index).drop( ["TIME", "TYRS", "TMON", "TDAY", "THRS", "TMIN", "TSEC"], axis=1) # Intitialize xarray dataset ds = tdf.to_xarray() # Assign header data to global attributes ds = ds.assign_attrs(self.metadata) if enhance is True: # global_attr = required_global_attributes(required_attributes, time_start, time_end) ds = self.enhance_xarray(ds) ds = xr.decode_cf(ds) return ds
def read(self, filename, **kwargs): """Read and parse a NetCDF file and load it to a xarray.Dataset Args: filename: Path and name of the file as string or FileInfo object. **kwargs: Additional key word arguments that are allowed for the :class:`~typhon.files.handlers.common.NetCDF4` class. Returns: A xarray.Dataset object. """ # Make sure that the standard fields are always gonna be imported: fields = kwargs.pop("fields", None) if fields is not None: fields = {"time", "lat", "lon"} | set(fields) # xarray has problems with decoding the time variable correctly. Hence, # we disable it here: decode_cf = kwargs.pop("decode_cf", True) data = super().read(filename, fields=fields, decode_cf=False, **kwargs) # Then we fix the problem (we need integer64 instead of integer 32): attrs = data["time"].attrs.copy() data["time"] = data["time"].astype(int) data["time"].attrs = attrs # Do decoding now (just if the user wanted it!) if decode_cf: return xr.decode_cf(data) return data
def diagnosis(name, path_int, path_out, varname): def decode_month_since(time): start = time.attrs['units'].split(' ')[2] return pd.date_range(start, periods=len(time), freq='1M') hr = xr.open_mfdataset(path_in, decode_times=False) var = hr[varname].copy(deep=True) if 'month' in hr['time'].attrs['units']: var['time'] = decode_month_since(hr['time']) else: var['time'] = xr.decode_cf(hr)['time'].to_index() hr.close() fig = plt.figure(figsize=(6.5, 8)) gs = gridspec.GridSpec(2, 1, hspace=0.2, height_ratios=[0.8, 1.2]) # Time series ax = plt.subplot(gs[0]) ax.plot(var['time'].to_index(), var.mean(dim=['lat', 'lon']).values) ax.set_title(name + ' time series') # Map ax = plt.subplot(gs[1], projection=ccrs.PlateCarree()) ax.coastlines() ax.gridlines() cf = ax.contourf(var.lon, var.lat, var.mean(dim='time'), cmap='Spectral') plt.colorbar(cf, ax=ax, orientation='horizontal', pad=0.05) ax.set_title(name + ' climatology') fig.savefig(path_out, dpi=600., bbox_inches='tight') plt.close(fig)
def prep_time_data(ds): """Prepare time coordinate information in Dataset for use in aospy. 1. If the Dataset contains a time bounds coordinate, add attributes representing the true beginning and end dates of the time interval used to construct the Dataset 2. If the Dataset contains a time bounds coordinate, overwrite the time coordinate values with the averages of the time bounds at each timestep 3. Decode the times into np.datetime64 objects for time indexing Parameters ---------- ds : Dataset Pre-processed Dataset with time coordinate renamed to internal_names.TIME_STR Returns ------- Dataset The processed Dataset """ ds = ensure_time_as_index(ds) if TIME_BOUNDS_STR in ds: ds = ensure_time_avg_has_cf_metadata(ds) ds[TIME_STR] = average_time_bounds(ds) else: logging.warning("dt array not found. Assuming equally spaced " "values in time, even though this may not be " "the case") ds = add_uniform_time_weights(ds) return xr.decode_cf(ds, decode_times=True, decode_coords=False, mask_and_scale=True)
def read_mls_o3(year='2005', day='0*', min_lat=-10, max_lat=10): """ Load several days of MLS O3 data...files are large. A better computer might be able to handle a year. Filter to input latitude band, remove low quality data (re. MLS documentation). **** Monthly mean is not calculated cause full months are not necessarily loaded. *** :param year: string. Default is '2005'. :param day: string. Default is '0*' which loads data for days 1 to 99 of year. :param min_lat: minimum latitude to include in means. Default is -10. :param max_lat: maximum latitude to include in means. Default is 10. :return: Save a new netcdf file that is smaller and quicker to load. """ o3 = xr.open_mfdataset( r'/home/kimberlee/ValhallaData/MLS/L2O3v04-20/MLS-Aura_L2GP-O3_v04-20-c01_%sd%s.he5' % (year, day), group='HDFEOS/SWATHS/O3/Data Fields/', concat_dim='nTimes') geo = xr.open_mfdataset( r'/home/kimberlee/ValhallaData/MLS/L2O3v04-20/MLS-Aura_L2GP-O3_v04-20-c01_%sd%s.he5' % (year, day), group='HDFEOS/SWATHS/O3/Geolocation Fields/', concat_dim='nTimes') mls = xr.merge([o3, geo]) mls = mls.drop(['AscDescMode', 'L2gpPrecision', 'L2gpValue', 'ChunkNumber', 'LineOfSightAngle', 'LocalSolarTime', 'OrbitGeodeticAngle', 'SolarZenithAngle']) mls.Time.attrs['units'] = 'Seconds since 01-01-1993' mls = xr.decode_cf(mls) mls = mls.swap_dims({'nTimes': 'Time'}, inplace=True) mls = mls.dropna(dim="Time") mls = mls.where((mls.Latitude > min_lat) & (mls.Latitude < max_lat)) mls = mls.where(((mls.Status % 2) == 0) & (mls.Quality > 1.0) & (mls.Convergence < 1.03) & (mls.O3Precision > 0)) mls = mls.resample('MS', dim='Time', how='mean') mls.to_netcdf(path='/home/kimberlee/Masters/NO2/MLS_O3_monthlymeans/quarters/MLS-O3-%s-%s.nc' % (year, day), mode='w') return
def test_maybe_apply_time_shift_ts(gfdl_data_loader, ds_with_time_bounds, var_name, generate_file_set_args): ds = xr.decode_cf(ds_with_time_bounds) da = ds[var_name] result = gfdl_data_loader._maybe_apply_time_shift( da.copy(), **generate_file_set_args)[TIME_STR] assert result.identical(da[TIME_STR])
def read_mls_n2o(year='2005', min_lat=-10, max_lat=10): """ Load a year worth of MLS N2O data. Filter to input latitude band, remove low quality data (re. MLS documentation) and calculate monthly mean. :param year: string. Default is '2005'. :param min_lat: minimum latitude to include in means. Default is -10. :param max_lat: maximum latitude to include in means. Default is 10. :return: Save a new netcdf file that is smaller and quicker to load. """ n2o = xr.open_mfdataset( r'/home/kimberlee/ValhallaData/MLS/L2N2Ov-04-23/MLS-Aura_L2GP-N2O_v04-20-c01_%s*.he5' % year, group='HDFEOS/SWATHS/N2O/Data Fields/', concat_dim='nTimes') geo = xr.open_mfdataset( r'/home/kimberlee/ValhallaData/MLS/L2N2Ov-04-23/MLS-Aura_L2GP-N2O_v04-20-c01_%s*.he5' % year, group='HDFEOS/SWATHS/N2O/Geolocation Fields/', concat_dim='nTimes') mls = xr.merge([n2o, geo]) mls = mls.drop(['AscDescMode', 'L2gpPrecision', 'L2gpValue', 'ChunkNumber', 'LineOfSightAngle', 'LocalSolarTime', 'OrbitGeodeticAngle', 'SolarZenithAngle']) mls.Time.attrs['units'] = 'Seconds since 01-01-1993' mls = xr.decode_cf(mls) mls = mls.swap_dims({'nTimes': 'Time'}, inplace=True) mls = mls.dropna(dim="Time") mls = mls.where((mls.Latitude > min_lat) & (mls.Latitude < max_lat)) mls = mls.where((mls.Status % 2) == 0) mls = mls.where(mls.Quality > 1.4) mls = mls.where(mls.Convergence < 1.01) mls = mls.resample('MS', dim='Time', how='mean') mls.to_netcdf(path='/home/kimberlee/Masters/NO2/MLS_N2O_monthlymeans/MLS-N2O-%s.nc' % year, mode='w') return
def test_maybe_apply_time_offset_ts(self): ds = xr.decode_cf(self.ds) da = ds[self.var_name] result = self.DataLoader._maybe_apply_time_shift( da.copy(), **self.generate_file_set_args)[TIME_STR] assert result.identical(da[TIME_STR])
def test_multiplication(): HEIGHT = 100 POINTSPEC = 3 BTIME = 5 scale_factor = (1 / HEIGHT) * 1000 fds = create_flexdust_test_data(seed=None) fpds = create_test_data(seed=None) ds_orr, pre_ds, out_data = process_per_pointspec(fpds, fds, x0=None, x1=None, y0=None, y1=None, height=HEIGHT) produced_ds = xr.decode_cf(pre_ds.to_dataset(name='spec001_mr')) fpds = fpds.rename({'latitude': 'lat', 'longitude': 'lon'}) produced_ds = produced_ds.isel(time=POINTSPEC, btime=BTIME) test_time = produced_ds.time + produced_ds.btime fpds = fpds.sel(time=test_time.values, pointspec=POINTSPEC, height=HEIGHT, nageclass=0)['spec001_mr'] fds = fds.sel(time=test_time.values)['Emission'] fpds_times_fds = (fpds * fds).values * scale_factor assert pytest.approx( produced_ds['spec001_mr'].sum(dim=['lon', 'lat']).values, 0.01) == fpds_times_fds.sum()
def preprocess_time_monthly(x): ''' Preprocesses time variables from GFDL format to SIPN2 format. Convert time to initialization and forecast lead time (to fit into orthogonal matrices) Input Dims: lat x lon x Time Output Dims: lat x lon x init_time x fore_time Where we represent fore_time as monthly increments ''' Nmons = x.average_T1.size m_i = np.arange(0, Nmons) m_dt = ['month' for x in m_i] # list of 'month' # Set record dimension of 'time' to the beginning of averaging period 'average_T1' x['time'] = x.average_T1 # Grab forecast times xtimes = xr.decode_cf(x).time.values # Get initialization time x.coords['init_time'] = xtimes[0] # get first one x.coords['init_time'].attrs['comments'] = 'Initilzation time of forecast' # Get forecast time (as timedeltas from init_time) x.rename({'time': 'fore_time'}, inplace=True) x.coords['fore_time'] = xr.DataArray(m_i, dims='fore_time') # Set time offset for index in fore_time x.coords['fore_offset'] = xr.DataArray(m_dt, dims='fore_time', coords={'fore_time': x.fore_time}) return x
def perform_cmip6_query(self, config, query_string: str) -> xr.Dataset: df_sub = config.df.query(query_string) if df_sub.zstore.values.size == 0: return df_sub mapper = config.fs.get_mapper(df_sub.zstore.values[-1]) logging.debug("[CMIP6_IO] df_sub: {}".format(df_sub)) ds = xr.open_zarr(mapper, consolidated=True, mask_and_scale=True) # print("Time encoding: {} - {}".format(ds.indexes['time'], ds.indexes['time'].dtype)) if not ds.indexes["time"].dtype in ["datetime64[ns]", "object"]: time_object = ds.indexes['time'].to_datetimeindex( ) # pd.DatetimeIndex([ds["time"].values[0]]) # Convert if necessary if time_object[0].year == 1: times = ds.indexes['time'].to_datetimeindex( ) # pd.DatetimeIndex([ds["time"].values]) times_plus_2000 = [] for t in times: times_plus_2000.append( cftime.DatetimeNoLeap(t.year + 2000, t.month, t.day, t.hour)) ds["time"].values = times_plus_2000 ds = xr.decode_cf(ds) return ds
def test_assert_has_data_for_time(): time_bounds = np.array([[0, 31], [31, 59], [59, 90]]) nv = np.array([0, 1]) time = np.array([15, 46, 74]) data = np.zeros((3)) var_name = 'a' ds = xr.DataArray(data, coords=[time], dims=[TIME_STR], name=var_name).to_dataset() ds[TIME_BOUNDS_STR] = xr.DataArray(time_bounds, coords=[time, nv], dims=[TIME_STR, BOUNDS_STR], name=TIME_BOUNDS_STR) units_str = 'days since 2000-01-01 00:00:00' ds[TIME_STR].attrs['units'] = units_str ds = ensure_time_avg_has_cf_metadata(ds) ds = set_grid_attrs_as_coords(ds) ds = xr.decode_cf(ds) da = ds[var_name] start_date = np.datetime64('2000-01-01') end_date = np.datetime64('2000-03-31') _assert_has_data_for_time(da, start_date, end_date) start_date_bad = np.datetime64('1999-12-31') end_date_bad = np.datetime64('2000-04-01') with pytest.raises(AssertionError): _assert_has_data_for_time(da, start_date_bad, end_date) with pytest.raises(AssertionError): _assert_has_data_for_time(da, start_date, end_date_bad) with pytest.raises(AssertionError): _assert_has_data_for_time(da, start_date_bad, end_date_bad)
def perform_cmip6_query(conf, query_string): df_sub = conf.df.query(query_string) if (df_sub.zstore.values.size == 0): return df_sub mapper = conf.fs.get_mapper(df_sub.zstore.values[-1]) ds = xr.open_zarr(mapper, consolidated=True) print("Time encoding: {} - {}".format(ds.indexes['time'], ds.indexes['time'].dtype)) if not ds.indexes["time"].dtype in ["datetime64[ns]", "object"]: time_object = ds.indexes['time'].to_datetimeindex( ) #pd.DatetimeIndex([ds["time"].values[0]]) print(time_object, time_object.year) # Convert if necesssary if time_object[0].year == 1: times = ds.indexes['time'].to_datetimeindex( ) # pd.DatetimeIndex([ds["time"].values]) times_plus_2000 = [] for t in times: times_plus_2000.append( cftime.DatetimeNoLeap(t.year + 2000, t.month, t.day, t.hour)) ds["time"].values = times_plus_2000 ds = xr.decode_cf(ds) return ds
def getRadarVar(filePath, refTime, varName): """ This function reads the radar netCDF files and extract the desired variable. Arguments --------- filePath : Path to the netCDF file refTime : String specifying the starting time for example 1970-01-01 00:00:00 varName : Name of the desired variable Returns ------- dataArray : xarray DataArray The extracted DataArray """ timeAtt = 'seconds since {refTime} UTC'.format(refTime=refTime) tempDS = xr.open_dataset(filePath) tempDS.time.attrs['units'] = timeAtt tempDS = xr.decode_cf(tempDS) tempDSZe = tempDS[varName] return tempDSZe
def load_dataset(path, ens_mems=40): logger.info('Extract data from {0:s} for {1:03d} ensemble members'.format( path, ens_mems)) ds_ens = [] pbar_mem = tqdm(range(ens_mems)) for mem in pbar_mem: path_mem = path.format(mem + 1) pbar_mem.write('Extract {0:s}'.format(path_mem)) found_paths = sorted(list(glob.glob(path_mem))) ds_mem = xr.open_mfdataset(found_paths, parallel=True, combine='nested', concat_dim='time', chunks={'time': 1}, decode_cf=False, decode_times=False) ds_ens.append(ds_mem) logger.info('Starting to concat ensemble') ds_ens = xr.concat(ds_ens, dim='ensemble') ds_ens = ds_ens.chunk({'ensemble': 1, 'time': 1}) logger.info('Starting to decode cf-conventions') ds_ens = xr.decode_cf(ds_ens) logger.info('Concatenated data from {0:s} for {1:03d} ensemble ' 'members'.format(path, ens_mems)) return ds_ens
def pop_decode_time(var): varname = var.name time = var.time time.values = time.values - 16 #var = var.assign_coords(time=time) ds = xr.decode_cf(var.to_dataset(), decode_times=True) return ds[varname]
def create_ray_time(i, decode=False, nrays=360): time_data = (create_startazT(i, nrays=nrays) + create_stopazT(i, nrays=nrays)) / 2. da = xr.DataArray(time_data, dims=['azimuth'], attrs=io.xarray.time_attrs) if decode: da = xr.decode_cf(xr.Dataset({'arr': da})).arr return da
def import_nc_file(filepath, variables): # TODO: Specify whether a variable is constant wrt. a dimension via the # command line. # It seems that the whole "cell_methods" and "units" heuristic doesn't # really work. So we're back to being simple again. If there's a # "cell_methods" attribute containing "time", the value is aggregatet, # otherwise it's istantaneous. Being constant wrt. time has to be specified # manually. dataset = xr.open_dataset(filepath, decode_cf=False) if ("time" in dataset.variables and "units" not in dataset[dataset["time"].attrs["bounds"]].attrs): dataset["time_bnds"].attrs["units"] = dataset["time"].units dataset = xr.decode_cf(dataset) vs = [v for v in variables if v in dataset.variables.keys()] return [{ "name": v, "dataset": dataset, "time": dataset["time"].attrs["bounds"] if "time:" in dataset[v].attrs.get( "cell_methods", "") else "time", } for v in vs]
def open_flatds(filename, writeable=False, with_dask=False): data = np.memmap(filename, dtype="uint8", mode="r+" if writeable else "r") if np.any(data[:len(MAGIC)] != MAGIC): raise ValueError("file \"{}\" is not a flatds file".format(filename)) if data[len(MAGIC)] != 0: raise ValueError("unknown header location") # header is in the back header_location = data[-8:].view("uint64")[0] header = msgpack.unpackb(data[header_location:-8], raw=False) def get_var(props): if len(props["d"]) > 0: dims, shape = zip(*[header["dims"][d] for d in props["d"]]) size = np.prod(shape) * props["is"] else: dims = () shape = () size = props["is"] ofs = props["ofs"] d = data[ofs:ofs+size] d = d.view(dtype=props["t"]) d = np.lib.stride_tricks.as_strided(d, shape, props["st"], subok=True, writeable=writeable) attrs = props.get("attrs", {}) if with_dask: import dask.array as da d = da.from_array(d) return xr.DataArray(d, dims=dims, attrs=attrs) variables = {name: get_var(p) for name, p in header["vars"].items()} attrs = header.get("attrs", {}) return xr.decode_cf(xr.Dataset(variables, attrs=attrs))
def test_write_store(self): expected = create_test_data() with self.create_store() as store: expected.dump_to_store(store) # we need to cf decode the store because it has time and # non-dimension coordinates actual = xr.decode_cf(store) self.assertDatasetAllClose(expected, actual)
def test_decode_cf(calendar): days = [1., 2., 3.] da = DataArray(days, coords=[days], dims=['time'], name='test') ds = da.to_dataset() for v in ['test', 'time']: ds[v].attrs['units'] = 'days since 2001-01-01' ds[v].attrs['calendar'] = calendar if not has_cftime_or_netCDF4 and calendar not in _STANDARD_CALENDARS: with pytest.raises(ValueError): ds = decode_cf(ds) else: ds = decode_cf(ds) if calendar not in _STANDARD_CALENDARS: assert ds.test.dtype == np.dtype('O') else: assert ds.test.dtype == np.dtype('M8[ns]')
def test_decode_cf_enable_cftimeindex(calendar, enable_cftimeindex): days = [1., 2., 3.] da = DataArray(days, coords=[days], dims=['time'], name='test') ds = da.to_dataset() for v in ['test', 'time']: ds[v].attrs['units'] = 'days since 2001-01-01' ds[v].attrs['calendar'] = calendar if (not has_cftime and enable_cftimeindex and calendar not in coding.times._STANDARD_CALENDARS): with pytest.raises(ValueError): with set_options(enable_cftimeindex=enable_cftimeindex): ds = decode_cf(ds) else: with set_options(enable_cftimeindex=enable_cftimeindex): ds = decode_cf(ds) if (enable_cftimeindex and calendar not in coding.times._STANDARD_CALENDARS): assert ds.test.dtype == np.dtype('O') else: assert ds.test.dtype == np.dtype('M8[ns]')
def test_maybe_apply_time_shift(data_loader, ds, inst_ds, var_name, generate_file_set_args): ds = xr.decode_cf(ds) da = ds[var_name] result = data_loader._maybe_apply_time_shift( da.copy(), **generate_file_set_args)[TIME_STR] assert result.identical(da[TIME_STR]) offset = data_loader._maybe_apply_time_shift( da.copy(), {'days': 1}, **generate_file_set_args) result = offset[TIME_STR] expected = da[TIME_STR] + np.timedelta64(1, 'D') expected[TIME_STR] = expected assert result.identical(expected)
def test_assert_has_data_for_time_cftime_datetimes(calendar, date_type): time_bounds = np.array([[0, 2], [2, 4], [4, 6]]) nv = np.array([0, 1]) time = np.array([1, 3, 5]) data = np.zeros((3)) var_name = 'a' ds = xr.DataArray(data, coords=[time], dims=[TIME_STR], name=var_name).to_dataset() ds[TIME_BOUNDS_STR] = xr.DataArray(time_bounds, coords=[time, nv], dims=[TIME_STR, BOUNDS_STR], name=TIME_BOUNDS_STR) units_str = 'days since 0002-01-02 00:00:00' ds[TIME_STR].attrs['units'] = units_str ds[TIME_STR].attrs['calendar'] = calendar ds = ensure_time_avg_has_cf_metadata(ds) ds = set_grid_attrs_as_coords(ds) with warnings.catch_warnings(record=True): with xr.set_options(enable_cftimeindex=True): ds = xr.decode_cf(ds) da = ds[var_name] start_date = date_type(2, 1, 2) end_date = date_type(2, 1, 8) _assert_has_data_for_time(da, start_date, end_date) start_date_bad = date_type(2, 1, 1) end_date_bad = date_type(2, 1, 9) with pytest.raises(AssertionError): _assert_has_data_for_time(da, start_date_bad, end_date) with pytest.raises(AssertionError): _assert_has_data_for_time(da, start_date, end_date_bad) with pytest.raises(AssertionError): _assert_has_data_for_time(da, start_date_bad, end_date_bad)
def test_maybe_apply_time_shift_inst(gfdl_data_loader, inst_ds, var_name, generate_file_set_args): inst_ds = xr.decode_cf(inst_ds) generate_file_set_args['dtype_in_time'] = 'inst' generate_file_set_args['intvl_in'] = '3hr' da = inst_ds[var_name] result = gfdl_data_loader._maybe_apply_time_shift( da.copy(), **generate_file_set_args)[TIME_STR] expected = da[TIME_STR] + np.timedelta64(-3, 'h') expected[TIME_STR] = expected assert result.identical(expected) generate_file_set_args['intvl_in'] = 'daily' da = inst_ds[var_name] result = gfdl_data_loader._maybe_apply_time_shift( da.copy(), **generate_file_set_args)[TIME_STR] expected = da[TIME_STR] expected[TIME_STR] = expected assert result.identical(expected)
def _prep_time_data(ds): """Prepare time coordinate information in Dataset for use in aospy. 1. If the Dataset contains a time bounds coordinate, add attributes representing the true beginning and end dates of the time interval used to construct the Dataset 2. If the Dataset contains a time bounds coordinate, overwrite the time coordinate values with the averages of the time bounds at each timestep 3. Decode the times into np.datetime64 objects for time indexing Parameters ---------- ds : Dataset Pre-processed Dataset with time coordinate renamed to internal_names.TIME_STR Returns ------- Dataset The processed Dataset """ ds = times.ensure_time_as_index(ds) if TIME_BOUNDS_STR in ds: ds = times.ensure_time_avg_has_cf_metadata(ds) ds[TIME_STR] = times.average_time_bounds(ds) else: logging.warning("dt array not found. Assuming equally spaced " "values in time, even though this may not be " "the case") ds = times.add_uniform_time_weights(ds) # Suppress enable_cftimeindex is a no-op warning; we'll keep setting it for # now to maintain backwards compatibility for older xarray versions. with warnings.catch_warnings(): warnings.filterwarnings('ignore') with xr.set_options(enable_cftimeindex=True): ds = xr.decode_cf(ds, decode_times=True, decode_coords=False, mask_and_scale=True) return ds
def to_xarray(self, **kwargs): """ Convert the field to a :class:`xarray.Dataset` with CF metadata interpretation. Limitations: * Bounds are treated as data arrays inside the ``xarray`` dataset. * Integer masked arrays are upcast to float data types in ``xarray``. * Group hierarchies are not supported in ``xarray``. :keyword bool decode_cf: (``=True``) If ``True``, run the ``xarray`` function ``decode_cf`` on the returned dataset. :param dict kwargs: Optional keyword arguments to dataset creation. See :meth:`ocgis.VariableCollection.to_xarray` for additional information. :rtype: :class:`xarray.Dataset` """ from xarray import decode_cf kwargs = kwargs.copy() should_decode_cf = kwargs.pop('decode_cf', True) ret = super(Field, self).to_xarray(**kwargs) if should_decode_cf: ret = decode_cf(ret) return ret
def load_variable(var_name, path_to_file, method='xarray', fix_times=True, **extr_kwargs): """ Interface for loading an extracted variable into memory, using either iris or xarray. If `path_to_file` is instead a raw dataset, then the entire contents of the file will be loaded! Parameters ---------- var_name : string The name of the variable to load path_to_file : string Location of file containing variable method : string Choose between 'iris' or 'xarray' fix_times : bool Correct the timestamps to the middle of the bounds in the variable metadata (CESM puts them at the right boundary which sucks!) extr_kwargs : dict Additional keyword arguments to pass to the extractor """ logger.info("Loading %s from %s" % (var_name, path_to_file)) if method == "iris": raise NotImplementedError("`iris` deprecated with Python 3") # cf = lambda c : c.var_name == var_name # cubes = iris.load(path_to_file, iris.Constraint(cube_func=cf), # **extr_kwargs) # # if not cubes: # raise RuntimeError("Could not find '%s' in cube" % var_name) # # assert len(cubes) == 1 # # c = cubes[0] # # if fix_times: # times = c.coord('time') # assert hasattr(times, 'bounds') # # bnds = times.bounds # mean_times = np.mean(bnds, axis=1) # # times.points = mean_times # # return c elif method == "xarray": ds = xarray.open_dataset(path_to_file, decode_cf=False, **extr_kwargs) # Fix time unit, if necessary interval, timestamp = ds.time.units.split(" since ") timestamp = timestamp.split(" ") yr, mm, dy = timestamp[0].split("-") if int(yr) < 1650: yr = 2001 yr = str(yr) # Re-construct at Jan 01, 2001 and re-set timestamp[0] = "-".join([yr, mm, dy]) new_units = " ".join([interval, "since"] + timestamp) ds.time.attrs['units'] = new_units if fix_times: assert hasattr(ds, 'time_bnds') bnds = ds.time_bnds.values mean_times = np.mean(bnds, axis=1) ds.time.values = mean_times # Be pedantic and check that we don't have a "missing_value" attr for field in ds: if hasattr(ds[field], 'missing_value'): del ds[field].attrs['missing_value'] # Lazy decode CF ds = xarray.decode_cf(ds) return ds
def open_mdsdataset(dirname, iters='all', prefix=None, read_grid=True, delta_t=1, ref_date=None, calendar='gregorian', geometry='sphericalpolar', grid_vars_to_coords=True, swap_dims=False, endian=">", chunks=None, ignore_unknown_vars=False,): """Open MITgcm-style mds (.data / .meta) file output as xarray datset. Parameters ---------- dirname : string Path to the directory where the mds .data and .meta files are stored iters : list, optional The iterations numbers of the files to be read. If `None`, no data files will be read. prefix : list, optional List of different filename prefixes to read. Default is to read all available files. read_grid : bool, optional Whether to read the grid data deltaT : number, optional The timestep used in the model. (Can't be inferred.) ref_date : string, optional A date string corresponding to the zero timestep. E.g. "1990-1-1 0:0:0". See CF conventions [1]_ calendar : string, optional A calendar allowed by CF conventions [1]_ geometry : {'sphericalpolar', 'cartesian', 'llc'} MITgcm grid geometry specifier. swap_dims : boolean, optional Whether to swap the logical dimensions for physical ones. endian : {'=', '>', '<'}, optional Endianness of variables. Default for MITgcm is ">" (big endian) chunks : int or dict, optional If chunks is provided, it used to load the new dataset into dask arrays. ignore_unknown_vars : boolean, optional Don't raise an error if unknown variables are encountered while reading the dataset. Returns ------- dset : xarray.Dataset Dataset object containing all coordinates and variables. References ---------- .. [1] http://cfconventions.org/Data/cf-conventions/cf-conventions-1.7/build/ch04s04.html """ # get frame info for history frame = inspect.currentframe() _, _, _, arg_values = inspect.getargvalues(frame) del arg_values['frame'] function_name = inspect.getframeinfo(frame)[2] # some checks for argument consistency if swap_dims and not read_grid: raise ValueError("If swap_dims==True, read_grid must be True.") # We either have a single iter, in which case we create a fresh store, # or a list of iters, in which case we combine. if iters == 'all': iters = _get_all_iternums(dirname, file_prefixes=prefix) if iters is None: iternum = None else: try: iternum = int(iters) # if not we probably have some kind of list except TypeError: if len(iters) == 1: iternum = int(iters[0]) else: # We have to check to make sure we have the same prefixes at # each timestep...otherwise we can't combine the datasets. first_prefixes = prefix or _get_all_matching_prefixes( dirname, iters[0]) for iternum in iters: these_prefixes = _get_all_matching_prefixes( dirname, iternum, prefix ) # don't care about order if set(these_prefixes) != set(first_prefixes): raise IOError("Could not find the expected file " "prefixes %s at iternum %g. (Instead " "found %s)" % (repr(first_prefixes), iternum, repr(these_prefixes))) # chunk at least by time chunks = chunks or {} # recursively open each dataset at a time datasets = [open_mdsdataset( dirname, iters=iternum, delta_t=delta_t, read_grid=False, swap_dims=False, prefix=prefix, ref_date=ref_date, calendar=calendar, geometry=geometry, grid_vars_to_coords=grid_vars_to_coords, endian=endian, chunks=chunks, ignore_unknown_vars=ignore_unknown_vars) for iternum in iters] # now add the grid if read_grid: datasets.insert(0, open_mdsdataset( dirname, iters=None, delta_t=delta_t, read_grid=True, swap_dims=False, prefix=prefix, ref_date=ref_date, calendar=calendar, geometry=geometry, grid_vars_to_coords=grid_vars_to_coords, endian=endian, chunks=chunks, ignore_unknown_vars=ignore_unknown_vars)) # apply chunking ds = xr.auto_combine(datasets) if swap_dims: ds = _swap_dimensions(ds, geometry) return ds store = _MDSDataStore(dirname, iternum, delta_t, read_grid, prefix, ref_date, calendar, geometry, endian, ignore_unknown_vars=ignore_unknown_vars) ds = xr.Dataset.load_store(store) if swap_dims: ds = _swap_dimensions(ds, geometry) if grid_vars_to_coords: ds = _set_coords(ds) # turn all the auxilliary grid variables into coordinates # if grid_vars_to_coords: # for k in _grid_variables: # ds.set_coords(k, inplace=True) # ds.set_coords('iter', inplace=True) if ref_date: ds = xr.decode_cf(ds) # do we need more fancy logic (like open_dataset), or is this enough if chunks is not None: ds = ds.chunk(chunks) # set attributes for CF conventions ds.attrs['Conventions'] = "CF-1.6" ds.attrs['title'] = "netCDF wrapper of MITgcm MDS binary data" ds.attrs['source'] = "MITgcm" arg_string = ', '.join(['%s=%s' % (str(k), repr(v)) for (k, v) in arg_values.items()]) ds.attrs['history'] = ('Created by calling ' '`%s(%s)`'% (function_name, arg_string)) return ds
# Grid info loc = 'http://barataria.tamu.edu:8080/thredds/dodsC/NcML/txla_nesting6.nc' # grid_filename = '/atch/raid1/zhangxq/Projects/txla_nesting6/txla_grd_v4_new.nc' # grid = tracpy.inout.readgrid(grid_filename, usebasemap=True, llcrnrlat=22.85, llcrnrlon=-97.9, urcrnrlat=30.5) # # actually using psi grid here despite the name # xpsi = np.asanyarray(grid['xpsi'].T, order='C') # ypsi = np.asanyarray(grid['ypsi'].T, order='C') # xr = np.asanyarray(grid['xr'].T, order='C') # yr = np.asanyarray(grid['yr'].T, order='C') ds = xr.open_dataset(loc, decode_cf=False) ds['temp'].attrs['missing_value'] = ds['temp'].attrs['_FillValue'] key='salt' ds[key].attrs['missing_value'] = ds[key].attrs['_FillValue'] ds=xr.decode_cf(ds) # current arrows cdx = 7; cdy = 11 # in indices wdx = 25; wdy = 40 # in indices, wind arrows # Colormap for model output if var == 'salt': levels = (37-np.exp(np.linspace(0,np.log(37.), 10)))[::-1] # log for salinity, 0 to 36 levels[0] = 0 # levels = (37-np.exp(np.linspace(0,np.log(36.), 10)))[::-1]-1 # log for salinity, 0 to 35 cmap = calc_cmap(cmo.haline, levels) # cmap = cmPong.salinity(cmo.haline, levels) # cmap = cmPong.salinity('YlGnBu_r', levels) ilevels = [0,1,2,3,4,5,8] # which levels to label ticks = [int(tick) for tick in levels[ilevels]] # plot ticks