def test_infer_freq_invalid_inputs(): # Non-datetime DataArray with pytest.raises(ValueError, match="must contain datetime-like objects"): xr.infer_freq(xr.DataArray([0, 1, 2])) indx = xr.cftime_range("1990-02-03", periods=4, freq="MS") # 2D DataArray with pytest.raises(ValueError, match="must be 1D"): xr.infer_freq(xr.DataArray([indx, indx])) # CFTimeIndex too short with pytest.raises(ValueError, match="Need at least 3 dates to infer frequency"): xr.infer_freq(indx[:2]) # Non-monotonic input assert xr.infer_freq(indx[np.array([0, 2, 1, 3])]) is None # Non-unique input assert xr.infer_freq(indx[np.array([0, 1, 1, 2])]) is None # No unique frequency (here 1st step is MS, second is 2MS) assert xr.infer_freq(indx[np.array([0, 1, 3])]) is None # Same, but for QS indx = xr.cftime_range("1990-02-03", periods=4, freq="QS") assert xr.infer_freq(indx[np.array([0, 1, 3])]) is None
def missing_from_context(da, freq, src_timestep=None, **indexer): # noqa: D103 src_timestep = src_timestep or xr.infer_freq(da.time) return FromContext.execute(da, freq, src_timestep, options={}, indexer=indexer)
def check_freq(var: xr.DataArray, freq: Union[str, Sequence[str]], strict: bool = True): """Raise an error if not series has not the expected temporal frequency or is not monotonically increasing. Parameters ---------- var : xr.DataArray Input array. freq : str or sequence of str The expected temporal frequencies, using Pandas frequency terminology ({'A', 'M', 'D', 'H', 'T', 'S', 'L', 'U'} and multiples thereof). To test strictly for 'W', pass '7D' with `strict=True`. This ignores the start flag and the anchor (ex: 'AS-JUL' will validate against 'Y'). strict : bool Whether multiples of the frequencies are considered invalid or not. With `strict` set to False, a '3H' series will not raise an error if freq is set to 'H'. """ if isinstance(freq, str): freq = [freq] exp_base = [parse_offset(frq)[1] for frq in freq] v_freq = xr.infer_freq(var.time) if v_freq is None: raise ValidationError( "Unable to infer the frequency of the time series. " "To mute this, set xclim's option data_validation='log'.") v_base = parse_offset(v_freq)[1] if v_base not in exp_base or (strict and all( compare_offsets(v_freq, "!=", frq) for frq in freq)): raise ValidationError( f"Frequency of time series not {'strictly' if strict else ''} in {freq}. " "To mute this, set xclim's option data_validation='log'.")
def at_least_n_valid(da, freq, n=1, src_timestep=None, **indexer): # noqa: D103 src_timestep = src_timestep or xr.infer_freq(da.time) return AtLeastNValid(da, freq, src_timestep, **indexer)(n=n)
def missing_pct(da, freq, tolerance, src_timestep=None, **indexer): # noqa: D103 src_timestep = src_timestep or xr.infer_freq(da.time) return MissingPct(da, freq, src_timestep, **indexer)(tolerance=tolerance)
def get_wrf_pw_at_dsea_gnss_coord(path=des_path, work_path=work_yuval, point=None): from PW_stations import produce_geo_gnss_solved_stations import xarray as xr from aux_gps import get_nearest_lat_lon_for_xy from aux_gps import path_glob from aux_gps import get_unique_index df = produce_geo_gnss_solved_stations(path=work_path / 'gis', plot=False) dsea_point = df.loc['dsea'][['lat', 'lon']].astype(float).values files = path_glob(path, 'pw_wrfout*.nc') pw_list = [] for file in files: pw_all = xr.load_dataset(file) freq = xr.infer_freq(pw_all['Time']) print(freq) if point is not None: print('looking for {} at wrf.'.format(point)) dsea_point = point loc = get_nearest_lat_lon_for_xy(pw_all['XLAT'], pw_all['XLONG'], dsea_point) print(loc) pw = pw_all.isel(south_north=loc[0][0], west_east=loc[0][1]) pw_list.append(pw) pw_ts = xr.concat(pw_list, 'Time') pw_ts = get_unique_index(pw_ts, dim='Time') return pw_ts
def test_convert_calendar_missing(source, target, freq): src = DataArray( date_range( "2004-01-01", "2004-12-31" if source != "360_day" else "2004-12-30", freq=freq, calendar=source, ), dims=("time", ), name="time", ) da_src = DataArray(np.linspace(0, 1, src.size), dims=("time", ), coords={"time": src}) out = convert_calendar(da_src, target, missing=np.nan, align_on="date") assert infer_freq(out.time) == freq expected = date_range( "2004-01-01", "2004-12-31" if target != "360_day" else "2004-12-30", freq=freq, calendar=target, ) np.testing.assert_array_equal(out.time, expected) if freq != "M": out_without_missing = convert_calendar(da_src, target, align_on="date") expected_nan = out.isel(time=~out.time.isin(out_without_missing.time)) assert expected_nan.isnull().all() expected_not_nan = out.sel(time=out_without_missing.time) assert_identical(expected_not_nan, out_without_missing)
def check_daily(var: xr.DataArray): """Raise an error if not series has a frequency other that daily, or is not monotonically increasing. Note that this does not check for gaps in the series. """ if xr.infer_freq(var.time) != "D": raise ValidationError("time series is not recognized as daily.")
def __init__(self, da, freq, src_timestep, **indexer): if src_timestep is None: src_timestep = xr.infer_freq(da.time) if src_timestep is None: raise ValueError( "`src_timestep` must be given as it cannot be inferred.") self.null, self.count = self.prepare(da, freq, src_timestep, **indexer)
def test_infer_freq(freq, calendar): indx = xr.cftime_range("2000-01-01", periods=3, freq=freq, calendar=calendar) out = xr.infer_freq(indx) assert out == freq
def align_synoptic_class_with_daily_dataset(ds, time_dim='time'): import xarray as xr assert xr.infer_freq(ds[time_dim]) == 'D' # ds = ds.resample({time_dim: '1D'}, keep_attrs=True).mean(keep_attrs=True) syn = read_synoptic_classification(report=False).to_xarray() ds['syn_class'] = syn['class'] ds['upper_class'] = syn['upper_class'] return ds
def missing_wmo(da, freq, nm=11, nc=5, src_timestep=None, **indexer): # noqa: D103 src_timestep = src_timestep or xr.infer_freq(da.time) missing = MissingWMO(da, "M", src_timestep, **indexer)(nm=nm, nc=nc) return missing.resample(time=freq).any()
def date_range_like(source, calendar): """Generate a datetime array with the same frequency, start and end as another one, but in a different calendar. Parameters ---------- source : xr.DataArray 1D datetime coordinate DataArray calendar : str New calendar name. Raises ------ ValueError If the source's frequency was not found. Returns ------- xr.DataArray 1D datetime coordinate with the same start, end and frequency as the source, but in the new calendar. The start date is assumed to exist in the target calendar. If the end date doesn't exist, the code tries 1 and 2 calendar days before. Exception when the source is in 360_day and the end of the range is the 30th of a 31-days month, then the 31st is appended to the range. """ freq = xr.infer_freq(source) if freq is None: raise ValueError( "`date_range_like` was unable to generate a range as the source frequency was not inferrable." ) src_cal = get_calendar(source) if src_cal == calendar: return source index = source.indexes[source.dims[0]] end_src = index[-1] end = _convert_datetime(end_src, calendar=calendar) if end is np.nan: # Day is invalid, happens at the end of months. end = _convert_datetime(end_src.replace(day=end_src.day - 1), calendar=calendar) if end is np.nan: # Still invalid : 360_day to non-leap february. end = _convert_datetime( end_src.replace(day=end_src.day - 2), calendar=calendar ) if src_cal == "360_day" and end_src.day == 30 and end.daysinmonth == 31: # For the specific case of daily data from 360_day source, the last day is expected to be "missing" end = end.replace(day=31) return xr.DataArray( date_range( _convert_datetime(index[0], calendar=calendar), end, freq=freq, calendar=calendar, ), dims=source.dims, name=source.dims[0], )
def check_daily(var): """Assert that the series is daily and monotonic (no jumps in time index). A ValidationError is raised otherwise. """ if xr.infer_freq(var.time.to_pandas()) != "D": raise ValidationError("time series is not recognized as daily.") # Check that the series does not go backward in time if not var.indexes["time"].is_monotonic_increasing: raise ValidationError("time index is not monotonically increasing.")
def missing_wmo(da, freq, nm=11, nc=5, src_timestep=None, **indexer): # noqa: D103 src_timestep = src_timestep or xr.infer_freq(da.time) return MissingWMO.execute(da, freq, src_timestep, options=dict(nm=nm, nc=nc), indexer=indexer)
def _to_quarter( pr: Optional[xarray.DataArray] = None, tas: Optional[xarray.DataArray] = None, ) -> xarray.DataArray: """Convert daily, weekly or monthly time series to quarterly time series according to ANUCLIM specifications.""" if tas is not None and pr is not None: raise ValueError( "Supply only one variable, 'tas' (exclusive) or 'pr'.") freq = xarray.infer_freq((tas if tas is not None else pr).time) if freq is None: raise ValueError("Can't infer sampling frequency of the input data.") if freq.upper().startswith("D"): if tas is not None: tas = tg_mean(tas, freq="7D") if pr is not None: # Accumulate on a week # Ensure units are back to a "rate" for rate2amount below pr = convert_units_to(precip_accumulation(pr, freq="7D"), "mm") pr.attrs["units"] = "mm/week" freq = "W" if freq.upper().startswith("W"): window = 13 elif freq.upper().startswith("M"): window = 3 else: raise NotImplementedError( f'Unknown input time frequency "{freq}": must be one of "D", "W" or "M".' ) if tas is not None: tas = ensure_chunk_size(tas, time=np.ceil(window / 2)) out = tas.rolling(time=window, center=False).mean(skipna=False) out.attrs = tas.attrs elif pr is not None: pr = ensure_chunk_size(pr, time=np.ceil(window / 2)) pram = rate2amount(pr) out = pram.rolling(time=window, center=False).sum() out.attrs = pr.attrs out.attrs["units"] = pram.units else: raise ValueError("No variables supplied.") out = ensure_chunk_size(out, time=-1) return out
def water_budget( pr: xarray.DataArray, tasmin: Optional[xarray.DataArray] = None, tasmax: Optional[xarray.DataArray] = None, tas: Optional[xarray.DataArray] = None, method: str = "BR65", ) -> xarray.DataArray: r"""Precipitation minus potential evapotranspiration. Precipitation minus potential evapotranspiration as a measure of an approximated surface water budget, where the potential evapotranspiration is calculated with a given method. Parameters ---------- pr : xarray.DataArray Daily precipitation. tasmin : xarray.DataArray Minimum daily temperature. tasmax : xarray.DataArray Maximum daily temperature. tas : xarray.DataArray Mean daily temperature. method : str Method to use to calculate the potential evapotranspiration. Notes ----- Available methods are listed in the description of xclim.indicators.atmos.potential_evapotranspiration. Returns ------- xarray.DataArray, Precipitation minus potential evapotranspiration. """ pr = convert_units_to(pr, "kg m-2 s-1") pet = xci.potential_evapotranspiration(tasmin=tasmin, tasmax=tasmax, tas=tas, method=method) if xarray.infer_freq(pet.time) == "MS": with xarray.set_options(keep_attrs=True): pr = pr.resample(time="MS").mean(dim="time") out = pr - pet out.attrs["units"] = pr.attrs["units"] return out
def infer_sampling_units(da: xr.DataArray, deffreq: str = "D") -> Tuple[int, str]: """Infer a multiplicator and the units corresponding to one sampling period. If `xr.infer_freq` fails, returns `deffreq`. """ freq = xr.infer_freq(da.time) if freq is None: freq = deffreq multi, base, _ = parse_offset(freq) try: return int(multi or "1"), FREQ_UNITS[base] except KeyError: raise ValueError( f"Sampling frequency {freq} has no corresponding units.")
def test_infer_freq_valid_types(): cf_indx = xr.cftime_range("2000-01-01", periods=3, freq="D") assert xr.infer_freq(cf_indx) == "D" assert xr.infer_freq(xr.DataArray(cf_indx)) == "D" pd_indx = pd.date_range("2000-01-01", periods=3, freq="D") assert xr.infer_freq(pd_indx) == "D" assert xr.infer_freq(xr.DataArray(pd_indx)) == "D" pd_td_indx = pd.timedelta_range(start="1D", periods=3, freq="D") assert xr.infer_freq(pd_td_indx) == "D" assert xr.infer_freq(xr.DataArray(pd_td_indx)) == "D"
def test_convert_calendar_missing(source, target, freq): src = xr.DataArray( date_range( "2004-01-01", "2004-12-31" if source != "360_day" else "2004-12-30", freq=freq, calendar=source, ), dims=("time",), name="time", ) da_src = xr.DataArray( np.linspace(0, 1, src.size), dims=("time",), coords={"time": src} ) out = convert_calendar(da_src, target, missing=np.nan, align_on="date") assert xr.infer_freq(out.time) == freq if source == "360_day": assert out.time[-1].dt.day == 31
def infer_sampling_units( da: xr.DataArray, deffreq: Optional[str] = "D", dim: str = "time", ) -> Tuple[int, str]: """Infer a multiplicator and the units corresponding to one sampling period. Parameters ---------- da : xr.DataArray A DataArray from which to take coordinate `dim`. deffreq : str If no frequency is inferred from `da[dim]`, take this one. dim : str Dimension from which to infer the frequency. Raises ------ ValueError If the frequency has no exact corresponding units. Returns ------- m : int The magnitude (number of base periods per period) u : str Units as a string, understandable by pint. """ dimmed = getattr(da, dim) freq = xr.infer_freq(dimmed) if freq is None: freq = deffreq multi, base, _, _ = parse_offset(freq) try: out = multi, FREQ_UNITS[base] except KeyError: raise ValueError( f"Sampling frequency {freq} has no corresponding units.") if out == (7, "d"): # Special case for weekly frequency. xarray's CFTimeOffsets do not have "W". return 1, "week" return out
def check_freq(var: xr.DataArray, freq: str, strict: bool = True): """Raise an error if not series has not the expected temporal frequency or is not monotonically increasing. Parameters ---------- var : xr.DataArray Input array. freq : str The temporal frequency defined using the Pandas frequency strings, e.g. 'A', 'M', 'D', 'H', 'T', 'S'. Note that a 3-hourly time series is declared as '3H'. strict : bool Whether or not multiples of the frequency are considered invalid. With `strict` set to False, a '3H' series will not raise an error if freq is set to 'H'. """ v_freq = xr.infer_freq(var.time) if v_freq != freq: if (freq in v_freq) and not strict: return raise ValidationError( "Time series has temporal frequency `{v_freq}`, expected `{freq}`." )
def _get_number_of_elements_by_year(time): """Get the number of elements in time in a year by inferring its sampling frequency. Only calendar with uniform year lengths are supported : 360_day, noleap, all_leap. """ cal = get_calendar(time) # Calendar check if cal in ["standard", "gregorian", "default", "proleptic_gregorian"]: raise ValueError( "For moving window computations, the data must have a uniform calendar (360_day, no_leap or all_leap)" ) mult, freq, _, _ = parse_offset(xr.infer_freq(time)) days_in_year = max_doy[cal] elements_in_year = {"Q": 4, "M": 12, "D": days_in_year, "H": days_in_year * 24} N_in_year = elements_in_year.get(freq, 1) / mult if N_in_year % 1 != 0: raise ValueError( f"Sampling frequency of the data must be Q, M, D or H and evenly divide a year (got {mult}{freq})." ) return int(N_in_year)
def _rate_and_amount_converter(da: xr.DataArray, dim: str = "time", to: str = "amount", out_units: str = None) -> xr.DataArray: """Private function performing the actual conversion for :py:func:`rate2amount` and :py:func:`amount2rate`.""" m = 1 u = None # Default to assume a non-uniform axis label = "lower" time = da[dim] try: freq = xr.infer_freq(da[dim]) except ValueError: freq = None if freq is not None: multi, base, start_anchor, _ = parse_offset(freq) if base in ["M", "Q", "A"]: start = time.indexes[dim][0] if not start_anchor: # Anchor is on the end of the period, substract 1 period. start = start - xr.coding.cftime_offsets.to_offset(freq) # In the diff below, assign to upper label! label = "upper" # We generate "time" with an extra element, so we do not need to repeat the last element below. time = xr.DataArray( date_range(start, periods=len(time) + 1, freq=freq, calendar=get_calendar(time)), dims=(dim, ), name=dim, attrs=da[dim].attrs, ) else: m, u = multi, FREQ_UNITS[base] # Freq is month, season or year, which are not constant units, or simply freq is not inferrable. if u is None: # Get sampling period lengths in nanoseconds # In the case with no freq, last period as the same length as the one before. # In the case with freq in M, Q, A, this has been dealt with above in `time` # and `label` has been updated accordingly. dt = (time.diff(dim, label=label).reindex({ dim: da[dim] }, method="ffill").astype(float)) dt = dt / 1e9 # Convert to seconds if to == "amount": tu = (str2pint(da.units) * str2pint("s")).to_reduced_units() out = da * dt * tu.m elif to == "rate": tu = (str2pint(da.units) / str2pint("s")).to_reduced_units() out = (da / dt) * tu.m else: raise ValueError("Argument `to` must be one of 'amout' or 'rate'.") out.attrs["units"] = pint2cfunits(tu) else: q = units.Quantity(m, u) if to == "amount": out = pint_multiply(da, q) elif to == "rate": out = pint_multiply(da, 1 / q) else: raise ValueError("Argument `to` must be one of 'amout' or 'rate'.") if out_units: out = convert_units_to(out, out_units) return out
def visualize_synoptic_class_on_time_series(da_ts, path=climate_path, ax=None, leg_ncol=1, add_mm=False, leg_loc=1, second_da_ts=None, twin=None): import xarray as xr import matplotlib.pyplot as plt from aux_gps import replace_time_series_with_its_group time_dim = list(set(da_ts.dims))[0] assert xr.infer_freq(da_ts[time_dim]) == 'D' if ax is None: fig, ax = plt.subplots() # also calc the monthly means: if add_mm: da_ts_mm = replace_time_series_with_its_group(da_ts, grp='month') da_ts_mm.plot.line('k-.', ax=ax) if isinstance(da_ts, xr.Dataset): styles = ['r-', 'g-', 'b-'] lns = [] for i, st in enumerate(da_ts): lbl = st.upper() ln = da_ts[st].plot.line(styles[i], lw=2, ax=ax, zorder=20, label=lbl) lns.append(ln) da_ts = da_ts[st] else: # plot daily values: da_ts.plot.line('k-', lw=2, ax=ax, zorder=20) if second_da_ts is not None: # record the corr between second_da_ts and da_ts: corr_all = xr.corr(da_ts, second_da_ts).item() corr_oct = xr.corr( da_ts.sel(time=da_ts['time.month'] == 10), second_da_ts.sel(time=second_da_ts['time.month'] == 10)).item() props = dict(boxstyle='round', facecolor='wheat', alpha=0.5) textstr = '\n'.join([ 'r_all = {:.2f}'.format(corr_all), 'r_just_Oct = {:.2f}'.format(corr_oct) ]) # textstr = 'r_all = {:.2f}'.format(corr) # place a text box in upper left in axes coords ax.text(0.05, 0.95, textstr, transform=ax.transAxes, fontsize=14, verticalalignment='top', bbox=props) try: if second_da_ts.attrs['units'] == da_ts.attrs['units']: second_da_ts.plot.line('k--', lw=2, ax=ax, marker='o') else: twinx = ax.twinx() second_da_ts.plot.line('k--', lw=2, ax=twinx, marker='o') if twin is not None: twinx.set_ylim(*twin) except KeyError: twinx = ax.twinx() second_da_ts.plot.line('k--', lw=2, ax=twinx, marker='o') if twin is not None: twinx.set_ylim(*twin) # ymin, ymax = ax.get_ylim() df = read_synoptic_classification(path, report=False) ind = da_ts.to_dataframe().index da_ts = align_synoptic_class_with_daily_dataset(da_ts) df = df.loc[ind] color_dict, edge_dict = choose_color_for_synoptic_classification() # df['color'] = df['class'].map(color_dict) # monthly count of synoptics: month_counts = agg_month_count_syn_class(freq=False) min_year = da_ts[time_dim].min().dt.year.item() min_month = da_ts[time_dim].min().dt.month.item() max_year = da_ts[time_dim].max().dt.year.item() max_month = da_ts[time_dim].max().dt.month.item() min_dt = '{}-{}'.format(min_year, min_month) max_dt = '{}-{}'.format(max_year, max_month) month_counts = month_counts.sel(time=slice(min_dt, max_dt)) # alternative count since we need not just monthly but by time slice: grp_dict = df.groupby('class').groups for key_class, key_ind in grp_dict.items(): color = color_dict[key_class] edge_color = edge_dict[key_class] abbr = add_class_abbr(key_class) # abbr_count = month_counts.sel(syn_cls=key_class).sum().item() abbr_count = df[df['class'] == key_class].count().values[0] abbr_label = r'${{{}}}$: {}'.format(abbr, int(abbr_count)) # for ind, row in df.iterrows(): da_ts[da_ts['syn_class'] == key_class].plot.line( 'k-', lw=0, ax=ax, marker='o', markersize=20, markerfacecolor=color, markeredgewidth=2, markeredgecolor=edge_color, label=abbr_label) # ax.vlines(key_ind, 0, 80, colors=color, alpha=0.4, lw=10, # label=abbr_label) ax.legend(ncol=leg_ncol, labelspacing=1.5, fontsize=12, loc=leg_loc) ax.grid() return ax
def freq(self): if self.is_temporal: return xr.infer_freq(self._obj) else: return None
def missing_any(da, freq, src_timestep=None, **indexer): # noqa: D103 src_timestep = src_timestep or xr.infer_freq(da.time) return MissingAny(da, freq, src_timestep, **indexer)()
def aggregate_between_dates( data: xr.DataArray, start: Union[xr.DataArray, DayOfYearStr], end: Union[xr.DataArray, DayOfYearStr], op: str = "sum", freq: Optional[str] = None, ) -> xr.DataArray: """Aggregate the data over a period between start and end dates and apply the operator on the aggregated data. Parameters ---------- data : xr.DataArray Data to aggregate between start and end dates. start : xr.DataArray or DayOfYearStr Start dates (as day-of-year) for the aggregation periods. end : xr.DataArray or DayOfYearStr End (as day-of-year) dates for the aggregation periods. op : {'min', 'max', 'sum', 'mean', 'std'} Operator. freq : str Resampling frequency. Returns ------- xarray.DataArray, [dimensionless] Aggregated data between the start and end dates. If the end date is before the start date, returns np.nan. If there is no start and/or end date, returns np.nan. """ def _get_days(_bound, _group, _base_time): """Get bound in number of days since base_time. Bound can be a days_since array or a DayOfYearStr.""" if isinstance(_bound, str): b_i = rl.index_of_date(_group.time, _bound, max_idxs=1) # noqa if not len(b_i): return None return (_group.time.isel(time=b_i[0]) - _group.time.isel(time=0)).dt.days if _base_time in _bound.time: return _bound.sel(time=_base_time) return None if freq is None: frequencies = [] for i, bound in enumerate([start, end], start=1): try: frequencies.append(xr.infer_freq(bound.time)) except AttributeError: frequencies.append(None) good_freq = set(frequencies) - {None} if len(good_freq) != 1: raise ValueError( f"Non-inferrable resampling frequency or inconsistent frequencies. Got start, end = {frequencies}." " Please consider providing `freq` manually.") freq = good_freq.pop() cal = get_calendar(data, dim="time") if not isinstance(start, str): start = convert_calendar(start, cal) start.attrs["calendar"] = cal start = doy_to_days_since(start) if not isinstance(end, str): end = convert_calendar(end, cal) end.attrs["calendar"] = cal end = doy_to_days_since(end) out = list() for base_time, indexes in data.resample(time=freq).groups.items(): # get group slice group = data.isel(time=indexes) start_d = _get_days(start, group, base_time) end_d = _get_days(end, group, base_time) # convert bounds for this group if start_d is not None and end_d is not None: days = (group.time - base_time).dt.days days[days < 0] = np.nan masked = group.where((days >= start_d) & (days <= end_d - 1)) res = getattr(masked, op)(dim="time", skipna=True) res = xr.where( ((start_d > end_d) | (start_d.isnull()) | (end_d.isnull())), np.nan, res) # Re-add the time dimension with the period's base time. res = res.expand_dims(time=[base_time]) out.append(res) else: # Get an array with the good shape, put nans and add the new time. res = (group.isel(time=0) * np.nan).expand_dims(time=[base_time]) out.append(res) continue out = xr.concat(out, dim="time") return out
def open_dataset( infiles, file_format=None, chunks=None, metadata_file=None, variables=[], spatial_coords=None, shapefile=None, shapefile_label_header=None, shape_overlap=None, combine_shapes=False, spatial_agg="none", lat_dim="lat", lon_dim="lon", standard_calendar=False, no_leap_days=False, rolling_sum_window=None, time_freq=None, time_agg=None, month=None, reset_times=False, complete_time_agg_periods=False, input_freq=None, time_dim="time", isel={}, sel={}, scale_factors={}, units={}, units_timing="end", ): """Create an xarray Dataset from one or more data files. Parameters ---------- infiles : str or list Input file path/s file_format : str, optional Formats/engines accepted by xarray.open_dataset (e.g. netcdf4, zarr, cfgrid). Estimated if not provided. chunks : dict, optional Chunks for xarray.open_mfdataset metadata_file : str YAML file path specifying required file metadata changes variables : list, optional Subset of variables of interest spatial_coords : list, optional Coordinates for spatial point or box selection. List of length 2 [lat, lon], 4 [south bound, north bound, east bound, west bound] shapefile : str, optional Shapefile for spatial subseting shapefile_label_header : str Name of the shapefile column containing the region names shape_overlap : float, optional Fraction that a grid cell must overlap with a shape to be included. If no fraction is provided, grid cells are selected if their centre point falls within the shape. combine_shapes : bool, default False Add a region that combines all shapes spatial_agg : {'mean', 'sum', 'weighted_mean'}, optional Spatial aggregation method lat_dim: str, default 'lat' Name of the latitude dimension in infiles lon_dim: str, default 'lon' Name of the longitude dimension in infiles no_leap_days : bool, default False Remove leap days from data rolling_sum_window : int, default None Apply a rolling sum with this window width time_freq : {'A-DEC', 'M', 'Q-NOV', 'A-NOV', 'A-AUG'}, optional Target temporal frequency for resampling time_agg : {'mean', 'sum', 'min', 'max'}, optional Temporal aggregation method standard_calendar : bool, default False Force a common calendar on all input files month : {1, 2, 3, 4, 5, 6, 7, 8, 9, 10, 11, 12}, optional Select a single month from the dataset reset_times : bool, default False Shift time values after resampling so months match initial date complete_time_agg_periods : bool default False Limit temporal aggregation output to complete years/months input_freq : {'A', 'Q', 'M', 'D'}, optional Input time frequency for resampling (estimated if not provided) time_dim: str, default 'time' Name of the time dimension in infiles isel : dict, optional Selection using xarray.Dataset.isel sel : dict, optional Selection using xarray.Dataset.sel scale_factors : dict, optional Divide input data by this value. Variable/s (keys) and scale factor (values). Scale factors can be a float or "days_in_month" units : dict, optional Variable/s (keys) and desired units (values) units_timing : str, {'start', 'middle', 'end'}, default 'end' When to perform the unit conversions in units Middle is after the spatial aggregation but before the temporal aggregation Returns ------- ds : xarray Dataset """ preprocess = time_utils.switch_calendar if standard_calendar else None engine = file_format if file_format else _guess_file_format(infiles) ds = xr.open_mfdataset(infiles, engine=engine, preprocess=preprocess, use_cftime=True, chunks=chunks) # Metadata if metadata_file: ds = _fix_metadata(ds, metadata_file) # Variable selection if variables: ds = ds[variables] # General selection/subsetting if isel: ds = ds.isel(isel) if sel: ds = ds.sel(sel) if month: ds = time_utils.select_month(ds, month, init_month=reset_times, time_dim=time_dim) # Scale factors if scale_factors: with xr.set_options(keep_attrs=True): for var, scale_factor in scale_factors.items(): if scale_factor == "days_in_month": ds[var] = ds[var] / ds[time_dim].dt.days_in_month else: ds[var] = ds[var] / scale_factor # Unit conversion (at start) if units and (units_timing == "start"): for var, target_units in units.items(): ds[var] = general_utils.convert_units(ds[var], target_units) # Spatial subsetting and aggregation spatial_coord_agg = "none" if shapefile else spatial_agg if spatial_coords is None: pass elif len(spatial_coords) == 4: ds = spatial_selection.select_box_region(ds, spatial_coords, agg=spatial_coord_agg, lat_dim=lat_dim, lon_dim=lon_dim) elif len(spatial_coords) == 2: ds = spatial_selection.select_point_region(ds, spatial_coords, lat_dim=lat_dim, lon_dim=lon_dim) else: msg = "coordinate selection must be None, a box (list of 4 floats) or a point (list of 2 floats)" raise ValueError(msg) if shapefile: shapes = gp.read_file(shapefile) ds = spatial_selection.select_shapefile_regions( ds, shapes, agg=spatial_agg, overlap_fraction=shape_overlap, header=shapefile_label_header, combine_shapes=combine_shapes, lat_dim=lat_dim, lon_dim=lon_dim, ) # Unit conversion (at middle) if units and (units_timing == "middle"): for var, target_units in units.items(): ds[var] = general_utils.convert_units(ds[var], target_units) # Temporal aggregation if no_leap_days: ds = ds.sel( time=~((ds[time_dim].dt.month == 2) & (ds[time_dim].dt.day == 29))) if rolling_sum_window: ds = ds.rolling({time_dim: rolling_sum_window}).sum() if time_freq: assert time_agg, "Provide a time_agg" assert variables, "Variables argument is required for temporal aggregation" if not input_freq: input_freq = xr.infer_freq(ds.indexes[time_dim][0:3])[0] ds = time_utils.temporal_aggregation( ds, time_freq, input_freq, time_agg, variables, reset_times=reset_times, complete=complete_time_agg_periods, ) output_freq = time_freq[0] if time_freq else input_freq if output_freq: ds[time_dim].attrs["frequency"] = output_freq # Unit conversion (at end) if units and (units_timing == "end"): for var, target_units in units.items(): ds[var] = general_utils.convert_units(ds[var], target_units) assert type(ds) == xr.core.dataset.Dataset ds = ds.squeeze(drop=True) return ds
def my_shift(init, lead): """Shift CFTimeIndex init by amount lead in units lead_unit.""" if isinstance(init, xr.DataArray): init = init.to_index() init_calendar = init.calendar if isinstance(lead, xr.DataArray): lead_unit = lead.attrs["units"] lead = lead.values if lead_unit in ["years", "seasons", "months" ] and "360" not in init_calendar: if int(lead) != float(lead): raise CoordinateError( f'Require integer leads if lead.attrs["units"]="{lead_unit}" in ' f'["years", "seasons", "months"] and calendar="{init_calendar}" ' 'not "360_day".') lead = int(lead) if "360" in init_calendar: # use pd.Timedelta if lead_unit == "years": lead = lead * 360 lead_unit = "D" elif lead_unit == "seasons": lead = lead * 90 lead_unit = "D" elif lead_unit == "months": lead_unit = "D" lead = lead * 30 if lead_unit in ["years", "seasons", "months"]: # use init_freq reconstructed from anchor and lead unit from xarray.coding.frequencies import month_anchor_check anchor_check = month_anchor_check(init) # returns None, ce or cs if anchor_check is not None: lead_freq_string = lead_unit[0].upper() # A for years, D for days # go down to monthly freq if lead_freq_string == "Y": lead_freq_string = "12M" elif lead_freq_string == "S": lead_freq_string = "3M" anchor = anchor_check[-1].upper() # S/E for start/end of month if anchor == "E": anchor = "" lead_freq = f"{lead_freq_string}{anchor}" if lead_freq_string in ["A", "Q"]: # add month info again init_freq = xr.infer_freq(init) if init_freq: if "-" in init_freq: lead_freq = lead_freq + "-" + init_freq.split("-")[-1] else: raise ValueError( f"could not shift init={init} in calendar={init_calendar} by " f" lead={lead} {lead_unit}") return init.shift(lead, lead_freq) else: # lower freq # reducing pentads, weeks (W) to days if lead_unit == "weeks": lead_unit = "W" elif lead_unit == "pentads": lead = lead * 5 lead_unit = "D" return init + pd.Timedelta(float(lead), lead_unit)