def test_file_pattern_concat_merge(): concat = ConcatDim(name="time", keys=list(range(3))) merge = MergeDim(name="variable", keys=["foo", "bar"]) def format_function(time, variable): return f"T_{time}_V_{variable}" fp = FilePattern(format_function, merge, concat) assert fp.dims == {"variable": 2, "time": 3} assert fp.shape == ( 2, 3, ) assert fp.merge_dims == ["variable"] assert fp.concat_dims == ["time"] assert fp.nitems_per_input == {"time": None} assert fp.concat_sequence_lens == {"time": None} expected_keys = [(0, 0), (0, 1), (0, 2), (1, 0), (1, 1), (1, 2)] assert list(fp) == expected_keys fnames = [] for key in expected_keys: fname = format_function(variable=merge.keys[key[0]], time=concat.keys[key[1]]) assert fp[key] == fname fnames.append(fname) assert list(fp.items()) == list(zip(expected_keys, fnames))
def test_file_pattern_concat(): concat = ConcatDim(name="time", keys=list(range(3))) def format_function(time): return f"T_{time}" fp = FilePattern(format_function, concat) assert fp.dims == {"time": 3} assert fp.shape == (3, ) assert fp.merge_dims == [] assert fp.concat_dims == ["time"] assert fp.nitems_per_input == {"time": None} assert fp.concat_sequence_lens == {"time": None} expected_keys = [(0, ), (1, ), (2, )] assert list(fp) == expected_keys for key in expected_keys: assert fp[key] == format_function(key[0])
def multifile_pattern(self, time_step: int = 479, longitude_step: int = 47) -> FilePattern: """Produces a FilePattern for a temporary NetCDF file of test data.""" time_dim = ConcatDim('time', list(range(0, 360 * 4, time_step))) longitude_dim = ConcatDim('longitude', list(range(0, 144, longitude_step))) with tempfile.TemporaryDirectory() as tmpdir: def make_path(time: int, longitude: int) -> str: return f'{tmpdir}/era5-{time}-{longitude}.nc' for time in time_dim.keys: for long in longitude_dim.keys: chunk = self.test_data.isel( time=slice(time, time + time_step), longitude=slice(long, long + longitude_step)) chunk.to_netcdf(make_path(time, long)) yield FilePattern(make_path, time_dim, longitude_dim)
def netCDFtoZarr_sequential_multi_variable_recipe( daily_xarray_dataset, netcdf_local_paths_by_variable, tmp_target, tmp_cache, tmp_metadata_target): paths, items_per_file, fnames_by_variable, path_format = netcdf_local_paths_by_variable time_index = list(range(len(paths) // 2)) def format_function(variable, time): return path_format.format(variable=variable, time=time) file_pattern = FilePattern( format_function, ConcatDim("time", time_index, items_per_file), MergeDim("variable", ["foo", "bar"]), ) kwargs = dict( inputs_per_chunk=1, target=tmp_target, input_cache=tmp_cache, metadata_cache=tmp_metadata_target, ) return recipes.XarrayZarrRecipe, file_pattern, kwargs, daily_xarray_dataset, tmp_target
class ExpandDimensionsByKeyTest(test_util.TestCase): def setUp(self): self.test_data = test_util.dummy_era5_surface_dataset() self.level = ConcatDim("level", list(range(91, 100))) self.pattern = FilePattern(lambda level: f"gs://dir/{level}.nc", self.level) def test_expands_dimensions(self): for i, (index, _) in enumerate(self.pattern.items()): actual = _expand_dimensions_by_key(self.test_data, index, self.pattern) expected_dims = dict(self.test_data.dims) expected_dims.update({"level": 1}) self.assertEqual(expected_dims, dict(actual.dims)) self.assertEqual(np.array([self.level.keys[i]]), actual["level"]) def test_raises_error_when_dataset_is_not_found(self): index = (DimIndex('boat', 0, 1, CombineOp.CONCAT), ) with self.assertRaisesRegex(ValueError, "boat"): _expand_dimensions_by_key(self.test_data, index, self.pattern)
def pattern_from_testdata(self) -> FilePattern: """Produces a FilePattern for a temporary NetCDF file of test data.""" with tempfile.TemporaryDirectory() as tmpdir: target = f'{tmpdir}/era5.nc' self.test_data.to_netcdf(target) yield FilePattern(lambda: target)
def setUp(self): self.test_data = test_util.dummy_era5_surface_dataset() self.level = ConcatDim("level", list(range(91, 100))) self.pattern = FilePattern(lambda level: f"gs://dir/{level}.nc", self.level)
import pandas as pd from pangeo_forge_recipes.patterns import ConcatDim, FilePattern from pangeo_forge_recipes.recipes import XarrayZarrRecipe start_date = "1981-09-01" def format_function(time): base = pd.Timestamp(start_date) day = base + pd.Timedelta(days=time) input_url_pattern = ( "https://www.ncei.noaa.gov/data/sea-surface-temperature-optimum-interpolation" "/v2.1/access/avhrr/{day:%Y%m}/oisst-avhrr-v02r01.{day:%Y%m%d}.nc") return input_url_pattern.format(day=day) dates = pd.date_range(start_date, "2021-01-05", freq="D") pattern = FilePattern(format_function, ConcatDim("time", range(len(dates)), 1)) recipe = XarrayZarrRecipe(pattern, inputs_per_chunk=20, cache_inputs=True)