def test_file_pattern_concat_merge():
    concat = ConcatDim(name="time", keys=list(range(3)))
    merge = MergeDim(name="variable", keys=["foo", "bar"])

    def format_function(time, variable):
        return f"T_{time}_V_{variable}"

    fp = FilePattern(format_function, merge, concat)
    assert fp.dims == {"variable": 2, "time": 3}
    assert fp.shape == (
        2,
        3,
    )
    assert fp.merge_dims == ["variable"]
    assert fp.concat_dims == ["time"]
    assert fp.nitems_per_input == {"time": None}
    assert fp.concat_sequence_lens == {"time": None}
    expected_keys = [(0, 0), (0, 1), (0, 2), (1, 0), (1, 1), (1, 2)]
    assert list(fp) == expected_keys
    fnames = []
    for key in expected_keys:
        fname = format_function(variable=merge.keys[key[0]],
                                time=concat.keys[key[1]])
        assert fp[key] == fname
        fnames.append(fname)
    assert list(fp.items()) == list(zip(expected_keys, fnames))
def test_file_pattern_concat():
    concat = ConcatDim(name="time", keys=list(range(3)))

    def format_function(time):
        return f"T_{time}"

    fp = FilePattern(format_function, concat)
    assert fp.dims == {"time": 3}
    assert fp.shape == (3, )
    assert fp.merge_dims == []
    assert fp.concat_dims == ["time"]
    assert fp.nitems_per_input == {"time": None}
    assert fp.concat_sequence_lens == {"time": None}
    expected_keys = [(0, ), (1, ), (2, )]
    assert list(fp) == expected_keys
    for key in expected_keys:
        assert fp[key] == format_function(key[0])
    def multifile_pattern(self,
                          time_step: int = 479,
                          longitude_step: int = 47) -> FilePattern:
        """Produces a FilePattern for a temporary NetCDF file of test data."""
        time_dim = ConcatDim('time', list(range(0, 360 * 4, time_step)))
        longitude_dim = ConcatDim('longitude',
                                  list(range(0, 144, longitude_step)))

        with tempfile.TemporaryDirectory() as tmpdir:

            def make_path(time: int, longitude: int) -> str:
                return f'{tmpdir}/era5-{time}-{longitude}.nc'

            for time in time_dim.keys:
                for long in longitude_dim.keys:
                    chunk = self.test_data.isel(
                        time=slice(time, time + time_step),
                        longitude=slice(long, long + longitude_step))
                    chunk.to_netcdf(make_path(time, long))
            yield FilePattern(make_path, time_dim, longitude_dim)
def netCDFtoZarr_sequential_multi_variable_recipe(
        daily_xarray_dataset, netcdf_local_paths_by_variable, tmp_target,
        tmp_cache, tmp_metadata_target):
    paths, items_per_file, fnames_by_variable, path_format = netcdf_local_paths_by_variable
    time_index = list(range(len(paths) // 2))

    def format_function(variable, time):
        return path_format.format(variable=variable, time=time)

    file_pattern = FilePattern(
        format_function,
        ConcatDim("time", time_index, items_per_file),
        MergeDim("variable", ["foo", "bar"]),
    )
    kwargs = dict(
        inputs_per_chunk=1,
        target=tmp_target,
        input_cache=tmp_cache,
        metadata_cache=tmp_metadata_target,
    )
    return recipes.XarrayZarrRecipe, file_pattern, kwargs, daily_xarray_dataset, tmp_target
class ExpandDimensionsByKeyTest(test_util.TestCase):
    def setUp(self):
        self.test_data = test_util.dummy_era5_surface_dataset()
        self.level = ConcatDim("level", list(range(91, 100)))
        self.pattern = FilePattern(lambda level: f"gs://dir/{level}.nc",
                                   self.level)

    def test_expands_dimensions(self):
        for i, (index, _) in enumerate(self.pattern.items()):
            actual = _expand_dimensions_by_key(self.test_data, index,
                                               self.pattern)

            expected_dims = dict(self.test_data.dims)
            expected_dims.update({"level": 1})

            self.assertEqual(expected_dims, dict(actual.dims))
            self.assertEqual(np.array([self.level.keys[i]]), actual["level"])

    def test_raises_error_when_dataset_is_not_found(self):
        index = (DimIndex('boat', 0, 1, CombineOp.CONCAT), )
        with self.assertRaisesRegex(ValueError, "boat"):
            _expand_dimensions_by_key(self.test_data, index, self.pattern)
 def pattern_from_testdata(self) -> FilePattern:
     """Produces a FilePattern for a temporary NetCDF file of test data."""
     with tempfile.TemporaryDirectory() as tmpdir:
         target = f'{tmpdir}/era5.nc'
         self.test_data.to_netcdf(target)
         yield FilePattern(lambda: target)
 def setUp(self):
     self.test_data = test_util.dummy_era5_surface_dataset()
     self.level = ConcatDim("level", list(range(91, 100)))
     self.pattern = FilePattern(lambda level: f"gs://dir/{level}.nc",
                                self.level)
Exemple #8
0
import pandas as pd
from pangeo_forge_recipes.patterns import ConcatDim, FilePattern
from pangeo_forge_recipes.recipes import XarrayZarrRecipe

start_date = "1981-09-01"


def format_function(time):
    base = pd.Timestamp(start_date)
    day = base + pd.Timedelta(days=time)
    input_url_pattern = (
        "https://www.ncei.noaa.gov/data/sea-surface-temperature-optimum-interpolation"
        "/v2.1/access/avhrr/{day:%Y%m}/oisst-avhrr-v02r01.{day:%Y%m%d}.nc")
    return input_url_pattern.format(day=day)


dates = pd.date_range(start_date, "2021-01-05", freq="D")
pattern = FilePattern(format_function, ConcatDim("time", range(len(dates)), 1))
recipe = XarrayZarrRecipe(pattern, inputs_per_chunk=20, cache_inputs=True)