コード例 #1
0
ファイル: rechunk_test.py プロジェクト: google/xarray-beam
    def test_rechunk_not_all_dimensions(self):
        data = np.random.RandomState(0).randint(2**30, size=(10, 20, 30))
        ds = xarray.Dataset({'foo': (('time', 'x', 'y'), data)})
        key = xbeam.Key({'x': 0, 'y': 0})
        y_split_with_time_key = ([(key.with_offsets(time=0), ds)]
                                 | xbeam.SplitChunks({'y': 3}))
        x_split = [(key, ds)] | xbeam.SplitChunks({'x': 2})
        actual = x_split | xbeam.Rechunk(
            dim_sizes=ds.sizes,
            source_chunks={
                'x': 2,
                'y': -1
            },
            target_chunks={
                'x': -1,
                'y': 3
            },
            itemsize=8,
            max_mem=10_000,
        )
        self.assertIdenticalChunks(actual, y_split_with_time_key)

        with self.assertRaisesRegex(
                ValueError,
                'source_chunks and target_chunks have different keys',
        ):
            xbeam.Rechunk(
                dim_sizes=ds.sizes,
                source_chunks={'x': 2},
                target_chunks={'y': 3},
                itemsize=8,
                max_mem=10_000,
            )
コード例 #2
0
ファイル: rechunk_test.py プロジェクト: google/xarray-beam
 def test_rechunk_1d(self, size, max_mem, source_chunks, target_chunks):
     data = np.random.RandomState(0).randint(2**30, size=(size, ))
     ds = xarray.Dataset({'foo': ('x', data)})
     key = xbeam.Key({'x': 0})
     inputs = [(key, ds)] | xbeam.SplitChunks({'x': source_chunks})
     expected = [(key, ds)] | xbeam.SplitChunks({'x': target_chunks})
     actual = inputs | xbeam.Rechunk(
         dim_sizes=ds.sizes,
         source_chunks={'x': source_chunks},
         target_chunks={'x': target_chunks},
         itemsize=1,
         max_mem=max_mem,
     )
     self.assertIdenticalChunks(actual, expected)
コード例 #3
0
def main(argv):
    # By passing chunks=None, we use Xarray's lazy-loading instead of Dask. This
    # result is much less data being passed from the launch script to workers.
    source_dataset = xarray.open_zarr(
        INPUT_PATH.value,
        chunks=None,
        consolidated=True,
    )

    # This lazy "template" allows us to setup the Zarr outputs before running the
    # pipeline. We don't really need to supply a template here because the outputs
    # are small (the template argument in ChunksToZarr is optional), but it makes
    # the pipeline slightly more efficient.
    max_month = source_dataset.time.dt.month.max().item()  # normally 12
    template = (source_dataset.chunk().pipe(xarray.zeros_like).isel(
        time=0, drop=True).expand_dims(month=np.arange(1, max_month + 1),
                                       hour=np.arange(24)))
    output_chunks = {'hour': 1, 'month': 1}

    with beam.Pipeline(runner=RUNNER.value, argv=argv) as root:
        (root
         | xbeam.DatasetToChunks(source_dataset, {'time': 31})
         | xbeam.SplitChunks({'time': 1})
         | beam.MapTuple(rekey_chunk_on_month_hour)
         | xbeam.Mean.PerKey()
         | xbeam.ChunksToZarr(OUTPUT_PATH.value, template, output_chunks))
コード例 #4
0
ファイル: rechunk_test.py プロジェクト: google/xarray-beam
 def test_consolidate_and_split_only_some_dims(self):
     chunk_data = np.arange(0, 10).reshape(2, 5)
     split = [
         (xbeam.Key({
             'x': 0,
             'y': 0
         }), xarray.Dataset({'foo': (('x', 'y'), chunk_data)})),
         (xbeam.Key({
             'x': 0,
             'y': 5
         }), xarray.Dataset({'foo': (('x', 'y'), chunk_data + 10)})),
     ]
     all_data = np.concatenate([chunk_data, chunk_data + 10], axis=1)
     consolidated = [
         (xbeam.Key({
             'x': 0,
             'y': 0
         }), xarray.Dataset({'foo': (('x', 'y'), all_data)})),
     ]
     with self.subTest('ConsolidateChunks'):
         actual = split | xbeam.ConsolidateChunks({'y': 10})
         self.assertIdenticalChunks(actual, consolidated)
     with self.subTest('SplitChunks'):
         actual = consolidated | xbeam.SplitChunks({'y': 5})
         self.assertIdenticalChunks(actual, split)
コード例 #5
0
ファイル: rechunk_test.py プロジェクト: google/xarray-beam
 def test_consolidate_and_split_chunks(self):
     consolidated = [
         (xbeam.Key({'x':
                     0}), xarray.Dataset({'foo': ('x', np.arange(0, 10))})),
         (xbeam.Key({'x':
                     10}), xarray.Dataset({'foo': ('x', np.arange(10,
                                                                  20))})),
     ]
     split = [
         (xbeam.Key({'x':
                     0}), xarray.Dataset({'foo': ('x', np.arange(0, 5))})),
         (xbeam.Key({'x':
                     5}), xarray.Dataset({'foo': ('x', np.arange(5, 10))})),
         (xbeam.Key({'x':
                     10}), xarray.Dataset({'foo': ('x', np.arange(10,
                                                                  15))})),
         (xbeam.Key({'x':
                     15}), xarray.Dataset({'foo': ('x', np.arange(15,
                                                                  20))})),
     ]
     with self.subTest('ConsolidateChunks'):
         actual = split | xbeam.ConsolidateChunks({'x': 10})
         self.assertIdenticalChunks(actual, consolidated)
     with self.subTest('SplitChunks'):
         actual = consolidated | xbeam.SplitChunks({'x': 5})
         self.assertIdenticalChunks(actual, split)
コード例 #6
0
ファイル: rechunk_test.py プロジェクト: google/xarray-beam
 def test_rechunk_uneven_2d(self):
     data = np.random.RandomState(0).randint(2**30, size=(100, 100))
     ds = xarray.Dataset({'foo': (('x', 'y'), data)})
     key = xbeam.Key({'x': 0, 'y': 0})
     inputs = [(key, ds)] | xbeam.SplitChunks({'x': 12})
     expected = [(key, ds)] | xbeam.SplitChunks({'y': 15})
     actual = inputs | xbeam.Rechunk(
         dim_sizes=ds.sizes,
         source_chunks={
             'x': 12,
             'y': -1
         },
         target_chunks={
             'x': -1,
             'y': 15
         },
         itemsize=1,
         max_mem=100 * 100 // 2,  # half the full size
     )
     self.assertIdenticalChunks(actual, expected)
コード例 #7
0
ファイル: rechunk_test.py プロジェクト: google/xarray-beam
 def test_rechunk_end_to_end(self):
     data = np.random.RandomState(0).randint(2**30, size=(10, 20, 30))
     ds = xarray.Dataset({'foo': (('time', 'x', 'y'), data)})
     key = xbeam.Key({'time': 0, 'x': 0, 'y': 0})
     time_split = [(key, ds)] | xbeam.SplitChunks({'time': 1})
     space_split = [(key, ds)] | xbeam.SplitChunks({'x': 5, 'y': 5})
     with self.subTest('time-to-space'):
         actual = time_split | xbeam.Rechunk(
             dim_sizes=ds.sizes,
             source_chunks={
                 'time': 1,
                 'x': 20,
                 'y': 30
             },
             target_chunks={
                 'time': 10,
                 'x': 5,
                 'y': 5
             },
             itemsize=8,
             max_mem=10_000,
         )
         self.assertIdenticalChunks(actual, space_split)
     with self.subTest('space-to-time'):
         actual = space_split | xbeam.Rechunk(
             dim_sizes=ds.sizes,
             source_chunks={
                 'time': 10,
                 'x': 5,
                 'y': 5
             },
             target_chunks={
                 'time': 1,
                 'x': 20,
                 'y': 30
             },
             itemsize=8,
             max_mem=10_000,
         )
         self.assertIdenticalChunks(actual, time_split)
コード例 #8
0
ファイル: rechunk_test.py プロジェクト: google/xarray-beam
 def test_split_uneven_chunks(self):
     inputs = [
         (xbeam.Key({'x':
                     0}), xarray.Dataset({'foo': ('x', np.arange(0, 5))})),
         (xbeam.Key({'x':
                     5}), xarray.Dataset({'foo': ('x', np.arange(5, 10))})),
     ]
     expected = [
         (xbeam.Key({'x':
                     0}), xarray.Dataset({'foo': ('x', np.arange(0, 3))})),
         (xbeam.Key({'x':
                     3}), xarray.Dataset({'foo': ('x', np.arange(3, 5))})),
         (xbeam.Key({'x':
                     5}), xarray.Dataset({'foo': ('x', np.arange(5, 6))})),
         (xbeam.Key({'x':
                     6}), xarray.Dataset({'foo': ('x', np.arange(6, 9))})),
         (xbeam.Key({'x':
                     9}), xarray.Dataset({'foo': ('x', np.arange(9, 10))})),
     ]
     actual = inputs | xbeam.SplitChunks({'x': 3})
     self.assertIdenticalChunks(actual, expected)
コード例 #9
0
 def test_2d_chunks_to_zarr(self, coords):
     dataset = xarray.Dataset(
         {'foo': (('x', 'y'), np.arange(0, 60, 10).reshape(3, 2))},
         coords=coords,
     )
     with self.subTest('partial key'):
         inputs = [(xbeam.Key({'x': 0}), dataset)]
         temp_dir = self.create_tempdir().full_path
         inputs | xbeam.ChunksToZarr(temp_dir)
         result = xarray.open_zarr(temp_dir, consolidated=True)
         xarray.testing.assert_identical(dataset, result)
     with self.subTest('split along partial key'):
         inputs = [(xbeam.Key({'x': 0}), dataset)]
         temp_dir = self.create_tempdir().full_path
         inputs | xbeam.SplitChunks({'x': 1}) | xbeam.ChunksToZarr(temp_dir)
         result = xarray.open_zarr(temp_dir, consolidated=True)
         xarray.testing.assert_identical(dataset, result)
     with self.subTest('full key'):
         inputs = [(xbeam.Key({'x': 0, 'y': 0}), dataset)]
         temp_dir = self.create_tempdir().full_path
         inputs | xbeam.ChunksToZarr(temp_dir)
         result = xarray.open_zarr(temp_dir, consolidated=True)
         xarray.testing.assert_identical(dataset, result)