Ejemplo n.º 1
0
    def test_dataset_to_chunks_whole(self):
        dataset = xarray.Dataset({'foo': ('x', np.arange(6))})
        expected = [(xbeam.Key({'x': 0}), dataset)]
        actual = (test_util.EagerPipeline()
                  | xbeam.DatasetToChunks(dataset, chunks={'x': -1}))
        self.assertIdenticalChunks(actual, expected)

        actual = (test_util.EagerPipeline()
                  | xbeam.DatasetToChunks(dataset, chunks={}))
        self.assertIdenticalChunks(actual, expected)
Ejemplo n.º 2
0
 def test_validate_chunks_compose_in_pipeline(self):
     dataset = xarray.Dataset({'foo': ('x', np.arange(6))})
     expected = [(xbeam.Key({'x': 0}), dataset)]
     actual = (test_util.EagerPipeline()
               | xbeam.DatasetToChunks(dataset, chunks={'x': -1})
               | xbeam.ValidateEachChunk())
     self.assertIdenticalChunks(actual, expected)
Ejemplo n.º 3
0
def main(argv):
    # By passing chunks=None, we use Xarray's lazy-loading instead of Dask. This
    # result is much less data being passed from the launch script to workers.
    source_dataset = xarray.open_zarr(
        INPUT_PATH.value,
        chunks=None,
        consolidated=True,
    )

    # This lazy "template" allows us to setup the Zarr outputs before running the
    # pipeline. We don't really need to supply a template here because the outputs
    # are small (the template argument in ChunksToZarr is optional), but it makes
    # the pipeline slightly more efficient.
    max_month = source_dataset.time.dt.month.max().item()  # normally 12
    template = (source_dataset.chunk().pipe(xarray.zeros_like).isel(
        time=0, drop=True).expand_dims(month=np.arange(1, max_month + 1),
                                       hour=np.arange(24)))
    output_chunks = {'hour': 1, 'month': 1}

    with beam.Pipeline(runner=RUNNER.value, argv=argv) as root:
        (root
         | xbeam.DatasetToChunks(source_dataset, {'time': 31})
         | xbeam.SplitChunks({'time': 1})
         | beam.MapTuple(rekey_chunk_on_month_hour)
         | xbeam.Mean.PerKey()
         | xbeam.ChunksToZarr(OUTPUT_PATH.value, template, output_chunks))
Ejemplo n.º 4
0
    def test_dataset_to_chunks_multiple(self):
        dataset = xarray.Dataset({'foo': ('x', np.arange(6))})
        expected = [
            (xbeam.Key({'x': 0}), dataset.head(x=3)),
            (xbeam.Key({'x': 3}), dataset.tail(x=3)),
        ]
        actual = (test_util.EagerPipeline()
                  | xbeam.DatasetToChunks(dataset.chunk({'x': 3})))
        self.assertIdenticalChunks(actual, expected)

        actual = (test_util.EagerPipeline()
                  | xbeam.DatasetToChunks(dataset.chunk({'x': 3}),
                                          num_threads=2))
        self.assertIdenticalChunks(actual, expected)

        actual = (test_util.EagerPipeline()
                  | xbeam.DatasetToChunks(dataset, chunks={'x': 3}))
        self.assertIdenticalChunks(actual, expected)
Ejemplo n.º 5
0
 def test_dataset_to_chunks_vars(self):
     dataset = xarray.Dataset({
         'foo': ('x', np.arange(6)),
         'bar': ('x', -np.arange(6)),
     })
     expected = [
         (xbeam.Key({'x': 0}, {'foo'}), dataset.head(x=3)[['foo']]),
         (xbeam.Key({'x': 0}, {'bar'}), dataset.head(x=3)[['bar']]),
         (xbeam.Key({'x': 3}, {'foo'}), dataset.tail(x=3)[['foo']]),
         (xbeam.Key({'x': 3}, {'bar'}), dataset.tail(x=3)[['bar']]),
     ]
     actual = (test_util.EagerPipeline()
               | xbeam.DatasetToChunks(
                   dataset, chunks={'x': 3}, split_vars=True))
     self.assertIdenticalChunks(actual, expected)
Ejemplo n.º 6
0
    def test_rechunk_zarr_to_zarr(self, template_method, split_vars):
        src_dir = self.create_tempdir('source').full_path
        dest_dir = self.create_tempdir('destination').full_path

        source_chunks = {'t': 1, 'x': 100, 'y': 120}
        target_chunks = {'t': -1, 'x': 20, 'y': 20}

        rs = np.random.RandomState(0)
        raw_data = rs.randint(2**30, size=(60, 100, 120))  # 5.76 MB
        dataset = xarray.Dataset({
            'foo': (('t', 'x', 'y'), raw_data),
            'bar': (('t', 'x', 'y'), raw_data - 1),
        })
        dataset.chunk(source_chunks).to_zarr(src_dir, consolidated=True)

        on_disk = xarray.open_zarr(src_dir, consolidated=True)
        on_disk_chunked = on_disk.chunk(target_chunks)
        with beam.Pipeline('DirectRunner') as pipeline:
            # make template
            if template_method == 'eager':
                target_template = on_disk_chunked
            elif template_method == 'lazy':
                target_template = beam.pvalue.AsSingleton(
                    pipeline | beam.Create([on_disk_chunked]))
            elif template_method == 'infer':
                target_template = None
            # run pipeline
            (pipeline
             | xbeam.DatasetToChunks(on_disk, split_vars=split_vars)
             | xbeam.Rechunk(
                 on_disk.sizes,
                 source_chunks,
                 target_chunks,
                 itemsize=8,
                 max_mem=10_000_000,  # require two stages
             )
             | xbeam.ChunksToZarr(dest_dir, target_template))
        roundtripped = xarray.open_zarr(dest_dir,
                                        consolidated=True,
                                        chunks=False)

        xarray.testing.assert_identical(roundtripped, dataset)
Ejemplo n.º 7
0
def main(argv):
  source_dataset = xarray.open_zarr(
      INPUT_PATH.value, chunks=None, consolidated=True
  )
  template = xarray.zeros_like(source_dataset.chunk())
  source_chunks = {'latitude': -1, 'longitude': -1, 'time': 31}
  target_chunks = {'latitude': 5, 'longitude': 5, 'time': -1}

  with beam.Pipeline(runner=RUNNER.value, argv=argv) as root:
    (
        root
        # Note: splitting across the 19 variables in this dataset is a critical
        # optimization step here, because it allows rechunking to make use of
        # much larger intermediate chunks.
        | xbeam.DatasetToChunks(source_dataset, source_chunks, split_vars=True)
        | xbeam.Rechunk(
            source_dataset.sizes,
            source_chunks,
            target_chunks,
            itemsize=4,
        )
        | xbeam.ChunksToZarr(OUTPUT_PATH.value, template, target_chunks)
    )