def flow(self):

        if len(self.targets) == 1:
            target = self.targets[0]
        else:
            raise ValueError(
                "Zarr target requires self.targets be a length one list")

        with Flow(self.name,
                  storage=self.storage,
                  environment=self.environment) as _flow:
            # download to cache
            nc_sources = download.map(
                self.sources,
                cache_location=unmapped(self.cache_location),
            )

            # convert cached netcdf data to zarr
            cached_sources = nc2zarr.map(
                nc_sources,
                cache_location=unmapped(self.cache_location),
            )

            # combine all datasets into a single zarr archive
            combine_and_write(cached_sources, target)

        return _flow
Esempio n. 2
0
    def flow(self):
        with Flow(self.name) as _flow:
            sources = source_url.map(self.days)
            nc_sources = download.map(sources,
                                      cache_location=unmapped(
                                          self.cache_location))
            chunked = chunk(nc_sources, size=self.files_per_chunk)
            writes = combine_and_write.map(chunked,
                                           unmapped(self.target_location),
                                           unmapped(self.concat_dim))
            consolidate_metadata(writes, self.target_location)

        return _flow
Esempio n. 3
0
    def flow(self):
        with Flow(self.name) as flow:
            # Use map the `source_url` task to each day. This returns a mapped output,
            # a list of string URLS. See
            # https://docs.prefect.io/core/concepts/mapping.html#prefect-approach
            # for more. We'll have one output URL per day.
            sources = source_url.map(self.days)

            # Map the `download` task (provided by prefect) to download the raw data
            # into a cache.
            # Mapped outputs (sources) can be fed straight into another Task.map call.
            # If an input is just a regular argument that's not a mapping, it must
            # be wrapepd in `prefect.unmapped`.
            # https://docs.prefect.io/core/concepts/mapping.html#unmapped-inputs
            # nc_sources will be a list of cached URLs, one per input day.
            nc_sources = download.map(sources,
                                      cache_location=unmapped(
                                          self.cache_location))

            # The individual files would be a bit too small for analysis. We'll use
            # pangeo_forge.utils.chunk to batch them up. We can pass mapped outputs
            # like nc_sources directly to `chunk`.
            chunked = pangeo_forge.utils.chunk(nc_sources, size=5)

            # Combine all the chunked inputs and write them to their final destination.
            writes = combine_and_write.map(
                chunked,
                unmapped(self.target_location),
                append_dim=unmapped("time"),
                concat_dim=unmapped("time"),
            )

            # Consolidate the metadata for the final dataset.
            consolidate_metadata(self.target_location, writes=writes)

        return flow
Esempio n. 4
0
    def flow(self):

        with Flow(self.name,
                  storage=self.storage,
                  environment=self.environment) as _flow:
            # download to cache
            nc_sources = download.map(
                self.sources,
                cache_location=unmapped(self.cache_location),
            )

            first = True
            write_tasks = []
            for source_group in chunked_iterable(nc_sources,
                                                 self.files_per_chunk):
                write_task = combine_and_write(source_group,
                                               self.target_location,
                                               self.concat_dim,
                                               first=first)
                write_tasks.append(write_task)
                first = False
            cm = consolidate_metadata(target_path)

        return _flow