Esempio n. 1
0
    def merge_files(path, target, remove_source=False):
        """Merge files (the output of this Block) into one single file.

        Optionally removes the source files.
        """
        path = utils.safe_abspath(path)
        target = utils.safe_abspath(target)

        if os.path.exists(target):
            raise IOError("Target '{}' already exists".format(target))

        target_base, ext = os.path.splitext(target)
        source_paths = glob.glob(os.path.join(path, '*' + ext))
        if len(source_paths) == 0:
            raise IOError(
                "No source files found with matching extension '{}'".format(
                    ext))
        elif len(source_paths) == 1:
            # shortcut for single file. we need to copy/move all base_name.*
            # files (e.g. shapefiles have multiple files)
            source_base = os.path.splitext(source_paths[0])[0]
            move_or_copy = shutil.move if remove_source else shutil.copy
            for file_path in glob.glob(source_base + '.*'):
                move_or_copy(file_path,
                             target_base + os.path.splitext(file_path)[1])
            return

        with utils.fiona_env():
            # first detect the driver etc
            with fiona.collection(source_paths[0], "r") as source:
                kwargs = {
                    "driver": source.driver,
                    "crs": source.crs,
                    "schema": source.schema,
                }
                if source.encoding:
                    kwargs["encoding"] = source.encoding

            with fiona.collection(target, "w", **kwargs) as out:
                for source_path in source_paths:
                    with fiona.collection(source_path, "r") as source:
                        out.writerecords(v for k, v in source.items())
                    if remove_source:
                        os.remove(source_path)

            if remove_source:
                try:
                    os.rmdir(path)
                except IOError:  # directory not empty: do nothing
                    pass
Esempio n. 2
0
 def gdal_dataset(self):
     try:
         return self._gdal_dataset
     except AttributeError:
         path = utils.safe_abspath(self.url)
         self._gdal_dataset = gdal.Open(path)
         return self._gdal_dataset
Esempio n. 3
0
    def process(data, process_kwargs):
        if "features" not in data or len(data["features"]) == 0:
            return data  # do nothing for non-feature or empty requests

        features = data["features"].copy()
        projection = data["projection"]
        path = utils.safe_abspath(process_kwargs["url"])
        fields = process_kwargs["fields"]
        extension = process_kwargs["extension"]
        driver = GeometryFileSink.supported_extensions[extension]

        # generate the directory if necessary
        os.makedirs(path, exist_ok=True)

        # the target file path is a deterministic hash of the request
        filename = ".".join([process_kwargs["hash"], extension])

        # add the index to the columns if necessary
        index_name = features.index.name
        if index_name in fields.values(
        ) and index_name not in features.columns:
            features[index_name] = features.index

        # copy the dataframe
        features = features[["geometry"] + list(fields.values())]

        # rename the columns
        features.columns = ["geometry"] + list(fields.keys())

        # generate the file
        features.to_file(os.path.join(path, filename), driver=driver)

        result = geopandas.GeoDataFrame(index=features.index)
        result["saved"] = True
        return {"features": result, "projection": projection}
Esempio n. 4
0
    def process(data, process_kwargs):
        if "features" not in data or len(data["features"]) == 0:
            return data  # do nothing for non-feature or empty requests

        features = data["features"].copy()
        projection = data["projection"]
        path = utils.safe_abspath(process_kwargs["url"])
        fields = process_kwargs["fields"]
        extension = process_kwargs["extension"]
        driver = GeometryFileSink.supported_extensions[extension]

        # generate the directory if necessary
        os.makedirs(path, exist_ok=True)

        # the target file path is a deterministic hash of the request
        filename = ".".join([process_kwargs["hash"], extension])

        # add the index to the columns if necessary
        index_name = features.index.name
        if index_name in fields.values(
        ) and index_name not in features.columns:
            features[index_name] = features.index

        # copy the dataframe
        features = features[["geometry"] + list(fields.values())]

        # rename the columns
        features.columns = ["geometry"] + list(fields.keys())

        # serialize nested fields (lists or dicts)
        for col in fields.keys():
            series = features[col]
            if series.dtype == object or (str(series.dtype) == "category"
                                          and series.cat.categories.dtype
                                          == object):
                features[col] = series.map(_to_json)

        # convert categoricals
        for col in fields.keys():
            series = features[col]
            if str(series.dtype) == "category":
                features[col] = series.astype(series.cat.categories.dtype)

        # GeoJSON needs reprojection to EPSG:4326
        if driver == "GeoJSON" and projection.upper() != "EPSG:4326":
            features = utils.geodataframe_transform(features, projection,
                                                    "EPSG:4326")

        # generate the file
        features.to_file(os.path.join(path, filename), driver=driver)

        result = geopandas.GeoDataFrame(index=features.index)
        result["saved"] = True
        return {"features": result, "projection": projection}
Esempio n. 5
0
    def process(process_kwargs):
        mode = process_kwargs["mode"]

        # handle empty requests
        if mode == "empty_vals":
            return
        elif mode == "empty_time":
            return {"time": []}
        elif mode == "empty_meta":
            return {"meta": []}

        # handle time requests
        if mode == "time":
            start = process_kwargs["start"]
            length = process_kwargs["length"]
            delta = process_kwargs["delta"]
            return {"time": [start + i * delta for i in range(length)]}

        # open the dataset
        url = process_kwargs["url"]
        path = utils.safe_abspath(url)
        dataset = gdal.Open(path)
        first_band = process_kwargs["first_band"]
        last_band = process_kwargs["last_band"]

        # handle meta requests
        if mode == "meta":
            return {
                "meta": [
                    dataset.GetRasterBand(i + 1).GetMetadata_Dict()
                    for i in range(first_band, last_band + 1)
                ]
            }

        # handle 'vals' requests
        dtype = process_kwargs["dtype"]
        no_data_value = process_kwargs["fillvalue"]
        bbox = process_kwargs["bbox"]
        width = process_kwargs["width"]
        height = process_kwargs["height"]
        length = last_band - first_band + 1

        # return an empty array if 0-sized data was requested
        if width == 0 or height == 0:
            return np.empty((length, height, width), dtype=dtype)

        # transform the requested bounding box to indices into the array
        shape = dataset.RasterCount, dataset.RasterYSize, dataset.RasterXSize
        gt = utils.GeoTransform(dataset.GetGeoTransform())
        ranges, padding = gt.get_array_ranges(bbox, shape)
        read_shape = [rng[1] - rng[0] for rng in ranges]

        # return nodata immediately for empty
        if any([x <= 0 for x in read_shape]):
            result = np.full(
                shape=(length, height, width), fill_value=no_data_value, dtype=dtype
            )
            return {"values": result, "no_data_value": no_data_value}

        # read arrays from file
        result = np.empty([length] + read_shape, dtype=dtype)
        for k in range(length):
            band = dataset.GetRasterBand(first_band + k + 1)
            result[k] = band.ReadAsArray(
                int(ranges[1][0]),
                int(ranges[0][0]),
                int(read_shape[1]),
                int(read_shape[0]),
            )

        # pad the data to the shape given by the index
        if padding is not None:
            padding = ((0, 0),) + padding  # for the time axis
            result = np.pad(result, padding, "constant", constant_values=no_data_value)

        # zoom to the desired height and width
        result = utils.zoom_raster(result, no_data_value, height, width)

        # fill nan values if they popped up
        result[~np.isfinite(result)] = no_data_value
        return {"values": result, "no_data_value": no_data_value}
Esempio n. 6
0
    def process(url, request):
        path = utils.safe_abspath(url)

        # convert the requested projection to a geopandas CRS
        crs = utils.get_crs(request["projection"])

        # convert the requested shapely geometry object to a GeoSeries
        filt_geom = gpd.GeoSeries([request["geometry"]], crs=crs)

        # acquire the data, filtering on the filt_geom bbox
        f = gpd.GeoDataFrame.from_file(path,
                                       bbox=filt_geom,
                                       layer=request["layer"])
        if len(f) == 0:
            # return directly if there is no data
            if request.get("mode") == "extent":
                return {"projection": request["projection"], "extent": None}
            else:  # this takes modes 'centroid' and 'intersects'
                return {
                    "projection": request["projection"],
                    "features": gpd.GeoDataFrame([]),
                }

        f.set_index(request["id_field"], inplace=True)

        # apply the non-geometry field filters first
        mask = None
        for field, value in request["filters"].items():
            if field not in f.columns:
                continue
            _mask = f[field] == value
            if mask is None:
                mask = _mask
            else:
                mask &= _mask
        if mask is not None:
            f = f[mask]

        # convert the data to the requested crs
        utils.geodataframe_transform(f, utils.crs_to_srs(f.crs),
                                     request["projection"])

        # compute the bounds of each geometry and filter on min_size
        min_size = request.get("min_size")
        if min_size:
            bounds = f["geometry"].bounds
            widths = bounds["maxx"] - bounds["minx"]
            heights = bounds["maxy"] - bounds["miny"]
            f = f[(widths > min_size) | (heights > min_size)]

        # only return geometries that truly intersect the requested geometry
        if request["mode"] == "centroid":
            with warnings.catch_warnings():  # geopandas warns if in WGS84
                warnings.simplefilter("ignore")
                f = f[f["geometry"].centroid.within(filt_geom.iloc[0])]
        else:
            f = f[f["geometry"].intersects(filt_geom.iloc[0])]

        if request.get("mode") == "extent":
            return {
                "projection": request["projection"],
                "extent": tuple(f.total_bounds),
            }
        else:  # this takes modes 'centroid' and 'intersects'
            # truncate the number of geometries if necessary
            if request.get("limit") and len(f) > request["limit"]:
                f = f.iloc[:request["limit"]]
            elif request.get("limit") is None:
                global_limit = config.get("geomodeling.geometry-limit")
                if len(f) > global_limit:
                    raise RuntimeError(
                        "The amount of returned geometries exceeded "
                        "the maximum of {} geometries.".format(global_limit))

            return {"projection": request["projection"], "features": f}
Esempio n. 7
0
 def path(self):
     return utils.safe_abspath(self.url)
Esempio n. 8
0
def to_file(source,
            url,
            fields=None,
            tile_size=None,
            dry_run=False,
            **request):
    """Utility function to export data from a GeometryBlock to a file on disk.

    You need to specify the target file path as well as the extent geometry
    you want to save.

    Args:
      source (GeometryBlock): the block the data is coming from
      url (str): The target file path. The extension determines the format. For
        supported formats, consult GeometryFileSink.supported_extensions.
      fields (dict): a mapping that relates column names to output file field
        names field names, ``{<output file field name>: <column name>, ...}``.
      tile_size (int): Optionally use this for large exports to stay within
        memory constraints. The export is split in tiles of given size (units
        are determined by the projection). Finally the tiles are merged.
      dry_run (bool): Do nothing, only validate the arguments.
      geometry (shapely Geometry): Limit exported objects to objects whose
        centroid intersects with this geometry.
      projection (str): The projection as a WKT string or EPSG code.
        Sets the projection of the geometry argument, the target
        projection of the data, and the tiling projection.
      mode (str): one of ``{"intersects", "centroid"}``, default "centroid"
      start (datetime): start date as UTC datetime
      stop (datetime): stop date as UTC datetime
      **request: see GeometryBlock request specification

    Relevant settings can be adapted as follows:
      >>> from dask import config
      >>> config.set({"geomodeling.root": '/my/output/data/path'})
      >>> config.set({"temporary_directory": '/my/alternative/tmp/dir'})
    """
    if "mode" not in request:
        request["mode"] = "centroid"

    path = utils.safe_abspath(url)
    extension = os.path.splitext(path)[1]

    TmpDir = DryRunTempDir if dry_run else tempfile.TemporaryDirectory
    with TmpDir(dir=config.get("temporary_directory", None)) as tmpdir:
        sink = GeometryFileSink(source,
                                tmpdir,
                                extension=extension,
                                fields=fields)

        # wrap the sink in a GeometryTiler
        if tile_size is not None:
            sink = GeometryTiler(sink, tile_size, request["projection"])

        if dry_run:
            return

        # export the dataset to the tmpdir (full dataset or multiple tiles)
        sink.get_data(**request)

        # copy the file / gather the tiles to the target location
        GeometryFileSink.merge_files(tmpdir, path)