Beispiel #1
0
class SpatialKeySchemaTest(BaseTestClass):
    expected_keys = {'col': 7, 'row': 3}

    sc = BaseTestClass.pysc._jsc.sc()
    ew = BaseTestClass.pysc._jvm.geopyspark.geotrellis.tests.schemas.SpatialKeyWrapper

    java_rdd = ew.testOut(sc)
    ser = ProtoBufSerializer(spatial_key_decoder, spatial_key_encoder)

    rdd = RDD(java_rdd, BaseTestClass.pysc, AutoBatchedSerializer(ser))
    collected = rdd.first()._asdict()

    @pytest.fixture(autouse=True)
    def tearDown(self):
        yield
        BaseTestClass.pysc._gateway.close()

    def result_checker(self, actual_keys, expected_keys):
        self.assertDictEqual(actual_keys, expected_keys)

    def test_encoded_keyss(self):
        actual_encoded = [spatial_key_encoder(x) for x in self.rdd.collect()]
        proto_spatial_key = keyMessages_pb2.ProtoSpatialKey()

        proto_spatial_key.col = 7
        proto_spatial_key.row = 3

        expected_encoded = proto_spatial_key.SerializeToString()

        self.assertEqual(actual_encoded[0], expected_encoded)

    def test_decoded_extents(self):
        self.assertDictEqual(self.collected, self.expected_keys)
class ByteTileSchemaTest(BaseTestClass):
    tiles = [
        Tile.from_numpy_array(np.int8([0, 0, 1, 1]).reshape(2, 2), -128),
        Tile.from_numpy_array(np.int8([1, 2, 3, 4]).reshape(2, 2), -128),
        Tile.from_numpy_array(np.int8([5, 6, 7, 8]).reshape(2, 2), -128)
    ]

    sc = BaseTestClass.pysc._jsc.sc()
    tw = BaseTestClass.pysc._jvm.geopyspark.geotrellis.tests.schemas.ByteArrayTileWrapper

    java_rdd = tw.testOut(sc)
    ser = ProtoBufSerializer(tile_decoder, tile_encoder)

    rdd = RDD(java_rdd, BaseTestClass.pysc, AutoBatchedSerializer(ser))
    collected = rdd.collect()

    def test_encoded_tiles(self):
        expected_encoded = [to_pb_tile(x) for x in self.collected]

        for actual, expected in zip(self.tiles, expected_encoded):
            cells = actual.cells
            rows, cols = cells.shape

            self.assertEqual(expected.cols, cols)
            self.assertEqual(expected.rows, rows)
            self.assertEqual(expected.cellType.nd, actual.no_data_value)
            self.assertEqual(expected.cellType.dataType,
                             mapped_data_types[actual.cell_type])

    def test_decoded_tiles(self):
        for actual, expected in zip(self.collected, self.tiles):
            self.assertTrue((actual.cells == expected.cells).all())
            self.assertTrue(actual.cells.dtype == expected.cells.dtype)
            self.assertEqual(actual.cells.shape, actual.cells.shape)
class TupleSchemaTest(BaseTestClass):
    extent = {
        'epsg': 2004,
        'extent': {
            'xmax': 1.0,
            'xmin': 0.0,
            'ymax': 1.0,
            'ymin': 0.0
        },
        'proj4': None
    }

    arr = np.int8([0, 0, 1, 1]).reshape(2, 2)
    bands = [arr, arr, arr]
    multiband_tile = np.array(bands)
    multiband_dict = Tile(multiband_tile, 'BYTE', -128)

    sc = BaseTestClass.pysc._jsc.sc()
    ew = BaseTestClass.pysc._jvm.geopyspark.geotrellis.tests.schemas.TupleWrapper

    java_rdd = ew.testOut(sc)

    decoder = create_partial_tuple_decoder(key_type="ProjectedExtent")
    encoder = create_partial_tuple_encoder(key_type="ProjectedExtent")

    ser = ProtoBufSerializer(decoder, encoder)
    rdd = RDD(java_rdd, BaseTestClass.pysc, AutoBatchedSerializer(ser))
    collected = rdd.collect()

    @pytest.mark.skipif(
        'TRAVIS' in os.environ,
        reason="Encoding using methods in Main causes issues on Travis")
    def test_encoded_tuples(self):
        proto_tuple = tupleMessages_pb2.ProtoTuple()

        self.extent['extent'] = Extent(**self.extent['extent'])
        proto_extent = to_pb_projected_extent(ProjectedExtent(**self.extent))
        proto_multiband = to_pb_multibandtile(self.multiband_dict)

        proto_tuple.projectedExtent.CopyFrom(proto_extent)
        proto_tuple.tiles.CopyFrom(proto_multiband)

        bs = proto_tuple.SerializeToString()
        expected_encoded = [self.ser.dumps(x) for x in self.collected]

        for expected in expected_encoded:
            self.assertEqual(bs, expected)

    def test_decoded_tuples(self):
        expected_tuples = [(self.extent, self.multiband_dict),
                           (self.extent, self.multiband_dict),
                           (self.extent, self.multiband_dict)]

        for actual, expected in zip(self.collected, expected_tuples):
            (actual_extent, actual_tile) = actual
            (expected_extent, expected_tile) = expected

            self.assertTrue((actual_tile.cells == expected_tile.cells).all())
            self.assertDictEqual(actual_extent._asdict(), expected_extent)
class TemporalProjectedExtentSchemaTest(BaseTestClass):
    extents = [
        Extent(0.0, 0.0, 1.0, 1.0),
        Extent(1.0, 2.0, 3.0, 4.0),
        Extent(5.0, 6.0, 7.0, 8.0),
    ]

    time = datetime.datetime.strptime("2016-08-24T09:00:00Z",
                                      '%Y-%m-%dT%H:%M:%SZ')

    expected_tpextents = [
        TemporalProjectedExtent(epsg=2004, extent=extents[0],
                                instant=time)._asdict(),
        TemporalProjectedExtent(epsg=2004, extent=extents[1],
                                instant=time)._asdict(),
        TemporalProjectedExtent(epsg=2004, extent=extents[2],
                                instant=time)._asdict()
    ]

    sc = BaseTestClass.pysc._jsc.sc()
    ew = BaseTestClass.pysc._jvm.geopyspark.geotrellis.tests.schemas.TemporalProjectedExtentWrapper

    java_rdd = ew.testOut(sc)
    ser = ProtoBufSerializer(temporal_projected_extent_decoder,
                             temporal_projected_extent_encoder)

    rdd = RDD(java_rdd, BaseTestClass.pysc, AutoBatchedSerializer(ser))
    collected = [tpex._asdict() for tpex in rdd.collect()]

    @pytest.fixture(scope='class', autouse=True)
    def tearDown(self):
        yield
        BaseTestClass.pysc._gateway.close()

    def result_checker(self, actual_tpe, expected_tpe):
        for actual, expected in zip(actual_tpe, expected_tpe):
            self.assertDictEqual(actual, expected)

    def test_encoded_tpextents(self):
        actual_encoded = [
            temporal_projected_extent_encoder(x) for x in self.rdd.collect()
        ]

        for x in range(0, len(self.expected_tpextents)):
            self.expected_tpextents[x]['extent'] = Extent(
                **self.expected_tpextents[x]['extent'])

        expected_encoded = [
            to_pb_temporal_projected_extent(TemporalProjectedExtent(**ex)).SerializeToString() \
            for ex in self.expected_tpextents
        ]

        for actual, expected in zip(actual_encoded, expected_encoded):
            self.assertEqual(actual, expected)

    def test_decoded_tpextents(self):
        self.result_checker(self.collected, self.expected_tpextents)
Beispiel #5
0
def get(uri,
        extensions=['.shp', '.SHP'],
        num_partitions=None,
        s3_client=DEFAULT_S3_CLIENT):
    """Creates an ``RDD[Feature]`` from Shapefile(s) that are located on the local file system, ``HDFS``,
    or ``S3``.

    The ``properties`` of the ``Feautre``\s in the ``RDD`` will contain the attributes of their
    respective geometry in a ``dict``. All keys and values of each ``dict`` will be ``str``\s regardless
    of how the attribute is represented in the Shapefile.

    Note:
        This feature is currently experimental and will most likely change in the coming versions of
        GPS.

    Note:
        When reading from S3, the desired files **must** be publicly readable. Otherwise, you will
        get 403 errors.

        Due to the nature of how GPS reads Shapefile(s) from S3, the ``mock`` S3 Client cannot
        currently be used.

    Args:
        uri (str or [str]): The path or list of paths to the desired Shapfile(s)/directory(ies).
        extensions ([str], optional): A list of the extensions that the Shapefile(s) have.
            These are ``.shp`` and ``.SHP`` by default.
        num_partitions (int, optional): The number of partitions Spark
            will make when the ``RDD`` is created. If ``None``, then the
            ``defaultParallelism`` will be used.
        s3_client (str, optional): Which ``S3Cleint`` to use when reading
            GeoTiffs from S3. There are currently two options: ``default`` and
            ``mock``. Defaults to :const:`~geopyspark.geotrellis.constants.DEFAULT_S3_CLIENT`.

            Note:
                ``mock`` should only be used in unit tests and debugging.

    Returns:
        ``RDD[:class:`~geopyspark.geotrellis.Feature`]``
    """

    pysc = get_spark_context()

    num_partitions = num_partitions or pysc.defaultParallelism

    shapefile = pysc._gateway.jvm.geopyspark.geotools.shapefile.ShapefileRDD

    if isinstance(uri, (list, tuple)):
        jrdd = shapefile.get(pysc._jsc.sc(), uri, extensions, num_partitions,
                             s3_client)
    else:
        jrdd = shapefile.get(pysc._jsc.sc(), [uri], extensions, num_partitions,
                             s3_client)

    ser = ProtoBufSerializer(feature_decoder, None)

    return create_python_rdd(jrdd, ser)
Beispiel #6
0
class MultibandSchemaTest(BaseTestClass):
    arr = np.int8([0, 0, 1, 1]).reshape(2, 2)
    no_data = -128
    arr_dict = Tile(arr, 'BYTE', no_data)
    band_dicts = [arr_dict, arr_dict, arr_dict]

    bands = [arr, arr, arr]
    multiband_tile = np.array(bands)
    multiband_dict = Tile(multiband_tile, 'BYTE', no_data)

    sc = BaseTestClass.pysc._jsc.sc()
    mw = BaseTestClass.pysc._jvm.geopyspark.geotrellis.tests.schemas.ArrayMultibandTileWrapper

    java_rdd = mw.testOut(sc)
    ser = ProtoBufSerializer(multibandtile_decoder, multibandtile_encoder)

    rdd = RDD(java_rdd, BaseTestClass.pysc, AutoBatchedSerializer(ser))
    collected = rdd.collect()

    @pytest.fixture(autouse=True)
    def tearDown(self):
        yield
        BaseTestClass.pysc._gateway.close()

    def test_encoded_multibands(self):
        actual_encoded = [multibandtile_encoder(x) for x in self.collected]

        proto_tile = tileMessages_pb2.ProtoTile()
        cell_type = tileMessages_pb2.ProtoCellType()

        cell_type.nd = self.no_data
        cell_type.hasNoData = True
        cell_type.dataType = 1

        proto_tile.cols = 2
        proto_tile.rows = 2
        proto_tile.sint32Cells.extend(self.arr.flatten().tolist())
        proto_tile.cellType.CopyFrom(cell_type)

        proto_multiband = tileMessages_pb2.ProtoMultibandTile()
        proto_multiband.tiles.extend([proto_tile, proto_tile, proto_tile])
        bs = proto_multiband.SerializeToString()

        expected_encoded = [bs, bs, bs]

        for actual, expected in zip(actual_encoded, expected_encoded):
            self.assertEqual(actual, expected)

    def test_decoded_multibands(self):
        expected_multibands = [
            self.multiband_dict, self.multiband_dict, self.multiband_dict
        ]

        for actual, expected in zip(self.collected, expected_multibands):
            self.assertTrue((actual.cells == expected.cells).all())
Beispiel #7
0
class FeatureCellValueSchemaTest(BaseTestClass):
    sc = BaseTestClass.pysc._jsc.sc()
    fw = BaseTestClass.pysc._jvm.geopyspark.geotrellis.tests.schemas.FeatureCellValueWrapper

    java_rdd = fw.testOut(sc)
    ser = ProtoBufSerializer(feature_cellvalue_decoder,
                             feature_cellvalue_encoder)

    rdd = RDD(java_rdd, BaseTestClass.pysc, AutoBatchedSerializer(ser))

    point = Point(0, 2)
    line_1 = LineString(
        [point, Point(1, 3),
         Point(2, 4),
         Point(3, 5),
         Point(4, 6)])
    line_2 = LineString(
        [Point(5, 7),
         Point(6, 8),
         Point(7, 9),
         Point(8, 10),
         Point(9, 11)])
    multi_line = MultiLineString([line_1, line_2])

    features = [
        Feature(point, CellValue(2, 1)),
        Feature(line_1, CellValue(1, 0)),
        Feature(multi_line, CellValue(1, 0))
    ]

    collected = [f for f in rdd.collect()]

    @pytest.fixture(autouse=True)
    def tearDown(self):
        yield
        BaseTestClass.pysc._gateway.close()

    def test_decoder(self):
        geoms = [g.geometry for g in self.collected]
        ms = [m.properties for m in self.collected]

        for x in self.features:
            self.assertTrue(x.geometry in geoms)
            self.assertTrue(x.properties in ms)

    def test_encoder(self):
        expected_encoded = [
            to_pb_feature_cellvalue(f).SerializeToString()
            for f in self.features
        ]
        actual_encoded = [feature_cellvalue_encoder(f) for f in self.collected]

        for x in expected_encoded:
            self.assertTrue(x in actual_encoded)
Beispiel #8
0
class ExtentSchemaTest(BaseTestClass):
    ew = BaseTestClass.pysc._gateway.jvm.geopyspark.geotrellis.tests.schemas.ExtentWrapper
    java_rdd = ew.testOut(BaseTestClass.pysc._jsc.sc())
    ser = ProtoBufSerializer(extent_decoder, extent_encoder)
    rdd = RDD(java_rdd, BaseTestClass.pysc, AutoBatchedSerializer(ser))
    collected = rdd.collect()

    expected_extents = [{
        "xmin": 0.0,
        "ymin": 0.0,
        "xmax": 1.0,
        "ymax": 1.0
    }, {
        "xmin": 1.0,
        "ymin": 2.0,
        "xmax": 3.0,
        "ymax": 4.0
    }, {
        "xmin": 5.0,
        "ymin": 6.0,
        "xmax": 7.0,
        "ymax": 8.0
    }]

    @pytest.fixture(scope='class', autouse=True)
    def tearDown(self):
        yield
        BaseTestClass.pysc._gateway.close()

    def result_checker(self, actual_result, expected_result):
        for actual, expected in zip(actual_result, expected_result):
            self.assertDictEqual(actual, expected)

    def test_decoded_extents(self):
        actual_decoded = [
            from_pb_extent(ex)._asdict() for ex in self.collected
        ]
        self.result_checker(actual_decoded, self.expected_extents)

    def test_encoded_extents(self):
        expected_encoded = [
            to_pb_extent(Extent(**x)).SerializeToString()
            for x in self.expected_extents
        ]
        actual_encoded = [extent_encoder(x) for x in self.collected]
        for actual, expected in zip(actual_encoded, expected_encoded):
            self.assertEqual(actual, expected)
class ProjectedExtentSchemaTest(BaseTestClass):
    projected_extents = [
        {'epsg': 2004, 'extent': {'xmax': 1.0, 'xmin': 0.0, 'ymax': 1.0, 'ymin': 0.0}, 'proj4': None},
        {'epsg': 2004, 'extent': {'xmax': 3.0, 'xmin': 1.0, 'ymax': 4.0, 'ymin': 2.0}, 'proj4': None},
        {'epsg': 2004, 'extent': {'xmax': 7.0, 'xmin': 5.0, 'ymax': 8.0, 'ymin': 6.0}, 'proj4': None}]

    sc = BaseTestClass.pysc._jsc.sc()
    ew = BaseTestClass.pysc._jvm.geopyspark.geotrellis.tests.schemas.ProjectedExtentWrapper

    java_rdd = ew.testOut(sc)
    ser = ProtoBufSerializer(projected_extent_decoder,
                             projected_extent_encoder)

    rdd = RDD(java_rdd, BaseTestClass.pysc, AutoBatchedSerializer(ser))
    collected = [pex._asdict() for pex in rdd.collect()]

    @pytest.fixture(autouse=True)
    def tearDown(self):
        yield
        BaseTestClass.pysc._gateway.close()

    def result_checker(self, actual_pe, expected_pe):
        for actual, expected in zip(actual_pe, expected_pe):
            self.assertDictEqual(actual, expected)

    def test_encoded_pextents(self):
        actual_encoded = [projected_extent_encoder(x) for x in self.rdd.collect()]

        for x in range(0, len(self.projected_extents)):
            self.projected_extents[x]['extent'] = Extent(**self.projected_extents[x]['extent'])

        expected_encoded = [
            to_pb_projected_extent(ProjectedExtent(**ex)).SerializeToString() for ex in self.projected_extents
        ]

        for actual, expected in zip(actual_encoded, expected_encoded):
            self.assertEqual(actual, expected)

    def test_decoded_pextents(self):
        self.result_checker(self.collected, self.projected_extents)
Beispiel #10
0
def rasterize_features(features,
                       crs,
                       zoom,
                       cell_type=CellType.FLOAT64,
                       options=None,
                       zindex_cell_type=CellType.INT8,
                       partition_strategy=None):
    """Rasterizes a collection of :class:`~geopyspark.vector_pipe.Feature`\s.

    Args:
        features (pyspark.RDD[Feature]): A Python ``RDD`` that
            contains :class:`~geopyspark.vector_pipe.Feature`\s.

            Note:
                The ``properties`` of each ``Feature`` must be an instance of
                :class:`~geopyspark.vector_pipe.CellValue`.
        crs (str or int): The CRS of the input geometry.
        zoom (int): The zoom level of the output raster.

            Note:
                Not all rasterized ``Feature``\s may be present in the resulting layer
                if the ``zoom`` is not high enough.
        cell_type (str or :class:`~geopyspark.geotrellis.constants.CellType`): Which data type the
            cells should be when created. Defaults to ``CellType.FLOAT64``.
        options (:class:`~geopyspark.geotrellis.RasterizerOptions`, optional): Pixel intersection options.
        zindex_cell_type (str or :class:`~geopyspark.geotrellis.constants.CellType`): Which data type
            the ``Z-Index`` cells are. Defaults to ``CellType.INT8``.
        partition_strategy (:class:`~geopyspark.HashPartitionStrategy` or :class:`~geopyspark.SpatialPartitioinStrategy`, optional):
            Sets the ``Partitioner`` for the resulting layer and how many partitions it has.
            Default is, ``None``.

            If ``None``, then the output layer will have the default ``Partitioner`` and a number
            of paritions that was determined by the method.

            If ``partition_strategy`` is set but has no ``num_partitions``, then the resulting layer
            will have the ``Partioner`` specified in the strategy with the with same number of
            partitions the source layer had.

            If ``partition_strategy`` is set and has a ``num_partitions``, then the resulting layer
            will have the ``Partioner`` and number of partitions specified in the strategy.

    Returns:
        :class:`~geopyspark.geotrellis.layer.TiledRasterLayer`
    """

    if isinstance(crs, int):
        crs = str(crs)

    pysc = get_spark_context()
    rasterizer = pysc._gateway.jvm.geopyspark.geotrellis.SpatialTiledRasterLayer.rasterizeFeaturesWithZIndex

    ser = ProtoBufSerializer(feature_cellvalue_decoder, feature_cellvalue_encoder)
    reserialized_rdd = features._reserialize(ser)

    srdd = rasterizer(reserialized_rdd._jrdd.rdd(),
                      crs,
                      zoom,
                      CellType(cell_type).value,
                      options,
                      CellType(zindex_cell_type).value,
                      partition_strategy)

    return TiledRasterLayer(LayerType.SPATIAL, srdd)
    def _get_rdd(self, jrdd):
        ser = ProtoBufSerializer(feature_decoder, feature_encoder)

        return create_python_rdd(jrdd, ser)
Beispiel #12
0
class SpaceTimeKeySchemaTest(BaseTestClass):
    time = datetime.datetime.strptime("2016-08-24T09:00:00Z",
                                      '%Y-%m-%dT%H:%M:%SZ')

    expected_keys = [
        SpaceTimeKey(7, 3, time)._asdict(),
        SpaceTimeKey(9, 4, time)._asdict(),
        SpaceTimeKey(11, 5, time)._asdict(),
    ]

    sc = BaseTestClass.pysc._jsc.sc()
    ew = BaseTestClass.pysc._jvm.geopyspark.geotrellis.tests.schemas.SpaceTimeKeyWrapper

    java_rdd = ew.testOut(sc)
    ser = ProtoBufSerializer(space_time_key_decoder, space_time_key_encoder)

    rdd = RDD(java_rdd, BaseTestClass.pysc, AutoBatchedSerializer(ser))
    collected = [stk._asdict() for stk in rdd.collect()]

    @pytest.fixture(autouse=True)
    def tearDown(self):
        yield
        BaseTestClass.pysc._gateway.close()

    def result_checker(self, actual_keys, expected_keys):
        for actual, expected in zip(actual_keys, expected_keys):
            self.assertDictEqual(actual, expected)

    def test_encoded_keyss(self):
        expected_encoded = [
            space_time_key_encoder(x) for x in self.rdd.collect()
        ]
        actual_encoded = []

        for x in self.expected_keys:
            proto_space_time_key = keyMessages_pb2.ProtoSpaceTimeKey()

            proto_space_time_key.col = x['col']
            proto_space_time_key.row = x['row']
            proto_space_time_key.instant = _convert_to_unix_time(x['instant'])

            actual_encoded.append(proto_space_time_key.SerializeToString())

        for actual, expected in zip(actual_encoded, expected_encoded):
            self.assertEqual(actual, expected)
        expected_encoded = [
            space_time_key_encoder(x) for x in self.rdd.collect()
        ]
        actual_encoded = []

        for x in self.expected_keys:
            proto_space_time_key = keyMessages_pb2.ProtoSpaceTimeKey()

            proto_space_time_key.col = x['col']
            proto_space_time_key.row = x['row']
            proto_space_time_key.instant = _convert_to_unix_time(x['instant'])

            actual_encoded.append(proto_space_time_key.SerializeToString())

        for actual, expected in zip(actual_encoded, expected_encoded):
            self.assertEqual(actual, expected)

    def test_decoded_extents(self):
        self.result_checker(self.collected, self.expected_keys)
class FeatureSchemaTest(BaseTestClass):
    sc = BaseTestClass.pysc._jsc.sc()
    fw = BaseTestClass.pysc._jvm.geopyspark.vectorpipe.tests.schemas.FeatureWrapper

    java_rdd = fw.testOut(sc)
    ser = ProtoBufSerializer(feature_decoder, feature_encoder)

    rdd = RDD(java_rdd, BaseTestClass.pysc, AutoBatchedSerializer(ser))

    metadata = Properties(element_id=1993,
                          user="******",
                          uid=19144,
                          changeset=10,
                          version=24,
                          minor_version=5,
                          timestamp=parser.parse("2012-06-05T07:00:00UTC"),
                          visible=True,
                          tags={
                              'amenity': 'embassy',
                              'diplomatic': 'embassy',
                              'country': 'azavea'
                          })

    point = Point(0, 2)
    line_1 = LineString(
        [point, Point(1, 3),
         Point(2, 4),
         Point(3, 5),
         Point(4, 6)])
    line_2 = LineString(
        [Point(5, 7),
         Point(6, 8),
         Point(7, 9),
         Point(8, 10),
         Point(9, 11)])
    multi_line = MultiLineString([line_1, line_2])

    features = [
        Feature(point, metadata),
        Feature(line_1, metadata),
        Feature(multi_line, metadata)
    ]

    collected = [f for f in rdd.collect()]

    @pytest.fixture(autouse=True)
    def tearDown(self):
        yield
        BaseTestClass.pysc._gateway.close()

    def test_decoder(self):
        geoms = [g.geometry for g in self.collected]
        ms = [m.properties for m in self.collected]

        for x in self.features:
            self.assertTrue(x.geometry in geoms)
            self.assertTrue(x.properties in ms)

    def test_encoder(self):
        expected_encoded = [
            to_pb_feature(f).SerializeToString() for f in self.features
        ]
        actual_encoded = [feature_encoder(f) for f in self.collected]

        for x in expected_encoded:
            self.assertTrue(x in actual_encoded)