Esempio n. 1
0
    def test_point_serializer(self):
        data = [[1, Point(21.0, 56.0), Point(21.0, 59.0)]]

        schema = t.StructType([
            t.StructField("id", IntegerType(), True),
            t.StructField("geom_from", GeometryType(), True),
            t.StructField("geom_to", GeometryType(), True)
        ])
        self.spark.createDataFrame(data,
                                   schema).createOrReplaceTempView("points")

        distance = self.spark.sql(
            "select st_distance(geom_from, geom_to) from points").collect(
            )[0][0]
        assert distance == 3.0
Esempio n. 2
0
    def test_multipolygon_serialization(self):
        ext = [(0, 0), (0, 2), (2, 2), (2, 0), (0, 0)]
        int = [(1, 1), (1, 1.5), (1.5, 1.5), (1.5, 1), (1, 1)]

        polygons = [
            Polygon(ext, [int]),
            Polygon([[0, 0], [1, 0], [1, 1], [0, 1], [0, 0]])
        ]
        multipolygon = MultiPolygon(polygons)

        data = [
            [1, multipolygon]
        ]

        schema = t.StructType(
            [
                t.StructField("id", IntegerType(), True),
                t.StructField("geom", GeometryType(), True)
            ]
        )

        spark.createDataFrame(
            data,
            schema
        ).createOrReplaceTempView("polygon")
        length = spark.sql("select st_area(geom) from polygon").collect()[0][0]
        self.assertEqual(length, 4.75)
    def test_point_rdd(self):
        spatial_rdd = PointRDD(sparkContext=self.sc,
                               InputLocation=crs_test_point,
                               Offset=0,
                               splitter=splitter,
                               carryInputData=True,
                               partitions=numPartitions,
                               newLevel=StorageLevel.MEMORY_ONLY)

        raw_spatial_rdd = spatial_rdd.rawSpatialRDD.map(
            lambda x: [x.geom, *x.getUserData().split("\t")])

        self.spark.createDataFrame(raw_spatial_rdd).show()

        schema = StructType([
            StructField("geom", GeometryType()),
            StructField("name", StringType())
        ])

        spatial_rdd_with_schema = self.spark.createDataFrame(
            raw_spatial_rdd, schema)

        spatial_rdd_with_schema.show()

        assert spatial_rdd_with_schema.take(
            1)[0][0].wkt == "POINT (32.324142 -88.331492)"
Esempio n. 4
0
    def to_bytes(cls, geom: BaseGeometry) -> List[int]:
        from geo_pyspark.sql.types import GeometryType
        geom_name = str(geom.__class__.__name__).lower()

        try:
            appr_parser = PARSERS[geom_name]
            geom.__UDT__ = GeometryType()
        except KeyError:
            raise KeyError(f"Parser for geometry {geom_name} is not available")
        return appr_parser.serialize(geom, BinaryBuffer())
Esempio n. 5
0
    def test_multipoint_serializer(self):

        multipoint = MultiPoint([[21.0, 56.0], [21.0, 57.0]])
        data = [[1, multipoint]]

        schema = t.StructType([
            t.StructField("id", IntegerType(), True),
            t.StructField("geom", GeometryType(), True)
        ])
        m_point_out = self.spark.createDataFrame(data, schema).collect()[0][1]

        assert m_point_out == multipoint
    def test_list_to_rdd_and_df(self):
        point_data = [[Point(21, 52.0), "1", 1], [Point(22, 52.0), "2", 2],
                      [Point(23.0, 52), "3", 3], [Point(23, 54), "4", 4],
                      [Point(24.0, 56.0), "5", 5]]
        schema = StructType([
            StructField("geom", GeometryType(), False),
            StructField("id_1", StringType(), False),
            StructField("id_2", IntegerType(), False),
        ])

        rdd_data = self.spark.sparkContext.parallelize(point_data)
        df = self.spark.createDataFrame(rdd_data)
        df.show()
        df.printSchema()
Esempio n. 7
0
    def test_multilinestring_serialization(self):
        multilinestring = MultiLineString([[[0, 1], [1, 1]], [[2, 2], [3, 2]]])
        data = [[1, multilinestring]]

        schema = t.StructType([
            t.StructField("id", IntegerType(), True),
            t.StructField("geom", GeometryType(), True)
        ])

        self.spark.createDataFrame(
            data, schema).createOrReplaceTempView("multilinestring")

        length = self.spark.sql(
            "select st_length(geom) from multilinestring").collect()[0][0]
        assert length == 2.0
Esempio n. 8
0
    def test_linestring_serialization(self):
        linestring = LineString([(0.0, 1.0), (1, 1), (12.0, 1.0)])
        data = [[1, linestring]]

        schema = t.StructType([
            t.StructField("id", IntegerType(), True),
            t.StructField("geom", GeometryType(), True)
        ])

        self.spark.createDataFrame(data,
                                   schema).createOrReplaceTempView("line")

        length = self.spark.sql(
            "select st_length(geom) from line").collect()[0][0]
        assert length == 12.0
Esempio n. 9
0
    def test_polygon_serialization(self):
        ext = [(0, 0), (0, 2), (2, 2), (2, 0), (0, 0)]
        int = [(1, 1), (1, 1.5), (1.5, 1.5), (1.5, 1), (1, 1)]

        polygon = Polygon(ext, [int])

        data = [[1, polygon]]

        schema = t.StructType([
            t.StructField("id", IntegerType(), True),
            t.StructField("geom", GeometryType(), True)
        ])

        self.spark.createDataFrame(data,
                                   schema).createOrReplaceTempView("polygon")

        length = self.spark.sql(
            "select st_area(geom) from polygon").collect()[0][0]
        assert length == 3.75
Esempio n. 10
0
def assign_udt_shapely_objects(geoms: List[type(BaseGeometry)]) -> bool:
    from geo_pyspark.sql.types import GeometryType
    for geom in geoms:
        geom.__UDT__ = GeometryType()
    return True