Esempio n. 1
0
 def test_meta(self):
     assert not is_greater_or_equal_version("1.1.5", "1.2.0")
     assert is_greater_or_equal_version("1.2.0", "1.1.5")
     assert is_greater_or_equal_version("1.3.5", "1.2.0")
     assert not is_greater_or_equal_version("", "1.2.0")
     assert not is_greater_or_equal_version("1.3.5", "")
     SedonaMeta.version = "1.2.0"
     assert SedonaMeta.version == "1.2.0"
     SedonaMeta.version = "1.3.0"
     assert SedonaMeta.version == "1.3.0"
    def test_read_to_valid_geometry_rdd(self):
        if is_greater_or_equal_version(SedonaMeta.version, "1.0.0"):
            geo_json_rdd = GeoJsonReader.readToGeometryRDD(
                self.sc,
                geo_json_geom_with_feature_property,
                True,
                False
            )

            assert geo_json_rdd.rawSpatialRDD.count() == 1001

            geo_json_rdd = GeoJsonReader.readToGeometryRDD(
                self.sc,
                geo_json_geom_without_feature_property,
                True,
                False
            )

            assert geo_json_rdd.rawSpatialRDD.count() == 10

            geo_json_rdd = GeoJsonReader.readToGeometryRDD(
                self.sc,
                geo_json_with_invalid_geometries,
                False,
                False
            )

            assert geo_json_rdd.rawSpatialRDD.count() == 2

            geo_json_rdd = GeoJsonReader.readToGeometryRDD(
                self.sc,
                geo_json_with_invalid_geometries
            )
            assert geo_json_rdd.rawSpatialRDD.count() == 3
Esempio n. 3
0
    def test_read_to_geometry_rdd_invalid_syntax(self):
        if is_greater_or_equal_version(SedonaMeta.version, "1.2.0"):
            geojson_rdd = GeoJsonReader.readToGeometryRDD(
                self.sc, geo_json_with_invalid_geom_with_feature_property,
                False, True)

            assert geojson_rdd.rawSpatialRDD.count() == 1
Esempio n. 4
0
    def test_read_to_geometry_rdd(self):
        print(SedonaMeta.version)
        if is_greater_or_equal_version(SedonaMeta.version, "1.2.0"):
            geo_json_rdd = GeoJsonReader.readToGeometryRDD(
                self.sc, geo_json_geom_with_feature_property)

            assert geo_json_rdd.rawSpatialRDD.count() == 1001

            geo_json_rdd = GeoJsonReader.readToGeometryRDD(
                self.sc, geo_json_geom_without_feature_property)

            assert geo_json_rdd.rawSpatialRDD.count() == 10
Esempio n. 5
0
    def test_read_to_include_id_rdd(self):
        if is_greater_or_equal_version(SedonaMeta.version, "1.2.0"):
            geo_json_rdd = GeoJsonReader.readToGeometryRDD(
                self.sc, geo_json_contains_id, True, False)

            geo_json_rdd = GeoJsonReader.readToGeometryRDD(
                sc=self.sc,
                inputPath=geo_json_contains_id,
                allowInvalidGeometries=True,
                skipSyntacticallyInvalidGeometries=False)
            assert geo_json_rdd.rawSpatialRDD.count() == 1
            try:
                assert geo_json_rdd.fieldNames.__len__() == 2
            except AssertionError:
                assert geo_json_rdd.fieldNames.__len__() == 3
Esempio n. 6
0
class TestAdapter(TestBase):
    def test_read_csv_point_into_spatial_rdd(self):
        df = self.spark.read.\
            format("csv").\
            option("delimiter", "\t").\
            option("header", "false").\
            load(area_lm_point_input_location)

        df.show()
        df.createOrReplaceTempView("inputtable")

        spatial_df = self.spark.sql(
            "select ST_PointFromText(inputtable._c0,\",\") as arealandmark from inputtable"
        )
        spatial_df.show()
        spatial_df.printSchema()

        spatial_rdd = Adapter.toSpatialRdd(spatial_df, "arealandmark")
        spatial_rdd.analyze()
        Adapter.toDf(spatial_rdd, self.spark).show()

    def test_read_csv_point_into_spatial_rdd_by_passing_coordinates(self):
        df = self.spark.read.format("csv").\
            option("delimiter", ",").\
            option("header", "false").\
            load(area_lm_point_input_location)

        df.show()
        df.createOrReplaceTempView("inputtable")

        spatial_df = self.spark.sql(
            "select ST_Point(cast(inputtable._c0 as Decimal(24,20)),cast(inputtable._c1 as Decimal(24,20))) as arealandmark from inputtable"
        )

        spatial_df.show()
        spatial_df.printSchema()

    def test_read_csv_point_into_spatial_rdd_with_unique_id_by_passing_coordinates(
            self):
        df = self.spark.read.format("csv").\
            option("delimiter", ",").\
            option("header", "false").\
            load(area_lm_point_input_location)

        df.show()
        df.createOrReplaceTempView("inputtable")

        spatial_df = self.spark.sql(
            "select ST_Point(cast(inputtable._c0 as Decimal(24,20)),cast(inputtable._c1 as Decimal(24,20))) as arealandmark from inputtable"
        )

        spatial_df.show()
        spatial_df.printSchema()

    def test_read_mixed_wkt_geometries_into_spatial_rdd(self):
        df = self.spark.read.format("csv").\
            option("delimiter", "\t").\
            option("header", "false").load(mixed_wkt_geometry_input_location)

        df.show()
        df.createOrReplaceTempView("inputtable")
        spatial_df = self.spark.sql(
            "select ST_GeomFromWKT(inputtable._c0) as usacounty from inputtable"
        )
        spatial_df.show()
        spatial_df.printSchema()
        spatial_rdd = Adapter.toSpatialRdd(spatial_df, "usacounty")
        spatial_rdd.analyze()
        Adapter.toDf(spatial_rdd, self.spark).show()
        assert (Adapter.toDf(spatial_rdd, self.spark).columns.__len__() == 1)
        Adapter.toDf(spatial_rdd, self.spark).show()

    def test_read_mixed_wkt_geometries_into_spatial_rdd_with_unique_id(self):
        df = self.spark.read.format("csv").\
            option("delimiter", "\t").\
            option("header", "false").\
            load(mixed_wkt_geometry_input_location)

        df.show()
        df.createOrReplaceTempView("inputtable")

        spatial_df = self.spark.sql(
            "select ST_GeomFromWKT(inputtable._c0) as usacounty, inputtable._c3, inputtable._c5 from inputtable"
        )
        spatial_df.show()
        spatial_df.printSchema()

        spatial_rdd = Adapter.toSpatialRdd(spatial_df, "usacounty")
        spatial_rdd.analyze()
        assert (Adapter.toDf(spatial_rdd, self.spark).columns.__len__() == 3)
        Adapter.toDf(spatial_rdd, self.spark).show()

    def test_read_shapefile_to_dataframe(self):
        spatial_rdd = ShapefileReader.readToGeometryRDD(
            self.spark.sparkContext, shape_file_input_location)
        spatial_rdd.analyze()
        logging.info(spatial_rdd.fieldNames)
        df = Adapter.toDf(spatial_rdd, self.spark)
        df.show()

    def test_read_shapefile_with_missing_to_dataframe(self):
        spatial_rdd = ShapefileReader.\
            readToGeometryRDD(self.spark.sparkContext, shape_file_with_missing_trailing_input_location)

        spatial_rdd.analyze()
        logging.info(spatial_rdd.fieldNames)

        df = Adapter.toDf(spatial_rdd, self.spark)
        df.show()

    def test_geojson_to_dataframe(self):
        spatial_rdd = PolygonRDD(self.spark.sparkContext,
                                 geojson_input_location,
                                 FileDataSplitter.GEOJSON, True)

        spatial_rdd.analyze()
        Adapter.toDf(spatial_rdd, self.spark).show()
        df = Adapter.toDf(spatial_rdd, self.spark)

        assert (df.columns[1] == "STATEFP")

    def test_convert_spatial_join_result_to_dataframe(self):
        polygon_wkt_df = self.spark.read.format("csv").option(
            "delimiter",
            "\t").option("header",
                         "false").load(mixed_wkt_geometry_input_location)
        polygon_wkt_df.createOrReplaceTempView("polygontable")

        polygon_df = self.spark.sql(
            "select ST_GeomFromWKT(polygontable._c0) as usacounty from polygontable"
        )
        polygon_rdd = Adapter.toSpatialRdd(polygon_df, "usacounty")

        polygon_rdd.analyze()

        point_csv_df = self.spark.read.format("csv").option(
            "delimiter",
            ",").option("header", "false").load(area_lm_point_input_location)
        point_csv_df.createOrReplaceTempView("pointtable")

        point_df = self.spark.sql(
            "select ST_Point(cast(pointtable._c0 as Decimal(24,20)),cast(pointtable._c1 as Decimal(24,20))) as arealandmark from pointtable"
        )

        point_rdd = Adapter.toSpatialRdd(point_df, "arealandmark")
        point_rdd.analyze()

        point_rdd.spatialPartitioning(GridType.QUADTREE)
        polygon_rdd.spatialPartitioning(point_rdd.getPartitioner())

        point_rdd.buildIndex(IndexType.QUADTREE, True)

        join_result_point_rdd = JoinQuery.\
            SpatialJoinQueryFlat(point_rdd, polygon_rdd, True, True)

        join_result_df = Adapter.toDf(join_result_point_rdd, self.spark)
        join_result_df.show()

        join_result_df2 = Adapter.toDf(join_result_point_rdd, ["abc", "def"],
                                       list(), self.spark)
        join_result_df2.show()

    def test_distance_join_result_to_dataframe(self):
        point_csv_df = self.spark.\
            read.\
            format("csv").\
            option("delimiter", ",").\
            option("header", "false").load(
                area_lm_point_input_location
        )
        point_csv_df.createOrReplaceTempView("pointtable")
        point_df = self.spark.sql(
            "select ST_Point(cast(pointtable._c0 as Decimal(24,20)),cast(pointtable._c1 as Decimal(24,20))) as arealandmark from pointtable"
        )

        point_rdd = Adapter.toSpatialRdd(point_df, "arealandmark")
        point_rdd.analyze()

        polygon_wkt_df = self.spark.read.\
            format("csv").\
            option("delimiter", "\t").\
            option("header", "false").load(
                mixed_wkt_geometry_input_location
        )

        polygon_wkt_df.createOrReplaceTempView("polygontable")
        polygon_df = self.spark.\
            sql("select ST_GeomFromWKT(polygontable._c0) as usacounty from polygontable")

        polygon_rdd = Adapter.toSpatialRdd(polygon_df, "usacounty")
        polygon_rdd.analyze()
        circle_rdd = CircleRDD(polygon_rdd, 0.2)

        point_rdd.spatialPartitioning(GridType.QUADTREE)
        circle_rdd.spatialPartitioning(point_rdd.getPartitioner())

        point_rdd.buildIndex(IndexType.QUADTREE, True)

        join_result_pair_rdd = JoinQuery.\
            DistanceJoinQueryFlat(point_rdd, circle_rdd, True, True)

        join_result_df = Adapter.toDf(join_result_pair_rdd, self.spark)
        join_result_df.printSchema()
        join_result_df.show()

    def test_load_id_column_data_check(self):
        spatial_rdd = PolygonRDD(self.spark.sparkContext,
                                 geojson_id_input_location,
                                 FileDataSplitter.GEOJSON, True)
        spatial_rdd.analyze()
        df = Adapter.toDf(spatial_rdd, self.spark)
        df.show()
        try:
            assert df.columns.__len__() == 3
        except AssertionError:
            assert df.columns.__len__() == 4
        assert df.count() == 1

    def _create_spatial_point_table(self) -> DataFrame:
        df = self.spark.read.\
            format("csv").\
            option("delimiter", "\t").\
            option("header", "false").\
            load(area_lm_point_input_location)

        df.createOrReplaceTempView("inputtable")

        spatial_df = self.spark.sql(
            "select ST_PointFromText(inputtable._c0,\",\") as geom from inputtable"
        )

        return spatial_df

    def test_to_spatial_rdd_df_and_geom_field_name(self):
        spatial_df = self._create_spatial_point_table()

        spatial_rdd = Adapter.toSpatialRdd(spatial_df, "geom")
        spatial_rdd = Adapter.toSpatialRdd(spatial_df, "s")
        spatial_rdd.analyze()

        assert spatial_rdd.approximateTotalCount == 121960
        assert spatial_rdd.boundaryEnvelope == Envelope(
            -179.147236, 179.475569, -14.548699, 71.35513400000001)

    def test_to_spatial_rdd_df(self):
        spatial_df = self._create_spatial_point_table()

        spatial_rdd = Adapter.toSpatialRdd(spatial_df, "geometry")

        spatial_rdd.analyze()

        assert spatial_rdd.approximateTotalCount == 121960
        assert spatial_rdd.boundaryEnvelope == Envelope(
            -179.147236, 179.475569, -14.548699, 71.35513400000001)

    @pytest.mark.skipif(is_greater_or_equal_version(version, "1.0.0"),
                        reason="Deprecated in Sedona")
    def test_to_spatial_rdd_df_geom_column_id(self):
        df = self.spark.read.\
            format("csv").\
            option("delimiter", "\t").\
            option("header", "false").\
            load(mixed_wkt_geometry_input_location)

        df_shorter = df.select(
            col("_c0").alias("geom"),
            col("_c6").alias("county_name"))
        df_shorter.createOrReplaceTempView("county_data")

        spatial_df = self.spark.sql(
            "SELECT ST_GeomFromWKT(geom) as geom, county_name FROM county_data"
        )
        spatial_df.show()

    def test_to_df_srdd_fn_spark(self):
        spatial_rdd = PolygonRDD(self.spark.sparkContext,
                                 geojson_input_location,
                                 FileDataSplitter.GEOJSON, True)
        spatial_rdd.analyze()
        assert spatial_rdd.approximateTotalCount == 1001

        spatial_columns = [
            "state_id", "county_id", "tract_id", "bg_id", "fips", "fips_short",
            "bg_nr", "type", "code1", "code2"
        ]
        spatial_df = Adapter.toDf(spatial_rdd, spatial_columns, self.spark)

        spatial_df.show()

        assert spatial_df.columns == ["geometry", *spatial_columns]
        assert spatial_df.count() == 1001