def test_range_query_flat_to_df(self): poi_point_rdd = WktReader.readToGeometryRDD(self.sc, bank_csv_path, 1, False, False) poi_point_rdd.analyze() poi_point_rdd.spatialPartitioning(GridType.QUADTREE) poi_point_rdd.buildIndex(IndexType.QUADTREE, False) result = RangeQueryRaw.SpatialRangeQuery( poi_point_rdd, loads("POLYGON((0 0, 0 20, 20 20, 20 0, 0 0))"), True, True) rdd = result.to_rdd() assert rdd.collect().__len__() == 4 df_without_column_names = Adapter.toDf(result, self.spark) raw_geometries = self.__row_to_list(df_without_column_names.collect()) assert [point[0].wkt for point in raw_geometries] == [ 'POINT (9 8)', 'POINT (4 3)', 'POINT (12 1)', 'POINT (11 5)' ] assert df_without_column_names.count() == 4 assert df_without_column_names.schema == StructType( [StructField("geometry", GeometryType())]) df = Adapter.toDf(result, self.spark, ["poi_id", "poi_name"]) assert df.count() == 4 assert df.columns == ["geometry", "poi_id", "poi_name"]
def test_spatial_join_query_flat_to_df(self): poi_point_rdd = WktReader.readToGeometryRDD(self.sc, bank_csv_path, 1, False, False) areas_polygon_rdd = WktReader.readToGeometryRDD( self.sc, areas_csv_path, 1, False, False) poi_point_rdd.analyze() areas_polygon_rdd.analyze() poi_point_rdd.spatialPartitioning(GridType.QUADTREE) areas_polygon_rdd.spatialPartitioning(poi_point_rdd.getPartitioner()) jvm_sedona_rdd = JoinQueryRaw.SpatialJoinQueryFlat( poi_point_rdd, areas_polygon_rdd, False, True) pois_within_areas_with_default_column_names = Adapter.toDf( jvm_sedona_rdd, self.spark) assert pois_within_areas_with_default_column_names.count() == 5 pois_within_areas_with_passed_column_names = Adapter.toDf( jvm_sedona_rdd, ["area_id", "area_name"], ["poi_id", "poi_name"], self.spark) assert pois_within_areas_with_passed_column_names.count() == 5 assert pois_within_areas_with_passed_column_names.columns == [ "leftgeometry", "area_id", "area_name", "rightgeometry", "poi_id", "poi_name" ] assert pois_within_areas_with_default_column_names.schema == StructType( [ StructField("leftgeometry", GeometryType()), StructField("rightgeometry", GeometryType()), ]) left_geometries_raw = pois_within_areas_with_default_column_names. \ selectExpr("ST_AsText(leftgeometry)"). \ collect() left_geometries = self.__row_to_list(left_geometries_raw) right_geometries_raw = pois_within_areas_with_default_column_names. \ selectExpr("ST_AsText(rightgeometry)"). \ collect() right_geometries = self.__row_to_list(right_geometries_raw) # Ignore the ordering of these assert set(geom[0] for geom in left_geometries) == set([ 'POLYGON ((0 4, -3 3, -8 6, -6 8, -2 9, 0 4))', 'POLYGON ((10 3, 10 6, 14 6, 14 3, 10 3))', 'POLYGON ((2 2, 2 4, 3 5, 7 5, 9 3, 8 1, 4 1, 2 2))', 'POLYGON ((-1 -1, -1 -3, -2 -5, -6 -8, -5 -2, -3 -2, -1 -1))', 'POLYGON ((-1 -1, -1 -3, -2 -5, -6 -8, -5 -2, -3 -2, -1 -1))' ]) assert set(geom[0] for geom in right_geometries) == set([ 'POINT (-3 5)', 'POINT (11 5)', 'POINT (4 3)', 'POINT (-1 -1)', 'POINT (-4 -5)' ])
def test_geojson_to_dataframe(self): spatial_rdd = PolygonRDD(self.spark.sparkContext, geojson_input_location, FileDataSplitter.GEOJSON, True) spatial_rdd.analyze() Adapter.toDf(spatial_rdd, self.spark).show() df = Adapter.toDf(spatial_rdd, self.spark) assert (df.columns[1] == "STATEFP")
def test_read_shapefile_to_dataframe(self): spatial_rdd = ShapefileReader.readToGeometryRDD( self.spark.sparkContext, shape_file_input_location) spatial_rdd.analyze() logging.info(spatial_rdd.fieldNames) df = Adapter.toDf(spatial_rdd, self.spark) df.show()
def test_read_mixed_wkt_geometries_into_spatial_rdd(self): df = self.spark.read.format("csv").\ option("delimiter", "\t").\ option("header", "false").load(mixed_wkt_geometry_input_location) df.show() df.createOrReplaceTempView("inputtable") spatial_df = self.spark.sql( "select ST_GeomFromWKT(inputtable._c0) as usacounty from inputtable" ) spatial_df.show() spatial_df.printSchema() spatial_rdd = Adapter.toSpatialRdd(spatial_df, "usacounty") spatial_rdd.analyze() Adapter.toDf(spatial_rdd, self.spark).show() assert (Adapter.toDf(spatial_rdd, self.spark).columns.__len__() == 1) Adapter.toDf(spatial_rdd, self.spark).show()
def test_read_csv_point_into_spatial_rdd(self): df = self.spark.read.\ format("csv").\ option("delimiter", "\t").\ option("header", "false").\ load(area_lm_point_input_location) df.show() df.createOrReplaceTempView("inputtable") spatial_df = self.spark.sql( "select ST_PointFromText(inputtable._c0,\",\") as arealandmark from inputtable" ) spatial_df.show() spatial_df.printSchema() spatial_rdd = Adapter.toSpatialRdd(spatial_df, "arealandmark") spatial_rdd.analyze() Adapter.toDf(spatial_rdd, self.spark).show()
def test_geojson_to_dataframe(self): spatial_rdd = PolygonRDD(self.spark.sparkContext, geojson_input_location, FileDataSplitter.GEOJSON, True) spatial_rdd.analyze() df = Adapter.toDf(spatial_rdd, self.spark).\ withColumn("geometry", expr("ST_GeomFromWKT(geometry)")) df.show() assert (df.columns[1] == "STATEFP")
def test_load_id_column_data_check(self): spatial_rdd = PolygonRDD(self.spark.sparkContext, geojson_id_input_location, FileDataSplitter.GEOJSON, True) spatial_rdd.analyze() df = Adapter.toDf(spatial_rdd, self.spark) df.show() try: assert df.columns.__len__() == 3 except AssertionError: assert df.columns.__len__() == 4 assert df.count() == 1
def test_convert_spatial_join_result_to_dataframe(self): polygon_wkt_df = self.spark.read.format("csv").option( "delimiter", "\t").option("header", "false").load(mixed_wkt_geometry_input_location) polygon_wkt_df.createOrReplaceTempView("polygontable") polygon_df = self.spark.sql( "select ST_GeomFromWKT(polygontable._c0) as usacounty from polygontable" ) polygon_rdd = Adapter.toSpatialRdd(polygon_df, "usacounty") polygon_rdd.analyze() point_csv_df = self.spark.read.format("csv").option( "delimiter", ",").option("header", "false").load(area_lm_point_input_location) point_csv_df.createOrReplaceTempView("pointtable") point_df = self.spark.sql( "select ST_Point(cast(pointtable._c0 as Decimal(24,20)),cast(pointtable._c1 as Decimal(24,20))) as arealandmark from pointtable" ) point_rdd = Adapter.toSpatialRdd(point_df, "arealandmark") point_rdd.analyze() point_rdd.spatialPartitioning(GridType.QUADTREE) polygon_rdd.spatialPartitioning(point_rdd.getPartitioner()) point_rdd.buildIndex(IndexType.QUADTREE, True) join_result_point_rdd = JoinQuery.\ SpatialJoinQueryFlat(point_rdd, polygon_rdd, True, True) join_result_df = Adapter.toDf(join_result_point_rdd, self.spark) join_result_df.show() join_result_df2 = Adapter.toDf(join_result_point_rdd, ["abc", "def"], list(), self.spark) join_result_df2.show()
def test_to_df_srdd_fn_spark(self): spatial_rdd = PolygonRDD(self.spark.sparkContext, geojson_input_location, FileDataSplitter.GEOJSON, True) spatial_rdd.analyze() assert spatial_rdd.approximateTotalCount == 1001 spatial_columns = [ "state_id", "county_id", "tract_id", "bg_id", "fips", "fips_short", "bg_nr", "type", "code1", "code2" ] spatial_df = Adapter.toDf(spatial_rdd, spatial_columns, self.spark) spatial_df.show() assert spatial_df.columns == ["geometry", *spatial_columns] assert spatial_df.count() == 1001
def test_distance_join_query_flat_to_df(self): poi_point_rdd = WktReader.readToGeometryRDD(self.sc, bank_csv_path, 1, False, False) circle_rdd = CircleRDD(poi_point_rdd, 2.0) circle_rdd.analyze() poi_point_rdd.analyze() poi_point_rdd.spatialPartitioning(GridType.QUADTREE) circle_rdd.spatialPartitioning(poi_point_rdd.getPartitioner()) jvm_sedona_rdd = JoinQueryRaw.DistanceJoinQueryFlat( poi_point_rdd, circle_rdd, False, True) df_sedona_rdd = Adapter.toDf(jvm_sedona_rdd, ["poi_from_id", "poi_from_name"], ["poi_to_id", "poi_to_name"], self.spark) assert df_sedona_rdd.count() == 10 assert df_sedona_rdd.columns == [ "leftgeometry", "poi_from_id", "poi_from_name", "rightgeometry", "poi_to_id", "poi_to_name" ]
def test_spatial_join_to_df(self): poi_point_rdd = WktReader.readToGeometryRDD(self.sc, bank_csv_path, 1, False, False) areas_polygon_rdd = WktReader.readToGeometryRDD( self.sc, areas_csv_path, 1, False, False) poi_point_rdd.analyze() areas_polygon_rdd.analyze() poi_point_rdd.spatialPartitioning(GridType.QUADTREE) areas_polygon_rdd.spatialPartitioning(poi_point_rdd.getPartitioner()) jvm_sedona_rdd = JoinQueryRaw.spatialJoin(poi_point_rdd, areas_polygon_rdd, JoinParams()) sedona_df = Adapter.toDf(jvm_sedona_rdd, ["area_id", "area_name"], ["poi_id", "poi_name"], self.spark) assert sedona_df.count() == 5 assert sedona_df.columns == [ "leftgeometry", "area_id", "area_name", "rightgeometry", "poi_id", "poi_name" ]
def run_test(test_name, num_partitions, max_vertices): # create spark session object spark = ( SparkSession.builder.master( "local[*]").appName("Spatial Join SQL Benchmark").config( "spark.sql.session.timeZone", "UTC").config("spark.sql.debug.maxToStringFields", 100).config("spark.serializer", KryoSerializer.getName). config("spark.kryo.registrator", SedonaKryoRegistrator.getName).config( "spark.jars.packages", 'org.apache.sedona:sedona-python-adapter-3.0_2.12:1.0.1-incubating,' 'org.datasyslab:geotools-wrapper:geotools-24.1').config( "spark.sql.adaptive.enabled", "true").config("spark.executor.cores", 4).config("spark.driver.memory", "8g").getOrCreate()) # Add Sedona functions and types to Spark SedonaRegistrator.registerAll(spark) start_time = datetime.now() # load gnaf points and create geoms point_df = (spark.read.parquet( os.path.join(input_path, "address_principals")).select( "gnaf_pid", "state", "geom").withColumnRenamed( "state", "gnaf_state").repartition(num_partitions, "gnaf_state")) # load boundaries and create geoms if max_vertices is not None: bdy_vertex_name = "{}_{}".format(bdy_name, max_vertices) else: bdy_vertex_name = bdy_name bdy_df = (spark.read.parquet(os.path.join( input_path, bdy_vertex_name)).select(bdy_id, "state", "geom").repartition(num_partitions, "state").cache()) bdy_count = bdy_df.count() # create RDDs - analysed partitioned and indexed point_rdd = Adapter.toSpatialRdd(point_df, "geom") bdy_rdd = Adapter.toSpatialRdd(bdy_df, "geom") point_df.unpersist() bdy_df.unpersist() point_rdd.analyze() bdy_rdd.analyze() point_rdd.spatialPartitioning(GridType.KDBTREE) bdy_rdd.spatialPartitioning(point_rdd.getPartitioner()) point_rdd.buildIndex(IndexType.RTREE, True) bdy_rdd.buildIndex(IndexType.RTREE, True) # run join query join_pair_rdd = JoinQueryRaw.SpatialJoinQueryFlat(point_rdd, bdy_rdd, True, True) # convert SedonaPairRDD to dataframe join_df = Adapter.toDf(join_pair_rdd, bdy_rdd.fieldNames, point_rdd.fieldNames, spark) # join_df.printSchema() # | -- leftgeometry: geometry(nullable=true) # | -- <bdy_id>: string(nullable=true) # | -- state: string(nullable=true) # | -- rightgeometry: geometry(nullable=true) # | -- gnaf_pid: string(nullable=true) # | -- gnaf_state: string(nullable=true) join_df2 = (join_df # .filter((join_df["state"] == join_df["gnaf_state"])) .select("gnaf_pid", bdy_id, "state") .dropDuplicates(["gnaf_pid", bdy_id]) .cache() ) # output to files if "warmup" in test_name: name = "gnaf_rdd_{}_{}_{}".format(bdy_id, max_vertices, num_partitions) (join_df2.repartition(50).write.partitionBy("state").option( "compression", "gzip").mode("overwrite").parquet(os.path.join(output_path, name))) # output vars join_count = join_df2.count() time_taken = datetime.now() - start_time if "warmup" in test_name: print("{},{},{},{},{},{}".format(test_name, join_count, bdy_count, max_vertices, num_partitions, time_taken)) else: log_file.write("{},{},{},{},{},{}\n".format(test_name, join_count, bdy_count, max_vertices, num_partitions, time_taken)) # cleanup spark.stop()