def test_range_query_flat_to_df(self): poi_point_rdd = WktReader.readToGeometryRDD(self.sc, bank_csv_path, 1, False, False) poi_point_rdd.analyze() poi_point_rdd.spatialPartitioning(GridType.QUADTREE) poi_point_rdd.buildIndex(IndexType.QUADTREE, False) result = RangeQueryRaw.SpatialRangeQuery( poi_point_rdd, loads("POLYGON((0 0, 0 20, 20 20, 20 0, 0 0))"), True, True) rdd = result.to_rdd() assert rdd.collect().__len__() == 4 df_without_column_names = Adapter.toDf(result, self.spark) raw_geometries = self.__row_to_list(df_without_column_names.collect()) assert [point[0].wkt for point in raw_geometries] == [ 'POINT (9 8)', 'POINT (4 3)', 'POINT (12 1)', 'POINT (11 5)' ] assert df_without_column_names.count() == 4 assert df_without_column_names.schema == StructType( [StructField("geometry", GeometryType())]) df = Adapter.toDf(result, self.spark, ["poi_id", "poi_name"]) assert df.count() == 4 assert df.columns == ["geometry", "poi_id", "poi_name"]
def test_spatial_join_query_flat_to_df(self): poi_point_rdd = WktReader.readToGeometryRDD(self.sc, bank_csv_path, 1, False, False) areas_polygon_rdd = WktReader.readToGeometryRDD( self.sc, areas_csv_path, 1, False, False) poi_point_rdd.analyze() areas_polygon_rdd.analyze() poi_point_rdd.spatialPartitioning(GridType.QUADTREE) areas_polygon_rdd.spatialPartitioning(poi_point_rdd.getPartitioner()) jvm_sedona_rdd = JoinQueryRaw.SpatialJoinQueryFlat( poi_point_rdd, areas_polygon_rdd, False, True) pois_within_areas_with_default_column_names = Adapter.toDf( jvm_sedona_rdd, self.spark) assert pois_within_areas_with_default_column_names.count() == 5 pois_within_areas_with_passed_column_names = Adapter.toDf( jvm_sedona_rdd, ["area_id", "area_name"], ["poi_id", "poi_name"], self.spark) assert pois_within_areas_with_passed_column_names.count() == 5 assert pois_within_areas_with_passed_column_names.columns == [ "leftgeometry", "area_id", "area_name", "rightgeometry", "poi_id", "poi_name" ] assert pois_within_areas_with_default_column_names.schema == StructType( [ StructField("leftgeometry", GeometryType()), StructField("rightgeometry", GeometryType()), ]) left_geometries_raw = pois_within_areas_with_default_column_names. \ selectExpr("ST_AsText(leftgeometry)"). \ collect() left_geometries = self.__row_to_list(left_geometries_raw) right_geometries_raw = pois_within_areas_with_default_column_names. \ selectExpr("ST_AsText(rightgeometry)"). \ collect() right_geometries = self.__row_to_list(right_geometries_raw) # Ignore the ordering of these assert set(geom[0] for geom in left_geometries) == set([ 'POLYGON ((0 4, -3 3, -8 6, -6 8, -2 9, 0 4))', 'POLYGON ((10 3, 10 6, 14 6, 14 3, 10 3))', 'POLYGON ((2 2, 2 4, 3 5, 7 5, 9 3, 8 1, 4 1, 2 2))', 'POLYGON ((-1 -1, -1 -3, -2 -5, -6 -8, -5 -2, -3 -2, -1 -1))', 'POLYGON ((-1 -1, -1 -3, -2 -5, -6 -8, -5 -2, -3 -2, -1 -1))' ]) assert set(geom[0] for geom in right_geometries) == set([ 'POINT (-3 5)', 'POINT (11 5)', 'POINT (4 3)', 'POINT (-1 -1)', 'POINT (-4 -5)' ])
def test_to_spatial_rdd_df_and_geom_field_name(self): spatial_df = self._create_spatial_point_table() spatial_rdd = Adapter.toSpatialRdd(spatial_df, "geom") spatial_rdd = Adapter.toSpatialRdd(spatial_df, "s") spatial_rdd.analyze() assert spatial_rdd.approximateTotalCount == 121960 assert spatial_rdd.boundaryEnvelope == Envelope( -179.147236, 179.475569, -14.548699, 71.35513400000001)
def test_geojson_to_dataframe(self): spatial_rdd = PolygonRDD(self.spark.sparkContext, geojson_input_location, FileDataSplitter.GEOJSON, True) spatial_rdd.analyze() Adapter.toDf(spatial_rdd, self.spark).show() df = Adapter.toDf(spatial_rdd, self.spark) assert (df.columns[1] == "STATEFP")
def test_read_shapefile_to_dataframe(self): spatial_rdd = ShapefileReader.readToGeometryRDD( self.spark.sparkContext, shape_file_input_location) spatial_rdd.analyze() logging.info(spatial_rdd.fieldNames) df = Adapter.toDf(spatial_rdd, self.spark) df.show()
def test_read_csv_point_into_spatial_rdd(self): df = self.spark.read.\ format("csv").\ option("delimiter", "\t").\ option("header", "false").\ load(area_lm_point_input_location) df.show() df.createOrReplaceTempView("inputtable") spatial_df = self.spark.sql( "select ST_PointFromText(inputtable._c0,\",\") as arealandmark from inputtable" ) spatial_df.show() spatial_df.printSchema() spatial_rdd = Adapter.toSpatialRdd(spatial_df, "arealandmark") spatial_rdd.analyze() Adapter.toDf(spatial_rdd, self.spark).show()
def test_read_mixed_wkt_geometries_into_spatial_rdd_with_unique_id(self): df = self.spark.read.format("csv").\ option("delimiter", "\t").\ option("header", "false").\ load(mixed_wkt_geometry_input_location) df.show() df.createOrReplaceTempView("inputtable") spatial_df = self.spark.sql( "select ST_GeomFromWKT(inputtable._c0) as usacounty, inputtable._c3, inputtable._c5 from inputtable" ) spatial_df.show() spatial_df.printSchema() spatial_rdd = Adapter.toSpatialRdd(spatial_df, "usacounty") spatial_rdd.analyze() assert (Adapter.toDf(spatial_rdd, self.spark).columns.__len__() == 3) Adapter.toDf(spatial_rdd, self.spark).show()
def test_distance_join_result_to_dataframe(self): point_csv_df = self.spark.\ read.\ format("csv").\ option("delimiter", ",").\ option("header", "false").load( area_lm_point_input_location ) point_csv_df.createOrReplaceTempView("pointtable") point_df = self.spark.sql( "select ST_Point(cast(pointtable._c0 as Decimal(24,20)),cast(pointtable._c1 as Decimal(24,20))) as arealandmark from pointtable" ) point_rdd = Adapter.toSpatialRdd(point_df, "arealandmark") point_rdd.analyze() polygon_wkt_df = self.spark.read.\ format("csv").\ option("delimiter", "\t").\ option("header", "false").load( mixed_wkt_geometry_input_location ) polygon_wkt_df.createOrReplaceTempView("polygontable") polygon_df = self.spark.\ sql("select ST_GeomFromWKT(polygontable._c0) as usacounty from polygontable") polygon_rdd = Adapter.toSpatialRdd(polygon_df, "usacounty") polygon_rdd.analyze() circle_rdd = CircleRDD(polygon_rdd, 0.2) point_rdd.spatialPartitioning(GridType.QUADTREE) circle_rdd.spatialPartitioning(point_rdd.getPartitioner()) point_rdd.buildIndex(IndexType.QUADTREE, True) join_result_pair_rdd = JoinQuery.\ DistanceJoinQueryFlat(point_rdd, circle_rdd, True, True) join_result_df = Adapter.toDf(join_result_pair_rdd, self.spark) join_result_df.printSchema() join_result_df.show()
def test_geojson_to_dataframe(self): spatial_rdd = PolygonRDD(self.spark.sparkContext, geojson_input_location, FileDataSplitter.GEOJSON, True) spatial_rdd.analyze() df = Adapter.toDf(spatial_rdd, self.spark).\ withColumn("geometry", expr("ST_GeomFromWKT(geometry)")) df.show() assert (df.columns[1] == "STATEFP")
def test_load_id_column_data_check(self): spatial_rdd = PolygonRDD(self.spark.sparkContext, geojson_id_input_location, FileDataSplitter.GEOJSON, True) spatial_rdd.analyze() df = Adapter.toDf(spatial_rdd, self.spark) df.show() try: assert df.columns.__len__() == 3 except AssertionError: assert df.columns.__len__() == 4 assert df.count() == 1
def test_to_df_srdd_fn_spark(self): spatial_rdd = PolygonRDD(self.spark.sparkContext, geojson_input_location, FileDataSplitter.GEOJSON, True) spatial_rdd.analyze() assert spatial_rdd.approximateTotalCount == 1001 spatial_columns = [ "state_id", "county_id", "tract_id", "bg_id", "fips", "fips_short", "bg_nr", "type", "code1", "code2" ] spatial_df = Adapter.toDf(spatial_rdd, spatial_columns, self.spark) spatial_df.show() assert spatial_df.columns == ["geometry", *spatial_columns] assert spatial_df.count() == 1001
def get_bdy_rdd(spark, bdy): # load boundaries from Postgres sql = """SELECT {}, name as {}, st_astext(geom) as wkt_geom FROM admin_bdys_202008.{}_analysis""".format( bdy["id_field"], bdy["name_field"], bdy["name"]) bdy_df = get_dataframe_from_postgres(spark, sql) # create geometries from WKT strings into new DataFrame bdy_df2 = bdy_df\ .withColumn("geom", f.expr("st_geomFromWKT(wkt_geom)")) \ .drop("wkt_geom") # create rdd output_rdd = Adapter.toSpatialRdd(bdy_df2, "geom") output_rdd.analyze() bdy_df2.unpersist() bdy_df.unpersist() return output_rdd
def test_distance_join_query_flat_to_df(self): poi_point_rdd = WktReader.readToGeometryRDD(self.sc, bank_csv_path, 1, False, False) circle_rdd = CircleRDD(poi_point_rdd, 2.0) circle_rdd.analyze() poi_point_rdd.analyze() poi_point_rdd.spatialPartitioning(GridType.QUADTREE) circle_rdd.spatialPartitioning(poi_point_rdd.getPartitioner()) jvm_sedona_rdd = JoinQueryRaw.DistanceJoinQueryFlat( poi_point_rdd, circle_rdd, False, True) df_sedona_rdd = Adapter.toDf(jvm_sedona_rdd, ["poi_from_id", "poi_from_name"], ["poi_to_id", "poi_to_name"], self.spark) assert df_sedona_rdd.count() == 10 assert df_sedona_rdd.columns == [ "leftgeometry", "poi_from_id", "poi_from_name", "rightgeometry", "poi_to_id", "poi_to_name" ]
def test_spatial_join_to_df(self): poi_point_rdd = WktReader.readToGeometryRDD(self.sc, bank_csv_path, 1, False, False) areas_polygon_rdd = WktReader.readToGeometryRDD( self.sc, areas_csv_path, 1, False, False) poi_point_rdd.analyze() areas_polygon_rdd.analyze() poi_point_rdd.spatialPartitioning(GridType.QUADTREE) areas_polygon_rdd.spatialPartitioning(poi_point_rdd.getPartitioner()) jvm_sedona_rdd = JoinQueryRaw.spatialJoin(poi_point_rdd, areas_polygon_rdd, JoinParams()) sedona_df = Adapter.toDf(jvm_sedona_rdd, ["area_id", "area_name"], ["poi_id", "poi_name"], self.spark) assert sedona_df.count() == 5 assert sedona_df.columns == [ "leftgeometry", "area_id", "area_name", "rightgeometry", "poi_id", "poi_name" ]
def rdd_filesave_join(): logger.info("\t - RDD file save join start") full_start_time = datetime.now() # ---------------------------------------------------------- # get spark session and context # ---------------------------------------------------------- start_time = datetime.now() spark = create_spark_session() sc = spark.sparkContext sedona_version = pkg_resources.get_distribution("sedona").version logger.info( "\t - PySpark {} session initiated with Apache Sedona {}: {}".format( sc.version, sedona_version, datetime.now() - start_time)) # ---------------------------------------------------------- # create GNAF PointRDD from CSV file # ---------------------------------------------------------- start_time = datetime.now() offset = 0 # The point long/lat fields start at column 0 carry_other_attributes = True # include non-geo columns point_rdd = PointRDD(sc, os.path.join(output_path, gnaf_csv_file_path), offset, FileDataSplitter.CSV, carry_other_attributes) point_rdd.analyze() # add partitioning and indexing point_rdd.spatialPartitioning(GridType.KDBTREE) point_rdd.buildIndex(IndexType.RTREE, True) # set Spark storage type - set to MEMORY_AND_DISK if low on memory point_rdd.indexedRDD.persist(StorageLevel.MEMORY_ONLY) logger.info("\t\t - GNAF RDD created: {}".format(datetime.now() - start_time)) # ---------------------------------------------------------- # get boundary tags using a spatial join # ---------------------------------------------------------- for bdy in bdy_list: start_time = datetime.now() # load boundaries # create geometries from WKT strings into new DataFrame bdy_df = spark.read.parquet(os.path.join(output_path, bdy["name"])) \ .withColumn("geom", f.expr("st_geomFromWKT(wkt_geom)")) \ .drop("wkt_geom") # create bdy rdd bdy_rdd = Adapter.toSpatialRdd(bdy_df, "geom") bdy_rdd.analyze() bdy_df.unpersist() bdy_rdd.spatialPartitioning(point_rdd.getPartitioner()) bdy_rdd.spatialPartitionedRDD.persist( StorageLevel.MEMORY_ONLY) # no need to persist(?) - used once # run the join - returns a PairRDD with 1 boundary to 1-N points # e.g. [Geometry: Polygon userData: WA32 TANGNEY WA, [Geometry: Point userData: GAWA_146792426 WA, ...]] result_pair_rdd = JoinQuery.SpatialJoinQueryFlat( point_rdd, bdy_rdd, True, True) # jim = result_pair_rdd.take(10) # for row in jim: # print(row) result_pair_rdd.saveAsTextFile( os.path.join(output_path, "rdd_file_save_gnaf_with_{}".format(bdy["name"]))) # # flat map values to have one point to bdy matched pair # flat_mapped_rdd = result_pair_rdd.flatMapValues(lambda x: x) # # # map values to create RDD row of gnaf & bdy IDs, plus state data # mapped_rdd = flat_mapped_rdd.map( # lambda x: [x[1].getUserData().split("\t")[0], # x[0].getUserData().split("\t")[0], # x[0].getUserData().split("\t")[1]] # ) # # # convert result to a dataframe of the following shema # schema = t.StructType([t.StructField("gnaf_pid", t.StringType(), False), # t.StructField(bdy["id_field"], t.StringType(), False), # t.StructField(bdy["name_field"], t.StringType(), False)]) # # join_df = spark.createDataFrame(mapped_rdd, schema) # # # save result to disk # join_df.write \ # .option("compression", "gzip") \ # .mode("overwrite") \ # .parquet(os.path.join(output_path, "rdd_file_save_gnaf_with_{}".format(bdy["name"]))) logger.info("\t\t - GNAF points bdy tagged with {}: {}".format( bdy["name"], datetime.now() - start_time)) # cleanup spark.stop() logger.info("\t - RDD file save join done: {}".format(datetime.now() - full_start_time))
def run_test(test_name, num_partitions, max_vertices): # create spark session object spark = ( SparkSession.builder.master( "local[*]").appName("Spatial Join SQL Benchmark").config( "spark.sql.session.timeZone", "UTC").config("spark.sql.debug.maxToStringFields", 100).config("spark.serializer", KryoSerializer.getName). config("spark.kryo.registrator", SedonaKryoRegistrator.getName).config( "spark.jars.packages", 'org.apache.sedona:sedona-python-adapter-3.0_2.12:1.0.1-incubating,' 'org.datasyslab:geotools-wrapper:geotools-24.1').config( "spark.sql.adaptive.enabled", "true").config("spark.executor.cores", 4).config("spark.driver.memory", "8g").getOrCreate()) # Add Sedona functions and types to Spark SedonaRegistrator.registerAll(spark) start_time = datetime.now() # load gnaf points and create geoms point_df = (spark.read.parquet( os.path.join(input_path, "address_principals")).select( "gnaf_pid", "state", "geom").withColumnRenamed( "state", "gnaf_state").repartition(num_partitions, "gnaf_state")) # load boundaries and create geoms if max_vertices is not None: bdy_vertex_name = "{}_{}".format(bdy_name, max_vertices) else: bdy_vertex_name = bdy_name bdy_df = (spark.read.parquet(os.path.join( input_path, bdy_vertex_name)).select(bdy_id, "state", "geom").repartition(num_partitions, "state").cache()) bdy_count = bdy_df.count() # create RDDs - analysed partitioned and indexed point_rdd = Adapter.toSpatialRdd(point_df, "geom") bdy_rdd = Adapter.toSpatialRdd(bdy_df, "geom") point_df.unpersist() bdy_df.unpersist() point_rdd.analyze() bdy_rdd.analyze() point_rdd.spatialPartitioning(GridType.KDBTREE) bdy_rdd.spatialPartitioning(point_rdd.getPartitioner()) point_rdd.buildIndex(IndexType.RTREE, True) bdy_rdd.buildIndex(IndexType.RTREE, True) # run join query join_pair_rdd = JoinQueryRaw.SpatialJoinQueryFlat(point_rdd, bdy_rdd, True, True) # convert SedonaPairRDD to dataframe join_df = Adapter.toDf(join_pair_rdd, bdy_rdd.fieldNames, point_rdd.fieldNames, spark) # join_df.printSchema() # | -- leftgeometry: geometry(nullable=true) # | -- <bdy_id>: string(nullable=true) # | -- state: string(nullable=true) # | -- rightgeometry: geometry(nullable=true) # | -- gnaf_pid: string(nullable=true) # | -- gnaf_state: string(nullable=true) join_df2 = (join_df # .filter((join_df["state"] == join_df["gnaf_state"])) .select("gnaf_pid", bdy_id, "state") .dropDuplicates(["gnaf_pid", bdy_id]) .cache() ) # output to files if "warmup" in test_name: name = "gnaf_rdd_{}_{}_{}".format(bdy_id, max_vertices, num_partitions) (join_df2.repartition(50).write.partitionBy("state").option( "compression", "gzip").mode("overwrite").parquet(os.path.join(output_path, name))) # output vars join_count = join_df2.count() time_taken = datetime.now() - start_time if "warmup" in test_name: print("{},{},{},{},{},{}".format(test_name, join_count, bdy_count, max_vertices, num_partitions, time_taken)) else: log_file.write("{},{},{},{},{},{}\n".format(test_name, join_count, bdy_count, max_vertices, num_partitions, time_taken)) # cleanup spark.stop()