def test_to_spatial_rdd_df_and_geom_field_name(self): spatial_df = self._create_spatial_point_table() spatial_rdd = Adapter.toSpatialRdd(spatial_df, "geom") spatial_rdd = Adapter.toSpatialRdd(spatial_df, "s") spatial_rdd.analyze() assert spatial_rdd.approximateTotalCount == 121960 assert spatial_rdd.boundaryEnvelope == Envelope( -179.147236, 179.475569, -14.548699, 71.35513400000001)
def test_distance_join_result_to_dataframe(self): point_csv_df = self.spark.\ read.\ format("csv").\ option("delimiter", ",").\ option("header", "false").load( area_lm_point_input_location ) point_csv_df.createOrReplaceTempView("pointtable") point_df = self.spark.sql( "select ST_Point(cast(pointtable._c0 as Decimal(24,20)),cast(pointtable._c1 as Decimal(24,20))) as arealandmark from pointtable" ) point_rdd = Adapter.toSpatialRdd(point_df, "arealandmark") point_rdd.analyze() polygon_wkt_df = self.spark.read.\ format("csv").\ option("delimiter", "\t").\ option("header", "false").load( mixed_wkt_geometry_input_location ) polygon_wkt_df.createOrReplaceTempView("polygontable") polygon_df = self.spark.\ sql("select ST_GeomFromWKT(polygontable._c0) as usacounty from polygontable") polygon_rdd = Adapter.toSpatialRdd(polygon_df, "usacounty") polygon_rdd.analyze() circle_rdd = CircleRDD(polygon_rdd, 0.2) point_rdd.spatialPartitioning(GridType.QUADTREE) circle_rdd.spatialPartitioning(point_rdd.getPartitioner()) point_rdd.buildIndex(IndexType.QUADTREE, True) join_result_pair_rdd = JoinQuery.\ DistanceJoinQueryFlat(point_rdd, circle_rdd, True, True) join_result_df = Adapter.toDf(join_result_pair_rdd, self.spark) join_result_df.printSchema() join_result_df.show()
def test_read_mixed_wkt_geometries_into_spatial_rdd(self): df = self.spark.read.format("csv").\ option("delimiter", "\t").\ option("header", "false").load(mixed_wkt_geometry_input_location) df.show() df.createOrReplaceTempView("inputtable") spatial_df = self.spark.sql( "select ST_GeomFromWKT(inputtable._c0) as usacounty from inputtable" ) spatial_df.show() spatial_df.printSchema() spatial_rdd = Adapter.toSpatialRdd(spatial_df, "usacounty") spatial_rdd.analyze() Adapter.toDf(spatial_rdd, self.spark).show() assert (Adapter.toDf(spatial_rdd, self.spark).columns.__len__() == 1) Adapter.toDf(spatial_rdd, self.spark).show()
def test_read_csv_point_into_spatial_rdd(self): df = self.spark.read.\ format("csv").\ option("delimiter", "\t").\ option("header", "false").\ load(area_lm_point_input_location) df.show() df.createOrReplaceTempView("inputtable") spatial_df = self.spark.sql( "select ST_PointFromText(inputtable._c0,\",\") as arealandmark from inputtable" ) spatial_df.show() spatial_df.printSchema() spatial_rdd = Adapter.toSpatialRdd(spatial_df, "arealandmark") spatial_rdd.analyze() Adapter.toDf(spatial_rdd, self.spark).show()
def get_bdy_rdd(spark, bdy): # load boundaries from Postgres sql = """SELECT {}, name as {}, st_astext(geom) as wkt_geom FROM admin_bdys_202008.{}_analysis""".format( bdy["id_field"], bdy["name_field"], bdy["name"]) bdy_df = get_dataframe_from_postgres(spark, sql) # create geometries from WKT strings into new DataFrame bdy_df2 = bdy_df\ .withColumn("geom", f.expr("st_geomFromWKT(wkt_geom)")) \ .drop("wkt_geom") # create rdd output_rdd = Adapter.toSpatialRdd(bdy_df2, "geom") output_rdd.analyze() bdy_df2.unpersist() bdy_df.unpersist() return output_rdd
def rdd_filesave_join(): logger.info("\t - RDD file save join start") full_start_time = datetime.now() # ---------------------------------------------------------- # get spark session and context # ---------------------------------------------------------- start_time = datetime.now() spark = create_spark_session() sc = spark.sparkContext sedona_version = pkg_resources.get_distribution("sedona").version logger.info( "\t - PySpark {} session initiated with Apache Sedona {}: {}".format( sc.version, sedona_version, datetime.now() - start_time)) # ---------------------------------------------------------- # create GNAF PointRDD from CSV file # ---------------------------------------------------------- start_time = datetime.now() offset = 0 # The point long/lat fields start at column 0 carry_other_attributes = True # include non-geo columns point_rdd = PointRDD(sc, os.path.join(output_path, gnaf_csv_file_path), offset, FileDataSplitter.CSV, carry_other_attributes) point_rdd.analyze() # add partitioning and indexing point_rdd.spatialPartitioning(GridType.KDBTREE) point_rdd.buildIndex(IndexType.RTREE, True) # set Spark storage type - set to MEMORY_AND_DISK if low on memory point_rdd.indexedRDD.persist(StorageLevel.MEMORY_ONLY) logger.info("\t\t - GNAF RDD created: {}".format(datetime.now() - start_time)) # ---------------------------------------------------------- # get boundary tags using a spatial join # ---------------------------------------------------------- for bdy in bdy_list: start_time = datetime.now() # load boundaries # create geometries from WKT strings into new DataFrame bdy_df = spark.read.parquet(os.path.join(output_path, bdy["name"])) \ .withColumn("geom", f.expr("st_geomFromWKT(wkt_geom)")) \ .drop("wkt_geom") # create bdy rdd bdy_rdd = Adapter.toSpatialRdd(bdy_df, "geom") bdy_rdd.analyze() bdy_df.unpersist() bdy_rdd.spatialPartitioning(point_rdd.getPartitioner()) bdy_rdd.spatialPartitionedRDD.persist( StorageLevel.MEMORY_ONLY) # no need to persist(?) - used once # run the join - returns a PairRDD with 1 boundary to 1-N points # e.g. [Geometry: Polygon userData: WA32 TANGNEY WA, [Geometry: Point userData: GAWA_146792426 WA, ...]] result_pair_rdd = JoinQuery.SpatialJoinQueryFlat( point_rdd, bdy_rdd, True, True) # jim = result_pair_rdd.take(10) # for row in jim: # print(row) result_pair_rdd.saveAsTextFile( os.path.join(output_path, "rdd_file_save_gnaf_with_{}".format(bdy["name"]))) # # flat map values to have one point to bdy matched pair # flat_mapped_rdd = result_pair_rdd.flatMapValues(lambda x: x) # # # map values to create RDD row of gnaf & bdy IDs, plus state data # mapped_rdd = flat_mapped_rdd.map( # lambda x: [x[1].getUserData().split("\t")[0], # x[0].getUserData().split("\t")[0], # x[0].getUserData().split("\t")[1]] # ) # # # convert result to a dataframe of the following shema # schema = t.StructType([t.StructField("gnaf_pid", t.StringType(), False), # t.StructField(bdy["id_field"], t.StringType(), False), # t.StructField(bdy["name_field"], t.StringType(), False)]) # # join_df = spark.createDataFrame(mapped_rdd, schema) # # # save result to disk # join_df.write \ # .option("compression", "gzip") \ # .mode("overwrite") \ # .parquet(os.path.join(output_path, "rdd_file_save_gnaf_with_{}".format(bdy["name"]))) logger.info("\t\t - GNAF points bdy tagged with {}: {}".format( bdy["name"], datetime.now() - start_time)) # cleanup spark.stop() logger.info("\t - RDD file save join done: {}".format(datetime.now() - full_start_time))
def run_test(test_name, num_partitions, max_vertices): # create spark session object spark = ( SparkSession.builder.master( "local[*]").appName("Spatial Join SQL Benchmark").config( "spark.sql.session.timeZone", "UTC").config("spark.sql.debug.maxToStringFields", 100).config("spark.serializer", KryoSerializer.getName). config("spark.kryo.registrator", SedonaKryoRegistrator.getName).config( "spark.jars.packages", 'org.apache.sedona:sedona-python-adapter-3.0_2.12:1.0.1-incubating,' 'org.datasyslab:geotools-wrapper:geotools-24.1').config( "spark.sql.adaptive.enabled", "true").config("spark.executor.cores", 4).config("spark.driver.memory", "8g").getOrCreate()) # Add Sedona functions and types to Spark SedonaRegistrator.registerAll(spark) start_time = datetime.now() # load gnaf points and create geoms point_df = (spark.read.parquet( os.path.join(input_path, "address_principals")).select( "gnaf_pid", "state", "geom").withColumnRenamed( "state", "gnaf_state").repartition(num_partitions, "gnaf_state")) # load boundaries and create geoms if max_vertices is not None: bdy_vertex_name = "{}_{}".format(bdy_name, max_vertices) else: bdy_vertex_name = bdy_name bdy_df = (spark.read.parquet(os.path.join( input_path, bdy_vertex_name)).select(bdy_id, "state", "geom").repartition(num_partitions, "state").cache()) bdy_count = bdy_df.count() # create RDDs - analysed partitioned and indexed point_rdd = Adapter.toSpatialRdd(point_df, "geom") bdy_rdd = Adapter.toSpatialRdd(bdy_df, "geom") point_df.unpersist() bdy_df.unpersist() point_rdd.analyze() bdy_rdd.analyze() point_rdd.spatialPartitioning(GridType.KDBTREE) bdy_rdd.spatialPartitioning(point_rdd.getPartitioner()) point_rdd.buildIndex(IndexType.RTREE, True) bdy_rdd.buildIndex(IndexType.RTREE, True) # run join query join_pair_rdd = JoinQueryRaw.SpatialJoinQueryFlat(point_rdd, bdy_rdd, True, True) # convert SedonaPairRDD to dataframe join_df = Adapter.toDf(join_pair_rdd, bdy_rdd.fieldNames, point_rdd.fieldNames, spark) # join_df.printSchema() # | -- leftgeometry: geometry(nullable=true) # | -- <bdy_id>: string(nullable=true) # | -- state: string(nullable=true) # | -- rightgeometry: geometry(nullable=true) # | -- gnaf_pid: string(nullable=true) # | -- gnaf_state: string(nullable=true) join_df2 = (join_df # .filter((join_df["state"] == join_df["gnaf_state"])) .select("gnaf_pid", bdy_id, "state") .dropDuplicates(["gnaf_pid", bdy_id]) .cache() ) # output to files if "warmup" in test_name: name = "gnaf_rdd_{}_{}_{}".format(bdy_id, max_vertices, num_partitions) (join_df2.repartition(50).write.partitionBy("state").option( "compression", "gzip").mode("overwrite").parquet(os.path.join(output_path, name))) # output vars join_count = join_df2.count() time_taken = datetime.now() - start_time if "warmup" in test_name: print("{},{},{},{},{},{}".format(test_name, join_count, bdy_count, max_vertices, num_partitions, time_taken)) else: log_file.write("{},{},{},{},{},{}\n".format(test_name, join_count, bdy_count, max_vertices, num_partitions, time_taken)) # cleanup spark.stop()