def test_outside_point_join_correctness(self): self.once_before_all() window_rdd = PolygonRDD( self.sc.parallelize(self.test_polygon_window_set), StorageLevel.MEMORY_ONLY) object_rdd = PointRDD(self.sc.parallelize(self.test_outside_point_set), StorageLevel.MEMORY_ONLY) self.prepare_rdd(object_rdd, window_rdd, GridType.QUADTREE) result = JoinQuery.SpatialJoinQuery(object_rdd, window_rdd, True, False).collect() assert 0 == result.__len__() result_no_index = JoinQuery.SpatialJoinQuery(object_rdd, window_rdd, False, False).collect() assert 0 == result_no_index.__len__()
def test_spatial_join_query_with_polygon_rdd_using_index(self): query_rdd = PolygonRDD(self.sc, input_location_query_polygon, splitter, True, num_partitions, StorageLevel.MEMORY_ONLY, "epsg:4326", "epsg:3005") spatial_rdd = PointRDD(self.sc, input_location, offset, splitter, True, num_partitions, StorageLevel.MEMORY_ONLY, "epsg:4326", "epsg:3005") query_rdd.analyze() spatial_rdd.analyze() spatial_rdd.spatialPartitioning(grid_type) spatial_rdd.buildIndex(IndexType.RTREE, True) query_rdd.spatialPartitioning(spatial_rdd.getPartitioner()) result = JoinQuery.SpatialJoinQuery(spatial_rdd, query_rdd, False, True).collect() assert result[1][0].getUserData() is not None for data in result: if data[1].__len__() != 0: for right_data in data[1]: assert right_data.getUserData() is not None
def test_on_boundary_point_join_correctness(self): window_rdd = PolygonRDD( self.sc.parallelize(self.test_polygon_window_set), StorageLevel.MEMORY_ONLY) object_rdd = PointRDD( self.sc.parallelize(self.test_on_boundary_point_set), StorageLevel.MEMORY_ONLY) self.prepare_rdd(object_rdd, window_rdd, GridType.QUADTREE) result = JoinQuery.SpatialJoinQuery(object_rdd, window_rdd, True, False).collect() self.verify_join_result(result) result_no_index = JoinQuery.SpatialJoinQuery(object_rdd, window_rdd, False, False).collect() self.verify_join_result(result_no_index)
def test_inside_point_join_correctness(self): self.once_before_all() window_rdd = PolygonRDD( self.sc.parallelize(self.test_polygon_window_set)) object_rdd = PointRDD(self.sc.parallelize(self.test_inside_point_set)) self.prepare_rdd(object_rdd, window_rdd, GridType.QUADTREE) result = JoinQuery.SpatialJoinQuery(object_rdd, window_rdd, True, False).collect() self.verify_join_result(result) result_no_index = JoinQuery.SpatialJoinQuery(object_rdd, window_rdd, False, False).collect() self.verify_join_result(result_no_index)
def test_outside_polygon_distance_join_correctness(self): center_geometry_rdd = PolygonRDD( self.sc.parallelize(self.test_polygon_window_set), StorageLevel.MEMORY_ONLY) window_rdd = CircleRDD(center_geometry_rdd, 0.1) object_rdd = PolygonRDD( self.sc.parallelize(self.test_outside_polygon_set), StorageLevel.MEMORY_ONLY) self.prepare_rdd(object_rdd, window_rdd, GridType.QUADTREE) result = JoinQuery.DistanceJoinQuery(object_rdd, window_rdd, True, True).collect() assert 0 == result.__len__() result_no_index = JoinQuery.DistanceJoinQuery(object_rdd, window_rdd, False, True).collect() assert 0 == result_no_index.__len__()
def dynamic_rtree_int(self, query_rdd, num_partitions, use_legacy_apis, grid_type, index_type, expected_count): spatial_rdd = self.create_point_rdd(input_location, splitter, num_partitions) self.partition_rdds(query_rdd, spatial_rdd, grid_type, use_legacy_apis) join_params = JoinParams(True, index_type, JoinBuildSide.LEFT) results = JoinQuery.spatialJoin(query_rdd, spatial_rdd, join_params).collect() self.sanity_check_flat_join_results(results) assert expected_count == results.__len__()
def nested_loop(self, query_rdd, num_partitions, grid_type, use_legacy_apis, expected_count): spatial_rdd = self.create_point_rdd(input_location, splitter, num_partitions) self.partition_rdds( query_rdd, spatial_rdd, grid_type, use_legacy_apis) result = JoinQuery.SpatialJoinQuery( spatial_rdd, query_rdd, False, True).collect() self.sanity_check_join_results(result) assert expected_count == self.count_join_results(result)
def test_nested_loop(self, num_partitions, use_legacy_apis, grid_type, intersects): query_rdd = self.create_polygon_rdd(query_polygon_set, splitter, num_partitions) spatial_rdd = self.create_polygon_rdd(input_location, splitter, num_partitions) self.partition_rdds(query_rdd, spatial_rdd, grid_type, use_legacy_apis) result = JoinQuery.SpatialJoinQuery( spatial_rdd, query_rdd, False, intersects).collect() self.sanity_check_join_results(result) assert self.get_expected_count(intersects) == self.count_join_results(result)
def bdy_tag(spark, point_rdd, bdy): start_time = datetime.now() # load boundaries bdy_rdd = get_bdy_rdd(spark, bdy) bdy_rdd.analyze() bdy_rdd.spatialPartitioning(point_rdd.getPartitioner()) # bdy_rdd.spatialPartitionedRDD.persist(StorageLevel.MEMORY_ONLY) # no need to persist(?) - used once # run the join - returns a PairRDD with 1 boundary to 1-N points # e.g. [Geometry: Polygon userData: WA32 TANGNEY WA, [Geometry: Point userData: GAWA_146792426 WA, ...]] result_pair_rdd = JoinQuery.SpatialJoinQuery(point_rdd, bdy_rdd, True, True) # print(result_pair_rdd.take(1)) # flat map values to have one point to bdy matched pair flat_mapped_rdd = result_pair_rdd.flatMapValues(lambda x: x) # map values to create RDD row of gnaf & bdy IDs, plus state data mapped_rdd = flat_mapped_rdd.map(lambda x: [ x[1].getUserData().split("\t")[0], x[0].getUserData().split("\t")[0], x[0].getUserData().split("\t")[1] ]) # jim = mapped_rdd.take(10) # for row in jim: # print(row) # convert result to a dataframe of the following shema schema = t.StructType([ t.StructField("gnaf_pid", t.StringType(), False), t.StructField(bdy["id_field"], t.StringType(), False), t.StructField(bdy["name_field"], t.StringType(), False) ]) join_df = spark.createDataFrame(mapped_rdd, schema) # join_df.printSchema() # join_df.show(10, False) # save result to disk export_to_parquet(join_df, "gnaf_with_{}".format(bdy["name"])) # num_joined_points = join_df.count() # this can be an expensive operation # cleanup datasets in memory join_df.unpersist() mapped_rdd.unpersist() flat_mapped_rdd.unpersist() result_pair_rdd.unpersist() # bdy_rdd.unpersist() # no method for SpatialRDD logger.info("\t - GNAF points bdy tagged with {}: {}".format( bdy["name"], datetime.now() - start_time))
def test_index_int(self, num_partitions, use_legacy_apis, grid_type, index_type): query_rdd = self.create_polygon_rdd(query_polygon_set, splitter, num_partitions) spatial_rdd = self.create_linestring_rdd(input_location, splitter, num_partitions) self.partition_rdds(query_rdd, spatial_rdd, grid_type, use_legacy_apis) spatial_rdd.buildIndex(index_type, True) result = JoinQuery.SpatialJoinQuery( spatial_rdd, query_rdd, False, True).collect() self.sanity_check_join_results(result) assert match_count == self.count_join_results(result)
def test_index_int(self, num_partitions, grid_type, index_type, intersects): query_rdd = self.create_polygon_rdd(query_polygon_set, splitter, num_partitions) spatial_rdd = self.create_polygon_rdd(input_location, splitter, num_partitions) self.partition_rdds(query_rdd, spatial_rdd, grid_type) spatial_rdd.buildIndex(index_type, True) result = JoinQuery.SpatialJoinQuery( spatial_rdd, query_rdd, True, intersects).collect() self.sanity_check_join_results(result) assert self.get_expected_with_original_duplicates_count(intersects) == self.count_join_results(result)
def index_int(self, query_rdd, num_partitions, grid_type, index_type, expected_count): spatial_rdd = self.create_point_rdd(input_location, splitter, num_partitions) self.partition_rdds(query_rdd, spatial_rdd, grid_type) spatial_rdd.buildIndex(index_type, True) result = JoinQuery.SpatialJoinQuery(spatial_rdd, query_rdd, False, True).collect() self.sanity_check_join_results(result) assert expected_count, self.count_join_results(result)
def test_indexed_rdd_assignment(self): object_rdd = PointRDD( self.sc, point_rdd_input_location, point_rdd_offset, point_rdd_splitter, True) query_window_rdd = CircleRDD(object_rdd, 0.1) object_rdd.analyze() object_rdd.spatialPartitioning(GridType.QUADTREE) object_rdd.buildIndex(IndexType.QUADTREE, True) query_window_rdd.spatialPartitioning(object_rdd.getPartitioner()) object_rdd.buildIndex(IndexType.RTREE, False) object_rdd.indexedRDD.persist(StorageLevel.MEMORY_ONLY) query_window_rdd.jvmSpatialPartitionedRDD.persist(StorageLevel.MEMORY_ONLY) query_window_rdd.jvmSpatialPartitionedRDD.count() object_rdd.indexedRDD.count() import time start = time.time() for _ in range(each_query_loop_times): result_size = JoinQuery.DistanceJoinQuery(object_rdd, query_window_rdd, True, True).count() diff = time.time() - start object_rdd = PointRDD( self.sc, point_rdd_input_location, point_rdd_offset, point_rdd_splitter, True) query_window_rdd = CircleRDD(object_rdd, 0.1) object_rdd.analyze() object_rdd.spatialPartitioning(GridType.QUADTREE) object_rdd.buildIndex(IndexType.QUADTREE, True) query_window_rdd.spatialPartitioning(object_rdd.getPartitioner()) object_rdd.buildIndex(IndexType.RTREE, False) start1 = time.time() for _ in range(each_query_loop_times): result_size = JoinQuery.DistanceJoinQuery(object_rdd, query_window_rdd, True, True).count()
def test_dynamic_index_int(self, num_partitions, use_legacy_apis, grid_type, index_type, intersects): query_rdd = self.create_polygon_rdd(query_polygon_set, splitter, num_partitions) spatial_rdd = self.create_polygon_rdd(input_location, splitter, num_partitions) self.partition_rdds(query_rdd, spatial_rdd, grid_type, use_legacy_apis) join_params = JoinParams(intersects, index_type, JoinBuildSide.LEFT) result = JoinQuery.spatialJoin(query_rdd, spatial_rdd, join_params).collect() self.sanity_check_flat_join_results(result) expected_count = self.get_expected_with_original_duplicates_count(intersects) \ if self.expect_to_preserve_original_duplicates(grid_type) else self.get_expected_count(intersects) assert expected_count == result.__len__()
def test_index_int(self, num_partitions, grid_type, index_type): query_rdd = self.create_polygon_rdd(query_polygon_set, splitter, num_partitions) spatial_rdd = self.create_linestring_rdd(input_location, splitter, num_partitions) self.partition_rdds(query_rdd, spatial_rdd, grid_type) spatial_rdd.buildIndex(index_type, True) result = JoinQuery.SpatialJoinQuery( spatial_rdd, query_rdd, False, True).collect() self.sanity_check_join_results(result) expected_count = match_with_original_duplicates_count if self.expect_to_preserve_original_duplicates( grid_type) else match_count assert expected_count == self.count_join_results(result)
def test_spatial_join_query(self): point_rdd = PointRDD(self.sc, point_path, 4, FileDataSplitter.WKT, True) polygon_rdd = PolygonRDD(self.sc, counties_path, 2, 3, FileDataSplitter.WKT, True) point_rdd.analyze() point_rdd.spatialPartitioning(GridType.KDBTREE) polygon_rdd.spatialPartitioning(point_rdd.getPartitioner()) result = JoinQuery.SpatialJoinQuery(point_rdd, polygon_rdd, True, False) print(result.count())
def test_distance_join_query(self): object_rdd = PointRDD( self.sc, point_rdd_input_location, point_rdd_offset, point_rdd_splitter, True, StorageLevel.MEMORY_ONLY) query_window_rdd = CircleRDD(object_rdd, 0.1) object_rdd.spatialPartitioning(GridType.QUADTREE) query_window_rdd.spatialPartitioning(object_rdd.getPartitioner()) object_rdd.spatialPartitionedRDD.persist(StorageLevel.MEMORY_ONLY) assert object_rdd.spatialPartitionedRDD.is_cached query_window_rdd.spatialPartitionedRDD.persist(StorageLevel.MEMORY_ONLY) for _ in range(each_query_loop_times): result_size = JoinQuery.DistanceJoinQuery(object_rdd, query_window_rdd, False, True).count()
def test_distance_join_query(self): object_rdd = PointRDD(sparkContext=self.sc, InputLocation=point_rdd_input_location, Offset=point_rdd_offset, splitter=point_rdd_splitter, carryInputData=False) query_window_rdd = CircleRDD(object_rdd, 0.1) object_rdd.analyze() object_rdd.spatialPartitioning(GridType.QUADTREE) query_window_rdd.spatialPartitioning(object_rdd.getPartitioner()) for i in range(each_query_loop_times): result_size = JoinQuery.DistanceJoinQuery(object_rdd, query_window_rdd, False, True).count()
def test_dynamic_index_int(self, num_partitions, grid_type, index_type): query_rdd = self.create_rectangle_rdd(input_location, splitter, num_partitions) spatial_rdd = self.create_rectangle_rdd(input_location, splitter, num_partitions) self.partition_rdds(query_rdd, spatial_rdd, grid_type) join_params = JoinParams(True, index_type, JoinBuildSide.LEFT) result = JoinQuery.spatialJoin(query_rdd, spatial_rdd, join_params).collect() self.sanity_check_flat_join_results(result) expected_count = match_with_original_duplicates_count \ if self.expect_to_preserve_original_duplicates(grid_type) else match_count assert expected_count == result.__len__()
def test_spatial_join_query(self): query_window_rdd = PolygonRDD( self.sc, polygon_rdd_input_location, polygon_rdd_start_offset, polygon_rdd_end_offset, polygon_rdd_splitter, True ) object_rdd = PointRDD( self.sc, point_rdd_input_location, point_rdd_offset, point_rdd_splitter, True, StorageLevel.MEMORY_ONLY) object_rdd.spatialPartitioning(join_query_partitioning_type) query_window_rdd.spatialPartitioning(object_rdd.getPartitioner()) object_rdd.jvmSpatialPartitionedRDD.persist(StorageLevel.MEMORY_ONLY) query_window_rdd.jvmSpatialPartitionedRDD.persist(StorageLevel.MEMORY_ONLY) for _ in range(each_query_loop_times): result_size = JoinQuery.SpatialJoinQuery(object_rdd, query_window_rdd, False, True).count()
def test_nested_loop(self, num_partitions, grid_type): query_rdd = self.create_rectangle_rdd(input_location, splitter, num_partitions) spatial_rdd = self.create_rectangle_rdd(input_location, splitter, num_partitions) self.partition_rdds(query_rdd, spatial_rdd, grid_type) result = JoinQuery.SpatialJoinQuery( spatial_rdd, query_rdd, False, True).collect() count = 0 for el in result: count += el[1].__len__() self.sanity_check_join_results(result) expected_count = match_with_original_duplicates_count if self.expect_to_preserve_original_duplicates( grid_type) else match_count assert expected_count == self.count_join_results(result)
def test_nested_loop(self, num_partitions, use_legacy_apis, grid_type): query_rdd = self.create_rectangle_rdd(input_location, splitter, num_partitions) spatial_rdd = self.create_rectangle_rdd(input_location, splitter, num_partitions) self.partition_rdds(query_rdd, spatial_rdd, grid_type, use_legacy_apis) result = JoinQuery.SpatialJoinQuery(spatial_rdd, query_rdd, False, True).collect() count = 0 for el in result: count += el[1].__len__() self.sanity_check_join_results(result) assert match_count == self.count_join_results(result)
def test_loading_spatial_rdd_from_disc(self): point_rdd = load_spatial_rdd_from_disc( self.sc, os.path.join(disc_location, "point"), GeoType.POINT) point_index_rdd = load_spatial_index_rdd_from_disc( self.sc, os.path.join(disc_location, "point_index")) point_rdd.indexedRawRDD = point_index_rdd assert point_rdd.indexedRawRDD is not None assert isinstance(point_rdd, PointRDD) point_rdd.analyze() print(point_rdd.boundaryEnvelope) polygon_rdd = load_spatial_rdd_from_disc( self.sc, os.path.join(disc_location, "polygon"), GeoType.POLYGON) polygon_index_rdd = load_spatial_index_rdd_from_disc( self.sc, os.path.join(disc_location, "polygon_index")) polygon_rdd.indexedRawRDD = polygon_index_rdd polygon_rdd.analyze() print(polygon_rdd.boundaryEnvelope) assert polygon_rdd.indexedRawRDD is not None assert isinstance(polygon_rdd, PolygonRDD) linestring_rdd = load_spatial_rdd_from_disc( self.sc, os.path.join(disc_location, "line_string"), GeoType.LINESTRING) linestring_index_rdd = load_spatial_index_rdd_from_disc( self.sc, os.path.join(disc_location, "line_string_index")) linestring_rdd.indexedRawRDD = linestring_index_rdd assert linestring_rdd.indexedRawRDD is not None assert isinstance(linestring_rdd, LineStringRDD) linestring_rdd.analyze() print(linestring_rdd.boundaryEnvelope) linestring_rdd.spatialPartitioning(GridType.KDBTREE) polygon_rdd.spatialPartitioning(linestring_rdd.getPartitioner()) polygon_rdd.buildIndex(IndexType.RTREE, True) linestring_rdd.buildIndex(IndexType.RTREE, True) result = JoinQuery.SpatialJoinQuery(linestring_rdd, polygon_rdd, True, True).collect() print(result) remove_directory(disc_location)
def test_spatial_join_query_and_build_index_on_points_on_the_fly(self): query_window = PolygonRDD(self.sc, polygon_rdd_input_location, polygon_rdd_start_offset, polygon_rdd_end_offset, polygon_rdd_splitter, True) object_rdd = PointRDD(sparkContext=self.sc, InputLocation=point_rdd_input_location, Offset=point_rdd_offset, splitter=point_rdd_splitter, carryInputData=False) object_rdd.analyze() object_rdd.spatialPartitioning(join_query_partitionin_type) query_window.spatialPartitioning(object_rdd.getPartitioner()) for i in range(each_query_loop_times): result_size = JoinQuery.SpatialJoinQuery(object_rdd, query_window, True, False).count()
def test_spatial_join_query_and_build_index_on_polygons_on_the_fly(self): query_window_rdd = PolygonRDD(self.sc, polygon_rdd_input_location, polygon_rdd_start_offset, polygon_rdd_end_offset, polygon_rdd_splitter, True) object_rdd = PointRDD(sparkContext=self.sc, InputLocation=point_rdd_input_location, Offset=point_rdd_offset, splitter=point_rdd_splitter, carryInputData=False) object_rdd.analyze() object_rdd.spatialPartitioning(join_query_partitionin_type) query_window_rdd.spatialPartitioning(object_rdd.getPartitioner()) for i in range(each_query_loop_times): join_params = JoinParams(False, polygon_rdd_index_type, JoinBuildSide.LEFT) resultSize = JoinQuery.spatialJoin(query_window_rdd, object_rdd, join_params).count()
def test_polygon_distance_join_with_crs_transformation(self): query_rdd = PolygonRDD(self.sc, input_location_query_polygon, splitter, True, num_partitions, StorageLevel.MEMORY_ONLY, "epsg:4326", "epsg:3857") window_rdd = CircleRDD(query_rdd, 0.1) object_rdd = PolygonRDD(self.sc, input_location_query_polygon, splitter, True, num_partitions, StorageLevel.MEMORY_ONLY, "epsg:4326", "epsg:3857") object_rdd.rawJvmSpatialRDD.jsrdd.repartition(4) object_rdd.spatialPartitioning(GridType.KDBTREE) object_rdd.buildIndex(IndexType.RTREE, True) window_rdd.spatialPartitioning(object_rdd.getPartitioner()) results = JoinQuery.DistanceJoinQuery(object_rdd, window_rdd, True, False).collect() assert 5467 == results.__len__() for data in results: for polygon_data in data[1]: assert Circle(data[0].geom, 0.1).covers(polygon_data.geom)
def rdd_filesave_join(): logger.info("\t - RDD file save join start") full_start_time = datetime.now() # ---------------------------------------------------------- # get spark session and context # ---------------------------------------------------------- start_time = datetime.now() spark = create_spark_session() sc = spark.sparkContext sedona_version = pkg_resources.get_distribution("sedona").version logger.info( "\t - PySpark {} session initiated with Apache Sedona {}: {}".format( sc.version, sedona_version, datetime.now() - start_time)) # ---------------------------------------------------------- # create GNAF PointRDD from CSV file # ---------------------------------------------------------- start_time = datetime.now() offset = 0 # The point long/lat fields start at column 0 carry_other_attributes = True # include non-geo columns point_rdd = PointRDD(sc, os.path.join(output_path, gnaf_csv_file_path), offset, FileDataSplitter.CSV, carry_other_attributes) point_rdd.analyze() # add partitioning and indexing point_rdd.spatialPartitioning(GridType.KDBTREE) point_rdd.buildIndex(IndexType.RTREE, True) # set Spark storage type - set to MEMORY_AND_DISK if low on memory point_rdd.indexedRDD.persist(StorageLevel.MEMORY_ONLY) logger.info("\t\t - GNAF RDD created: {}".format(datetime.now() - start_time)) # ---------------------------------------------------------- # get boundary tags using a spatial join # ---------------------------------------------------------- for bdy in bdy_list: start_time = datetime.now() # load boundaries # create geometries from WKT strings into new DataFrame bdy_df = spark.read.parquet(os.path.join(output_path, bdy["name"])) \ .withColumn("geom", f.expr("st_geomFromWKT(wkt_geom)")) \ .drop("wkt_geom") # create bdy rdd bdy_rdd = Adapter.toSpatialRdd(bdy_df, "geom") bdy_rdd.analyze() bdy_df.unpersist() bdy_rdd.spatialPartitioning(point_rdd.getPartitioner()) bdy_rdd.spatialPartitionedRDD.persist( StorageLevel.MEMORY_ONLY) # no need to persist(?) - used once # run the join - returns a PairRDD with 1 boundary to 1-N points # e.g. [Geometry: Polygon userData: WA32 TANGNEY WA, [Geometry: Point userData: GAWA_146792426 WA, ...]] result_pair_rdd = JoinQuery.SpatialJoinQueryFlat( point_rdd, bdy_rdd, True, True) # jim = result_pair_rdd.take(10) # for row in jim: # print(row) result_pair_rdd.saveAsTextFile( os.path.join(output_path, "rdd_file_save_gnaf_with_{}".format(bdy["name"]))) # # flat map values to have one point to bdy matched pair # flat_mapped_rdd = result_pair_rdd.flatMapValues(lambda x: x) # # # map values to create RDD row of gnaf & bdy IDs, plus state data # mapped_rdd = flat_mapped_rdd.map( # lambda x: [x[1].getUserData().split("\t")[0], # x[0].getUserData().split("\t")[0], # x[0].getUserData().split("\t")[1]] # ) # # # convert result to a dataframe of the following shema # schema = t.StructType([t.StructField("gnaf_pid", t.StringType(), False), # t.StructField(bdy["id_field"], t.StringType(), False), # t.StructField(bdy["name_field"], t.StringType(), False)]) # # join_df = spark.createDataFrame(mapped_rdd, schema) # # # save result to disk # join_df.write \ # .option("compression", "gzip") \ # .mode("overwrite") \ # .parquet(os.path.join(output_path, "rdd_file_save_gnaf_with_{}".format(bdy["name"]))) logger.info("\t\t - GNAF points bdy tagged with {}: {}".format( bdy["name"], datetime.now() - start_time)) # cleanup spark.stop() logger.info("\t - RDD file save join done: {}".format(datetime.now() - full_start_time))