def test_overlapped_linestring_join_correctness(self): window_rdd = PolygonRDD( self.sc.parallelize(self.test_polygon_window_set), StorageLevel.MEMORY_ONLY) object_rdd = LineStringRDD( self.sc.parallelize(self.test_overlapped_linestring_set), StorageLevel.MEMORY_ONLY) self.prepare_rdd(object_rdd, window_rdd, GridType.QUADTREE) result = JoinQuery.SpatialJoinQuery(object_rdd, window_rdd, True, True).collect() self.verify_join_result(result) result_no_index = JoinQuery.SpatialJoinQuery(object_rdd, window_rdd, False, True).collect() self.verify_join_result(result_no_index)
def test_inside_point_join_correctness(self): self.once_before_all() window_rdd = PolygonRDD( self.sc.parallelize(self.test_polygon_window_set)) object_rdd = PointRDD(self.sc.parallelize(self.test_inside_point_set)) self.prepare_rdd(object_rdd, window_rdd, GridType.QUADTREE) result = JoinQuery.SpatialJoinQuery(object_rdd, window_rdd, True, False).collect() self.verify_join_result(result) result_no_index = JoinQuery.SpatialJoinQuery(object_rdd, window_rdd, False, False).collect() self.verify_join_result(result_no_index)
def test_outside_point_join_correctness(self): self.once_before_all() window_rdd = PolygonRDD( self.sc.parallelize(self.test_polygon_window_set), StorageLevel.MEMORY_ONLY) object_rdd = PointRDD(self.sc.parallelize(self.test_outside_point_set), StorageLevel.MEMORY_ONLY) self.prepare_rdd(object_rdd, window_rdd, GridType.QUADTREE) result = JoinQuery.SpatialJoinQuery(object_rdd, window_rdd, True, False).collect() assert 0 == result.__len__() result_no_index = JoinQuery.SpatialJoinQuery(object_rdd, window_rdd, False, False).collect() assert 0 == result_no_index.__len__()
def test_outside_polygon_distance_join_correctness(self): center_geometry_rdd = PolygonRDD( self.sc.parallelize(self.test_polygon_window_set), StorageLevel.MEMORY_ONLY) window_rdd = CircleRDD(center_geometry_rdd, 0.1) object_rdd = PolygonRDD( self.sc.parallelize(self.test_outside_polygon_set), StorageLevel.MEMORY_ONLY) self.prepare_rdd(object_rdd, window_rdd, GridType.QUADTREE) result = JoinQuery.DistanceJoinQuery(object_rdd, window_rdd, True, True).collect() assert 0 == result.__len__() result_no_index = JoinQuery.DistanceJoinQuery(object_rdd, window_rdd, False, True).collect() assert 0 == result_no_index.__len__()
def test_indexed_rdd_assignment(self): object_rdd = PointRDD(self.sc, point_rdd_input_location, point_rdd_offset, point_rdd_splitter, True) query_window_rdd = CircleRDD(object_rdd, 0.1) object_rdd.analyze() object_rdd.spatialPartitioning(GridType.QUADTREE) object_rdd.buildIndex(IndexType.QUADTREE, True) query_window_rdd.spatialPartitioning(object_rdd.getPartitioner()) object_rdd.buildIndex(IndexType.RTREE, False) object_rdd.indexedRDD.persist(StorageLevel.MEMORY_ONLY) query_window_rdd.jvmSpatialPartitionedRDD.persist( StorageLevel.MEMORY_ONLY) query_window_rdd.jvmSpatialPartitionedRDD.count() object_rdd.indexedRDD.count() import time start = time.time() for _ in range(each_query_loop_times): result_size = JoinQuery.DistanceJoinQuery(object_rdd, query_window_rdd, True, True).count() diff = time.time() - start object_rdd = PointRDD(self.sc, point_rdd_input_location, point_rdd_offset, point_rdd_splitter, True) query_window_rdd = CircleRDD(object_rdd, 0.1) object_rdd.analyze() object_rdd.spatialPartitioning(GridType.QUADTREE) object_rdd.buildIndex(IndexType.QUADTREE, True) query_window_rdd.spatialPartitioning(object_rdd.getPartitioner()) object_rdd.buildIndex(IndexType.RTREE, False) start1 = time.time() for _ in range(each_query_loop_times): result_size = JoinQuery.DistanceJoinQuery(object_rdd, query_window_rdd, True, True).count()
def test_nested_loop(self, num_partitions, use_legacy_apis, grid_type, intersects): query_rdd = self.create_polygon_rdd(query_polygon_set, splitter, num_partitions) spatial_rdd = self.create_polygon_rdd(input_location, splitter, num_partitions) self.partition_rdds(query_rdd, spatial_rdd, grid_type, use_legacy_apis) result = JoinQuery.SpatialJoinQuery( spatial_rdd, query_rdd, False, intersects).collect() self.sanity_check_join_results(result) assert self.get_expected_count(intersects) == self.count_join_results(result)
def bdy_tag(spark, point_rdd, bdy): start_time = datetime.now() # load boundaries bdy_rdd = get_bdy_rdd(spark, bdy) bdy_rdd.analyze() bdy_rdd.spatialPartitioning(point_rdd.getPartitioner()) # bdy_rdd.spatialPartitionedRDD.persist(StorageLevel.MEMORY_ONLY) # no need to persist(?) - used once # run the join - returns a PairRDD with 1 boundary to 1-N points # e.g. [Geometry: Polygon userData: WA32 TANGNEY WA, [Geometry: Point userData: GAWA_146792426 WA, ...]] result_pair_rdd = JoinQuery.SpatialJoinQuery(point_rdd, bdy_rdd, True, True) # print(result_pair_rdd.take(1)) # flat map values to have one point to bdy matched pair flat_mapped_rdd = result_pair_rdd.flatMapValues(lambda x: x) # map values to create RDD row of gnaf & bdy IDs, plus state data mapped_rdd = flat_mapped_rdd.map(lambda x: [ x[1].getUserData().split("\t")[0], x[0].getUserData().split("\t")[0], x[0].getUserData().split("\t")[1] ]) # jim = mapped_rdd.take(10) # for row in jim: # print(row) # convert result to a dataframe of the following shema schema = t.StructType([ t.StructField("gnaf_pid", t.StringType(), False), t.StructField(bdy["id_field"], t.StringType(), False), t.StructField(bdy["name_field"], t.StringType(), False) ]) join_df = spark.createDataFrame(mapped_rdd, schema) # join_df.printSchema() # join_df.show(10, False) # save result to disk export_to_parquet(join_df, "gnaf_with_{}".format(bdy["name"])) # num_joined_points = join_df.count() # this can be an expensive operation # cleanup datasets in memory join_df.unpersist() mapped_rdd.unpersist() flat_mapped_rdd.unpersist() result_pair_rdd.unpersist() # bdy_rdd.unpersist() # no method for SpatialRDD logger.info("\t - GNAF points bdy tagged with {}: {}".format( bdy["name"], datetime.now() - start_time))
def test_index_int(self, num_partitions, use_legacy_apis, grid_type, index_type): query_rdd = self.create_rectangle_rdd(input_location, splitter, num_partitions) spatial_rdd = self.create_rectangle_rdd(input_location, splitter, num_partitions) self.partition_rdds(query_rdd, spatial_rdd, grid_type, use_legacy_apis) spatial_rdd.buildIndex(index_type, True) result = JoinQuery.SpatialJoinQuery( spatial_rdd, query_rdd, False, True).collect() self.sanity_check_join_results(result) assert match_count == self.count_join_results(result)
def nested_loop(self, query_rdd, num_partitions, grid_type, use_legacy_apis, expected_count): spatial_rdd = self.create_point_rdd(input_location, splitter, num_partitions) self.partition_rdds(query_rdd, spatial_rdd, grid_type, use_legacy_apis) result = JoinQuery.SpatialJoinQuery(spatial_rdd, query_rdd, False, True).collect() self.sanity_check_join_results(result) assert expected_count == self.count_join_results(result)
def dynamic_rtree_int(self, query_rdd, num_partitions, use_legacy_apis, grid_type, index_type, expected_count): spatial_rdd = self.create_point_rdd(input_location, splitter, num_partitions) self.partition_rdds(query_rdd, spatial_rdd, grid_type, use_legacy_apis) join_params = JoinParams(True, index_type, JoinBuildSide.LEFT) results = JoinQuery.spatialJoin(query_rdd, spatial_rdd, join_params).collect() self.sanity_check_flat_join_results(results) assert expected_count == results.__len__()
def test_nested_loop(self, num_partitions, use_legacy_apis, grid_type): query_rdd = self.create_rectangle_rdd(input_location, splitter, num_partitions) spatial_rdd = self.create_rectangle_rdd(input_location, splitter, num_partitions) self.partition_rdds(query_rdd, spatial_rdd, grid_type, use_legacy_apis) result = JoinQuery.SpatialJoinQuery( spatial_rdd, query_rdd, False, True).collect() count = 0 for el in result: count += el[1].__len__() self.sanity_check_join_results(result) assert match_count == self.count_join_results(result)
def test_spatial_join_query(self): point_rdd = PointRDD(self.sc, point_path, 4, FileDataSplitter.WKT, True) polygon_rdd = PolygonRDD(self.sc, counties_path, 2, 3, FileDataSplitter.WKT, True) point_rdd.analyze() point_rdd.spatialPartitioning(GridType.KDBTREE) polygon_rdd.spatialPartitioning(point_rdd.getPartitioner()) result = JoinQuery.SpatialJoinQuery(point_rdd, polygon_rdd, True, False) print(result.count())
def test_dynamic_index_int(self, num_partitions, use_legacy_apis, grid_type, index_type, intersects): query_rdd = self.create_polygon_rdd(query_polygon_set, splitter, num_partitions) spatial_rdd = self.create_polygon_rdd(input_location, splitter, num_partitions) self.partition_rdds(query_rdd, spatial_rdd, grid_type, use_legacy_apis) join_params = JoinParams(intersects, index_type, JoinBuildSide.LEFT) result = JoinQuery.spatialJoin(query_rdd, spatial_rdd, join_params).collect() self.sanity_check_flat_join_results(result) expected_count = self.get_expected_with_original_duplicates_count(intersects) \ if self.expect_to_preserve_original_duplicates(grid_type) else self.get_expected_count(intersects) assert expected_count == result.__len__()
def test_distance_join_query(self): object_rdd = PointRDD(sparkContext=self.sc, InputLocation=point_rdd_input_location, Offset=point_rdd_offset, splitter=point_rdd_splitter, carryInputData=False) query_window_rdd = CircleRDD(object_rdd, 0.1) object_rdd.analyze() object_rdd.spatialPartitioning(GridType.QUADTREE) query_window_rdd.spatialPartitioning(object_rdd.getPartitioner()) for i in range(each_query_loop_times): result_size = JoinQuery.DistanceJoinQuery(object_rdd, query_window_rdd, False, True).count()
def test_spatial_join_query_and_build_index_on_points_on_the_fly(self): query_window = PolygonRDD(self.sc, polygon_rdd_input_location, polygon_rdd_start_offset, polygon_rdd_end_offset, polygon_rdd_splitter, True) object_rdd = PointRDD(sparkContext=self.sc, InputLocation=point_rdd_input_location, Offset=point_rdd_offset, splitter=point_rdd_splitter, carryInputData=False) object_rdd.analyze() object_rdd.spatialPartitioning(join_query_partitionin_type) query_window.spatialPartitioning(object_rdd.getPartitioner()) for i in range(each_query_loop_times): result_size = JoinQuery.SpatialJoinQuery(object_rdd, query_window, True, False).count()
def test_loading_spatial_rdd_from_disc(self): point_rdd = load_spatial_rdd_from_disc( self.sc, os.path.join(disc_location, "point"), GeoType.POINT ) point_index_rdd = load_spatial_index_rdd_from_disc(self.sc, os.path.join(disc_location, "point_index")) point_rdd.indexedRawRDD = point_index_rdd assert point_rdd.indexedRawRDD is not None assert isinstance(point_rdd, PointRDD) point_rdd.analyze() print(point_rdd.boundaryEnvelope) polygon_rdd = load_spatial_rdd_from_disc( self.sc, os.path.join(disc_location, "polygon"), GeoType.POLYGON ) polygon_index_rdd = load_spatial_index_rdd_from_disc(self.sc, os.path.join(disc_location, "polygon_index")) polygon_rdd.indexedRawRDD = polygon_index_rdd polygon_rdd.analyze() print(polygon_rdd.boundaryEnvelope) assert polygon_rdd.indexedRawRDD is not None assert isinstance(polygon_rdd, PolygonRDD) linestring_rdd = load_spatial_rdd_from_disc( self.sc, os.path.join(disc_location, "line_string"), GeoType.LINESTRING ) linestring_index_rdd = load_spatial_index_rdd_from_disc(self.sc, os.path.join(disc_location, "line_string_index")) linestring_rdd.indexedRawRDD = linestring_index_rdd assert linestring_rdd.indexedRawRDD is not None assert isinstance(linestring_rdd, LineStringRDD) linestring_rdd.analyze() print(linestring_rdd.boundaryEnvelope) linestring_rdd.spatialPartitioning(GridType.RTREE) polygon_rdd.spatialPartitioning(linestring_rdd.grids) polygon_rdd.buildIndex(IndexType.RTREE, True) linestring_rdd.buildIndex(IndexType.RTREE, True) result = JoinQuery.SpatialJoinQuery( linestring_rdd, polygon_rdd, True, True).collect() print(result)
def test_distance_join_query(self): object_rdd = PointRDD(self.sc, point_rdd_input_location, point_rdd_offset, point_rdd_splitter, True, StorageLevel.MEMORY_ONLY) query_window_rdd = CircleRDD(object_rdd, 0.1) object_rdd.spatialPartitioning(GridType.QUADTREE) query_window_rdd.spatialPartitioning(object_rdd.getPartitioner()) object_rdd.spatialPartitionedRDD.persist(StorageLevel.MEMORY_ONLY) assert object_rdd.spatialPartitionedRDD.is_cached query_window_rdd.spatialPartitionedRDD.persist( StorageLevel.MEMORY_ONLY) for _ in range(each_query_loop_times): result_size = JoinQuery.DistanceJoinQuery(object_rdd, query_window_rdd, False, True).count()
def test_spatial_join_query(self): query_window_rdd = PolygonRDD(self.sc, polygon_rdd_input_location, polygon_rdd_start_offset, polygon_rdd_end_offset, polygon_rdd_splitter, True) object_rdd = PointRDD(self.sc, point_rdd_input_location, point_rdd_offset, point_rdd_splitter, True, StorageLevel.MEMORY_ONLY) object_rdd.spatialPartitioning(join_query_partitioning_type) query_window_rdd.spatialPartitioning(object_rdd.getPartitioner()) object_rdd.jvmSpatialPartitionedRDD.persist(StorageLevel.MEMORY_ONLY) query_window_rdd.jvmSpatialPartitionedRDD.persist( StorageLevel.MEMORY_ONLY) for _ in range(each_query_loop_times): result_size = JoinQuery.SpatialJoinQuery(object_rdd, query_window_rdd, False, True).count()
def test_spatial_join_query_and_build_index_on_polygons_on_the_fly(self): query_window_rdd = PolygonRDD(self.sc, polygon_rdd_input_location, polygon_rdd_start_offset, polygon_rdd_end_offset, polygon_rdd_splitter, True) object_rdd = PointRDD(sparkContext=self.sc, InputLocation=point_rdd_input_location, Offset=point_rdd_offset, splitter=point_rdd_splitter, carryInputData=False) object_rdd.analyze() object_rdd.spatialPartitioning(join_query_partitionin_type) query_window_rdd.spatialPartitioning(object_rdd.getPartitioner()) for i in range(each_query_loop_times): join_params = JoinParams(False, polygon_rdd_index_type, JoinBuildSide.LEFT) resultSize = JoinQuery.spatialJoin(query_window_rdd, object_rdd, join_params).count()
def test_spatial_join_query_with_polygon_rdd(self): query_rdd = PolygonRDD(self.sc, input_location_query_polygon, splitter, True, num_partitions, StorageLevel.MEMORY_ONLY, "epsg:4326", "epsg:3005") spatial_rdd = PointRDD(self.sc, input_location, offset, splitter, True, num_partitions, StorageLevel.MEMORY_ONLY, "epsg:4326", "epsg:3005") spatial_rdd.spatialPartitioning(grid_type) query_rdd.spatialPartitioning(spatial_rdd.grids) result = JoinQuery.SpatialJoinQuery(spatial_rdd, query_rdd, False, True).collect() assert result[1][0].getUserData() is not None for data in result: if data[1].__len__() != 0: for right_data in data[1]: assert right_data.getUserData() is not None
def test_polygon_distance_join_with_crs_transformation(self): query_rdd = PolygonRDD(self.sc, input_location_query_polygon, splitter, True, num_partitions, StorageLevel.MEMORY_ONLY, "epsg:4326", "epsg:3857") window_rdd = CircleRDD(query_rdd, 0.1) object_rdd = PolygonRDD(self.sc, input_location_query_polygon, splitter, True, num_partitions, StorageLevel.MEMORY_ONLY, "epsg:4326", "epsg:3857") object_rdd.rawJvmSpatialRDD.jsrdd.repartition(4) object_rdd.spatialPartitioning(GridType.RTREE) object_rdd.buildIndex(IndexType.RTREE, True) window_rdd.spatialPartitioning(object_rdd.grids) results = JoinQuery.DistanceJoinQuery(object_rdd, window_rdd, True, False).collect() assert results.__len__() == 5467 for data in results: for polygon_data in data[1]: assert Circle(data[0].geom, 0.1).covers(polygon_data.geom)