def test_creating_point_rdd(self): point_rdd = PointRDD(self.spark._sc, point_path, 4, FileDataSplitter.WKT, True) point_rdd.analyze() cnt = point_rdd.countWithoutDuplicates() assert cnt == 12872, f"Point RDD should have 12872 but found {cnt}"
def test_spatial_knn_correctness(self): point_rdd = PointRDD(self.sc, input_location, offset, splitter, True, StorageLevel.MEMORY_ONLY, "epsg:4326", "epsg:3005") result_no_index = KNNQuery.SpatialKnnQuery(point_rdd, query_point, top_k, False) point_rdd.buildIndex(IndexType.RTREE, False) result_with_index = KNNQuery.SpatialKnnQuery(point_rdd, query_point, top_k, True) sorted_result_no_index = sorted( result_no_index, key=lambda geo_data: distance_sorting_functions( geo_data, query_point)) sorted_result_with_index = sorted( result_with_index, key=lambda geo_data: distance_sorting_functions( geo_data, query_point)) difference = 0 for x in range(top_k): difference += sorted_result_no_index[x].geom.distance( sorted_result_with_index[x].geom) assert difference == 0
def test_build_index_without_set_grid(self): spatial_rdd = PointRDD(sparkContext=self.sc, InputLocation=input_location, Offset=offset, splitter=splitter, carryInputData=True, partitions=num_partitions, newLevel=StorageLevel.MEMORY_ONLY) spatial_rdd.buildIndex(IndexType.RTREE, False)
def test_spatial_knn_query_using_index(self): point_rdd = PointRDD(self.sc, input_location, offset, splitter, False) point_rdd.buildIndex(IndexType.RTREE, False) for i in range(self.loop_times): result = KNNQuery.SpatialKnnQuery(point_rdd, self.query_point, self.top_k, False) assert result.__len__() > -1 assert result[0].getUserData() is not None
def test_knn_query_with_index(self): object_rdd = PointRDD(sparkContext=self.sc, InputLocation=point_rdd_input_location, Offset=point_rdd_offset, splitter=point_rdd_splitter, carryInputData=False) object_rdd.buildIndex(point_rdd_index_type, False) for i in range(each_query_loop_times): result = KNNQuery.SpatialKnnQuery(object_rdd, knn_query_point, 1000, True)
def test_range_query_using_index(self): object_rdd = PointRDD(sparkContext=self.sc, InputLocation=point_rdd_input_location, Offset=point_rdd_offset, splitter=point_rdd_splitter, carryInputData=False) object_rdd.buildIndex(point_rdd_index_type, False) for i in range(each_query_loop_times): result_size = RangeQuery.SpatialRangeQuery(object_rdd, range_query_window, False, True).count
def test_spatial_knn_query_using_index(self): point_rdd = PointRDD(self.sc, input_location, offset, splitter, True, StorageLevel.MEMORY_ONLY, "epsg:4326", "epsg:3005") point_rdd.buildIndex(IndexType.RTREE, False) for i in range(loop_times): result = KNNQuery.SpatialKnnQuery(point_rdd, query_point, top_k, False) assert result.__len__() > 0 assert result[0].getUserData() is not None
def test_point_rdd(self): point_rdd = PointRDD(sparkContext=self.sc, InputLocation=point_rdd_input_location, Offset=point_rdd_offset, splitter=point_rdd_splitter, carryInputData=False) collected_points = point_rdd.getRawSpatialRDD().collect() points_coordinates = [[-88.331492, 32.324142], [-88.175933, 32.360763], [-88.388954, 32.357073], [-88.221102, 32.35078]] assert [[geo_data.geom.x, geo_data.geom.y] for geo_data in collected_points[:4]] == points_coordinates[:4]
def test_crs_tranformed_spatial_range_query_using_index(self): object_rdd = PointRDD(sparkContext=self.sc, InputLocation=point_rdd_input_location, Offset=point_rdd_offset, splitter=point_rdd_splitter, carryInputData=False, newLevel=StorageLevel.DISK_ONLY, sourceEpsgCRSCode="epsg:4326", targetEpsgCode="epsg:3005") object_rdd.buildIndex(point_rdd_index_type, False) for i in range(each_query_loop_times): result_size = RangeQuery.SpatialRangeQuery(object_rdd, range_query_window, False, True).count
def readToPointRDD(cls, sc: SparkContext, inputPath: str) -> PointRDD: """ :param sc: :param inputPath: :return: """ ShapefileReader.validate_imports() jvm = sc._jvm jsc = sc._jsc srdd = jvm.ShapefileReader.readToPointRDD(jsc, inputPath) spatial_rdd = PointRDD() spatial_rdd.set_srdd(srdd) return spatial_rdd
def test_spatial_range_query_using_index(self): spatial_rdd = PointRDD(self.sc, input_location, offset, splitter, False) spatial_rdd.buildIndex(IndexType.RTREE, False) for i in range(self.loop_times): result_size = RangeQuery.\ SpatialRangeQuery(spatial_rdd, self.query_envelope, False, False)\ .count() assert result_size == 2830 assert RangeQuery.SpatialRangeQuery( spatial_rdd, self.query_envelope, False, False).take(10)[1].\ getUserData() is not None
def test_crs_transform(self): spatial_rdd = PointRDD( sparkContext=self.sc, InputLocation=crs_test_point, Offset=0, splitter=splitter, carryInputData=True, partitions=numPartitions, newLevel=StorageLevel.MEMORY_ONLY ) spatial_rdd.CRSTransform("epsg:4326", "epsg:3857") assert spatial_rdd.rawSpatialRDD.collect()[0].geom.wkt == "POINT (-9833016.710450118 3805934.914254189)"
def test_get_crs_transformation(self): spatial_rdd = PointRDD( sparkContext=self.sc, InputLocation=crs_test_point, Offset=0, splitter=splitter, carryInputData=True, partitions=numPartitions, newLevel=StorageLevel.MEMORY_ONLY ) assert not spatial_rdd.getCRStransformation() spatial_rdd.CRSTransform("epsg:4326", "epsg:3857") assert spatial_rdd.getCRStransformation()
def test_spatial_range_query_using_index(self): spatial_rdd = PointRDD(self.sc, input_location, offset, splitter, True, StorageLevel.MEMORY_ONLY, "epsg:4326", "epsg:3005") spatial_rdd.buildIndex(IndexType.RTREE, False) for i in range(loop_times): result_size = RangeQuery.SpatialRangeQuery(spatial_rdd, query_envelope, False, False).count() assert result_size == 3127 assert RangeQuery.SpatialRangeQuery( spatial_rdd, query_envelope, False, False).take(10)[1].getUserData() is not None
def test_empty_constructor(self): spatial_rdd = PointRDD(sparkContext=self.sc, InputLocation=input_location, Offset=offset, splitter=splitter, carryInputData=True, partitions=num_partitions, newLevel=StorageLevel.MEMORY_ONLY) spatial_rdd.buildIndex(IndexType.RTREE, False) spatial_rdd_copy = PointRDD() spatial_rdd_copy.rawJvmSpatialRDD = spatial_rdd.rawJvmSpatialRDD spatial_rdd_copy.analyze()
def test_get_source_epsg_code(self): spatial_rdd = PointRDD( sparkContext=self.sc, InputLocation=crs_test_point, Offset=0, splitter=splitter, carryInputData=True, partitions=numPartitions, newLevel=StorageLevel.MEMORY_ONLY ) assert spatial_rdd.getSourceEpsgCode() == "" spatial_rdd.CRSTransform("epsg:4326", "epsg:3857") assert spatial_rdd.getSourceEpsgCode() == "epsg:4326"
def test_point_rdd(self): spatial_rdd = PointRDD(sparkContext=self.sc, InputLocation=crs_test_point, Offset=0, splitter=splitter, carryInputData=True, partitions=numPartitions, newLevel=StorageLevel.MEMORY_ONLY) raw_spatial_rdd = spatial_rdd.rawSpatialRDD.map( lambda x: [x.geom, *x.getUserData().split("\t")]) self.spark.createDataFrame(raw_spatial_rdd).show() schema = StructType([ StructField("geom", GeometryType()), StructField("name", StringType()) ]) spatial_rdd_with_schema = self.spark.createDataFrame( raw_spatial_rdd, schema) spatial_rdd_with_schema.show() assert spatial_rdd_with_schema.take( 1)[0][0].wkt == "POINT (32.324142 -88.331492)"
def test_circle_rdd(self): object_rdd = PointRDD(sparkContext=self.sc, InputLocation=point_rdd_input_location, Offset=point_rdd_offset, splitter=point_rdd_splitter, carryInputData=False) circle_rdd = CircleRDD(object_rdd, 0.1) collected_data = circle_rdd.getRawSpatialRDD().collect() print([geo_data.geom.wkt for geo_data in collected_data])
def create_spatial_rdd(self): spatial_rdd = PointRDD( sparkContext=self.sc, InputLocation=input_file_location, Offset=offset, splitter=splitter, carryInputData=True, partitions=numPartitions, newLevel=StorageLevel.MEMORY_ONLY ) return spatial_rdd
def test_spatial_join_query_with_polygon_rdd(self): query_rdd = PolygonRDD(self.sc, input_location_query_polygon, splitter, True, num_partitions, StorageLevel.MEMORY_ONLY, "epsg:4326", "epsg:3005") spatial_rdd = PointRDD(self.sc, input_location, offset, splitter, True, num_partitions, StorageLevel.MEMORY_ONLY, "epsg:4326", "epsg:3005") spatial_rdd.spatialPartitioning(grid_type) query_rdd.spatialPartitioning(spatial_rdd.grids) result = JoinQuery.SpatialJoinQuery(spatial_rdd, query_rdd, False, True).collect() assert result[1][0].getUserData() is not None for data in result: if data[1].__len__() != 0: for right_data in data[1]: assert right_data.getUserData() is not None
def test_distance_join_query_using_index(self): object_rdd = PointRDD(sparkContext=self.sc, InputLocation=point_rdd_input_location, Offset=point_rdd_offset, splitter=point_rdd_splitter, carryInputData=False) query_window_rdd = CircleRDD(object_rdd, 0.1) object_rdd.analyze() object_rdd.spatialPartitioning(GridType.QUADTREE) query_window_rdd.spatialPartitioning(object_rdd.getPartitioner()) object_rdd.buildIndex(IndexType.RTREE, True) for i in range(each_query_loop_times): result_size = JoinQuery.DistanceJoinQuery(object_rdd, query_window_rdd, True, True).count
def test_spatial_range_query(self): object_rdd = PointRDD(sparkContext=self.sc, InputLocation=point_rdd_input_location, Offset=point_rdd_offset, splitter=point_rdd_splitter, carryInputData=False) for i in range(each_query_loop_times): result_size = RangeQuery.SpatialRangeQuery(object_rdd, range_query_window, False, False).count() logging.info(result_size)
def test_equal_partitioning(self): spatial_rdd = PointRDD(sparkContext=self.sc, InputLocation=input_location, Offset=offset, splitter=splitter, carryInputData=False, partitions=10, newLevel=StorageLevel.MEMORY_ONLY) spatial_rdd.analyze() spatial_rdd.spatialPartitioning(GridType.EQUALGRID) for envelope in spatial_rdd.grids: print("PointRDD spatial partitioning grids: " + str(envelope)) assert spatial_rdd.countWithoutDuplicates( ) == spatial_rdd.countWithoutDuplicatesSPRDD()
def test_spatial_join_using_index(self): query_window = PolygonRDD(self.sc, polygon_rdd_input_location, polygon_rdd_start_offset, polygon_rdd_end_offset, polygon_rdd_splitter, True) object_rdd = PointRDD(sparkContext=self.sc, InputLocation=point_rdd_input_location, Offset=point_rdd_offset, splitter=point_rdd_splitter, carryInputData=False) object_rdd.analyze() object_rdd.spatialPartitioning(join_query_partitionin_type) query_window.spatialPartitioning(object_rdd.getPartitioner()) object_rdd.buildIndex(point_rdd_index_type, True) for i in range(each_query_loop_times): result_size = JoinQuery.SpatialJoinQuery(object_rdd, query_window, True, False).count()
def test_r_tree_spatial_partitioning(self): spatial_rdd = PointRDD(sparkContext=self.sc, InputLocation=input_location, Offset=offset, splitter=splitter, carryInputData=True, partitions=10, newLevel=StorageLevel.MEMORY_ONLY) spatial_rdd.analyze() spatial_rdd.spatialPartitioning(GridType.RTREE) for envelope in spatial_rdd.grids: print(envelope) assert spatial_rdd.countWithoutDuplicates( ) == spatial_rdd.countWithoutDuplicatesSPRDD()
def test_save_as_geo_json_with_data(self, remove_wkb_directory): spatial_rdd = PointRDD( sparkContext=self.sc, InputLocation=inputLocation, Offset=offset, splitter=splitter, carryInputData=True, partitions=numPartitions, newLevel=StorageLevel.MEMORY_ONLY ) spatial_rdd.saveAsGeoJSON(test_save_as_wkb_with_data) result_wkb = PointRDD( sparkContext=self.sc, InputLocation=test_save_as_wkb_with_data, splitter=FileDataSplitter.GEOJSON, carryInputData=True, partitions=numPartitions, newLevel=StorageLevel.MEMORY_ONLY ) assert result_wkb.rawSpatialRDD.count() == spatial_rdd.rawSpatialRDD.count()
def test_empty_constructor_test(self): object_rdd = PointRDD(sparkContext=self.sc, InputLocation=point_rdd_input_location, Offset=point_rdd_offset, splitter=point_rdd_splitter, carryInputData=False) object_rdd_copy = PointRDD() object_rdd_copy.rawJvmSpatialRDD = object_rdd.rawJvmSpatialRDD object_rdd_copy.analyze()
def test_on_boundary_point_join_correctness(self): window_rdd = PolygonRDD( self.sc.parallelize(self.test_polygon_window_set), StorageLevel.MEMORY_ONLY) object_rdd = PointRDD( self.sc.parallelize(self.test_on_boundary_point_set), StorageLevel.MEMORY_ONLY) self.prepare_rdd(object_rdd, window_rdd, GridType.QUADTREE) result = JoinQuery.SpatialJoinQuery(object_rdd, window_rdd, True, False).collect() self.verify_join_result(result) result_no_index = JoinQuery.SpatialJoinQuery(object_rdd, window_rdd, False, False).collect() self.verify_join_result(result_no_index)
def test_outside_point_join_correctness(self): self.once_before_all() window_rdd = PolygonRDD( self.sc.parallelize(self.test_polygon_window_set), StorageLevel.MEMORY_ONLY) object_rdd = PointRDD(self.sc.parallelize(self.test_outside_point_set), StorageLevel.MEMORY_ONLY) self.prepare_rdd(object_rdd, window_rdd, GridType.QUADTREE) result = JoinQuery.SpatialJoinQuery(object_rdd, window_rdd, True, False).collect() assert 0 == result.__len__() result_no_index = JoinQuery.SpatialJoinQuery(object_rdd, window_rdd, False, False).collect() assert 0 == result_no_index.__len__()
def test_spatial_join_query(self): point_rdd = PointRDD(self.sc, point_path, 4, FileDataSplitter.WKT, True) polygon_rdd = PolygonRDD(self.sc, counties_path, 2, 3, FileDataSplitter.WKT, True) point_rdd.analyze() point_rdd.spatialPartitioning(GridType.KDBTREE) polygon_rdd.spatialPartitioning(point_rdd.getPartitioner()) result = JoinQuery.SpatialJoinQuery(point_rdd, polygon_rdd, True, False) print(result.count())