def test_saving_to_disc_index_linestring(self, remove_spatial_rdd_disc_dir): from tests.properties.point_properties import input_location, offset, splitter, num_partitions point_rdd = PointRDD( self.sc, input_location, offset, splitter, True, num_partitions, StorageLevel.MEMORY_ONLY ) point_rdd.buildIndex(IndexType.RTREE, False) point_rdd.indexedRawRDD.saveAsObjectFile(os.path.join(disc_object_location, "point_index"))
def test_spatial_knn_correctness(self): point_rdd = PointRDD(self.sc, input_location, offset, splitter, True, StorageLevel.MEMORY_ONLY, "epsg:4326", "epsg:3005") result_no_index = KNNQuery.SpatialKnnQuery(point_rdd, query_point, top_k, False) point_rdd.buildIndex(IndexType.RTREE, False) result_with_index = KNNQuery.SpatialKnnQuery(point_rdd, query_point, top_k, True) sorted_result_no_index = sorted( result_no_index, key=lambda geo_data: distance_sorting_functions( geo_data, query_point)) sorted_result_with_index = sorted( result_with_index, key=lambda geo_data: distance_sorting_functions( geo_data, query_point)) difference = 0 for x in range(top_k): difference += sorted_result_no_index[x].geom.distance( sorted_result_with_index[x].geom) assert difference == 0
def load(cls, sc: SparkContext, path: str) -> SpatialRDD: jvm = sc._jvm point_rdd = PointRDD() srdd = SpatialObjectLoaderAdapter(jvm).load_point_spatial_rdd( sc._jsc, path) point_rdd.set_srdd(srdd) return point_rdd
def test_creating_point_rdd(self): point_rdd = PointRDD(self.spark._sc, point_path, 4, FileDataSplitter.WKT, True) point_rdd.analyze() cnt = point_rdd.countWithoutDuplicates() assert cnt == 12872, f"Point RDD should have 12872 but found {cnt}"
def test_spatial_knn_query_using_index(self): point_rdd = PointRDD(self.sc, input_location, offset, splitter, False) point_rdd.buildIndex(IndexType.RTREE, False) for i in range(self.loop_times): result = KNNQuery.SpatialKnnQuery(point_rdd, self.query_point, self.top_k, False) assert result.__len__() > -1 assert result[0].getUserData() is not None
def test_build_index_without_set_grid(self): spatial_rdd = PointRDD(sparkContext=self.sc, InputLocation=input_location, Offset=offset, splitter=splitter, carryInputData=True, partitions=num_partitions, newLevel=StorageLevel.MEMORY_ONLY) spatial_rdd.buildIndex(IndexType.RTREE, False)
def test_spatial_knn_query_using_index(self): object_rdd = PointRDD(self.sc, point_rdd_input_location, point_rdd_offset, point_rdd_splitter, True, StorageLevel.MEMORY_ONLY) object_rdd.buildIndex(point_rdd_index_type, False) object_rdd.indexedRawRDD.persist(StorageLevel.MEMORY_ONLY) for _ in range(each_query_loop_times): result = KNNQuery.SpatialKnnQuery(object_rdd, knn_query_point, 1000, True)
def test_knn_query_with_index(self): object_rdd = PointRDD(sparkContext=self.sc, InputLocation=point_rdd_input_location, Offset=point_rdd_offset, splitter=point_rdd_splitter, carryInputData=False) object_rdd.buildIndex(point_rdd_index_type, False) for i in range(each_query_loop_times): result = KNNQuery.SpatialKnnQuery(object_rdd, knn_query_point, 1000, True)
def test_spatial_knn_query_using_index(self): point_rdd = PointRDD(self.sc, input_location, offset, splitter, True, StorageLevel.MEMORY_ONLY, "epsg:4326", "epsg:3005") point_rdd.buildIndex(IndexType.RTREE, False) for i in range(loop_times): result = KNNQuery.SpatialKnnQuery(point_rdd, query_point, top_k, False) assert result.__len__() > 0 assert result[0].getUserData() is not None
def test_range_query_using_index(self): object_rdd = PointRDD(sparkContext=self.sc, InputLocation=point_rdd_input_location, Offset=point_rdd_offset, splitter=point_rdd_splitter, carryInputData=False) object_rdd.buildIndex(point_rdd_index_type, False) for i in range(each_query_loop_times): result_size = RangeQuery.SpatialRangeQuery(object_rdd, range_query_window, False, True).count
def test_spatial_range_query_using_index(self): object_rdd = PointRDD(self.sc, point_rdd_input_location, point_rdd_offset, point_rdd_splitter, True, StorageLevel.MEMORY_ONLY) object_rdd.buildIndex(point_rdd_index_type, False) object_rdd.indexedRawRDD.persist(StorageLevel.MEMORY_ONLY) assert object_rdd.indexedRawRDD.is_cached for _ in range(each_query_loop_times): result_size = RangeQuery.SpatialRangeQuery(object_rdd, range_query_window, False, True).count
def test_crs_tranformed_spatial_range_query_using_index(self): object_rdd = PointRDD(sparkContext=self.sc, InputLocation=point_rdd_input_location, Offset=point_rdd_offset, splitter=point_rdd_splitter, carryInputData=False, newLevel=StorageLevel.DISK_ONLY, sourceEpsgCRSCode="epsg:4326", targetEpsgCode="epsg:3005") object_rdd.buildIndex(point_rdd_index_type, False) for i in range(each_query_loop_times): result_size = RangeQuery.SpatialRangeQuery(object_rdd, range_query_window, False, True).count
def test_point_rdd(self): point_rdd = PointRDD(sparkContext=self.sc, InputLocation=point_rdd_input_location, Offset=point_rdd_offset, splitter=point_rdd_splitter, carryInputData=False) collected_points = point_rdd.getRawSpatialRDD().collect() points_coordinates = [[-88.331492, 32.324142], [-88.175933, 32.360763], [-88.388954, 32.357073], [-88.221102, 32.35078]] assert [[geo_data.geom.x, geo_data.geom.y] for geo_data in collected_points[:4]] == points_coordinates[:4]
def test_crs_transform(self): spatial_rdd = PointRDD( sparkContext=self.sc, InputLocation=crs_test_point, Offset=0, splitter=splitter, carryInputData=True, partitions=numPartitions, newLevel=StorageLevel.MEMORY_ONLY ) spatial_rdd.CRSTransform("epsg:4326", "epsg:3857") assert spatial_rdd.rawSpatialRDD.collect()[0].geom.wkt == "POINT (-9833016.710450118 3805934.914254189)"
def test_spatial_range_query_using_index(self): spatial_rdd = PointRDD(self.sc, input_location, offset, splitter, False) spatial_rdd.buildIndex(IndexType.RTREE, False) for i in range(self.loop_times): result_size = RangeQuery.\ SpatialRangeQuery(spatial_rdd, self.query_envelope, False, False)\ .count() assert result_size == 2830 assert RangeQuery.SpatialRangeQuery( spatial_rdd, self.query_envelope, False, False).take(10)[1].\ getUserData() is not None
def test_spatial_range_query_using_index(self): spatial_rdd = PointRDD(self.sc, input_location, offset, splitter, True, StorageLevel.MEMORY_ONLY, "epsg:4326", "epsg:3005") spatial_rdd.buildIndex(IndexType.RTREE, False) for i in range(loop_times): result_size = RangeQuery.SpatialRangeQuery(spatial_rdd, query_envelope, False, False).count() assert result_size == 3127 assert RangeQuery.SpatialRangeQuery( spatial_rdd, query_envelope, False, False).take(10)[1].getUserData() is not None
def test_get_crs_transformation(self): spatial_rdd = PointRDD( sparkContext=self.sc, InputLocation=crs_test_point, Offset=0, splitter=splitter, carryInputData=True, partitions=numPartitions, newLevel=StorageLevel.MEMORY_ONLY ) assert not spatial_rdd.getCRStransformation() spatial_rdd.CRSTransform("epsg:4326", "epsg:3857") assert spatial_rdd.getCRStransformation()
def test_empty_constructor(self): spatial_rdd = PointRDD(sparkContext=self.sc, InputLocation=input_location, Offset=offset, splitter=splitter, carryInputData=True, partitions=num_partitions, newLevel=StorageLevel.MEMORY_ONLY) spatial_rdd.buildIndex(IndexType.RTREE, False) spatial_rdd_copy = PointRDD() spatial_rdd_copy.rawJvmSpatialRDD = spatial_rdd.rawJvmSpatialRDD spatial_rdd_copy.analyze()
def test_get_source_epsg_code(self): spatial_rdd = PointRDD( sparkContext=self.sc, InputLocation=crs_test_point, Offset=0, splitter=splitter, carryInputData=True, partitions=numPartitions, newLevel=StorageLevel.MEMORY_ONLY ) assert spatial_rdd.getSourceEpsgCode() == "" spatial_rdd.CRSTransform("epsg:4326", "epsg:3857") assert spatial_rdd.getSourceEpsgCode() == "epsg:4326"
def test_point_rdd(self): spatial_rdd = PointRDD(sparkContext=self.sc, InputLocation=crs_test_point, Offset=0, splitter=splitter, carryInputData=True, partitions=numPartitions, newLevel=StorageLevel.MEMORY_ONLY) raw_spatial_rdd = spatial_rdd.rawSpatialRDD.map( lambda x: [x.geom, *x.getUserData().split("\t")]) self.spark.createDataFrame(raw_spatial_rdd).show() schema = StructType([ StructField("geom", GeometryType()), StructField("name", StringType()) ]) spatial_rdd_with_schema = self.spark.createDataFrame( raw_spatial_rdd, schema, verifySchema=False) spatial_rdd_with_schema.show() assert spatial_rdd_with_schema.take( 1)[0][0].wkt == "POINT (32.324142 -88.331492)"
def readToPointRDD(cls, sc: SparkContext, inputPath: str) -> PointRDD: """ :param sc: :param inputPath: :return: """ ShapefileReader.validate_imports() jvm = sc._jvm jsc = sc._jsc srdd = jvm.ShapefileReader.readToPointRDD( jsc, inputPath ) spatial_rdd = PointRDD() spatial_rdd.set_srdd(srdd) return spatial_rdd
def test_saving_to_disc_spatial_rdd_point(self, remove_spatial_rdd_disc_dir): from tests.properties.point_properties import input_location, offset, splitter, num_partitions point_rdd = PointRDD( self.sc, input_location, offset, splitter, True, num_partitions, StorageLevel.MEMORY_ONLY ) point_rdd.rawJvmSpatialRDD.saveAsObjectFile(os.path.join(disc_object_location, "point"))
def test_spatial_knn_query(self): object_rdd = PointRDD(self.sc, point_rdd_input_location, point_rdd_offset, point_rdd_splitter, True, StorageLevel.MEMORY_ONLY) object_rdd.rawJvmSpatialRDD.persist(StorageLevel.MEMORY_ONLY) for _ in range(each_query_loop_times): result = KNNQuery.SpatialKnnQuery(object_rdd, knn_query_point, 1000, False)
def test_circle_rdd(self): object_rdd = PointRDD(sparkContext=self.sc, InputLocation=point_rdd_input_location, Offset=point_rdd_offset, splitter=point_rdd_splitter, carryInputData=False) circle_rdd = CircleRDD(object_rdd, 0.1) collected_data = circle_rdd.getRawSpatialRDD().collect() print([geo_data.geom.wkt for geo_data in collected_data])
def test_spatial_range_query(self): object_rdd = PointRDD(self.sc, point_rdd_input_location, point_rdd_offset, point_rdd_splitter, True, StorageLevel.MEMORY_ONLY) object_rdd.rawJvmSpatialRDD.persist(StorageLevel.MEMORY_ONLY) for _ in range(each_query_loop_times): result_size = RangeQuery.SpatialRangeQuery(object_rdd, range_query_window, False, False).count() object_rdd = PointRDD(self.sc, point_rdd_input_location, point_rdd_offset, point_rdd_splitter, True, StorageLevel.MEMORY_ONLY) object_rdd.rawJvmSpatialRDD.persist(StorageLevel.MEMORY_ONLY) for _ in range(each_query_loop_times): result_size = RangeQuery.SpatialRangeQuery(object_rdd, range_query_window, False, False).count()
def test_distance_join_query(self): object_rdd = PointRDD(self.sc, point_rdd_input_location, point_rdd_offset, point_rdd_splitter, True, StorageLevel.MEMORY_ONLY) query_window_rdd = CircleRDD(object_rdd, 0.1) object_rdd.spatialPartitioning(GridType.QUADTREE) query_window_rdd.spatialPartitioning(object_rdd.getPartitioner()) object_rdd.spatialPartitionedRDD.persist(StorageLevel.MEMORY_ONLY) assert object_rdd.spatialPartitionedRDD.is_cached query_window_rdd.spatialPartitionedRDD.persist( StorageLevel.MEMORY_ONLY) for _ in range(each_query_loop_times): result_size = JoinQuery.DistanceJoinQuery(object_rdd, query_window_rdd, False, True).count()
def test_spatial_join_query_with_polygon_rdd(self): query_rdd = PolygonRDD(self.sc, input_location_query_polygon, splitter, True, num_partitions, StorageLevel.MEMORY_ONLY, "epsg:4326", "epsg:3005") spatial_rdd = PointRDD(self.sc, input_location, offset, splitter, True, num_partitions, StorageLevel.MEMORY_ONLY, "epsg:4326", "epsg:3005") spatial_rdd.spatialPartitioning(grid_type) query_rdd.spatialPartitioning(spatial_rdd.grids) result = JoinQuery.SpatialJoinQuery(spatial_rdd, query_rdd, False, True).collect() assert result[1][0].getUserData() is not None for data in result: if data[1].__len__() != 0: for right_data in data[1]: assert right_data.getUserData() is not None
def test_spatial_join_query(self): query_window_rdd = PolygonRDD(self.sc, polygon_rdd_input_location, polygon_rdd_start_offset, polygon_rdd_end_offset, polygon_rdd_splitter, True) object_rdd = PointRDD(self.sc, point_rdd_input_location, point_rdd_offset, point_rdd_splitter, True, StorageLevel.MEMORY_ONLY) object_rdd.spatialPartitioning(join_query_partitioning_type) query_window_rdd.spatialPartitioning(object_rdd.getPartitioner()) object_rdd.jvmSpatialPartitionedRDD.persist(StorageLevel.MEMORY_ONLY) query_window_rdd.jvmSpatialPartitionedRDD.persist( StorageLevel.MEMORY_ONLY) for _ in range(each_query_loop_times): result_size = JoinQuery.SpatialJoinQuery(object_rdd, query_window_rdd, False, True).count()
def create_spatial_rdd(self): spatial_rdd = PointRDD( sparkContext=self.sc, InputLocation=input_file_location, Offset=offset, splitter=splitter, carryInputData=True, partitions=numPartitions, newLevel=StorageLevel.MEMORY_ONLY ) return spatial_rdd
def test_save_as_geo_json_with_data(self, remove_wkb_directory): spatial_rdd = PointRDD(sparkContext=self.sc, InputLocation=inputLocation, Offset=offset, splitter=splitter, carryInputData=True, partitions=numPartitions, newLevel=StorageLevel.MEMORY_ONLY) spatial_rdd.saveAsGeoJSON(test_save_as_wkb_with_data) result_wkb = PointRDD(sparkContext=self.sc, InputLocation=test_save_as_wkb_with_data, splitter=FileDataSplitter.GEOJSON, carryInputData=True, partitions=numPartitions, newLevel=StorageLevel.MEMORY_ONLY) assert result_wkb.rawSpatialRDD.count( ) == spatial_rdd.rawSpatialRDD.count()