def test_empty_constructor(self): spatial_rdd = PolygonRDD(sparkContext=self.sc, InputLocation=input_location, splitter=splitter, carryInputData=True, partitions=num_partitions, newLevel=StorageLevel.MEMORY_ONLY) spatial_rdd.analyze() spatial_rdd.spatialPartitioning(grid_type) spatial_rdd.buildIndex(IndexType.RTREE, True) spatial_rdd_copy = PolygonRDD() spatial_rdd_copy.rawSpatialRDD = spatial_rdd spatial_rdd_copy.analyze()
def test_overlapped_polygon_join_correctness(self): window_rdd = PolygonRDD( self.sc.parallelize(self.test_polygon_window_set), StorageLevel.MEMORY_ONLY) object_rdd = PolygonRDD( self.sc.parallelize(self.test_overlapped_polygon_set), StorageLevel.MEMORY_ONLY) self.prepare_rdd(object_rdd, window_rdd, GridType.QUADTREE) result = JoinQuery.SpatialJoinQuery(object_rdd, window_rdd, True, True).collect() self.verify_join_result(result) result_no_index = JoinQuery.SpatialJoinQuery(object_rdd, window_rdd, False, True).collect() self.verify_join_result(result_no_index)
def test_spatial_knn_correctness(self): polygon_rdd = PolygonRDD(self.sc, input_location, splitter, True) result_no_index = KNNQuery.SpatialKnnQuery(polygon_rdd, self.query_point, self.top_k, False) polygon_rdd.buildIndex(IndexType.RTREE, False) result_with_index = KNNQuery.SpatialKnnQuery(polygon_rdd, self.query_point, self.top_k, True) sorted_result_no_index = sorted( result_no_index, key=lambda geo_data: distance_sorting_functions( geo_data, self.query_point)) sorted_result_with_index = sorted( result_with_index, key=lambda geo_data: distance_sorting_functions( geo_data, self.query_point)) difference = 0 for x in range(self.top_k): difference += sorted_result_no_index[x].geom.distance( sorted_result_with_index[x].geom) assert difference == 0
def test_load_id_column_data_check(self): spatial_rdd = PolygonRDD(self.spark.sparkContext, geojson_id_input_location, FileDataSplitter.GEOJSON, True) spatial_rdd.analyze() df = Adapter.toDf(spatial_rdd, self.spark) df.show() assert df.columns.__len__() == 4 assert df.count() == 1
def test_outside_polygon_join_correctness(self): window_rdd = PolygonRDD( self.sc.parallelize(self.test_polygon_window_set), StorageLevel.MEMORY_ONLY) object_rdd = PolygonRDD( self.sc.parallelize(self.test_outside_polygon_set), StorageLevel.MEMORY_ONLY) self.prepare_rdd(object_rdd, window_rdd, GridType.QUADTREE) result = JoinQuery.SpatialJoinQuery(object_rdd, window_rdd, True, False).collect() assert 0 == result.__len__() result_no_index = JoinQuery.SpatialJoinQuery(object_rdd, window_rdd, False, False).collect() assert 0 == result_no_index.__len__()
def test_outside_polygon_distance_join_correctness(self): center_geometry_rdd = PolygonRDD( self.sc.parallelize(self.test_polygon_window_set), StorageLevel.MEMORY_ONLY) window_rdd = CircleRDD(center_geometry_rdd, 0.1) object_rdd = PolygonRDD( self.sc.parallelize(self.test_outside_polygon_set), StorageLevel.MEMORY_ONLY) self.prepare_rdd(object_rdd, window_rdd, GridType.QUADTREE) result = JoinQuery.DistanceJoinQuery(object_rdd, window_rdd, True, True).collect() assert 0 == result.__len__() result_no_index = JoinQuery.DistanceJoinQuery(object_rdd, window_rdd, False, True).collect() assert 0 == result_no_index.__len__()
def test_spatial_knn_query_using_index(self): polygon_rdd = PolygonRDD(self.sc, input_location, splitter, True) polygon_rdd.buildIndex(IndexType.RTREE, False) for i in range(self.loop_times): result = KNNQuery.SpatialKnnQuery(polygon_rdd, self.query_point, self.top_k, True) assert result.__len__() > -1 assert result[0].getUserData() is not None
def test_build_index_without_set_grid(self): spatial_rdd = PolygonRDD(self.sc, input_location, FileDataSplitter.CSV, carryInputData=True, partitions=num_partitions, newLevel=StorageLevel.MEMORY_ONLY) spatial_rdd.analyze() spatial_rdd.buildIndex(IndexType.RTREE, False)
def test_creating_polygon_rdd(self): polygon_rdd = PolygonRDD(self.spark._sc, counties_path, 2, 3, FileDataSplitter.WKT, True) polygon_rdd.analyze() cnt = polygon_rdd.countWithoutDuplicates() assert cnt == 407, f"Polygon RDD should have 407 but found {cnt}"
def test_wkb_constructor(self): spatial_rdd = PolygonRDD(sparkContext=self.sc, InputLocation=input_location_wkb, splitter=FileDataSplitter.WKB, carryInputData=True, newLevel=StorageLevel.MEMORY_ONLY) spatial_rdd.analyze() assert spatial_rdd.approximateTotalCount == 103 assert spatial_rdd.boundaryEnvelope is not None assert spatial_rdd.rawSpatialRDD.take(1)[0].getUserData( ) == "31\t039\t00835841\t31039\tCuming\tCuming County\t06\tH1\tG4020\t\t\t\tA\t1477895811\t10447360\t+41.9158651\t-096.7885168"
def test_geojson_to_dataframe(self): spatial_rdd = PolygonRDD( self.spark.sparkContext, geojson_input_location, FileDataSplitter.GEOJSON, True ) spatial_rdd.analyze() df = Adapter.toDf(spatial_rdd, self.spark).\ withColumn("geometry", expr("ST_GeomFromWKT(geometry)")) df.show() assert (df.columns[1] == "STATEFP")
def test_spatial_range_query(self): spatial_rdd = PolygonRDD(self.sc, input_location, splitter, True, StorageLevel.MEMORY_ONLY) for i in range(self.loop_times): result_size = RangeQuery.\ SpatialRangeQuery(spatial_rdd, self.query_envelope, False, False).count() assert result_size == 704 assert RangeQuery.SpatialRangeQuery( spatial_rdd, self.query_envelope, False, False).take(10)[0].getUserData() is not None
def test_hilbert_curve_spatial_partitioning(self): spatial_rdd = PolygonRDD(sparkContext=self.sc, InputLocation=input_location, splitter=splitter, carryInputData=True, partitions=10, newLevel=StorageLevel.MEMORY_ONLY) spatial_rdd.analyze() spatial_rdd.spatialPartitioning(GridType.HILBERT) for envelope in spatial_rdd.grids: print(envelope)
def test_voronoi_spatial_partitioning(self): spatial_rdd = PolygonRDD(sparkContext=self.sc, InputLocation=input_location, splitter=FileDataSplitter.CSV, carryInputData=True, partitions=10, newLevel=StorageLevel.MEMORY_ONLY) spatial_rdd.analyze() spatial_rdd.spatialPartitioning(GridType.VORONOI) for envelope in spatial_rdd.grids: print(envelope)
def test_spatial_join_query(self): point_rdd = PointRDD(self.sc, point_path, 4, FileDataSplitter.WKT, True) polygon_rdd = PolygonRDD(self.sc, counties_path, 2, 3, FileDataSplitter.WKT, True) point_rdd.analyze() point_rdd.spatialPartitioning(GridType.KDBTREE) polygon_rdd.spatialPartitioning(point_rdd.getPartitioner()) result = JoinQuery.SpatialJoinQuery(point_rdd, polygon_rdd, True, False) print(result.count())
def readToPolygonRDD(cls, sc: SparkContext, inputPath: str) -> PolygonRDD: """ :param sc: :param inputPath: :return: """ ShapefileReader.validate_imports() jvm = sc._jvm jsc = sc._jsc srdd = jvm.ShapefileReader.readToPolygonRDD(jsc, inputPath) spatial_rdd = PolygonRDD() spatial_rdd.set_srdd(srdd) return spatial_rdd
def test_polygon_distance_join_with_crs_transformation(self): query_rdd = PolygonRDD(self.sc, input_location_query_polygon, splitter, True, num_partitions, StorageLevel.MEMORY_ONLY, "epsg:4326", "epsg:3857") window_rdd = CircleRDD(query_rdd, 0.1) object_rdd = PolygonRDD(self.sc, input_location_query_polygon, splitter, True, num_partitions, StorageLevel.MEMORY_ONLY, "epsg:4326", "epsg:3857") object_rdd.rawJvmSpatialRDD.jsrdd.repartition(4) object_rdd.spatialPartitioning(GridType.RTREE) object_rdd.buildIndex(IndexType.RTREE, True) window_rdd.spatialPartitioning(object_rdd.grids) results = JoinQuery.DistanceJoinQuery(object_rdd, window_rdd, True, False).collect() assert results.__len__() == 5467 for data in results: for polygon_data in data[1]: assert Circle(data[0].geom, 0.1).covers(polygon_data.geom)
def test_mbr(self): polygon_rdd = PolygonRDD(sparkContext=self.sc, InputLocation=input_location, splitter=FileDataSplitter.CSV, carryInputData=True, partitions=num_partitions) rectangle_rdd = polygon_rdd.MinimumBoundingRectangle() result = rectangle_rdd.rawSpatialRDD.collect() for el in result: print(el.geom.wkt) print(result) assert result.__len__() > -1
def test_inside_point_join_correctness(self): self.once_before_all() window_rdd = PolygonRDD( self.sc.parallelize(self.test_polygon_window_set)) object_rdd = PointRDD(self.sc.parallelize(self.test_inside_point_set)) self.prepare_rdd(object_rdd, window_rdd, GridType.QUADTREE) result = JoinQuery.SpatialJoinQuery(object_rdd, window_rdd, True, False).collect() self.verify_join_result(result) result_no_index = JoinQuery.SpatialJoinQuery(object_rdd, window_rdd, False, False).collect() self.verify_join_result(result_no_index)
def test_spatial_join_query_and_build_index_on_points_on_the_fly(self): query_window = PolygonRDD(self.sc, polygon_rdd_input_location, polygon_rdd_start_offset, polygon_rdd_end_offset, polygon_rdd_splitter, True) object_rdd = PointRDD(sparkContext=self.sc, InputLocation=point_rdd_input_location, Offset=point_rdd_offset, splitter=point_rdd_splitter, carryInputData=False) object_rdd.analyze() object_rdd.spatialPartitioning(join_query_partitionin_type) query_window.spatialPartitioning(object_rdd.getPartitioner()) for i in range(each_query_loop_times): result_size = JoinQuery.SpatialJoinQuery(object_rdd, query_window, True, False).count()
def test_polygon_rdd(self): polygon_rdd = PolygonRDD(sparkContext=self.sc, InputLocation=polygon_rdd_input_location, startOffset=polygon_rdd_start_offset, endOffset=polygon_rdd_end_offset, splitter=polygon_rdd_splitter, carryInputData=True) collected_polygon_rdd = polygon_rdd.getRawSpatialRDD().collect() input_wkt_polygons = [ "POLYGON ((-74.020753 40.836454, -74.020753 40.843768, -74.018162 40.843768, -74.018162 40.836454, -74.020753 40.836454))", "POLYGON ((-74.018978 40.837712, -74.018978 40.852181, -74.014938 40.852181, -74.014938 40.837712, -74.018978 40.837712))", "POLYGON ((-74.021683 40.833253, -74.021683 40.834288, -74.021368 40.834288, -74.021368 40.833253, -74.021683 40.833253))" ] assert [geo_data.geom.wkt for geo_data in collected_polygon_rdd ][:3] == input_wkt_polygons
def test_geojson_constructor(self): spatial_rdd = PolygonRDD(sparkContext=self.sc, InputLocation=input_location_geo_json, splitter=FileDataSplitter.GEOJSON, carryInputData=True, partitions=4, newLevel=StorageLevel.MEMORY_ONLY) spatial_rdd.analyze() assert spatial_rdd.approximateTotalCount == 1001 assert spatial_rdd.boundaryEnvelope is not None assert spatial_rdd.rawSpatialRDD.take(1)[0].getUserData( ) == "01\t077\t011501\t5\t1500000US010770115015\t010770115015\t5\tBG\t6844991\t32636" assert spatial_rdd.rawSpatialRDD.take(2)[1].getUserData( ) == "01\t045\t021102\t4\t1500000US010450211024\t010450211024\t4\tBG\t11360854\t0" assert spatial_rdd.fieldNames == [ "STATEFP", "COUNTYFP", "TRACTCE", "BLKGRPCE", "AFFGEOID", "GEOID", "NAME", "LSAD", "ALAND", "AWATER" ]
def test_spatial_join_query_with_polygon_rdd(self): query_rdd = PolygonRDD(self.sc, input_location_query_polygon, splitter, True, num_partitions, StorageLevel.MEMORY_ONLY, "epsg:4326", "epsg:3005") spatial_rdd = PointRDD(self.sc, input_location, offset, splitter, True, num_partitions, StorageLevel.MEMORY_ONLY, "epsg:4326", "epsg:3005") spatial_rdd.spatialPartitioning(grid_type) query_rdd.spatialPartitioning(spatial_rdd.grids) result = JoinQuery.SpatialJoinQuery(spatial_rdd, query_rdd, False, True).collect() assert result[1][0].getUserData() is not None for data in result: if data[1].__len__() != 0: for right_data in data[1]: assert right_data.getUserData() is not None
def test_spatial_join_query_and_build_index_on_polygons_on_the_fly(self): query_window_rdd = PolygonRDD(self.sc, polygon_rdd_input_location, polygon_rdd_start_offset, polygon_rdd_end_offset, polygon_rdd_splitter, True) object_rdd = PointRDD(sparkContext=self.sc, InputLocation=point_rdd_input_location, Offset=point_rdd_offset, splitter=point_rdd_splitter, carryInputData=False) object_rdd.analyze() object_rdd.spatialPartitioning(join_query_partitionin_type) query_window_rdd.spatialPartitioning(object_rdd.getPartitioner()) for i in range(each_query_loop_times): join_params = JoinParams(False, polygon_rdd_index_type, JoinBuildSide.LEFT) resultSize = JoinQuery.spatialJoin(query_window_rdd, object_rdd, join_params).count()
def test_to_df_srdd_fn_spark(self): spatial_rdd = PolygonRDD( self.spark.sparkContext, geojson_input_location, FileDataSplitter.GEOJSON, True ) spatial_rdd.analyze() assert spatial_rdd.approximateTotalCount == 1001 spatial_columns = [ "state_id", "county_id", "tract_id", "bg_id", "fips", "fips_short", "bg_nr", "type", "code1", "code2" ] spatial_df = Adapter.toDf( spatial_rdd, spatial_columns, self.spark ) spatial_df.show() assert spatial_df.columns == ["geometry", *spatial_columns] assert spatial_df.count() == 1001
def create_polygon_rdd(self, location, splitter, num_partitions): rdd = PolygonRDD( self.sc, location, splitter, True, num_partitions ) return PolygonRDD(rdd.rawJvmSpatialRDD, StorageLevel.MEMORY_ONLY)
def test_constructor(self): spatial_rdd_core = PolygonRDD(sparkContext=self.sc, InputLocation=input_location, splitter=splitter, carryInputData=True, partitions=num_partitions, newLevel=StorageLevel.MEMORY_ONLY) self.compare_spatial_rdd(spatial_rdd_core, input_boundary) spatial_rdd_core = PolygonRDD(self.sc, input_location, splitter, True, num_partitions, StorageLevel.MEMORY_ONLY) self.compare_spatial_rdd(spatial_rdd_core, input_boundary) spatial_rdd = PolygonRDD( rawSpatialRDD=spatial_rdd_core.rawJvmSpatialRDD) self.compare_spatial_rdd(spatial_rdd, input_boundary) spatial_rdd = PolygonRDD(spatial_rdd_core.rawJvmSpatialRDD, "epsg:4326", "epsg:5070") self.compare_spatial_rdd(spatial_rdd, query_envelope) assert spatial_rdd.getSourceEpsgCode() == "epsg:4326" assert spatial_rdd.getTargetEpsgCode() == "epsg:5070" spatial_rdd = PolygonRDD( rawSpatialRDD=spatial_rdd_core.rawJvmSpatialRDD, sourceEpsgCode="epsg:4326", targetEpsgCode="epsg:5070") assert spatial_rdd.getSourceEpsgCode() == "epsg:4326" assert spatial_rdd.getTargetEpsgCode() == "epsg:5070" self.compare_spatial_rdd(spatial_rdd, query_envelope) spatial_rdd = PolygonRDD(rawSpatialRDD=spatial_rdd.rawJvmSpatialRDD, newLevel=StorageLevel.MEMORY_ONLY) self.compare_spatial_rdd(spatial_rdd, query_envelope) spatial_rdd = PolygonRDD(spatial_rdd_core.rawJvmSpatialRDD, StorageLevel.MEMORY_ONLY) self.compare_spatial_rdd(spatial_rdd, input_boundary) spatial_rdd = PolygonRDD() query_window_rdd = PolygonRDD(self.sc, polygon_rdd_input_location, polygon_rdd_start_offset, polygon_rdd_end_offset, polygon_rdd_splitter, True, 2) assert query_window_rdd.analyze() assert query_window_rdd.approximateTotalCount == 3000 query_window_rdd = PolygonRDD(self.sc, polygon_rdd_input_location, polygon_rdd_start_offset, polygon_rdd_end_offset, polygon_rdd_splitter, True) assert query_window_rdd.analyze() assert query_window_rdd.approximateTotalCount == 3000 spatial_rdd_core = PolygonRDD(self.sc, input_location, splitter, True, num_partitions) self.compare_spatial_rdd(spatial_rdd_core, input_boundary) spatial_rdd_core = PolygonRDD(self.sc, input_location, splitter, True) self.compare_spatial_rdd(spatial_rdd_core, input_boundary) query_window_rdd = PolygonRDD(self.sc, polygon_rdd_input_location, polygon_rdd_start_offset, polygon_rdd_end_offset, polygon_rdd_splitter, True, 5, StorageLevel.MEMORY_ONLY) assert query_window_rdd.analyze() assert query_window_rdd.approximateTotalCount == 3000 query_window_rdd = PolygonRDD(self.sc, polygon_rdd_input_location, polygon_rdd_start_offset, polygon_rdd_end_offset, polygon_rdd_splitter, True, StorageLevel.MEMORY_ONLY) assert query_window_rdd.analyze() assert query_window_rdd.approximateTotalCount == 3000 spatial_rdd_core = PolygonRDD(self.sc, input_location, splitter, True, 5, StorageLevel.MEMORY_ONLY) self.compare_spatial_rdd(spatial_rdd_core, input_boundary) spatial_rdd_core = PolygonRDD(self.sc, input_location, splitter, True, StorageLevel.MEMORY_ONLY) self.compare_spatial_rdd(spatial_rdd_core, input_boundary) spatial_rdd = PolygonRDD(spatial_rdd_core.rawJvmSpatialRDD, StorageLevel.MEMORY_ONLY, "epsg:4326", "epsg:5070") self.compare_spatial_rdd(spatial_rdd, query_envelope) query_window_rdd = PolygonRDD(self.sc, polygon_rdd_input_location, polygon_rdd_start_offset, polygon_rdd_end_offset, polygon_rdd_splitter, True, 5, StorageLevel.MEMORY_ONLY, "epsg:4326", "epsg:5070") assert query_window_rdd.analyze() assert query_window_rdd.approximateTotalCount == 3000 query_window_rdd = PolygonRDD(self.sc, polygon_rdd_input_location, polygon_rdd_start_offset, polygon_rdd_end_offset, polygon_rdd_splitter, True, StorageLevel.MEMORY_ONLY, "epsg:4326", "epsg:5070") assert query_window_rdd.analyze() assert query_window_rdd.approximateTotalCount == 3000 spatial_rdd_core = PolygonRDD(self.sc, input_location, splitter, True, 5, StorageLevel.MEMORY_ONLY, "epsg:4326", "epsg:5070") self.compare_spatial_rdd(spatial_rdd_core, query_envelope) spatial_rdd_core = PolygonRDD(self.sc, input_location, splitter, True, StorageLevel.MEMORY_ONLY, "epsg:4326", "epsg:5070") spatial_rdd_core = PolygonRDD(sparkContext=self.sc, InputLocation=input_location, splitter=splitter, carryInputData=True, newLevel=StorageLevel.MEMORY_ONLY, sourceEpsgCRSCode="epsg:4326", targetEpsgCode="epsg:5070") self.compare_spatial_rdd(spatial_rdd_core, query_envelope)
def getCenterPolygonAsSpatialRDD(self) -> 'PolygonRDD': from geo_pyspark.core.SpatialRDD import PolygonRDD srdd = self._srdd.getCenterPolygonAsSpatialRDD() polygon_rdd = PolygonRDD() polygon_rdd.set_srdd(srdd) return polygon_rdd