def test_empty_constructor(self): spatial_rdd = PolygonRDD(sparkContext=self.sc, InputLocation=input_location, splitter=splitter, carryInputData=True, partitions=num_partitions, newLevel=StorageLevel.MEMORY_ONLY) spatial_rdd.analyze() spatial_rdd.spatialPartitioning(grid_type) spatial_rdd.buildIndex(IndexType.RTREE, True) spatial_rdd_copy = PolygonRDD() spatial_rdd_copy.rawJvmSpatialRDD = spatial_rdd.rawJvmSpatialRDD spatial_rdd_copy.analyze()
def test_geo_data_convert_linestring_rdd(self): polygon = Polygon([(0, 0), (0, 1), (1, 1), (1, 0), (0, 0)]) ext = [(0, 0), (0, 2), (2, 2), (2, 0), (0, 0)] int = [(1, 1), (1.5, 1), (1.5, 1.5), (1, 1.5), (1, 1)] polygon2 = Polygon(ext, [int]) wkt = "POLYGON ((-71.1776585052917 42.3902909739571, -71.1776820268866 42.3903701743239, -71.1776063012595 42.3903825660754, -71.1775826583081 42.3903033653531, -71.1776585052917 42.3902909739571))" polygon3 = loads(wkt) polygons = [ GeoData(geom=polygon, userData="a"), GeoData(geom=polygon2, userData="b"), GeoData(geom=polygon3, userData="c"), ] rdd_data = self.sc.parallelize(polygons) polygon_rdd = PolygonRDD(rdd_data) collected_data = polygon_rdd.rawSpatialRDD.collect() sorted_collected_data = sorted(collected_data, key=lambda x: x.userData) assert all([ geo1 == geo2 for geo1, geo2 in zip(polygons, sorted_collected_data) ])
def test_outside_polygon_join_correctness(self): window_rdd = PolygonRDD( self.sc.parallelize(self.test_polygon_window_set), StorageLevel.MEMORY_ONLY) object_rdd = PolygonRDD( self.sc.parallelize(self.test_outside_polygon_set), StorageLevel.MEMORY_ONLY) self.prepare_rdd(object_rdd, window_rdd, GridType.QUADTREE) result = JoinQuery.SpatialJoinQuery(object_rdd, window_rdd, True, False).collect() assert 0 == result.__len__() result_no_index = JoinQuery.SpatialJoinQuery(object_rdd, window_rdd, False, False).collect() assert 0 == result_no_index.__len__()
def test_overlapped_polygon_join_correctness(self): window_rdd = PolygonRDD( self.sc.parallelize(self.test_polygon_window_set), StorageLevel.MEMORY_ONLY) object_rdd = PolygonRDD( self.sc.parallelize(self.test_overlapped_polygon_set), StorageLevel.MEMORY_ONLY) self.prepare_rdd(object_rdd, window_rdd, GridType.QUADTREE) result = JoinQuery.SpatialJoinQuery(object_rdd, window_rdd, True, True).collect() self.verify_join_result(result) result_no_index = JoinQuery.SpatialJoinQuery(object_rdd, window_rdd, False, True).collect() self.verify_join_result(result_no_index)
def test_saving_to_disc_spatial_rdd_polygon(self, remove_spatial_rdd_disc_dir): from tests.properties.polygon_properties import input_location, splitter, num_partitions polygon_rdd = PolygonRDD(self.sc, input_location, splitter, True, num_partitions, StorageLevel.MEMORY_ONLY) polygon_rdd.rawJvmSpatialRDD.saveAsObjectFile( os.path.join(disc_object_location, "polygon"))
def test_spatial_knn_correctness(self): polygon_rdd = PolygonRDD(self.sc, input_location, splitter, True) result_no_index = KNNQuery.SpatialKnnQuery(polygon_rdd, self.query_point, self.top_k, False) polygon_rdd.buildIndex(IndexType.RTREE, False) result_with_index = KNNQuery.SpatialKnnQuery(polygon_rdd, self.query_point, self.top_k, True) sorted_result_no_index = sorted( result_no_index, key=lambda geo_data: distance_sorting_functions( geo_data, self.query_point)) sorted_result_with_index = sorted( result_with_index, key=lambda geo_data: distance_sorting_functions( geo_data, self.query_point)) difference = 0 for x in range(self.top_k): difference += sorted_result_no_index[x].geom.distance( sorted_result_with_index[x].geom) assert difference == 0
def test_spatial_join_query_with_polygon_rdd_using_index(self): query_rdd = PolygonRDD(self.sc, input_location_query_polygon, splitter, True, num_partitions, StorageLevel.MEMORY_ONLY, "epsg:4326", "epsg:3005") spatial_rdd = PointRDD(self.sc, input_location, offset, splitter, True, num_partitions, StorageLevel.MEMORY_ONLY, "epsg:4326", "epsg:3005") query_rdd.analyze() spatial_rdd.analyze() spatial_rdd.spatialPartitioning(grid_type) spatial_rdd.buildIndex(IndexType.RTREE, True) query_rdd.spatialPartitioning(spatial_rdd.getPartitioner()) result = JoinQuery.SpatialJoinQuery(spatial_rdd, query_rdd, False, True).collect() assert result[1][0].getUserData() is not None for data in result: if data[1].__len__() != 0: for right_data in data[1]: assert right_data.getUserData() is not None
def load(cls, sc: SparkContext, path: str) -> SpatialRDD: jvm = sc._jvm polygon_rdd = PolygonRDD() srdd = SpatialObjectLoaderAdapter(jvm).load_polygon_spatial_rdd( sc._jsc, path) polygon_rdd.set_srdd(srdd) return polygon_rdd
def test_saving_to_disc_index_polygon(self): from tests.properties.polygon_properties import input_location, splitter, num_partitions polygon_rdd = PolygonRDD(self.sc, input_location, splitter, True, num_partitions, StorageLevel.MEMORY_ONLY) polygon_rdd.buildIndex(IndexType.RTREE, False) polygon_rdd.indexedRawRDD.saveAsObjectFile( os.path.join(disc_location, "polygon_index"))
def test_spatial_knn_query_using_index(self): polygon_rdd = PolygonRDD(self.sc, input_location, splitter, True) polygon_rdd.buildIndex(IndexType.RTREE, False) for i in range(self.loop_times): result = KNNQuery.SpatialKnnQuery(polygon_rdd, self.query_point, self.top_k, True) assert result.__len__() > -1 assert result[0].getUserData() is not None
def test_outside_polygon_distance_join_correctness(self): center_geometry_rdd = PolygonRDD( self.sc.parallelize(self.test_polygon_window_set), StorageLevel.MEMORY_ONLY) window_rdd = CircleRDD(center_geometry_rdd, 0.1) object_rdd = PolygonRDD( self.sc.parallelize(self.test_outside_polygon_set), StorageLevel.MEMORY_ONLY) self.prepare_rdd(object_rdd, window_rdd, GridType.QUADTREE) result = JoinQuery.DistanceJoinQuery(object_rdd, window_rdd, True, True).collect() assert 0 == result.__len__() result_no_index = JoinQuery.DistanceJoinQuery(object_rdd, window_rdd, False, True).collect() assert 0 == result_no_index.__len__()
def test_build_index_without_set_grid(self): spatial_rdd = PolygonRDD(self.sc, input_location, FileDataSplitter.CSV, carryInputData=True, partitions=num_partitions, newLevel=StorageLevel.MEMORY_ONLY) spatial_rdd.analyze() spatial_rdd.buildIndex(IndexType.RTREE, False)
def test_creating_polygon_rdd(self): polygon_rdd = PolygonRDD(self.spark._sc, counties_path, 2, 3, FileDataSplitter.WKT, True) polygon_rdd.analyze() cnt = polygon_rdd.countWithoutDuplicates() assert cnt == 407, f"Polygon RDD should have 407 but found {cnt}"
def test_geojson_to_dataframe(self): spatial_rdd = PolygonRDD(self.spark.sparkContext, geojson_input_location, FileDataSplitter.GEOJSON, True) spatial_rdd.analyze() Adapter.toDf(spatial_rdd, self.spark).show() df = Adapter.toDf(spatial_rdd, self.spark) assert (df.columns[1] == "STATEFP")
def test_geojson_to_dataframe(self): spatial_rdd = PolygonRDD(self.spark.sparkContext, geojson_input_location, FileDataSplitter.GEOJSON, True) spatial_rdd.analyze() df = Adapter.toDf(spatial_rdd, self.spark).\ withColumn("geometry", expr("ST_GeomFromWKT(geometry)")) df.show() assert (df.columns[1] == "STATEFP")
def test_spatial_range_query(self): spatial_rdd = PolygonRDD(self.sc, input_location, splitter, True, StorageLevel.MEMORY_ONLY) for i in range(self.loop_times): result_size = RangeQuery.\ SpatialRangeQuery(spatial_rdd, self.query_envelope, False, False).count() assert result_size == 704 assert RangeQuery.SpatialRangeQuery( spatial_rdd, self.query_envelope, False, False).take(10)[0].getUserData() is not None
def test_wkb_constructor(self): spatial_rdd = PolygonRDD(sparkContext=self.sc, InputLocation=input_location_wkb, splitter=FileDataSplitter.WKB, carryInputData=True, newLevel=StorageLevel.MEMORY_ONLY) spatial_rdd.analyze() assert spatial_rdd.approximateTotalCount == 103 assert spatial_rdd.boundaryEnvelope is not None assert spatial_rdd.rawSpatialRDD.take(1)[0].getUserData( ) == "31\t039\t00835841\t31039\tCuming\tCuming County\t06\tH1\tG4020\t\t\t\tA\t1477895811\t10447360\t+41.9158651\t-096.7885168"
def test_voronoi_spatial_partitioning(self): spatial_rdd = PolygonRDD(sparkContext=self.sc, InputLocation=input_location, splitter=FileDataSplitter.CSV, carryInputData=True, partitions=10, newLevel=StorageLevel.MEMORY_ONLY) spatial_rdd.analyze() spatial_rdd.spatialPartitioning(GridType.VORONOI) for envelope in spatial_rdd.grids: print(envelope)
def test_load_id_column_data_check(self): spatial_rdd = PolygonRDD(self.spark.sparkContext, geojson_id_input_location, FileDataSplitter.GEOJSON, True) spatial_rdd.analyze() df = Adapter.toDf(spatial_rdd, self.spark) df.show() try: assert df.columns.__len__() == 3 except AssertionError: assert df.columns.__len__() == 4 assert df.count() == 1
def test_hilbert_curve_spatial_partitioning(self): spatial_rdd = PolygonRDD(sparkContext=self.sc, InputLocation=input_location, splitter=splitter, carryInputData=True, partitions=10, newLevel=StorageLevel.MEMORY_ONLY) spatial_rdd.analyze() spatial_rdd.spatialPartitioning(GridType.HILBERT) for envelope in spatial_rdd.grids: print(envelope)
def readToPolygonRDD(cls, sc: SparkContext, inputPath: str) -> PolygonRDD: """ :param sc: :param inputPath: :return: """ jvm = sc._jvm jsc = sc._jsc srdd = jvm.ShapefileReader.readToPolygonRDD(jsc, inputPath) spatial_rdd = PolygonRDD() spatial_rdd.set_srdd(srdd) return spatial_rdd
def test_spatial_join_query(self): point_rdd = PointRDD(self.sc, point_path, 4, FileDataSplitter.WKT, True) polygon_rdd = PolygonRDD(self.sc, counties_path, 2, 3, FileDataSplitter.WKT, True) point_rdd.analyze() point_rdd.spatialPartitioning(GridType.KDBTREE) polygon_rdd.spatialPartitioning(point_rdd.getPartitioner()) result = JoinQuery.SpatialJoinQuery(point_rdd, polygon_rdd, True, False) print(result.count())
def test_polygon_distance_join_with_crs_transformation(self): query_rdd = PolygonRDD(self.sc, input_location_query_polygon, splitter, True, num_partitions, StorageLevel.MEMORY_ONLY, "epsg:4326", "epsg:3857") window_rdd = CircleRDD(query_rdd, 0.1) object_rdd = PolygonRDD(self.sc, input_location_query_polygon, splitter, True, num_partitions, StorageLevel.MEMORY_ONLY, "epsg:4326", "epsg:3857") object_rdd.rawJvmSpatialRDD.jsrdd.repartition(4) object_rdd.spatialPartitioning(GridType.KDBTREE) object_rdd.buildIndex(IndexType.RTREE, True) window_rdd.spatialPartitioning(object_rdd.getPartitioner()) results = JoinQuery.DistanceJoinQuery(object_rdd, window_rdd, True, False).collect() assert 5467 == results.__len__() for data in results: for polygon_data in data[1]: assert Circle(data[0].geom, 0.1).covers(polygon_data.geom)
def test_mbr(self): polygon_rdd = PolygonRDD(sparkContext=self.sc, InputLocation=input_location, splitter=FileDataSplitter.CSV, carryInputData=True, partitions=num_partitions) rectangle_rdd = polygon_rdd.MinimumBoundingRectangle() result = rectangle_rdd.rawSpatialRDD.collect() for el in result: print(el.geom.wkt) print(result) assert result.__len__() > -1
def test_geojson_constructor(self): spatial_rdd = PolygonRDD( sparkContext=self.sc, InputLocation=input_location_geo_json, splitter=FileDataSplitter.GEOJSON, carryInputData=True, partitions=4, newLevel=StorageLevel.MEMORY_ONLY ) spatial_rdd.analyze() assert spatial_rdd.approximateTotalCount == 1001 assert spatial_rdd.boundaryEnvelope is not None assert spatial_rdd.rawSpatialRDD.take(1)[0].getUserData() == "01\t077\t011501\t5\t1500000US010770115015\t010770115015\t5\tBG\t6844991\t32636" assert spatial_rdd.rawSpatialRDD.take(2)[1].getUserData() == "01\t045\t021102\t4\t1500000US010450211024\t010450211024\t4\tBG\t11360854\t0" assert spatial_rdd.fieldNames == ["STATEFP", "COUNTYFP", "TRACTCE", "BLKGRPCE", "AFFGEOID", "GEOID", "NAME", "LSAD", "ALAND", "AWATER"]
def test_spatial_join_query(self): query_window_rdd = PolygonRDD( self.sc, polygon_rdd_input_location, polygon_rdd_start_offset, polygon_rdd_end_offset, polygon_rdd_splitter, True ) object_rdd = PointRDD( self.sc, point_rdd_input_location, point_rdd_offset, point_rdd_splitter, True, StorageLevel.MEMORY_ONLY) object_rdd.spatialPartitioning(join_query_partitioning_type) query_window_rdd.spatialPartitioning(object_rdd.getPartitioner()) object_rdd.jvmSpatialPartitionedRDD.persist(StorageLevel.MEMORY_ONLY) query_window_rdd.jvmSpatialPartitionedRDD.persist(StorageLevel.MEMORY_ONLY) for _ in range(each_query_loop_times): result_size = JoinQuery.SpatialJoinQuery(object_rdd, query_window_rdd, False, True).count()
def test_inside_point_join_correctness(self): self.once_before_all() window_rdd = PolygonRDD( self.sc.parallelize(self.test_polygon_window_set)) object_rdd = PointRDD(self.sc.parallelize(self.test_inside_point_set)) self.prepare_rdd(object_rdd, window_rdd, GridType.QUADTREE) result = JoinQuery.SpatialJoinQuery(object_rdd, window_rdd, True, False).collect() self.verify_join_result(result) result_no_index = JoinQuery.SpatialJoinQuery(object_rdd, window_rdd, False, False).collect() self.verify_join_result(result_no_index)
def test_to_df_srdd_fn_spark(self): spatial_rdd = PolygonRDD(self.spark.sparkContext, geojson_input_location, FileDataSplitter.GEOJSON, True) spatial_rdd.analyze() assert spatial_rdd.approximateTotalCount == 1001 spatial_columns = [ "state_id", "county_id", "tract_id", "bg_id", "fips", "fips_short", "bg_nr", "type", "code1", "code2" ] spatial_df = Adapter.toDf(spatial_rdd, spatial_columns, self.spark) spatial_df.show() assert spatial_df.columns == ["geometry", *spatial_columns] assert spatial_df.count() == 1001
def test_spatial_join_query_and_build_index_on_points_on_the_fly(self): query_window = PolygonRDD(self.sc, polygon_rdd_input_location, polygon_rdd_start_offset, polygon_rdd_end_offset, polygon_rdd_splitter, True) object_rdd = PointRDD(sparkContext=self.sc, InputLocation=point_rdd_input_location, Offset=point_rdd_offset, splitter=point_rdd_splitter, carryInputData=False) object_rdd.analyze() object_rdd.spatialPartitioning(join_query_partitionin_type) query_window.spatialPartitioning(object_rdd.getPartitioner()) for i in range(each_query_loop_times): result_size = JoinQuery.SpatialJoinQuery(object_rdd, query_window, True, False).count()
def test_polygon_rdd(self): polygon_rdd = PolygonRDD(sparkContext=self.sc, InputLocation=polygon_rdd_input_location, startOffset=polygon_rdd_start_offset, endOffset=polygon_rdd_end_offset, splitter=polygon_rdd_splitter, carryInputData=True) collected_polygon_rdd = polygon_rdd.getRawSpatialRDD().collect() input_wkt_polygons = [ "POLYGON ((-74.020753 40.836454, -74.020753 40.843768, -74.018162 40.843768, -74.018162 40.836454, -74.020753 40.836454))", "POLYGON ((-74.018978 40.837712, -74.018978 40.852181, -74.014938 40.852181, -74.014938 40.837712, -74.018978 40.837712))", "POLYGON ((-74.021683 40.833253, -74.021683 40.834288, -74.021368 40.834288, -74.021368 40.833253, -74.021683 40.833253))" ] assert [geo_data.geom.wkt for geo_data in collected_polygon_rdd ][:3] == input_wkt_polygons