def test_spatial_join_query_with_polygon_rdd_using_index(self): query_rdd = PolygonRDD(self.sc, input_location_query_polygon, splitter, True, num_partitions, StorageLevel.MEMORY_ONLY, "epsg:4326", "epsg:3005") spatial_rdd = PointRDD(self.sc, input_location, offset, splitter, True, num_partitions, StorageLevel.MEMORY_ONLY, "epsg:4326", "epsg:3005") query_rdd.analyze() spatial_rdd.analyze() spatial_rdd.spatialPartitioning(grid_type) spatial_rdd.buildIndex(IndexType.RTREE, True) query_rdd.spatialPartitioning(spatial_rdd.getPartitioner()) result = JoinQuery.SpatialJoinQuery(spatial_rdd, query_rdd, False, True).collect() assert result[1][0].getUserData() is not None for data in result: if data[1].__len__() != 0: for right_data in data[1]: assert right_data.getUserData() is not None
def test_build_index_without_set_grid(self): spatial_rdd = PolygonRDD(self.sc, input_location, FileDataSplitter.CSV, carryInputData=True, partitions=num_partitions, newLevel=StorageLevel.MEMORY_ONLY) spatial_rdd.analyze() spatial_rdd.buildIndex(IndexType.RTREE, False)
def test_creating_polygon_rdd(self): polygon_rdd = PolygonRDD(self.spark._sc, counties_path, 2, 3, FileDataSplitter.WKT, True) polygon_rdd.analyze() cnt = polygon_rdd.countWithoutDuplicates() assert cnt == 407, f"Polygon RDD should have 407 but found {cnt}"
def test_geojson_to_dataframe(self): spatial_rdd = PolygonRDD(self.spark.sparkContext, geojson_input_location, FileDataSplitter.GEOJSON, True) spatial_rdd.analyze() Adapter.toDf(spatial_rdd, self.spark).show() df = Adapter.toDf(spatial_rdd, self.spark) assert (df.columns[1] == "STATEFP")
def test_geojson_to_dataframe(self): spatial_rdd = PolygonRDD(self.spark.sparkContext, geojson_input_location, FileDataSplitter.GEOJSON, True) spatial_rdd.analyze() df = Adapter.toDf(spatial_rdd, self.spark).\ withColumn("geometry", expr("ST_GeomFromWKT(geometry)")) df.show() assert (df.columns[1] == "STATEFP")
def test_wkb_constructor(self): spatial_rdd = PolygonRDD(sparkContext=self.sc, InputLocation=input_location_wkb, splitter=FileDataSplitter.WKB, carryInputData=True, newLevel=StorageLevel.MEMORY_ONLY) spatial_rdd.analyze() assert spatial_rdd.approximateTotalCount == 103 assert spatial_rdd.boundaryEnvelope is not None assert spatial_rdd.rawSpatialRDD.take(1)[0].getUserData( ) == "31\t039\t00835841\t31039\tCuming\tCuming County\t06\tH1\tG4020\t\t\t\tA\t1477895811\t10447360\t+41.9158651\t-096.7885168"
def test_load_id_column_data_check(self): spatial_rdd = PolygonRDD(self.spark.sparkContext, geojson_id_input_location, FileDataSplitter.GEOJSON, True) spatial_rdd.analyze() df = Adapter.toDf(spatial_rdd, self.spark) df.show() try: assert df.columns.__len__() == 3 except AssertionError: assert df.columns.__len__() == 4 assert df.count() == 1
def test_voronoi_spatial_partitioning(self): spatial_rdd = PolygonRDD(sparkContext=self.sc, InputLocation=input_location, splitter=FileDataSplitter.CSV, carryInputData=True, partitions=10, newLevel=StorageLevel.MEMORY_ONLY) spatial_rdd.analyze() spatial_rdd.spatialPartitioning(GridType.VORONOI) for envelope in spatial_rdd.grids: print(envelope)
def test_hilbert_curve_spatial_partitioning(self): spatial_rdd = PolygonRDD(sparkContext=self.sc, InputLocation=input_location, splitter=splitter, carryInputData=True, partitions=10, newLevel=StorageLevel.MEMORY_ONLY) spatial_rdd.analyze() spatial_rdd.spatialPartitioning(GridType.HILBERT) for envelope in spatial_rdd.grids: print(envelope)
def test_empty_constructor(self): spatial_rdd = PolygonRDD(sparkContext=self.sc, InputLocation=input_location, splitter=splitter, carryInputData=True, partitions=num_partitions, newLevel=StorageLevel.MEMORY_ONLY) spatial_rdd.analyze() spatial_rdd.spatialPartitioning(grid_type) spatial_rdd.buildIndex(IndexType.RTREE, True) spatial_rdd_copy = PolygonRDD() spatial_rdd_copy.rawJvmSpatialRDD = spatial_rdd.rawJvmSpatialRDD spatial_rdd_copy.analyze()
def test_geojson_constructor(self): spatial_rdd = PolygonRDD( sparkContext=self.sc, InputLocation=input_location_geo_json, splitter=FileDataSplitter.GEOJSON, carryInputData=True, partitions=4, newLevel=StorageLevel.MEMORY_ONLY ) spatial_rdd.analyze() assert spatial_rdd.approximateTotalCount == 1001 assert spatial_rdd.boundaryEnvelope is not None assert spatial_rdd.rawSpatialRDD.take(1)[0].getUserData() == "01\t077\t011501\t5\t1500000US010770115015\t010770115015\t5\tBG\t6844991\t32636" assert spatial_rdd.rawSpatialRDD.take(2)[1].getUserData() == "01\t045\t021102\t4\t1500000US010450211024\t010450211024\t4\tBG\t11360854\t0" assert spatial_rdd.fieldNames == ["STATEFP", "COUNTYFP", "TRACTCE", "BLKGRPCE", "AFFGEOID", "GEOID", "NAME", "LSAD", "ALAND", "AWATER"]
def test_to_df_srdd_fn_spark(self): spatial_rdd = PolygonRDD(self.spark.sparkContext, geojson_input_location, FileDataSplitter.GEOJSON, True) spatial_rdd.analyze() assert spatial_rdd.approximateTotalCount == 1001 spatial_columns = [ "state_id", "county_id", "tract_id", "bg_id", "fips", "fips_short", "bg_nr", "type", "code1", "code2" ] spatial_df = Adapter.toDf(spatial_rdd, spatial_columns, self.spark) spatial_df.show() assert spatial_df.columns == ["geometry", *spatial_columns] assert spatial_df.count() == 1001
def test_constructor(self): spatial_rdd_core = PolygonRDD( sparkContext=self.sc, InputLocation=input_location, splitter=splitter, carryInputData=True, partitions=num_partitions, newLevel=StorageLevel.MEMORY_ONLY ) self.compare_spatial_rdd(spatial_rdd_core, input_boundary) spatial_rdd_core = PolygonRDD( self.sc, input_location, splitter, True, num_partitions, StorageLevel.MEMORY_ONLY ) self.compare_spatial_rdd(spatial_rdd_core, input_boundary) spatial_rdd = PolygonRDD(rawSpatialRDD=spatial_rdd_core.rawJvmSpatialRDD) self.compare_spatial_rdd(spatial_rdd, input_boundary) spatial_rdd = PolygonRDD(spatial_rdd_core.rawJvmSpatialRDD, "epsg:4326", "epsg:5070") self.compare_spatial_rdd(spatial_rdd, query_envelope) assert spatial_rdd.getSourceEpsgCode() == "epsg:4326" assert spatial_rdd.getTargetEpsgCode() == "epsg:5070" spatial_rdd = PolygonRDD(rawSpatialRDD=spatial_rdd_core.rawJvmSpatialRDD, sourceEpsgCode="epsg:4326", targetEpsgCode="epsg:5070") assert spatial_rdd.getSourceEpsgCode() == "epsg:4326" assert spatial_rdd.getTargetEpsgCode() == "epsg:5070" self.compare_spatial_rdd(spatial_rdd, query_envelope) spatial_rdd = PolygonRDD(rawSpatialRDD=spatial_rdd.rawJvmSpatialRDD, newLevel=StorageLevel.MEMORY_ONLY) self.compare_spatial_rdd(spatial_rdd, query_envelope) spatial_rdd = PolygonRDD(spatial_rdd_core.rawJvmSpatialRDD, StorageLevel.MEMORY_ONLY) self.compare_spatial_rdd(spatial_rdd, input_boundary) spatial_rdd = PolygonRDD() query_window_rdd = PolygonRDD( self.sc, polygon_rdd_input_location, polygon_rdd_start_offset, polygon_rdd_end_offset, polygon_rdd_splitter, True, 2 ) assert query_window_rdd.analyze() assert query_window_rdd.approximateTotalCount == 3000 query_window_rdd = PolygonRDD( self.sc, polygon_rdd_input_location, polygon_rdd_start_offset, polygon_rdd_end_offset, polygon_rdd_splitter, True ) assert query_window_rdd.analyze() assert query_window_rdd.approximateTotalCount == 3000 spatial_rdd_core = PolygonRDD( self.sc, input_location, splitter, True, num_partitions ) self.compare_spatial_rdd(spatial_rdd_core, input_boundary) spatial_rdd_core = PolygonRDD( self.sc, input_location, splitter, True ) self.compare_spatial_rdd(spatial_rdd_core, input_boundary) query_window_rdd = PolygonRDD( self.sc, polygon_rdd_input_location, polygon_rdd_start_offset, polygon_rdd_end_offset, polygon_rdd_splitter, True, 5, StorageLevel.MEMORY_ONLY ) assert query_window_rdd.analyze() assert query_window_rdd.approximateTotalCount == 3000 query_window_rdd = PolygonRDD( self.sc, polygon_rdd_input_location, polygon_rdd_start_offset, polygon_rdd_end_offset, polygon_rdd_splitter, True, StorageLevel.MEMORY_ONLY ) assert query_window_rdd.analyze() assert query_window_rdd.approximateTotalCount == 3000 spatial_rdd_core = PolygonRDD( self.sc, input_location, splitter, True, 5, StorageLevel.MEMORY_ONLY ) self.compare_spatial_rdd(spatial_rdd_core, input_boundary) spatial_rdd_core = PolygonRDD( self.sc, input_location, splitter, True, StorageLevel.MEMORY_ONLY ) self.compare_spatial_rdd(spatial_rdd_core, input_boundary) spatial_rdd = PolygonRDD( spatial_rdd_core.rawJvmSpatialRDD, StorageLevel.MEMORY_ONLY, "epsg:4326", "epsg:5070" ) self.compare_spatial_rdd(spatial_rdd, query_envelope) query_window_rdd = PolygonRDD( self.sc, polygon_rdd_input_location, polygon_rdd_start_offset, polygon_rdd_end_offset, polygon_rdd_splitter, True, 5, StorageLevel.MEMORY_ONLY, "epsg:4326", "epsg:5070" ) assert query_window_rdd.analyze() assert query_window_rdd.approximateTotalCount == 3000 query_window_rdd = PolygonRDD( self.sc, polygon_rdd_input_location, polygon_rdd_start_offset, polygon_rdd_end_offset, polygon_rdd_splitter, True, StorageLevel.MEMORY_ONLY, "epsg:4326", "epsg:5070" ) assert query_window_rdd.analyze() assert query_window_rdd.approximateTotalCount == 3000 spatial_rdd_core = PolygonRDD( self.sc, input_location, splitter, True, 5, StorageLevel.MEMORY_ONLY, "epsg:4326", "epsg:5070" ) self.compare_spatial_rdd(spatial_rdd_core, query_envelope) spatial_rdd_core = PolygonRDD( self.sc, input_location, splitter, True, StorageLevel.MEMORY_ONLY, "epsg:4326", "epsg:5070" ) spatial_rdd_core = PolygonRDD( sparkContext=self.sc, InputLocation=input_location, splitter=splitter, carryInputData=True, newLevel=StorageLevel.MEMORY_ONLY, sourceEpsgCRSCode="epsg:4326", targetEpsgCode="epsg:5070" ) self.compare_spatial_rdd(spatial_rdd_core, query_envelope)