Example #1
0
 def test_load_id_column_data_check(self):
     spatial_rdd = PolygonRDD(self.spark.sparkContext, geojson_id_input_location, FileDataSplitter.GEOJSON, True)
     spatial_rdd.analyze()
     df = Adapter.toDf(spatial_rdd, self.spark)
     df.show()
     assert df.columns.__len__() == 4
     assert df.count() == 1
Example #2
0
    def test_spatial_knn_correctness(self):
        polygon_rdd = PolygonRDD(self.sc, input_location, splitter, True)
        result_no_index = KNNQuery.SpatialKnnQuery(polygon_rdd,
                                                   self.query_point,
                                                   self.top_k, False)
        polygon_rdd.buildIndex(IndexType.RTREE, False)
        result_with_index = KNNQuery.SpatialKnnQuery(polygon_rdd,
                                                     self.query_point,
                                                     self.top_k, True)

        sorted_result_no_index = sorted(
            result_no_index,
            key=lambda geo_data: distance_sorting_functions(
                geo_data, self.query_point))

        sorted_result_with_index = sorted(
            result_with_index,
            key=lambda geo_data: distance_sorting_functions(
                geo_data, self.query_point))

        difference = 0
        for x in range(self.top_k):
            difference += sorted_result_no_index[x].geom.distance(
                sorted_result_with_index[x].geom)

        assert difference == 0
Example #3
0
 def test_spatial_knn_query_using_index(self):
     polygon_rdd = PolygonRDD(self.sc, input_location, splitter, True)
     polygon_rdd.buildIndex(IndexType.RTREE, False)
     for i in range(self.loop_times):
         result = KNNQuery.SpatialKnnQuery(polygon_rdd, self.query_point,
                                           self.top_k, True)
         assert result.__len__() > -1
         assert result[0].getUserData() is not None
Example #4
0
    def test_creating_polygon_rdd(self):
        polygon_rdd = PolygonRDD(self.spark._sc, counties_path, 2, 3,
                                 FileDataSplitter.WKT, True)

        polygon_rdd.analyze()

        cnt = polygon_rdd.countWithoutDuplicates()

        assert cnt == 407, f"Polygon RDD should have 407 but found {cnt}"
Example #5
0
 def test_wkb_constructor(self):
     spatial_rdd = PolygonRDD(sparkContext=self.sc,
                              InputLocation=input_location_wkb,
                              splitter=FileDataSplitter.WKB,
                              carryInputData=True,
                              newLevel=StorageLevel.MEMORY_ONLY)
     spatial_rdd.analyze()
     assert spatial_rdd.approximateTotalCount == 103
     assert spatial_rdd.boundaryEnvelope is not None
     assert spatial_rdd.rawSpatialRDD.take(1)[0].getUserData(
     ) == "31\t039\t00835841\t31039\tCuming\tCuming County\t06\tH1\tG4020\t\t\t\tA\t1477895811\t10447360\t+41.9158651\t-096.7885168"
Example #6
0
    def test_geojson_to_dataframe(self):
        spatial_rdd = PolygonRDD(
            self.spark.sparkContext, geojson_input_location, FileDataSplitter.GEOJSON, True
        )

        spatial_rdd.analyze()

        df = Adapter.toDf(spatial_rdd, self.spark).\
            withColumn("geometry", expr("ST_GeomFromWKT(geometry)"))
        df.show()
        assert (df.columns[1] == "STATEFP")
Example #7
0
    def test_spatial_range_query_using_index(self):
        spatial_rdd = PolygonRDD(self.sc, input_location, splitter, True,
                                 StorageLevel.MEMORY_ONLY)
        spatial_rdd.buildIndex(IndexType.RTREE, False)
        for i in range(self.loop_times):
            result_size = RangeQuery.\
                SpatialRangeQuery(spatial_rdd, self.query_envelope, False, False).count()
            assert result_size == 704

        assert RangeQuery.SpatialRangeQuery(
            spatial_rdd, self.query_envelope, False,
            False).take(10)[0].getUserData() is not None
Example #8
0
    def test_spatial_join_query(self):
        point_rdd = PointRDD(self.sc, point_path, 4, FileDataSplitter.WKT,
                             True)

        polygon_rdd = PolygonRDD(self.sc, counties_path, 2, 3,
                                 FileDataSplitter.WKT, True)

        point_rdd.analyze()
        point_rdd.spatialPartitioning(GridType.KDBTREE)
        polygon_rdd.spatialPartitioning(point_rdd.getPartitioner())
        result = JoinQuery.SpatialJoinQuery(point_rdd, polygon_rdd, True,
                                            False)

        print(result.count())
Example #9
0
    def readToPolygonRDD(cls, sc: SparkContext, inputPath: str) -> PolygonRDD:
        """

        :param sc:
        :param inputPath:
        :return:
        """
        ShapefileReader.validate_imports()
        jvm = sc._jvm
        jsc = sc._jsc
        srdd = jvm.ShapefileReader.readToPolygonRDD(jsc, inputPath)
        spatial_rdd = PolygonRDD()
        spatial_rdd.set_srdd(srdd)
        return spatial_rdd
Example #10
0
    def test_mbr(self):
        polygon_rdd = PolygonRDD(sparkContext=self.sc,
                                 InputLocation=input_location,
                                 splitter=FileDataSplitter.CSV,
                                 carryInputData=True,
                                 partitions=num_partitions)

        rectangle_rdd = polygon_rdd.MinimumBoundingRectangle()

        result = rectangle_rdd.rawSpatialRDD.collect()

        for el in result:
            print(el.geom.wkt)
        print(result)
        assert result.__len__() > -1
    def test_overlapped_polygon_join_correctness(self):
        window_rdd = PolygonRDD(
            self.sc.parallelize(self.test_polygon_window_set),
            StorageLevel.MEMORY_ONLY)
        object_rdd = PolygonRDD(
            self.sc.parallelize(self.test_overlapped_polygon_set),
            StorageLevel.MEMORY_ONLY)
        self.prepare_rdd(object_rdd, window_rdd, GridType.QUADTREE)

        result = JoinQuery.SpatialJoinQuery(object_rdd, window_rdd, True,
                                            True).collect()
        self.verify_join_result(result)

        result_no_index = JoinQuery.SpatialJoinQuery(object_rdd, window_rdd,
                                                     False, True).collect()
        self.verify_join_result(result_no_index)
    def test_outside_polygon_join_correctness(self):
        window_rdd = PolygonRDD(
            self.sc.parallelize(self.test_polygon_window_set),
            StorageLevel.MEMORY_ONLY)
        object_rdd = PolygonRDD(
            self.sc.parallelize(self.test_outside_polygon_set),
            StorageLevel.MEMORY_ONLY)
        self.prepare_rdd(object_rdd, window_rdd, GridType.QUADTREE)

        result = JoinQuery.SpatialJoinQuery(object_rdd, window_rdd, True,
                                            False).collect()
        assert 0 == result.__len__()

        result_no_index = JoinQuery.SpatialJoinQuery(object_rdd, window_rdd,
                                                     False, False).collect()
        assert 0 == result_no_index.__len__()
    def test_outside_polygon_distance_join_correctness(self):
        center_geometry_rdd = PolygonRDD(
            self.sc.parallelize(self.test_polygon_window_set),
            StorageLevel.MEMORY_ONLY)
        window_rdd = CircleRDD(center_geometry_rdd, 0.1)
        object_rdd = PolygonRDD(
            self.sc.parallelize(self.test_outside_polygon_set),
            StorageLevel.MEMORY_ONLY)
        self.prepare_rdd(object_rdd, window_rdd, GridType.QUADTREE)

        result = JoinQuery.DistanceJoinQuery(object_rdd, window_rdd, True,
                                             True).collect()
        assert 0 == result.__len__()

        result_no_index = JoinQuery.DistanceJoinQuery(object_rdd, window_rdd,
                                                      False, True).collect()
        assert 0 == result_no_index.__len__()
Example #14
0
    def test_spatial_join_query_and_build_index_on_points_on_the_fly(self):
        query_window = PolygonRDD(self.sc, polygon_rdd_input_location,
                                  polygon_rdd_start_offset,
                                  polygon_rdd_end_offset, polygon_rdd_splitter,
                                  True)
        object_rdd = PointRDD(sparkContext=self.sc,
                              InputLocation=point_rdd_input_location,
                              Offset=point_rdd_offset,
                              splitter=point_rdd_splitter,
                              carryInputData=False)
        object_rdd.analyze()
        object_rdd.spatialPartitioning(join_query_partitionin_type)
        query_window.spatialPartitioning(object_rdd.getPartitioner())

        for i in range(each_query_loop_times):
            result_size = JoinQuery.SpatialJoinQuery(object_rdd, query_window,
                                                     True, False).count()
Example #15
0
 def test_geojson_constructor(self):
     spatial_rdd = PolygonRDD(sparkContext=self.sc,
                              InputLocation=input_location_geo_json,
                              splitter=FileDataSplitter.GEOJSON,
                              carryInputData=True,
                              partitions=4,
                              newLevel=StorageLevel.MEMORY_ONLY)
     spatial_rdd.analyze()
     assert spatial_rdd.approximateTotalCount == 1001
     assert spatial_rdd.boundaryEnvelope is not None
     assert spatial_rdd.rawSpatialRDD.take(1)[0].getUserData(
     ) == "01\t077\t011501\t5\t1500000US010770115015\t010770115015\t5\tBG\t6844991\t32636"
     assert spatial_rdd.rawSpatialRDD.take(2)[1].getUserData(
     ) == "01\t045\t021102\t4\t1500000US010450211024\t010450211024\t4\tBG\t11360854\t0"
     assert spatial_rdd.fieldNames == [
         "STATEFP", "COUNTYFP", "TRACTCE", "BLKGRPCE", "AFFGEOID", "GEOID",
         "NAME", "LSAD", "ALAND", "AWATER"
     ]
Example #16
0
    def test_polygon_rdd(self):
        polygon_rdd = PolygonRDD(sparkContext=self.sc,
                                 InputLocation=polygon_rdd_input_location,
                                 startOffset=polygon_rdd_start_offset,
                                 endOffset=polygon_rdd_end_offset,
                                 splitter=polygon_rdd_splitter,
                                 carryInputData=True)

        collected_polygon_rdd = polygon_rdd.getRawSpatialRDD().collect()

        input_wkt_polygons = [
            "POLYGON ((-74.020753 40.836454, -74.020753 40.843768, -74.018162 40.843768, -74.018162 40.836454, -74.020753 40.836454))",
            "POLYGON ((-74.018978 40.837712, -74.018978 40.852181, -74.014938 40.852181, -74.014938 40.837712, -74.018978 40.837712))",
            "POLYGON ((-74.021683 40.833253, -74.021683 40.834288, -74.021368 40.834288, -74.021368 40.833253, -74.021683 40.833253))"
        ]

        assert [geo_data.geom.wkt for geo_data in collected_polygon_rdd
                ][:3] == input_wkt_polygons
Example #17
0
    def test_spatial_join_query_and_build_index_on_polygons_on_the_fly(self):
        query_window_rdd = PolygonRDD(self.sc, polygon_rdd_input_location,
                                      polygon_rdd_start_offset,
                                      polygon_rdd_end_offset,
                                      polygon_rdd_splitter, True)

        object_rdd = PointRDD(sparkContext=self.sc,
                              InputLocation=point_rdd_input_location,
                              Offset=point_rdd_offset,
                              splitter=point_rdd_splitter,
                              carryInputData=False)
        object_rdd.analyze()
        object_rdd.spatialPartitioning(join_query_partitionin_type)
        query_window_rdd.spatialPartitioning(object_rdd.getPartitioner())

        for i in range(each_query_loop_times):
            join_params = JoinParams(False, polygon_rdd_index_type,
                                     JoinBuildSide.LEFT)
            resultSize = JoinQuery.spatialJoin(query_window_rdd, object_rdd,
                                               join_params).count()
    def test_spatial_join_query_with_polygon_rdd(self):
        query_rdd = PolygonRDD(self.sc, input_location_query_polygon, splitter,
                               True, num_partitions, StorageLevel.MEMORY_ONLY,
                               "epsg:4326", "epsg:3005")

        spatial_rdd = PointRDD(self.sc, input_location, offset, splitter, True,
                               num_partitions, StorageLevel.MEMORY_ONLY,
                               "epsg:4326", "epsg:3005")
        spatial_rdd.spatialPartitioning(grid_type)

        query_rdd.spatialPartitioning(spatial_rdd.grids)

        result = JoinQuery.SpatialJoinQuery(spatial_rdd, query_rdd, False,
                                            True).collect()
        assert result[1][0].getUserData() is not None

        for data in result:
            if data[1].__len__() != 0:
                for right_data in data[1]:
                    assert right_data.getUserData() is not None
Example #19
0
    def test_to_df_srdd_fn_spark(self):
        spatial_rdd = PolygonRDD(
            self.spark.sparkContext, geojson_input_location, FileDataSplitter.GEOJSON, True
        )
        spatial_rdd.analyze()
        assert spatial_rdd.approximateTotalCount == 1001

        spatial_columns = [
                "state_id", "county_id", "tract_id", "bg_id",
                "fips", "fips_short", "bg_nr", "type", "code1", "code2"
            ]
        spatial_df = Adapter.toDf(
            spatial_rdd,
            spatial_columns,
            self.spark
        )

        spatial_df.show()

        assert spatial_df.columns == ["geometry", *spatial_columns]
        assert spatial_df.count() == 1001
Example #20
0
 def test_build_index_without_set_grid(self):
     spatial_rdd = PolygonRDD(self.sc,
                              input_location,
                              FileDataSplitter.CSV,
                              carryInputData=True,
                              partitions=num_partitions,
                              newLevel=StorageLevel.MEMORY_ONLY)
     spatial_rdd.analyze()
     spatial_rdd.buildIndex(IndexType.RTREE, False)
    def test_inside_point_join_correctness(self):
        self.once_before_all()

        window_rdd = PolygonRDD(
            self.sc.parallelize(self.test_polygon_window_set))

        object_rdd = PointRDD(self.sc.parallelize(self.test_inside_point_set))
        self.prepare_rdd(object_rdd, window_rdd, GridType.QUADTREE)

        result = JoinQuery.SpatialJoinQuery(object_rdd, window_rdd, True,
                                            False).collect()
        self.verify_join_result(result)

        result_no_index = JoinQuery.SpatialJoinQuery(object_rdd, window_rdd,
                                                     False, False).collect()
        self.verify_join_result(result_no_index)
Example #22
0
    def test_hilbert_curve_spatial_partitioning(self):
        spatial_rdd = PolygonRDD(sparkContext=self.sc,
                                 InputLocation=input_location,
                                 splitter=splitter,
                                 carryInputData=True,
                                 partitions=10,
                                 newLevel=StorageLevel.MEMORY_ONLY)
        spatial_rdd.analyze()
        spatial_rdd.spatialPartitioning(GridType.HILBERT)

        for envelope in spatial_rdd.grids:
            print(envelope)
Example #23
0
    def test_voronoi_spatial_partitioning(self):
        spatial_rdd = PolygonRDD(sparkContext=self.sc,
                                 InputLocation=input_location,
                                 splitter=FileDataSplitter.CSV,
                                 carryInputData=True,
                                 partitions=10,
                                 newLevel=StorageLevel.MEMORY_ONLY)
        spatial_rdd.analyze()
        spatial_rdd.spatialPartitioning(GridType.VORONOI)

        for envelope in spatial_rdd.grids:
            print(envelope)
    def test_polygon_distance_join_with_crs_transformation(self):
        query_rdd = PolygonRDD(self.sc, input_location_query_polygon, splitter,
                               True, num_partitions, StorageLevel.MEMORY_ONLY,
                               "epsg:4326", "epsg:3857")
        window_rdd = CircleRDD(query_rdd, 0.1)

        object_rdd = PolygonRDD(self.sc, input_location_query_polygon,
                                splitter, True, num_partitions,
                                StorageLevel.MEMORY_ONLY, "epsg:4326",
                                "epsg:3857")

        object_rdd.rawJvmSpatialRDD.jsrdd.repartition(4)
        object_rdd.spatialPartitioning(GridType.RTREE)
        object_rdd.buildIndex(IndexType.RTREE, True)
        window_rdd.spatialPartitioning(object_rdd.grids)

        results = JoinQuery.DistanceJoinQuery(object_rdd, window_rdd, True,
                                              False).collect()

        assert results.__len__() == 5467

        for data in results:
            for polygon_data in data[1]:
                assert Circle(data[0].geom, 0.1).covers(polygon_data.geom)
Example #25
0
 def test_empty_constructor(self):
     spatial_rdd = PolygonRDD(sparkContext=self.sc,
                              InputLocation=input_location,
                              splitter=splitter,
                              carryInputData=True,
                              partitions=num_partitions,
                              newLevel=StorageLevel.MEMORY_ONLY)
     spatial_rdd.analyze()
     spatial_rdd.spatialPartitioning(grid_type)
     spatial_rdd.buildIndex(IndexType.RTREE, True)
     spatial_rdd_copy = PolygonRDD()
     spatial_rdd_copy.rawSpatialRDD = spatial_rdd
     spatial_rdd_copy.analyze()
Example #26
0
 def create_polygon_rdd(self, location, splitter, num_partitions):
     rdd = PolygonRDD(
         self.sc, location, splitter, True, num_partitions
     )
     return PolygonRDD(rdd.rawJvmSpatialRDD, StorageLevel.MEMORY_ONLY)
Example #27
0
    def test_constructor(self):
        spatial_rdd_core = PolygonRDD(sparkContext=self.sc,
                                      InputLocation=input_location,
                                      splitter=splitter,
                                      carryInputData=True,
                                      partitions=num_partitions,
                                      newLevel=StorageLevel.MEMORY_ONLY)
        self.compare_spatial_rdd(spatial_rdd_core, input_boundary)

        spatial_rdd_core = PolygonRDD(self.sc, input_location, splitter, True,
                                      num_partitions, StorageLevel.MEMORY_ONLY)

        self.compare_spatial_rdd(spatial_rdd_core, input_boundary)
        spatial_rdd = PolygonRDD(
            rawSpatialRDD=spatial_rdd_core.rawJvmSpatialRDD)
        self.compare_spatial_rdd(spatial_rdd, input_boundary)
        spatial_rdd = PolygonRDD(spatial_rdd_core.rawJvmSpatialRDD,
                                 "epsg:4326", "epsg:5070")
        self.compare_spatial_rdd(spatial_rdd, query_envelope)
        assert spatial_rdd.getSourceEpsgCode() == "epsg:4326"
        assert spatial_rdd.getTargetEpsgCode() == "epsg:5070"
        spatial_rdd = PolygonRDD(
            rawSpatialRDD=spatial_rdd_core.rawJvmSpatialRDD,
            sourceEpsgCode="epsg:4326",
            targetEpsgCode="epsg:5070")
        assert spatial_rdd.getSourceEpsgCode() == "epsg:4326"
        assert spatial_rdd.getTargetEpsgCode() == "epsg:5070"
        self.compare_spatial_rdd(spatial_rdd, query_envelope)
        spatial_rdd = PolygonRDD(rawSpatialRDD=spatial_rdd.rawJvmSpatialRDD,
                                 newLevel=StorageLevel.MEMORY_ONLY)
        self.compare_spatial_rdd(spatial_rdd, query_envelope)
        spatial_rdd = PolygonRDD(spatial_rdd_core.rawJvmSpatialRDD,
                                 StorageLevel.MEMORY_ONLY)
        self.compare_spatial_rdd(spatial_rdd, input_boundary)
        spatial_rdd = PolygonRDD()

        query_window_rdd = PolygonRDD(self.sc, polygon_rdd_input_location,
                                      polygon_rdd_start_offset,
                                      polygon_rdd_end_offset,
                                      polygon_rdd_splitter, True, 2)
        assert query_window_rdd.analyze()
        assert query_window_rdd.approximateTotalCount == 3000

        query_window_rdd = PolygonRDD(self.sc, polygon_rdd_input_location,
                                      polygon_rdd_start_offset,
                                      polygon_rdd_end_offset,
                                      polygon_rdd_splitter, True)
        assert query_window_rdd.analyze()
        assert query_window_rdd.approximateTotalCount == 3000

        spatial_rdd_core = PolygonRDD(self.sc, input_location, splitter, True,
                                      num_partitions)

        self.compare_spatial_rdd(spatial_rdd_core, input_boundary)

        spatial_rdd_core = PolygonRDD(self.sc, input_location, splitter, True)

        self.compare_spatial_rdd(spatial_rdd_core, input_boundary)

        query_window_rdd = PolygonRDD(self.sc, polygon_rdd_input_location,
                                      polygon_rdd_start_offset,
                                      polygon_rdd_end_offset,
                                      polygon_rdd_splitter, True, 5,
                                      StorageLevel.MEMORY_ONLY)

        assert query_window_rdd.analyze()
        assert query_window_rdd.approximateTotalCount == 3000

        query_window_rdd = PolygonRDD(self.sc, polygon_rdd_input_location,
                                      polygon_rdd_start_offset,
                                      polygon_rdd_end_offset,
                                      polygon_rdd_splitter, True,
                                      StorageLevel.MEMORY_ONLY)

        assert query_window_rdd.analyze()
        assert query_window_rdd.approximateTotalCount == 3000

        spatial_rdd_core = PolygonRDD(self.sc, input_location, splitter, True,
                                      5, StorageLevel.MEMORY_ONLY)

        self.compare_spatial_rdd(spatial_rdd_core, input_boundary)

        spatial_rdd_core = PolygonRDD(self.sc, input_location, splitter, True,
                                      StorageLevel.MEMORY_ONLY)

        self.compare_spatial_rdd(spatial_rdd_core, input_boundary)

        spatial_rdd = PolygonRDD(spatial_rdd_core.rawJvmSpatialRDD,
                                 StorageLevel.MEMORY_ONLY, "epsg:4326",
                                 "epsg:5070")
        self.compare_spatial_rdd(spatial_rdd, query_envelope)

        query_window_rdd = PolygonRDD(self.sc, polygon_rdd_input_location,
                                      polygon_rdd_start_offset,
                                      polygon_rdd_end_offset,
                                      polygon_rdd_splitter, True, 5,
                                      StorageLevel.MEMORY_ONLY, "epsg:4326",
                                      "epsg:5070")

        assert query_window_rdd.analyze()
        assert query_window_rdd.approximateTotalCount == 3000

        query_window_rdd = PolygonRDD(self.sc, polygon_rdd_input_location,
                                      polygon_rdd_start_offset,
                                      polygon_rdd_end_offset,
                                      polygon_rdd_splitter, True,
                                      StorageLevel.MEMORY_ONLY, "epsg:4326",
                                      "epsg:5070")

        assert query_window_rdd.analyze()
        assert query_window_rdd.approximateTotalCount == 3000

        spatial_rdd_core = PolygonRDD(self.sc, input_location, splitter, True,
                                      5, StorageLevel.MEMORY_ONLY, "epsg:4326",
                                      "epsg:5070")

        self.compare_spatial_rdd(spatial_rdd_core, query_envelope)
        spatial_rdd_core = PolygonRDD(self.sc, input_location, splitter, True,
                                      StorageLevel.MEMORY_ONLY, "epsg:4326",
                                      "epsg:5070")

        spatial_rdd_core = PolygonRDD(sparkContext=self.sc,
                                      InputLocation=input_location,
                                      splitter=splitter,
                                      carryInputData=True,
                                      newLevel=StorageLevel.MEMORY_ONLY,
                                      sourceEpsgCRSCode="epsg:4326",
                                      targetEpsgCode="epsg:5070")

        self.compare_spatial_rdd(spatial_rdd_core, query_envelope)
Example #28
0
 def getCenterPolygonAsSpatialRDD(self) -> 'PolygonRDD':
     from geo_pyspark.core.SpatialRDD import PolygonRDD
     srdd = self._srdd.getCenterPolygonAsSpatialRDD()
     polygon_rdd = PolygonRDD()
     polygon_rdd.set_srdd(srdd)
     return polygon_rdd