Exemple #1
0
    def test_overlapped_linestring_join_correctness(self):
        window_rdd = PolygonRDD(
            self.sc.parallelize(self.test_polygon_window_set),
            StorageLevel.MEMORY_ONLY)
        object_rdd = LineStringRDD(
            self.sc.parallelize(self.test_overlapped_linestring_set),
            StorageLevel.MEMORY_ONLY)
        self.prepare_rdd(object_rdd, window_rdd, GridType.QUADTREE)

        result = JoinQuery.SpatialJoinQuery(object_rdd, window_rdd, True,
                                            True).collect()
        self.verify_join_result(result)

        result_no_index = JoinQuery.SpatialJoinQuery(object_rdd, window_rdd,
                                                     False, True).collect()
        self.verify_join_result(result_no_index)
Exemple #2
0
    def test_inside_point_join_correctness(self):
        self.once_before_all()

        window_rdd = PolygonRDD(
            self.sc.parallelize(self.test_polygon_window_set))

        object_rdd = PointRDD(self.sc.parallelize(self.test_inside_point_set))
        self.prepare_rdd(object_rdd, window_rdd, GridType.QUADTREE)

        result = JoinQuery.SpatialJoinQuery(object_rdd, window_rdd, True,
                                            False).collect()
        self.verify_join_result(result)

        result_no_index = JoinQuery.SpatialJoinQuery(object_rdd, window_rdd,
                                                     False, False).collect()
        self.verify_join_result(result_no_index)
Exemple #3
0
    def test_outside_point_join_correctness(self):
        self.once_before_all()
        window_rdd = PolygonRDD(
            self.sc.parallelize(self.test_polygon_window_set),
            StorageLevel.MEMORY_ONLY)
        object_rdd = PointRDD(self.sc.parallelize(self.test_outside_point_set),
                              StorageLevel.MEMORY_ONLY)
        self.prepare_rdd(object_rdd, window_rdd, GridType.QUADTREE)

        result = JoinQuery.SpatialJoinQuery(object_rdd, window_rdd, True,
                                            False).collect()
        assert 0 == result.__len__()

        result_no_index = JoinQuery.SpatialJoinQuery(object_rdd, window_rdd,
                                                     False, False).collect()
        assert 0 == result_no_index.__len__()
Exemple #4
0
    def test_outside_polygon_distance_join_correctness(self):
        center_geometry_rdd = PolygonRDD(
            self.sc.parallelize(self.test_polygon_window_set),
            StorageLevel.MEMORY_ONLY)
        window_rdd = CircleRDD(center_geometry_rdd, 0.1)
        object_rdd = PolygonRDD(
            self.sc.parallelize(self.test_outside_polygon_set),
            StorageLevel.MEMORY_ONLY)
        self.prepare_rdd(object_rdd, window_rdd, GridType.QUADTREE)

        result = JoinQuery.DistanceJoinQuery(object_rdd, window_rdd, True,
                                             True).collect()
        assert 0 == result.__len__()

        result_no_index = JoinQuery.DistanceJoinQuery(object_rdd, window_rdd,
                                                      False, True).collect()
        assert 0 == result_no_index.__len__()
Exemple #5
0
    def test_indexed_rdd_assignment(self):
        object_rdd = PointRDD(self.sc, point_rdd_input_location,
                              point_rdd_offset, point_rdd_splitter, True)
        query_window_rdd = CircleRDD(object_rdd, 0.1)
        object_rdd.analyze()
        object_rdd.spatialPartitioning(GridType.QUADTREE)
        object_rdd.buildIndex(IndexType.QUADTREE, True)

        query_window_rdd.spatialPartitioning(object_rdd.getPartitioner())

        object_rdd.buildIndex(IndexType.RTREE, False)

        object_rdd.indexedRDD.persist(StorageLevel.MEMORY_ONLY)
        query_window_rdd.jvmSpatialPartitionedRDD.persist(
            StorageLevel.MEMORY_ONLY)
        query_window_rdd.jvmSpatialPartitionedRDD.count()
        object_rdd.indexedRDD.count()

        import time

        start = time.time()
        for _ in range(each_query_loop_times):
            result_size = JoinQuery.DistanceJoinQuery(object_rdd,
                                                      query_window_rdd, True,
                                                      True).count()
        diff = time.time() - start

        object_rdd = PointRDD(self.sc, point_rdd_input_location,
                              point_rdd_offset, point_rdd_splitter, True)
        query_window_rdd = CircleRDD(object_rdd, 0.1)

        object_rdd.analyze()
        object_rdd.spatialPartitioning(GridType.QUADTREE)
        object_rdd.buildIndex(IndexType.QUADTREE, True)

        query_window_rdd.spatialPartitioning(object_rdd.getPartitioner())

        object_rdd.buildIndex(IndexType.RTREE, False)

        start1 = time.time()
        for _ in range(each_query_loop_times):
            result_size = JoinQuery.DistanceJoinQuery(object_rdd,
                                                      query_window_rdd, True,
                                                      True).count()
    def test_nested_loop(self, num_partitions, use_legacy_apis, grid_type, intersects):
        query_rdd = self.create_polygon_rdd(query_polygon_set, splitter, num_partitions)
        spatial_rdd = self.create_polygon_rdd(input_location, splitter, num_partitions)

        self.partition_rdds(query_rdd, spatial_rdd, grid_type, use_legacy_apis)

        result = JoinQuery.SpatialJoinQuery(
            spatial_rdd, query_rdd, False, intersects).collect()

        self.sanity_check_join_results(result)
        assert self.get_expected_count(intersects) == self.count_join_results(result)
Exemple #7
0
def bdy_tag(spark, point_rdd, bdy):
    start_time = datetime.now()

    # load boundaries
    bdy_rdd = get_bdy_rdd(spark, bdy)
    bdy_rdd.analyze()

    bdy_rdd.spatialPartitioning(point_rdd.getPartitioner())
    # bdy_rdd.spatialPartitionedRDD.persist(StorageLevel.MEMORY_ONLY)  # no need to persist(?) - used once

    # run the join - returns a PairRDD with 1 boundary to 1-N points
    # e.g. [Geometry: Polygon userData: WA32       TANGNEY WA, [Geometry: Point userData: GAWA_146792426	WA, ...]]
    result_pair_rdd = JoinQuery.SpatialJoinQuery(point_rdd, bdy_rdd, True,
                                                 True)
    # print(result_pair_rdd.take(1))

    # flat map values to have one point to bdy matched pair
    flat_mapped_rdd = result_pair_rdd.flatMapValues(lambda x: x)

    # map values to create RDD row of gnaf & bdy IDs, plus state data
    mapped_rdd = flat_mapped_rdd.map(lambda x: [
        x[1].getUserData().split("\t")[0], x[0].getUserData().split("\t")[0],
        x[0].getUserData().split("\t")[1]
    ])
    # jim = mapped_rdd.take(10)
    # for row in jim:
    #     print(row)

    # convert result to a dataframe of the following shema
    schema = t.StructType([
        t.StructField("gnaf_pid", t.StringType(), False),
        t.StructField(bdy["id_field"], t.StringType(), False),
        t.StructField(bdy["name_field"], t.StringType(), False)
    ])

    join_df = spark.createDataFrame(mapped_rdd, schema)
    # join_df.printSchema()
    # join_df.show(10, False)

    # save result to disk
    export_to_parquet(join_df, "gnaf_with_{}".format(bdy["name"]))

    # num_joined_points = join_df.count()  # this can be an expensive operation

    # cleanup datasets in memory
    join_df.unpersist()
    mapped_rdd.unpersist()
    flat_mapped_rdd.unpersist()
    result_pair_rdd.unpersist()
    # bdy_rdd.unpersist()  # no method for SpatialRDD

    logger.info("\t - GNAF points bdy tagged with {}: {}".format(
        bdy["name"],
        datetime.now() - start_time))
    def test_index_int(self, num_partitions, use_legacy_apis, grid_type, index_type):
        query_rdd = self.create_rectangle_rdd(input_location, splitter, num_partitions)
        spatial_rdd = self.create_rectangle_rdd(input_location, splitter, num_partitions)

        self.partition_rdds(query_rdd, spatial_rdd, grid_type, use_legacy_apis)
        spatial_rdd.buildIndex(index_type, True)

        result = JoinQuery.SpatialJoinQuery(
            spatial_rdd, query_rdd, False, True).collect()

        self.sanity_check_join_results(result)
        assert match_count == self.count_join_results(result)
Exemple #9
0
    def nested_loop(self, query_rdd, num_partitions, grid_type,
                    use_legacy_apis, expected_count):
        spatial_rdd = self.create_point_rdd(input_location, splitter,
                                            num_partitions)

        self.partition_rdds(query_rdd, spatial_rdd, grid_type, use_legacy_apis)

        result = JoinQuery.SpatialJoinQuery(spatial_rdd, query_rdd, False,
                                            True).collect()

        self.sanity_check_join_results(result)
        assert expected_count == self.count_join_results(result)
Exemple #10
0
    def dynamic_rtree_int(self, query_rdd, num_partitions, use_legacy_apis,
                          grid_type, index_type, expected_count):
        spatial_rdd = self.create_point_rdd(input_location, splitter,
                                            num_partitions)

        self.partition_rdds(query_rdd, spatial_rdd, grid_type, use_legacy_apis)
        join_params = JoinParams(True, index_type, JoinBuildSide.LEFT)
        results = JoinQuery.spatialJoin(query_rdd, spatial_rdd,
                                        join_params).collect()

        self.sanity_check_flat_join_results(results)

        assert expected_count == results.__len__()
    def test_nested_loop(self, num_partitions, use_legacy_apis, grid_type):
        query_rdd = self.create_rectangle_rdd(input_location, splitter, num_partitions)
        spatial_rdd = self.create_rectangle_rdd(input_location, splitter, num_partitions)

        self.partition_rdds(query_rdd, spatial_rdd, grid_type, use_legacy_apis)

        result = JoinQuery.SpatialJoinQuery(
            spatial_rdd, query_rdd, False, True).collect()

        count = 0
        for el in result:
            count += el[1].__len__()
        self.sanity_check_join_results(result)
        assert match_count == self.count_join_results(result)
    def test_spatial_join_query(self):
        point_rdd = PointRDD(self.sc, point_path, 4, FileDataSplitter.WKT,
                             True)

        polygon_rdd = PolygonRDD(self.sc, counties_path, 2, 3,
                                 FileDataSplitter.WKT, True)

        point_rdd.analyze()
        point_rdd.spatialPartitioning(GridType.KDBTREE)
        polygon_rdd.spatialPartitioning(point_rdd.getPartitioner())
        result = JoinQuery.SpatialJoinQuery(point_rdd, polygon_rdd, True,
                                            False)

        print(result.count())
    def test_dynamic_index_int(self, num_partitions, use_legacy_apis, grid_type, index_type, intersects):
        query_rdd = self.create_polygon_rdd(query_polygon_set, splitter, num_partitions)
        spatial_rdd = self.create_polygon_rdd(input_location, splitter, num_partitions)

        self.partition_rdds(query_rdd, spatial_rdd, grid_type, use_legacy_apis)

        join_params = JoinParams(intersects, index_type, JoinBuildSide.LEFT)
        result = JoinQuery.spatialJoin(query_rdd, spatial_rdd, join_params).collect()

        self.sanity_check_flat_join_results(result)

        expected_count = self.get_expected_with_original_duplicates_count(intersects) \
            if self.expect_to_preserve_original_duplicates(grid_type) else self.get_expected_count(intersects)
        assert expected_count == result.__len__()
    def test_distance_join_query(self):
        object_rdd = PointRDD(sparkContext=self.sc,
                              InputLocation=point_rdd_input_location,
                              Offset=point_rdd_offset,
                              splitter=point_rdd_splitter,
                              carryInputData=False)
        query_window_rdd = CircleRDD(object_rdd, 0.1)
        object_rdd.analyze()
        object_rdd.spatialPartitioning(GridType.QUADTREE)
        query_window_rdd.spatialPartitioning(object_rdd.getPartitioner())

        for i in range(each_query_loop_times):
            result_size = JoinQuery.DistanceJoinQuery(object_rdd,
                                                      query_window_rdd, False,
                                                      True).count()
    def test_spatial_join_query_and_build_index_on_points_on_the_fly(self):
        query_window = PolygonRDD(self.sc, polygon_rdd_input_location,
                                  polygon_rdd_start_offset,
                                  polygon_rdd_end_offset, polygon_rdd_splitter,
                                  True)
        object_rdd = PointRDD(sparkContext=self.sc,
                              InputLocation=point_rdd_input_location,
                              Offset=point_rdd_offset,
                              splitter=point_rdd_splitter,
                              carryInputData=False)
        object_rdd.analyze()
        object_rdd.spatialPartitioning(join_query_partitionin_type)
        query_window.spatialPartitioning(object_rdd.getPartitioner())

        for i in range(each_query_loop_times):
            result_size = JoinQuery.SpatialJoinQuery(object_rdd, query_window,
                                                     True, False).count()
Exemple #16
0
    def test_loading_spatial_rdd_from_disc(self):
        point_rdd = load_spatial_rdd_from_disc(
            self.sc, os.path.join(disc_location, "point"), GeoType.POINT
        )
        point_index_rdd = load_spatial_index_rdd_from_disc(self.sc, os.path.join(disc_location, "point_index"))
        point_rdd.indexedRawRDD = point_index_rdd

        assert point_rdd.indexedRawRDD is not None
        assert isinstance(point_rdd, PointRDD)
        point_rdd.analyze()
        print(point_rdd.boundaryEnvelope)

        polygon_rdd = load_spatial_rdd_from_disc(
            self.sc, os.path.join(disc_location, "polygon"), GeoType.POLYGON
        )
        polygon_index_rdd = load_spatial_index_rdd_from_disc(self.sc, os.path.join(disc_location, "polygon_index"))
        polygon_rdd.indexedRawRDD = polygon_index_rdd
        polygon_rdd.analyze()

        print(polygon_rdd.boundaryEnvelope)

        assert polygon_rdd.indexedRawRDD is not None
        assert isinstance(polygon_rdd, PolygonRDD)

        linestring_rdd = load_spatial_rdd_from_disc(
            self.sc, os.path.join(disc_location, "line_string"), GeoType.LINESTRING
        )
        linestring_index_rdd = load_spatial_index_rdd_from_disc(self.sc, os.path.join(disc_location, "line_string_index"))
        linestring_rdd.indexedRawRDD = linestring_index_rdd

        assert linestring_rdd.indexedRawRDD is not None
        assert isinstance(linestring_rdd, LineStringRDD)

        linestring_rdd.analyze()
        print(linestring_rdd.boundaryEnvelope)

        linestring_rdd.spatialPartitioning(GridType.RTREE)
        polygon_rdd.spatialPartitioning(linestring_rdd.grids)
        polygon_rdd.buildIndex(IndexType.RTREE, True)
        linestring_rdd.buildIndex(IndexType.RTREE, True)

        result = JoinQuery.SpatialJoinQuery(
            linestring_rdd, polygon_rdd, True, True).collect()

        print(result)
Exemple #17
0
    def test_distance_join_query(self):
        object_rdd = PointRDD(self.sc, point_rdd_input_location,
                              point_rdd_offset, point_rdd_splitter, True,
                              StorageLevel.MEMORY_ONLY)
        query_window_rdd = CircleRDD(object_rdd, 0.1)

        object_rdd.spatialPartitioning(GridType.QUADTREE)
        query_window_rdd.spatialPartitioning(object_rdd.getPartitioner())

        object_rdd.spatialPartitionedRDD.persist(StorageLevel.MEMORY_ONLY)
        assert object_rdd.spatialPartitionedRDD.is_cached

        query_window_rdd.spatialPartitionedRDD.persist(
            StorageLevel.MEMORY_ONLY)

        for _ in range(each_query_loop_times):
            result_size = JoinQuery.DistanceJoinQuery(object_rdd,
                                                      query_window_rdd, False,
                                                      True).count()
Exemple #18
0
    def test_spatial_join_query(self):
        query_window_rdd = PolygonRDD(self.sc, polygon_rdd_input_location,
                                      polygon_rdd_start_offset,
                                      polygon_rdd_end_offset,
                                      polygon_rdd_splitter, True)
        object_rdd = PointRDD(self.sc, point_rdd_input_location,
                              point_rdd_offset, point_rdd_splitter, True,
                              StorageLevel.MEMORY_ONLY)

        object_rdd.spatialPartitioning(join_query_partitioning_type)
        query_window_rdd.spatialPartitioning(object_rdd.getPartitioner())

        object_rdd.jvmSpatialPartitionedRDD.persist(StorageLevel.MEMORY_ONLY)
        query_window_rdd.jvmSpatialPartitionedRDD.persist(
            StorageLevel.MEMORY_ONLY)

        for _ in range(each_query_loop_times):
            result_size = JoinQuery.SpatialJoinQuery(object_rdd,
                                                     query_window_rdd, False,
                                                     True).count()
    def test_spatial_join_query_and_build_index_on_polygons_on_the_fly(self):
        query_window_rdd = PolygonRDD(self.sc, polygon_rdd_input_location,
                                      polygon_rdd_start_offset,
                                      polygon_rdd_end_offset,
                                      polygon_rdd_splitter, True)

        object_rdd = PointRDD(sparkContext=self.sc,
                              InputLocation=point_rdd_input_location,
                              Offset=point_rdd_offset,
                              splitter=point_rdd_splitter,
                              carryInputData=False)
        object_rdd.analyze()
        object_rdd.spatialPartitioning(join_query_partitionin_type)
        query_window_rdd.spatialPartitioning(object_rdd.getPartitioner())

        for i in range(each_query_loop_times):
            join_params = JoinParams(False, polygon_rdd_index_type,
                                     JoinBuildSide.LEFT)
            resultSize = JoinQuery.spatialJoin(query_window_rdd, object_rdd,
                                               join_params).count()
    def test_spatial_join_query_with_polygon_rdd(self):
        query_rdd = PolygonRDD(self.sc, input_location_query_polygon, splitter,
                               True, num_partitions, StorageLevel.MEMORY_ONLY,
                               "epsg:4326", "epsg:3005")

        spatial_rdd = PointRDD(self.sc, input_location, offset, splitter, True,
                               num_partitions, StorageLevel.MEMORY_ONLY,
                               "epsg:4326", "epsg:3005")
        spatial_rdd.spatialPartitioning(grid_type)

        query_rdd.spatialPartitioning(spatial_rdd.grids)

        result = JoinQuery.SpatialJoinQuery(spatial_rdd, query_rdd, False,
                                            True).collect()
        assert result[1][0].getUserData() is not None

        for data in result:
            if data[1].__len__() != 0:
                for right_data in data[1]:
                    assert right_data.getUserData() is not None
    def test_polygon_distance_join_with_crs_transformation(self):
        query_rdd = PolygonRDD(self.sc, input_location_query_polygon, splitter,
                               True, num_partitions, StorageLevel.MEMORY_ONLY,
                               "epsg:4326", "epsg:3857")
        window_rdd = CircleRDD(query_rdd, 0.1)

        object_rdd = PolygonRDD(self.sc, input_location_query_polygon,
                                splitter, True, num_partitions,
                                StorageLevel.MEMORY_ONLY, "epsg:4326",
                                "epsg:3857")

        object_rdd.rawJvmSpatialRDD.jsrdd.repartition(4)
        object_rdd.spatialPartitioning(GridType.RTREE)
        object_rdd.buildIndex(IndexType.RTREE, True)
        window_rdd.spatialPartitioning(object_rdd.grids)

        results = JoinQuery.DistanceJoinQuery(object_rdd, window_rdd, True,
                                              False).collect()

        assert results.__len__() == 5467

        for data in results:
            for polygon_data in data[1]:
                assert Circle(data[0].geom, 0.1).covers(polygon_data.geom)