Ejemplos de JoinQuery en Python, ejemplos de sedona.core.spatialOperator.JoinQuery en Python

Ejemplo n.º 1

0

Mostrar archivo

    def test_outside_point_join_correctness(self):
        self.once_before_all()
        window_rdd = PolygonRDD(
            self.sc.parallelize(self.test_polygon_window_set),
            StorageLevel.MEMORY_ONLY)
        object_rdd = PointRDD(self.sc.parallelize(self.test_outside_point_set),
                              StorageLevel.MEMORY_ONLY)
        self.prepare_rdd(object_rdd, window_rdd, GridType.QUADTREE)

        result = JoinQuery.SpatialJoinQuery(object_rdd, window_rdd, True,
                                            False).collect()
        assert 0 == result.__len__()

        result_no_index = JoinQuery.SpatialJoinQuery(object_rdd, window_rdd,
                                                     False, False).collect()
        assert 0 == result_no_index.__len__()

Ejemplo n.º 2

0

Mostrar archivo

Archivo: test_crs_transformation.py Proyecto: yitao-li/incubator-sedona

    def test_spatial_join_query_with_polygon_rdd_using_index(self):
        query_rdd = PolygonRDD(self.sc, input_location_query_polygon, splitter,
                               True, num_partitions, StorageLevel.MEMORY_ONLY,
                               "epsg:4326", "epsg:3005")

        spatial_rdd = PointRDD(self.sc, input_location, offset, splitter, True,
                               num_partitions, StorageLevel.MEMORY_ONLY,
                               "epsg:4326", "epsg:3005")

        query_rdd.analyze()
        spatial_rdd.analyze()

        spatial_rdd.spatialPartitioning(grid_type)

        spatial_rdd.buildIndex(IndexType.RTREE, True)

        query_rdd.spatialPartitioning(spatial_rdd.getPartitioner())

        result = JoinQuery.SpatialJoinQuery(spatial_rdd, query_rdd, False,
                                            True).collect()

        assert result[1][0].getUserData() is not None

        for data in result:
            if data[1].__len__() != 0:
                for right_data in data[1]:
                    assert right_data.getUserData() is not None

Ejemplo n.º 3

0

Mostrar archivo

    def test_on_boundary_point_join_correctness(self):
        window_rdd = PolygonRDD(
            self.sc.parallelize(self.test_polygon_window_set),
            StorageLevel.MEMORY_ONLY)
        object_rdd = PointRDD(
            self.sc.parallelize(self.test_on_boundary_point_set),
            StorageLevel.MEMORY_ONLY)
        self.prepare_rdd(object_rdd, window_rdd, GridType.QUADTREE)

        result = JoinQuery.SpatialJoinQuery(object_rdd, window_rdd, True,
                                            False).collect()
        self.verify_join_result(result)

        result_no_index = JoinQuery.SpatialJoinQuery(object_rdd, window_rdd,
                                                     False, False).collect()
        self.verify_join_result(result_no_index)

Ejemplo n.º 4

0

Mostrar archivo

    def test_inside_point_join_correctness(self):
        self.once_before_all()

        window_rdd = PolygonRDD(
            self.sc.parallelize(self.test_polygon_window_set))

        object_rdd = PointRDD(self.sc.parallelize(self.test_inside_point_set))
        self.prepare_rdd(object_rdd, window_rdd, GridType.QUADTREE)

        result = JoinQuery.SpatialJoinQuery(object_rdd, window_rdd, True,
                                            False).collect()
        self.verify_join_result(result)

        result_no_index = JoinQuery.SpatialJoinQuery(object_rdd, window_rdd,
                                                     False, False).collect()
        self.verify_join_result(result_no_index)

Ejemplo n.º 5

0

Mostrar archivo

    def test_outside_polygon_distance_join_correctness(self):
        center_geometry_rdd = PolygonRDD(
            self.sc.parallelize(self.test_polygon_window_set),
            StorageLevel.MEMORY_ONLY)
        window_rdd = CircleRDD(center_geometry_rdd, 0.1)
        object_rdd = PolygonRDD(
            self.sc.parallelize(self.test_outside_polygon_set),
            StorageLevel.MEMORY_ONLY)
        self.prepare_rdd(object_rdd, window_rdd, GridType.QUADTREE)

        result = JoinQuery.DistanceJoinQuery(object_rdd, window_rdd, True,
                                             True).collect()
        assert 0 == result.__len__()

        result_no_index = JoinQuery.DistanceJoinQuery(object_rdd, window_rdd,
                                                      False, True).collect()
        assert 0 == result_no_index.__len__()

Ejemplo n.º 6

0

Mostrar archivo

    def dynamic_rtree_int(self, query_rdd, num_partitions, use_legacy_apis, grid_type, index_type, expected_count):
        spatial_rdd = self.create_point_rdd(input_location, splitter, num_partitions)

        self.partition_rdds(query_rdd, spatial_rdd, grid_type, use_legacy_apis)
        join_params = JoinParams(True, index_type, JoinBuildSide.LEFT)
        results = JoinQuery.spatialJoin(query_rdd, spatial_rdd, join_params).collect()

        self.sanity_check_flat_join_results(results)

        assert expected_count == results.__len__()

Ejemplo n.º 7

0

Mostrar archivo

    def nested_loop(self, query_rdd, num_partitions, grid_type, use_legacy_apis, expected_count):
        spatial_rdd = self.create_point_rdd(input_location, splitter, num_partitions)

        self.partition_rdds(
            query_rdd, spatial_rdd, grid_type, use_legacy_apis)

        result = JoinQuery.SpatialJoinQuery(
            spatial_rdd, query_rdd, False, True).collect()

        self.sanity_check_join_results(result)
        assert expected_count == self.count_join_results(result)

Ejemplo n.º 8

0

Mostrar archivo

    def test_nested_loop(self, num_partitions, use_legacy_apis, grid_type, intersects):
        query_rdd = self.create_polygon_rdd(query_polygon_set, splitter, num_partitions)
        spatial_rdd = self.create_polygon_rdd(input_location, splitter, num_partitions)

        self.partition_rdds(query_rdd, spatial_rdd, grid_type, use_legacy_apis)

        result = JoinQuery.SpatialJoinQuery(
            spatial_rdd, query_rdd, False, intersects).collect()

        self.sanity_check_join_results(result)
        assert self.get_expected_count(intersects) == self.count_join_results(result)

Ejemplo n.º 9

0

Mostrar archivo

def bdy_tag(spark, point_rdd, bdy):
    start_time = datetime.now()

    # load boundaries
    bdy_rdd = get_bdy_rdd(spark, bdy)
    bdy_rdd.analyze()

    bdy_rdd.spatialPartitioning(point_rdd.getPartitioner())
    # bdy_rdd.spatialPartitionedRDD.persist(StorageLevel.MEMORY_ONLY)  # no need to persist(?) - used once

    # run the join - returns a PairRDD with 1 boundary to 1-N points
    # e.g. [Geometry: Polygon userData: WA32       TANGNEY WA, [Geometry: Point userData: GAWA_146792426	WA, ...]]
    result_pair_rdd = JoinQuery.SpatialJoinQuery(point_rdd, bdy_rdd, True,
                                                 True)
    # print(result_pair_rdd.take(1))

    # flat map values to have one point to bdy matched pair
    flat_mapped_rdd = result_pair_rdd.flatMapValues(lambda x: x)

    # map values to create RDD row of gnaf & bdy IDs, plus state data
    mapped_rdd = flat_mapped_rdd.map(lambda x: [
        x[1].getUserData().split("\t")[0], x[0].getUserData().split("\t")[0],
        x[0].getUserData().split("\t")[1]
    ])
    # jim = mapped_rdd.take(10)
    # for row in jim:
    #     print(row)

    # convert result to a dataframe of the following shema
    schema = t.StructType([
        t.StructField("gnaf_pid", t.StringType(), False),
        t.StructField(bdy["id_field"], t.StringType(), False),
        t.StructField(bdy["name_field"], t.StringType(), False)
    ])

    join_df = spark.createDataFrame(mapped_rdd, schema)
    # join_df.printSchema()
    # join_df.show(10, False)

    # save result to disk
    export_to_parquet(join_df, "gnaf_with_{}".format(bdy["name"]))

    # num_joined_points = join_df.count()  # this can be an expensive operation

    # cleanup datasets in memory
    join_df.unpersist()
    mapped_rdd.unpersist()
    flat_mapped_rdd.unpersist()
    result_pair_rdd.unpersist()
    # bdy_rdd.unpersist()  # no method for SpatialRDD

    logger.info("\t - GNAF points bdy tagged with {}: {}".format(
        bdy["name"],
        datetime.now() - start_time))

Ejemplo n.º 10

0

Mostrar archivo

    def test_index_int(self, num_partitions, use_legacy_apis, grid_type, index_type):
        query_rdd = self.create_polygon_rdd(query_polygon_set, splitter, num_partitions)
        spatial_rdd = self.create_linestring_rdd(input_location, splitter, num_partitions)

        self.partition_rdds(query_rdd, spatial_rdd, grid_type, use_legacy_apis)
        spatial_rdd.buildIndex(index_type, True)

        result = JoinQuery.SpatialJoinQuery(
            spatial_rdd, query_rdd, False, True).collect()

        self.sanity_check_join_results(result)
        assert match_count == self.count_join_results(result)

Ejemplo n.º 11

0

Mostrar archivo

    def test_index_int(self, num_partitions, grid_type, index_type, intersects):
        query_rdd = self.create_polygon_rdd(query_polygon_set, splitter, num_partitions)
        spatial_rdd = self.create_polygon_rdd(input_location, splitter, num_partitions)

        self.partition_rdds(query_rdd, spatial_rdd, grid_type)
        spatial_rdd.buildIndex(index_type, True)

        result = JoinQuery.SpatialJoinQuery(
            spatial_rdd, query_rdd, True, intersects).collect()

        self.sanity_check_join_results(result)
        assert self.get_expected_with_original_duplicates_count(intersects) == self.count_join_results(result)

Ejemplo n.º 12

0

Mostrar archivo

    def index_int(self, query_rdd, num_partitions, grid_type, index_type,
                  expected_count):
        spatial_rdd = self.create_point_rdd(input_location, splitter,
                                            num_partitions)

        self.partition_rdds(query_rdd, spatial_rdd, grid_type)
        spatial_rdd.buildIndex(index_type, True)

        result = JoinQuery.SpatialJoinQuery(spatial_rdd, query_rdd, False,
                                            True).collect()

        self.sanity_check_join_results(result)
        assert expected_count, self.count_join_results(result)

Ejemplo n.º 13

0

Mostrar archivo

    def test_indexed_rdd_assignment(self):
        object_rdd = PointRDD(
            self.sc, point_rdd_input_location, point_rdd_offset, point_rdd_splitter, True)
        query_window_rdd = CircleRDD(object_rdd, 0.1)
        object_rdd.analyze()
        object_rdd.spatialPartitioning(GridType.QUADTREE)
        object_rdd.buildIndex(IndexType.QUADTREE, True)

        query_window_rdd.spatialPartitioning(object_rdd.getPartitioner())

        object_rdd.buildIndex(IndexType.RTREE, False)

        object_rdd.indexedRDD.persist(StorageLevel.MEMORY_ONLY)
        query_window_rdd.jvmSpatialPartitionedRDD.persist(StorageLevel.MEMORY_ONLY)
        query_window_rdd.jvmSpatialPartitionedRDD.count()
        object_rdd.indexedRDD.count()

        import time

        start = time.time()
        for _ in range(each_query_loop_times):
            result_size = JoinQuery.DistanceJoinQuery(object_rdd, query_window_rdd, True, True).count()
        diff = time.time() - start

        object_rdd = PointRDD(
            self.sc, point_rdd_input_location, point_rdd_offset, point_rdd_splitter, True)
        query_window_rdd = CircleRDD(object_rdd, 0.1)

        object_rdd.analyze()
        object_rdd.spatialPartitioning(GridType.QUADTREE)
        object_rdd.buildIndex(IndexType.QUADTREE, True)

        query_window_rdd.spatialPartitioning(object_rdd.getPartitioner())

        object_rdd.buildIndex(IndexType.RTREE, False)

        start1 = time.time()
        for _ in range(each_query_loop_times):
            result_size = JoinQuery.DistanceJoinQuery(object_rdd, query_window_rdd, True, True).count()

Ejemplo n.º 14

0

Mostrar archivo

    def test_dynamic_index_int(self, num_partitions, use_legacy_apis, grid_type, index_type, intersects):
        query_rdd = self.create_polygon_rdd(query_polygon_set, splitter, num_partitions)
        spatial_rdd = self.create_polygon_rdd(input_location, splitter, num_partitions)

        self.partition_rdds(query_rdd, spatial_rdd, grid_type, use_legacy_apis)

        join_params = JoinParams(intersects, index_type, JoinBuildSide.LEFT)
        result = JoinQuery.spatialJoin(query_rdd, spatial_rdd, join_params).collect()

        self.sanity_check_flat_join_results(result)

        expected_count = self.get_expected_with_original_duplicates_count(intersects) \
            if self.expect_to_preserve_original_duplicates(grid_type) else self.get_expected_count(intersects)
        assert expected_count == result.__len__()

Ejemplo n.º 15

0

Mostrar archivo

    def test_index_int(self, num_partitions, grid_type, index_type):
        query_rdd = self.create_polygon_rdd(query_polygon_set, splitter, num_partitions)
        spatial_rdd = self.create_linestring_rdd(input_location, splitter, num_partitions)

        self.partition_rdds(query_rdd, spatial_rdd, grid_type)
        spatial_rdd.buildIndex(index_type, True)

        result = JoinQuery.SpatialJoinQuery(
            spatial_rdd, query_rdd, False, True).collect()

        self.sanity_check_join_results(result)
        expected_count = match_with_original_duplicates_count if self.expect_to_preserve_original_duplicates(
            grid_type) else match_count
        assert expected_count == self.count_join_results(result)

Ejemplo n.º 16

0

Mostrar archivo

    def test_spatial_join_query(self):
        point_rdd = PointRDD(self.sc, point_path, 4, FileDataSplitter.WKT,
                             True)

        polygon_rdd = PolygonRDD(self.sc, counties_path, 2, 3,
                                 FileDataSplitter.WKT, True)

        point_rdd.analyze()
        point_rdd.spatialPartitioning(GridType.KDBTREE)
        polygon_rdd.spatialPartitioning(point_rdd.getPartitioner())
        result = JoinQuery.SpatialJoinQuery(point_rdd, polygon_rdd, True,
                                            False)

        print(result.count())

Ejemplo n.º 17

0

Mostrar archivo

    def test_distance_join_query(self):
        object_rdd = PointRDD(
            self.sc, point_rdd_input_location, point_rdd_offset, point_rdd_splitter, True, StorageLevel.MEMORY_ONLY)
        query_window_rdd = CircleRDD(object_rdd, 0.1)

        object_rdd.spatialPartitioning(GridType.QUADTREE)
        query_window_rdd.spatialPartitioning(object_rdd.getPartitioner())

        object_rdd.spatialPartitionedRDD.persist(StorageLevel.MEMORY_ONLY)
        assert object_rdd.spatialPartitionedRDD.is_cached

        query_window_rdd.spatialPartitionedRDD.persist(StorageLevel.MEMORY_ONLY)

        for _ in range(each_query_loop_times):
            result_size = JoinQuery.DistanceJoinQuery(object_rdd, query_window_rdd, False, True).count()

Ejemplo n.º 18

0

Mostrar archivo

Archivo: test_rdd.py Proyecto: paulmvp/incubator-sedona

    def test_distance_join_query(self):
        object_rdd = PointRDD(sparkContext=self.sc,
                              InputLocation=point_rdd_input_location,
                              Offset=point_rdd_offset,
                              splitter=point_rdd_splitter,
                              carryInputData=False)
        query_window_rdd = CircleRDD(object_rdd, 0.1)
        object_rdd.analyze()
        object_rdd.spatialPartitioning(GridType.QUADTREE)
        query_window_rdd.spatialPartitioning(object_rdd.getPartitioner())

        for i in range(each_query_loop_times):
            result_size = JoinQuery.DistanceJoinQuery(object_rdd,
                                                      query_window_rdd, False,
                                                      True).count()

Ejemplo n.º 19

0

Mostrar archivo

Archivo: test_rectangle_join.py Proyecto: paulmvp/incubator-sedona

    def test_dynamic_index_int(self, num_partitions, grid_type, index_type):
        query_rdd = self.create_rectangle_rdd(input_location, splitter, num_partitions)
        spatial_rdd = self.create_rectangle_rdd(input_location, splitter, num_partitions)

        self.partition_rdds(query_rdd, spatial_rdd, grid_type)

        join_params = JoinParams(True, index_type, JoinBuildSide.LEFT)
        result = JoinQuery.spatialJoin(query_rdd, spatial_rdd, join_params).collect()

        self.sanity_check_flat_join_results(result)

        expected_count = match_with_original_duplicates_count \
            if self.expect_to_preserve_original_duplicates(grid_type) else match_count

        assert expected_count == result.__len__()

Ejemplo n.º 20

0

Mostrar archivo

    def test_spatial_join_query(self):
        query_window_rdd = PolygonRDD(
            self.sc, polygon_rdd_input_location, polygon_rdd_start_offset, polygon_rdd_end_offset,
            polygon_rdd_splitter, True
        )
        object_rdd = PointRDD(
            self.sc, point_rdd_input_location, point_rdd_offset, point_rdd_splitter, True, StorageLevel.MEMORY_ONLY)

        object_rdd.spatialPartitioning(join_query_partitioning_type)
        query_window_rdd.spatialPartitioning(object_rdd.getPartitioner())

        object_rdd.jvmSpatialPartitionedRDD.persist(StorageLevel.MEMORY_ONLY)
        query_window_rdd.jvmSpatialPartitionedRDD.persist(StorageLevel.MEMORY_ONLY)

        for _ in range(each_query_loop_times):
            result_size = JoinQuery.SpatialJoinQuery(object_rdd, query_window_rdd, False, True).count()

Ejemplo n.º 21

0

Mostrar archivo

Archivo: test_rectangle_join.py Proyecto: paulmvp/incubator-sedona

    def test_nested_loop(self, num_partitions, grid_type):
        query_rdd = self.create_rectangle_rdd(input_location, splitter, num_partitions)
        spatial_rdd = self.create_rectangle_rdd(input_location, splitter, num_partitions)

        self.partition_rdds(query_rdd, spatial_rdd, grid_type)

        result = JoinQuery.SpatialJoinQuery(
            spatial_rdd, query_rdd, False, True).collect()

        count = 0
        for el in result:
            count += el[1].__len__()
        self.sanity_check_join_results(result)
        expected_count = match_with_original_duplicates_count if self.expect_to_preserve_original_duplicates(
            grid_type) else match_count
        assert expected_count == self.count_join_results(result)

Ejemplo n.º 22

0

Mostrar archivo

    def test_nested_loop(self, num_partitions, use_legacy_apis, grid_type):
        query_rdd = self.create_rectangle_rdd(input_location, splitter,
                                              num_partitions)
        spatial_rdd = self.create_rectangle_rdd(input_location, splitter,
                                                num_partitions)

        self.partition_rdds(query_rdd, spatial_rdd, grid_type, use_legacy_apis)

        result = JoinQuery.SpatialJoinQuery(spatial_rdd, query_rdd, False,
                                            True).collect()

        count = 0
        for el in result:
            count += el[1].__len__()
        self.sanity_check_join_results(result)
        assert match_count == self.count_join_results(result)

Ejemplo n.º 23

0

Mostrar archivo

Archivo: test_spatial_rdd_from_disc.py Proyecto: yitao-li/incubator-sedona

    def test_loading_spatial_rdd_from_disc(self):
        point_rdd = load_spatial_rdd_from_disc(
            self.sc, os.path.join(disc_location, "point"), GeoType.POINT)
        point_index_rdd = load_spatial_index_rdd_from_disc(
            self.sc, os.path.join(disc_location, "point_index"))
        point_rdd.indexedRawRDD = point_index_rdd

        assert point_rdd.indexedRawRDD is not None
        assert isinstance(point_rdd, PointRDD)
        point_rdd.analyze()
        print(point_rdd.boundaryEnvelope)

        polygon_rdd = load_spatial_rdd_from_disc(
            self.sc, os.path.join(disc_location, "polygon"), GeoType.POLYGON)
        polygon_index_rdd = load_spatial_index_rdd_from_disc(
            self.sc, os.path.join(disc_location, "polygon_index"))
        polygon_rdd.indexedRawRDD = polygon_index_rdd
        polygon_rdd.analyze()

        print(polygon_rdd.boundaryEnvelope)

        assert polygon_rdd.indexedRawRDD is not None
        assert isinstance(polygon_rdd, PolygonRDD)

        linestring_rdd = load_spatial_rdd_from_disc(
            self.sc, os.path.join(disc_location, "line_string"),
            GeoType.LINESTRING)
        linestring_index_rdd = load_spatial_index_rdd_from_disc(
            self.sc, os.path.join(disc_location, "line_string_index"))
        linestring_rdd.indexedRawRDD = linestring_index_rdd

        assert linestring_rdd.indexedRawRDD is not None
        assert isinstance(linestring_rdd, LineStringRDD)

        linestring_rdd.analyze()
        print(linestring_rdd.boundaryEnvelope)

        linestring_rdd.spatialPartitioning(GridType.KDBTREE)
        polygon_rdd.spatialPartitioning(linestring_rdd.getPartitioner())
        polygon_rdd.buildIndex(IndexType.RTREE, True)
        linestring_rdd.buildIndex(IndexType.RTREE, True)

        result = JoinQuery.SpatialJoinQuery(linestring_rdd, polygon_rdd, True,
                                            True).collect()

        print(result)
        remove_directory(disc_location)

Ejemplo n.º 24

0

Mostrar archivo

Archivo: test_rdd.py Proyecto: paulmvp/incubator-sedona

    def test_spatial_join_query_and_build_index_on_points_on_the_fly(self):
        query_window = PolygonRDD(self.sc, polygon_rdd_input_location,
                                  polygon_rdd_start_offset,
                                  polygon_rdd_end_offset, polygon_rdd_splitter,
                                  True)
        object_rdd = PointRDD(sparkContext=self.sc,
                              InputLocation=point_rdd_input_location,
                              Offset=point_rdd_offset,
                              splitter=point_rdd_splitter,
                              carryInputData=False)
        object_rdd.analyze()
        object_rdd.spatialPartitioning(join_query_partitionin_type)
        query_window.spatialPartitioning(object_rdd.getPartitioner())

        for i in range(each_query_loop_times):
            result_size = JoinQuery.SpatialJoinQuery(object_rdd, query_window,
                                                     True, False).count()

Ejemplo n.º 25

0

Mostrar archivo

Archivo: test_rdd.py Proyecto: paulmvp/incubator-sedona

    def test_spatial_join_query_and_build_index_on_polygons_on_the_fly(self):
        query_window_rdd = PolygonRDD(self.sc, polygon_rdd_input_location,
                                      polygon_rdd_start_offset,
                                      polygon_rdd_end_offset,
                                      polygon_rdd_splitter, True)

        object_rdd = PointRDD(sparkContext=self.sc,
                              InputLocation=point_rdd_input_location,
                              Offset=point_rdd_offset,
                              splitter=point_rdd_splitter,
                              carryInputData=False)
        object_rdd.analyze()
        object_rdd.spatialPartitioning(join_query_partitionin_type)
        query_window_rdd.spatialPartitioning(object_rdd.getPartitioner())

        for i in range(each_query_loop_times):
            join_params = JoinParams(False, polygon_rdd_index_type,
                                     JoinBuildSide.LEFT)
            resultSize = JoinQuery.spatialJoin(query_window_rdd, object_rdd,
                                               join_params).count()

Ejemplo n.º 26

0

Mostrar archivo

Archivo: test_crs_transformation.py Proyecto: yitao-li/incubator-sedona

    def test_polygon_distance_join_with_crs_transformation(self):
        query_rdd = PolygonRDD(self.sc, input_location_query_polygon, splitter,
                               True, num_partitions, StorageLevel.MEMORY_ONLY,
                               "epsg:4326", "epsg:3857")
        window_rdd = CircleRDD(query_rdd, 0.1)

        object_rdd = PolygonRDD(self.sc, input_location_query_polygon,
                                splitter, True, num_partitions,
                                StorageLevel.MEMORY_ONLY, "epsg:4326",
                                "epsg:3857")

        object_rdd.rawJvmSpatialRDD.jsrdd.repartition(4)
        object_rdd.spatialPartitioning(GridType.KDBTREE)
        object_rdd.buildIndex(IndexType.RTREE, True)
        window_rdd.spatialPartitioning(object_rdd.getPartitioner())

        results = JoinQuery.DistanceJoinQuery(object_rdd, window_rdd, True,
                                              False).collect()
        assert 5467 == results.__len__()

        for data in results:
            for polygon_data in data[1]:
                assert Circle(data[0].geom, 0.1).covers(polygon_data.geom)

Ejemplo n.º 27

0

Mostrar archivo

def rdd_filesave_join():
    logger.info("\t - RDD file save join start")

    full_start_time = datetime.now()

    # ----------------------------------------------------------
    # get spark session and context
    # ----------------------------------------------------------

    start_time = datetime.now()

    spark = create_spark_session()
    sc = spark.sparkContext
    sedona_version = pkg_resources.get_distribution("sedona").version

    logger.info(
        "\t - PySpark {} session initiated with Apache Sedona {}: {}".format(
            sc.version, sedona_version,
            datetime.now() - start_time))

    # ----------------------------------------------------------
    # create GNAF PointRDD from CSV file
    # ----------------------------------------------------------

    start_time = datetime.now()

    offset = 0  # The point long/lat fields start at column 0
    carry_other_attributes = True  # include non-geo columns

    point_rdd = PointRDD(sc, os.path.join(output_path, gnaf_csv_file_path),
                         offset, FileDataSplitter.CSV, carry_other_attributes)
    point_rdd.analyze()

    # add partitioning and indexing
    point_rdd.spatialPartitioning(GridType.KDBTREE)
    point_rdd.buildIndex(IndexType.RTREE, True)

    # set Spark storage type - set to MEMORY_AND_DISK if low on memory
    point_rdd.indexedRDD.persist(StorageLevel.MEMORY_ONLY)

    logger.info("\t\t - GNAF RDD created: {}".format(datetime.now() -
                                                     start_time))

    # ----------------------------------------------------------
    # get boundary tags using a spatial join
    # ----------------------------------------------------------

    for bdy in bdy_list:
        start_time = datetime.now()

        # load boundaries
        # create geometries from WKT strings into new DataFrame
        bdy_df = spark.read.parquet(os.path.join(output_path, bdy["name"])) \
            .withColumn("geom", f.expr("st_geomFromWKT(wkt_geom)")) \
            .drop("wkt_geom")

        # create bdy rdd
        bdy_rdd = Adapter.toSpatialRdd(bdy_df, "geom")
        bdy_rdd.analyze()

        bdy_df.unpersist()

        bdy_rdd.spatialPartitioning(point_rdd.getPartitioner())
        bdy_rdd.spatialPartitionedRDD.persist(
            StorageLevel.MEMORY_ONLY)  # no need to persist(?) - used once

        # run the join - returns a PairRDD with 1 boundary to 1-N points
        # e.g. [Geometry: Polygon userData: WA32       TANGNEY WA, [Geometry: Point userData: GAWA_146792426	WA, ...]]
        result_pair_rdd = JoinQuery.SpatialJoinQueryFlat(
            point_rdd, bdy_rdd, True, True)
        # jim = result_pair_rdd.take(10)
        # for row in jim:
        #     print(row)

        result_pair_rdd.saveAsTextFile(
            os.path.join(output_path,
                         "rdd_file_save_gnaf_with_{}".format(bdy["name"])))

        # # flat map values to have one point to bdy matched pair
        # flat_mapped_rdd = result_pair_rdd.flatMapValues(lambda x: x)
        #
        # # map values to create RDD row of gnaf & bdy IDs, plus state data
        # mapped_rdd = flat_mapped_rdd.map(
        #     lambda x: [x[1].getUserData().split("\t")[0],
        #                x[0].getUserData().split("\t")[0],
        #                x[0].getUserData().split("\t")[1]]
        # )
        #
        # # convert result to a dataframe of the following shema
        # schema = t.StructType([t.StructField("gnaf_pid", t.StringType(), False),
        #                        t.StructField(bdy["id_field"], t.StringType(), False),
        #                        t.StructField(bdy["name_field"], t.StringType(), False)])
        #
        # join_df = spark.createDataFrame(mapped_rdd, schema)
        #
        # # save result to disk
        # join_df.write \
        #     .option("compression", "gzip") \
        #     .mode("overwrite") \
        #     .parquet(os.path.join(output_path, "rdd_file_save_gnaf_with_{}".format(bdy["name"])))

        logger.info("\t\t - GNAF points bdy tagged with {}: {}".format(
            bdy["name"],
            datetime.now() - start_time))

    # cleanup
    spark.stop()

    logger.info("\t - RDD file save join done: {}".format(datetime.now() -
                                                          full_start_time))