def test_range_query_flat_to_df(self):
        poi_point_rdd = WktReader.readToGeometryRDD(self.sc, bank_csv_path, 1,
                                                    False, False)

        poi_point_rdd.analyze()

        poi_point_rdd.spatialPartitioning(GridType.QUADTREE)
        poi_point_rdd.buildIndex(IndexType.QUADTREE, False)

        result = RangeQueryRaw.SpatialRangeQuery(
            poi_point_rdd, loads("POLYGON((0 0, 0 20, 20 20, 20 0, 0 0))"),
            True, True)

        rdd = result.to_rdd()

        assert rdd.collect().__len__() == 4

        df_without_column_names = Adapter.toDf(result, self.spark)

        raw_geometries = self.__row_to_list(df_without_column_names.collect())

        assert [point[0].wkt for point in raw_geometries] == [
            'POINT (9 8)', 'POINT (4 3)', 'POINT (12 1)', 'POINT (11 5)'
        ]
        assert df_without_column_names.count() == 4
        assert df_without_column_names.schema == StructType(
            [StructField("geometry", GeometryType())])

        df = Adapter.toDf(result, self.spark, ["poi_id", "poi_name"])

        assert df.count() == 4
        assert df.columns == ["geometry", "poi_id", "poi_name"]
Esempio n. 2
0
    def test_spatial_join_query_flat_to_df(self):
        poi_point_rdd = WktReader.readToGeometryRDD(self.sc, bank_csv_path, 1,
                                                    False, False)
        areas_polygon_rdd = WktReader.readToGeometryRDD(
            self.sc, areas_csv_path, 1, False, False)
        poi_point_rdd.analyze()
        areas_polygon_rdd.analyze()

        poi_point_rdd.spatialPartitioning(GridType.QUADTREE)
        areas_polygon_rdd.spatialPartitioning(poi_point_rdd.getPartitioner())

        jvm_sedona_rdd = JoinQueryRaw.SpatialJoinQueryFlat(
            poi_point_rdd, areas_polygon_rdd, False, True)

        pois_within_areas_with_default_column_names = Adapter.toDf(
            jvm_sedona_rdd, self.spark)

        assert pois_within_areas_with_default_column_names.count() == 5

        pois_within_areas_with_passed_column_names = Adapter.toDf(
            jvm_sedona_rdd, ["area_id", "area_name"], ["poi_id", "poi_name"],
            self.spark)

        assert pois_within_areas_with_passed_column_names.count() == 5

        assert pois_within_areas_with_passed_column_names.columns == [
            "leftgeometry", "area_id", "area_name", "rightgeometry", "poi_id",
            "poi_name"
        ]

        assert pois_within_areas_with_default_column_names.schema == StructType(
            [
                StructField("leftgeometry", GeometryType()),
                StructField("rightgeometry", GeometryType()),
            ])

        left_geometries_raw = pois_within_areas_with_default_column_names. \
            selectExpr("ST_AsText(leftgeometry)"). \
            collect()

        left_geometries = self.__row_to_list(left_geometries_raw)

        right_geometries_raw = pois_within_areas_with_default_column_names. \
            selectExpr("ST_AsText(rightgeometry)"). \
            collect()

        right_geometries = self.__row_to_list(right_geometries_raw)

        # Ignore the ordering of these
        assert set(geom[0] for geom in left_geometries) == set([
            'POLYGON ((0 4, -3 3, -8 6, -6 8, -2 9, 0 4))',
            'POLYGON ((10 3, 10 6, 14 6, 14 3, 10 3))',
            'POLYGON ((2 2, 2 4, 3 5, 7 5, 9 3, 8 1, 4 1, 2 2))',
            'POLYGON ((-1 -1, -1 -3, -2 -5, -6 -8, -5 -2, -3 -2, -1 -1))',
            'POLYGON ((-1 -1, -1 -3, -2 -5, -6 -8, -5 -2, -3 -2, -1 -1))'
        ])
        assert set(geom[0] for geom in right_geometries) == set([
            'POINT (-3 5)', 'POINT (11 5)', 'POINT (4 3)', 'POINT (-1 -1)',
            'POINT (-4 -5)'
        ])
Esempio n. 3
0
    def test_to_spatial_rdd_df_and_geom_field_name(self):
        spatial_df = self._create_spatial_point_table()

        spatial_rdd = Adapter.toSpatialRdd(spatial_df, "geom")
        spatial_rdd = Adapter.toSpatialRdd(spatial_df, "s")
        spatial_rdd.analyze()

        assert spatial_rdd.approximateTotalCount == 121960
        assert spatial_rdd.boundaryEnvelope == Envelope(
            -179.147236, 179.475569, -14.548699, 71.35513400000001)
Esempio n. 4
0
    def test_geojson_to_dataframe(self):
        spatial_rdd = PolygonRDD(self.spark.sparkContext,
                                 geojson_input_location,
                                 FileDataSplitter.GEOJSON, True)

        spatial_rdd.analyze()
        Adapter.toDf(spatial_rdd, self.spark).show()
        df = Adapter.toDf(spatial_rdd, self.spark)

        assert (df.columns[1] == "STATEFP")
Esempio n. 5
0
 def test_read_shapefile_to_dataframe(self):
     spatial_rdd = ShapefileReader.readToGeometryRDD(
         self.spark.sparkContext, shape_file_input_location)
     spatial_rdd.analyze()
     logging.info(spatial_rdd.fieldNames)
     df = Adapter.toDf(spatial_rdd, self.spark)
     df.show()
Esempio n. 6
0
    def test_read_csv_point_into_spatial_rdd(self):
        df = self.spark.read.\
            format("csv").\
            option("delimiter", "\t").\
            option("header", "false").\
            load(area_lm_point_input_location)

        df.show()
        df.createOrReplaceTempView("inputtable")

        spatial_df = self.spark.sql(
            "select ST_PointFromText(inputtable._c0,\",\") as arealandmark from inputtable"
        )
        spatial_df.show()
        spatial_df.printSchema()

        spatial_rdd = Adapter.toSpatialRdd(spatial_df, "arealandmark")
        spatial_rdd.analyze()
        Adapter.toDf(spatial_rdd, self.spark).show()
Esempio n. 7
0
    def test_read_mixed_wkt_geometries_into_spatial_rdd_with_unique_id(self):
        df = self.spark.read.format("csv").\
            option("delimiter", "\t").\
            option("header", "false").\
            load(mixed_wkt_geometry_input_location)

        df.show()
        df.createOrReplaceTempView("inputtable")

        spatial_df = self.spark.sql(
            "select ST_GeomFromWKT(inputtable._c0) as usacounty, inputtable._c3, inputtable._c5 from inputtable"
        )
        spatial_df.show()
        spatial_df.printSchema()

        spatial_rdd = Adapter.toSpatialRdd(spatial_df, "usacounty")
        spatial_rdd.analyze()
        assert (Adapter.toDf(spatial_rdd, self.spark).columns.__len__() == 3)
        Adapter.toDf(spatial_rdd, self.spark).show()
Esempio n. 8
0
    def test_distance_join_result_to_dataframe(self):
        point_csv_df = self.spark.\
            read.\
            format("csv").\
            option("delimiter", ",").\
            option("header", "false").load(
                area_lm_point_input_location
        )
        point_csv_df.createOrReplaceTempView("pointtable")
        point_df = self.spark.sql(
            "select ST_Point(cast(pointtable._c0 as Decimal(24,20)),cast(pointtable._c1 as Decimal(24,20))) as arealandmark from pointtable"
        )

        point_rdd = Adapter.toSpatialRdd(point_df, "arealandmark")
        point_rdd.analyze()

        polygon_wkt_df = self.spark.read.\
            format("csv").\
            option("delimiter", "\t").\
            option("header", "false").load(
                mixed_wkt_geometry_input_location
        )

        polygon_wkt_df.createOrReplaceTempView("polygontable")
        polygon_df = self.spark.\
            sql("select ST_GeomFromWKT(polygontable._c0) as usacounty from polygontable")

        polygon_rdd = Adapter.toSpatialRdd(polygon_df, "usacounty")
        polygon_rdd.analyze()
        circle_rdd = CircleRDD(polygon_rdd, 0.2)

        point_rdd.spatialPartitioning(GridType.QUADTREE)
        circle_rdd.spatialPartitioning(point_rdd.getPartitioner())

        point_rdd.buildIndex(IndexType.QUADTREE, True)

        join_result_pair_rdd = JoinQuery.\
            DistanceJoinQueryFlat(point_rdd, circle_rdd, True, True)

        join_result_df = Adapter.toDf(join_result_pair_rdd, self.spark)
        join_result_df.printSchema()
        join_result_df.show()
Esempio n. 9
0
    def test_geojson_to_dataframe(self):
        spatial_rdd = PolygonRDD(self.spark.sparkContext,
                                 geojson_input_location,
                                 FileDataSplitter.GEOJSON, True)

        spatial_rdd.analyze()

        df = Adapter.toDf(spatial_rdd, self.spark).\
            withColumn("geometry", expr("ST_GeomFromWKT(geometry)"))
        df.show()
        assert (df.columns[1] == "STATEFP")
Esempio n. 10
0
 def test_load_id_column_data_check(self):
     spatial_rdd = PolygonRDD(self.spark.sparkContext,
                              geojson_id_input_location,
                              FileDataSplitter.GEOJSON, True)
     spatial_rdd.analyze()
     df = Adapter.toDf(spatial_rdd, self.spark)
     df.show()
     try:
         assert df.columns.__len__() == 3
     except AssertionError:
         assert df.columns.__len__() == 4
     assert df.count() == 1
Esempio n. 11
0
    def test_to_df_srdd_fn_spark(self):
        spatial_rdd = PolygonRDD(self.spark.sparkContext,
                                 geojson_input_location,
                                 FileDataSplitter.GEOJSON, True)
        spatial_rdd.analyze()
        assert spatial_rdd.approximateTotalCount == 1001

        spatial_columns = [
            "state_id", "county_id", "tract_id", "bg_id", "fips", "fips_short",
            "bg_nr", "type", "code1", "code2"
        ]
        spatial_df = Adapter.toDf(spatial_rdd, spatial_columns, self.spark)

        spatial_df.show()

        assert spatial_df.columns == ["geometry", *spatial_columns]
        assert spatial_df.count() == 1001
Esempio n. 12
0
def get_bdy_rdd(spark, bdy):
    # load boundaries from Postgres
    sql = """SELECT {}, name as {}, st_astext(geom) as wkt_geom
             FROM admin_bdys_202008.{}_analysis""".format(
        bdy["id_field"], bdy["name_field"], bdy["name"])
    bdy_df = get_dataframe_from_postgres(spark, sql)

    # create geometries from WKT strings into new DataFrame
    bdy_df2 = bdy_df\
        .withColumn("geom", f.expr("st_geomFromWKT(wkt_geom)")) \
        .drop("wkt_geom")

    # create rdd
    output_rdd = Adapter.toSpatialRdd(bdy_df2, "geom")
    output_rdd.analyze()

    bdy_df2.unpersist()
    bdy_df.unpersist()

    return output_rdd
    def test_distance_join_query_flat_to_df(self):
        poi_point_rdd = WktReader.readToGeometryRDD(self.sc, bank_csv_path, 1,
                                                    False, False)
        circle_rdd = CircleRDD(poi_point_rdd, 2.0)

        circle_rdd.analyze()
        poi_point_rdd.analyze()

        poi_point_rdd.spatialPartitioning(GridType.QUADTREE)
        circle_rdd.spatialPartitioning(poi_point_rdd.getPartitioner())

        jvm_sedona_rdd = JoinQueryRaw.DistanceJoinQueryFlat(
            poi_point_rdd, circle_rdd, False, True)
        df_sedona_rdd = Adapter.toDf(jvm_sedona_rdd,
                                     ["poi_from_id", "poi_from_name"],
                                     ["poi_to_id", "poi_to_name"], self.spark)

        assert df_sedona_rdd.count() == 10
        assert df_sedona_rdd.columns == [
            "leftgeometry", "poi_from_id", "poi_from_name", "rightgeometry",
            "poi_to_id", "poi_to_name"
        ]
    def test_spatial_join_to_df(self):
        poi_point_rdd = WktReader.readToGeometryRDD(self.sc, bank_csv_path, 1,
                                                    False, False)
        areas_polygon_rdd = WktReader.readToGeometryRDD(
            self.sc, areas_csv_path, 1, False, False)
        poi_point_rdd.analyze()
        areas_polygon_rdd.analyze()

        poi_point_rdd.spatialPartitioning(GridType.QUADTREE)
        areas_polygon_rdd.spatialPartitioning(poi_point_rdd.getPartitioner())

        jvm_sedona_rdd = JoinQueryRaw.spatialJoin(poi_point_rdd,
                                                  areas_polygon_rdd,
                                                  JoinParams())
        sedona_df = Adapter.toDf(jvm_sedona_rdd, ["area_id", "area_name"],
                                 ["poi_id", "poi_name"], self.spark)

        assert sedona_df.count() == 5
        assert sedona_df.columns == [
            "leftgeometry", "area_id", "area_name", "rightgeometry", "poi_id",
            "poi_name"
        ]
Esempio n. 15
0
def rdd_filesave_join():
    logger.info("\t - RDD file save join start")

    full_start_time = datetime.now()

    # ----------------------------------------------------------
    # get spark session and context
    # ----------------------------------------------------------

    start_time = datetime.now()

    spark = create_spark_session()
    sc = spark.sparkContext
    sedona_version = pkg_resources.get_distribution("sedona").version

    logger.info(
        "\t - PySpark {} session initiated with Apache Sedona {}: {}".format(
            sc.version, sedona_version,
            datetime.now() - start_time))

    # ----------------------------------------------------------
    # create GNAF PointRDD from CSV file
    # ----------------------------------------------------------

    start_time = datetime.now()

    offset = 0  # The point long/lat fields start at column 0
    carry_other_attributes = True  # include non-geo columns

    point_rdd = PointRDD(sc, os.path.join(output_path, gnaf_csv_file_path),
                         offset, FileDataSplitter.CSV, carry_other_attributes)
    point_rdd.analyze()

    # add partitioning and indexing
    point_rdd.spatialPartitioning(GridType.KDBTREE)
    point_rdd.buildIndex(IndexType.RTREE, True)

    # set Spark storage type - set to MEMORY_AND_DISK if low on memory
    point_rdd.indexedRDD.persist(StorageLevel.MEMORY_ONLY)

    logger.info("\t\t - GNAF RDD created: {}".format(datetime.now() -
                                                     start_time))

    # ----------------------------------------------------------
    # get boundary tags using a spatial join
    # ----------------------------------------------------------

    for bdy in bdy_list:
        start_time = datetime.now()

        # load boundaries
        # create geometries from WKT strings into new DataFrame
        bdy_df = spark.read.parquet(os.path.join(output_path, bdy["name"])) \
            .withColumn("geom", f.expr("st_geomFromWKT(wkt_geom)")) \
            .drop("wkt_geom")

        # create bdy rdd
        bdy_rdd = Adapter.toSpatialRdd(bdy_df, "geom")
        bdy_rdd.analyze()

        bdy_df.unpersist()

        bdy_rdd.spatialPartitioning(point_rdd.getPartitioner())
        bdy_rdd.spatialPartitionedRDD.persist(
            StorageLevel.MEMORY_ONLY)  # no need to persist(?) - used once

        # run the join - returns a PairRDD with 1 boundary to 1-N points
        # e.g. [Geometry: Polygon userData: WA32       TANGNEY WA, [Geometry: Point userData: GAWA_146792426	WA, ...]]
        result_pair_rdd = JoinQuery.SpatialJoinQueryFlat(
            point_rdd, bdy_rdd, True, True)
        # jim = result_pair_rdd.take(10)
        # for row in jim:
        #     print(row)

        result_pair_rdd.saveAsTextFile(
            os.path.join(output_path,
                         "rdd_file_save_gnaf_with_{}".format(bdy["name"])))

        # # flat map values to have one point to bdy matched pair
        # flat_mapped_rdd = result_pair_rdd.flatMapValues(lambda x: x)
        #
        # # map values to create RDD row of gnaf & bdy IDs, plus state data
        # mapped_rdd = flat_mapped_rdd.map(
        #     lambda x: [x[1].getUserData().split("\t")[0],
        #                x[0].getUserData().split("\t")[0],
        #                x[0].getUserData().split("\t")[1]]
        # )
        #
        # # convert result to a dataframe of the following shema
        # schema = t.StructType([t.StructField("gnaf_pid", t.StringType(), False),
        #                        t.StructField(bdy["id_field"], t.StringType(), False),
        #                        t.StructField(bdy["name_field"], t.StringType(), False)])
        #
        # join_df = spark.createDataFrame(mapped_rdd, schema)
        #
        # # save result to disk
        # join_df.write \
        #     .option("compression", "gzip") \
        #     .mode("overwrite") \
        #     .parquet(os.path.join(output_path, "rdd_file_save_gnaf_with_{}".format(bdy["name"])))

        logger.info("\t\t - GNAF points bdy tagged with {}: {}".format(
            bdy["name"],
            datetime.now() - start_time))

    # cleanup
    spark.stop()

    logger.info("\t - RDD file save join done: {}".format(datetime.now() -
                                                          full_start_time))
Esempio n. 16
0
def run_test(test_name, num_partitions, max_vertices):

    # create spark session object
    spark = (
        SparkSession.builder.master(
            "local[*]").appName("Spatial Join SQL Benchmark").config(
                "spark.sql.session.timeZone",
                "UTC").config("spark.sql.debug.maxToStringFields",
                              100).config("spark.serializer",
                                          KryoSerializer.getName).
        config("spark.kryo.registrator", SedonaKryoRegistrator.getName).config(
            "spark.jars.packages",
            'org.apache.sedona:sedona-python-adapter-3.0_2.12:1.0.1-incubating,'
            'org.datasyslab:geotools-wrapper:geotools-24.1').config(
                "spark.sql.adaptive.enabled",
                "true").config("spark.executor.cores",
                               4).config("spark.driver.memory",
                                         "8g").getOrCreate())

    # Add Sedona functions and types to Spark
    SedonaRegistrator.registerAll(spark)

    start_time = datetime.now()

    # load gnaf points and create geoms
    point_df = (spark.read.parquet(
        os.path.join(input_path, "address_principals")).select(
            "gnaf_pid", "state", "geom").withColumnRenamed(
                "state", "gnaf_state").repartition(num_partitions,
                                                   "gnaf_state"))

    # load boundaries and create geoms
    if max_vertices is not None:
        bdy_vertex_name = "{}_{}".format(bdy_name, max_vertices)
    else:
        bdy_vertex_name = bdy_name

    bdy_df = (spark.read.parquet(os.path.join(
        input_path,
        bdy_vertex_name)).select(bdy_id, "state",
                                 "geom").repartition(num_partitions,
                                                     "state").cache())
    bdy_count = bdy_df.count()

    # create RDDs - analysed partitioned and indexed
    point_rdd = Adapter.toSpatialRdd(point_df, "geom")
    bdy_rdd = Adapter.toSpatialRdd(bdy_df, "geom")

    point_df.unpersist()
    bdy_df.unpersist()

    point_rdd.analyze()
    bdy_rdd.analyze()

    point_rdd.spatialPartitioning(GridType.KDBTREE)
    bdy_rdd.spatialPartitioning(point_rdd.getPartitioner())

    point_rdd.buildIndex(IndexType.RTREE, True)
    bdy_rdd.buildIndex(IndexType.RTREE, True)

    # run join query
    join_pair_rdd = JoinQueryRaw.SpatialJoinQueryFlat(point_rdd, bdy_rdd, True,
                                                      True)

    # convert SedonaPairRDD to dataframe
    join_df = Adapter.toDf(join_pair_rdd, bdy_rdd.fieldNames,
                           point_rdd.fieldNames, spark)
    # join_df.printSchema()

    # | -- leftgeometry: geometry(nullable=true)
    # | -- <bdy_id>: string(nullable=true)
    # | -- state: string(nullable=true)
    # | -- rightgeometry: geometry(nullable=true)
    # | -- gnaf_pid: string(nullable=true)
    # | -- gnaf_state: string(nullable=true)

    join_df2 = (join_df
                # .filter((join_df["state"] == join_df["gnaf_state"]))
                .select("gnaf_pid", bdy_id, "state")
                .dropDuplicates(["gnaf_pid", bdy_id])
                .cache()
                )

    # output to files
    if "warmup" in test_name:
        name = "gnaf_rdd_{}_{}_{}".format(bdy_id, max_vertices, num_partitions)

        (join_df2.repartition(50).write.partitionBy("state").option(
            "compression",
            "gzip").mode("overwrite").parquet(os.path.join(output_path, name)))

    # output vars
    join_count = join_df2.count()
    time_taken = datetime.now() - start_time

    if "warmup" in test_name:
        print("{},{},{},{},{},{}".format(test_name, join_count, bdy_count,
                                         max_vertices, num_partitions,
                                         time_taken))
    else:
        log_file.write("{},{},{},{},{},{}\n".format(test_name, join_count,
                                                    bdy_count, max_vertices,
                                                    num_partitions,
                                                    time_taken))

    # cleanup
    spark.stop()