Esempio n. 1
0
    def test_to_spatial_rdd_df_and_geom_field_name(self):
        spatial_df = self._create_spatial_point_table()

        spatial_rdd = Adapter.toSpatialRdd(spatial_df, "geom")
        spatial_rdd = Adapter.toSpatialRdd(spatial_df, "s")
        spatial_rdd.analyze()

        assert spatial_rdd.approximateTotalCount == 121960
        assert spatial_rdd.boundaryEnvelope == Envelope(
            -179.147236, 179.475569, -14.548699, 71.35513400000001)
Esempio n. 2
0
    def test_distance_join_result_to_dataframe(self):
        point_csv_df = self.spark.\
            read.\
            format("csv").\
            option("delimiter", ",").\
            option("header", "false").load(
                area_lm_point_input_location
        )
        point_csv_df.createOrReplaceTempView("pointtable")
        point_df = self.spark.sql(
            "select ST_Point(cast(pointtable._c0 as Decimal(24,20)),cast(pointtable._c1 as Decimal(24,20))) as arealandmark from pointtable"
        )

        point_rdd = Adapter.toSpatialRdd(point_df, "arealandmark")
        point_rdd.analyze()

        polygon_wkt_df = self.spark.read.\
            format("csv").\
            option("delimiter", "\t").\
            option("header", "false").load(
                mixed_wkt_geometry_input_location
        )

        polygon_wkt_df.createOrReplaceTempView("polygontable")
        polygon_df = self.spark.\
            sql("select ST_GeomFromWKT(polygontable._c0) as usacounty from polygontable")

        polygon_rdd = Adapter.toSpatialRdd(polygon_df, "usacounty")
        polygon_rdd.analyze()
        circle_rdd = CircleRDD(polygon_rdd, 0.2)

        point_rdd.spatialPartitioning(GridType.QUADTREE)
        circle_rdd.spatialPartitioning(point_rdd.getPartitioner())

        point_rdd.buildIndex(IndexType.QUADTREE, True)

        join_result_pair_rdd = JoinQuery.\
            DistanceJoinQueryFlat(point_rdd, circle_rdd, True, True)

        join_result_df = Adapter.toDf(join_result_pair_rdd, self.spark)
        join_result_df.printSchema()
        join_result_df.show()
Esempio n. 3
0
    def test_read_mixed_wkt_geometries_into_spatial_rdd(self):
        df = self.spark.read.format("csv").\
            option("delimiter", "\t").\
            option("header", "false").load(mixed_wkt_geometry_input_location)

        df.show()
        df.createOrReplaceTempView("inputtable")
        spatial_df = self.spark.sql(
            "select ST_GeomFromWKT(inputtable._c0) as usacounty from inputtable"
        )
        spatial_df.show()
        spatial_df.printSchema()
        spatial_rdd = Adapter.toSpatialRdd(spatial_df, "usacounty")
        spatial_rdd.analyze()
        Adapter.toDf(spatial_rdd, self.spark).show()
        assert (Adapter.toDf(spatial_rdd, self.spark).columns.__len__() == 1)
        Adapter.toDf(spatial_rdd, self.spark).show()
Esempio n. 4
0
    def test_read_csv_point_into_spatial_rdd(self):
        df = self.spark.read.\
            format("csv").\
            option("delimiter", "\t").\
            option("header", "false").\
            load(area_lm_point_input_location)

        df.show()
        df.createOrReplaceTempView("inputtable")

        spatial_df = self.spark.sql(
            "select ST_PointFromText(inputtable._c0,\",\") as arealandmark from inputtable"
        )
        spatial_df.show()
        spatial_df.printSchema()

        spatial_rdd = Adapter.toSpatialRdd(spatial_df, "arealandmark")
        spatial_rdd.analyze()
        Adapter.toDf(spatial_rdd, self.spark).show()
Esempio n. 5
0
def get_bdy_rdd(spark, bdy):
    # load boundaries from Postgres
    sql = """SELECT {}, name as {}, st_astext(geom) as wkt_geom
             FROM admin_bdys_202008.{}_analysis""".format(
        bdy["id_field"], bdy["name_field"], bdy["name"])
    bdy_df = get_dataframe_from_postgres(spark, sql)

    # create geometries from WKT strings into new DataFrame
    bdy_df2 = bdy_df\
        .withColumn("geom", f.expr("st_geomFromWKT(wkt_geom)")) \
        .drop("wkt_geom")

    # create rdd
    output_rdd = Adapter.toSpatialRdd(bdy_df2, "geom")
    output_rdd.analyze()

    bdy_df2.unpersist()
    bdy_df.unpersist()

    return output_rdd
Esempio n. 6
0
def rdd_filesave_join():
    logger.info("\t - RDD file save join start")

    full_start_time = datetime.now()

    # ----------------------------------------------------------
    # get spark session and context
    # ----------------------------------------------------------

    start_time = datetime.now()

    spark = create_spark_session()
    sc = spark.sparkContext
    sedona_version = pkg_resources.get_distribution("sedona").version

    logger.info(
        "\t - PySpark {} session initiated with Apache Sedona {}: {}".format(
            sc.version, sedona_version,
            datetime.now() - start_time))

    # ----------------------------------------------------------
    # create GNAF PointRDD from CSV file
    # ----------------------------------------------------------

    start_time = datetime.now()

    offset = 0  # The point long/lat fields start at column 0
    carry_other_attributes = True  # include non-geo columns

    point_rdd = PointRDD(sc, os.path.join(output_path, gnaf_csv_file_path),
                         offset, FileDataSplitter.CSV, carry_other_attributes)
    point_rdd.analyze()

    # add partitioning and indexing
    point_rdd.spatialPartitioning(GridType.KDBTREE)
    point_rdd.buildIndex(IndexType.RTREE, True)

    # set Spark storage type - set to MEMORY_AND_DISK if low on memory
    point_rdd.indexedRDD.persist(StorageLevel.MEMORY_ONLY)

    logger.info("\t\t - GNAF RDD created: {}".format(datetime.now() -
                                                     start_time))

    # ----------------------------------------------------------
    # get boundary tags using a spatial join
    # ----------------------------------------------------------

    for bdy in bdy_list:
        start_time = datetime.now()

        # load boundaries
        # create geometries from WKT strings into new DataFrame
        bdy_df = spark.read.parquet(os.path.join(output_path, bdy["name"])) \
            .withColumn("geom", f.expr("st_geomFromWKT(wkt_geom)")) \
            .drop("wkt_geom")

        # create bdy rdd
        bdy_rdd = Adapter.toSpatialRdd(bdy_df, "geom")
        bdy_rdd.analyze()

        bdy_df.unpersist()

        bdy_rdd.spatialPartitioning(point_rdd.getPartitioner())
        bdy_rdd.spatialPartitionedRDD.persist(
            StorageLevel.MEMORY_ONLY)  # no need to persist(?) - used once

        # run the join - returns a PairRDD with 1 boundary to 1-N points
        # e.g. [Geometry: Polygon userData: WA32       TANGNEY WA, [Geometry: Point userData: GAWA_146792426	WA, ...]]
        result_pair_rdd = JoinQuery.SpatialJoinQueryFlat(
            point_rdd, bdy_rdd, True, True)
        # jim = result_pair_rdd.take(10)
        # for row in jim:
        #     print(row)

        result_pair_rdd.saveAsTextFile(
            os.path.join(output_path,
                         "rdd_file_save_gnaf_with_{}".format(bdy["name"])))

        # # flat map values to have one point to bdy matched pair
        # flat_mapped_rdd = result_pair_rdd.flatMapValues(lambda x: x)
        #
        # # map values to create RDD row of gnaf & bdy IDs, plus state data
        # mapped_rdd = flat_mapped_rdd.map(
        #     lambda x: [x[1].getUserData().split("\t")[0],
        #                x[0].getUserData().split("\t")[0],
        #                x[0].getUserData().split("\t")[1]]
        # )
        #
        # # convert result to a dataframe of the following shema
        # schema = t.StructType([t.StructField("gnaf_pid", t.StringType(), False),
        #                        t.StructField(bdy["id_field"], t.StringType(), False),
        #                        t.StructField(bdy["name_field"], t.StringType(), False)])
        #
        # join_df = spark.createDataFrame(mapped_rdd, schema)
        #
        # # save result to disk
        # join_df.write \
        #     .option("compression", "gzip") \
        #     .mode("overwrite") \
        #     .parquet(os.path.join(output_path, "rdd_file_save_gnaf_with_{}".format(bdy["name"])))

        logger.info("\t\t - GNAF points bdy tagged with {}: {}".format(
            bdy["name"],
            datetime.now() - start_time))

    # cleanup
    spark.stop()

    logger.info("\t - RDD file save join done: {}".format(datetime.now() -
                                                          full_start_time))
Esempio n. 7
0
def run_test(test_name, num_partitions, max_vertices):

    # create spark session object
    spark = (
        SparkSession.builder.master(
            "local[*]").appName("Spatial Join SQL Benchmark").config(
                "spark.sql.session.timeZone",
                "UTC").config("spark.sql.debug.maxToStringFields",
                              100).config("spark.serializer",
                                          KryoSerializer.getName).
        config("spark.kryo.registrator", SedonaKryoRegistrator.getName).config(
            "spark.jars.packages",
            'org.apache.sedona:sedona-python-adapter-3.0_2.12:1.0.1-incubating,'
            'org.datasyslab:geotools-wrapper:geotools-24.1').config(
                "spark.sql.adaptive.enabled",
                "true").config("spark.executor.cores",
                               4).config("spark.driver.memory",
                                         "8g").getOrCreate())

    # Add Sedona functions and types to Spark
    SedonaRegistrator.registerAll(spark)

    start_time = datetime.now()

    # load gnaf points and create geoms
    point_df = (spark.read.parquet(
        os.path.join(input_path, "address_principals")).select(
            "gnaf_pid", "state", "geom").withColumnRenamed(
                "state", "gnaf_state").repartition(num_partitions,
                                                   "gnaf_state"))

    # load boundaries and create geoms
    if max_vertices is not None:
        bdy_vertex_name = "{}_{}".format(bdy_name, max_vertices)
    else:
        bdy_vertex_name = bdy_name

    bdy_df = (spark.read.parquet(os.path.join(
        input_path,
        bdy_vertex_name)).select(bdy_id, "state",
                                 "geom").repartition(num_partitions,
                                                     "state").cache())
    bdy_count = bdy_df.count()

    # create RDDs - analysed partitioned and indexed
    point_rdd = Adapter.toSpatialRdd(point_df, "geom")
    bdy_rdd = Adapter.toSpatialRdd(bdy_df, "geom")

    point_df.unpersist()
    bdy_df.unpersist()

    point_rdd.analyze()
    bdy_rdd.analyze()

    point_rdd.spatialPartitioning(GridType.KDBTREE)
    bdy_rdd.spatialPartitioning(point_rdd.getPartitioner())

    point_rdd.buildIndex(IndexType.RTREE, True)
    bdy_rdd.buildIndex(IndexType.RTREE, True)

    # run join query
    join_pair_rdd = JoinQueryRaw.SpatialJoinQueryFlat(point_rdd, bdy_rdd, True,
                                                      True)

    # convert SedonaPairRDD to dataframe
    join_df = Adapter.toDf(join_pair_rdd, bdy_rdd.fieldNames,
                           point_rdd.fieldNames, spark)
    # join_df.printSchema()

    # | -- leftgeometry: geometry(nullable=true)
    # | -- <bdy_id>: string(nullable=true)
    # | -- state: string(nullable=true)
    # | -- rightgeometry: geometry(nullable=true)
    # | -- gnaf_pid: string(nullable=true)
    # | -- gnaf_state: string(nullable=true)

    join_df2 = (join_df
                # .filter((join_df["state"] == join_df["gnaf_state"]))
                .select("gnaf_pid", bdy_id, "state")
                .dropDuplicates(["gnaf_pid", bdy_id])
                .cache()
                )

    # output to files
    if "warmup" in test_name:
        name = "gnaf_rdd_{}_{}_{}".format(bdy_id, max_vertices, num_partitions)

        (join_df2.repartition(50).write.partitionBy("state").option(
            "compression",
            "gzip").mode("overwrite").parquet(os.path.join(output_path, name)))

    # output vars
    join_count = join_df2.count()
    time_taken = datetime.now() - start_time

    if "warmup" in test_name:
        print("{},{},{},{},{},{}".format(test_name, join_count, bdy_count,
                                         max_vertices, num_partitions,
                                         time_taken))
    else:
        log_file.write("{},{},{},{},{},{}\n".format(test_name, join_count,
                                                    bdy_count, max_vertices,
                                                    num_partitions,
                                                    time_taken))

    # cleanup
    spark.stop()