Ejemplo n.º 1
0
    def spark(self):
        if not hasattr(self, "__spark"):
            upload_jars()
            spark = SparkSession. \
                builder. \
                config("spark.serializer", KryoSerializer.getName).\
                config("spark.kryo.registrator", GeoSparkKryoRegistrator.getName) .\
                master("local[*]").\
                getOrCreate()

            GeoSparkRegistrator.registerAll(spark)

            setattr(self, "__spark", spark)
        return getattr(self, "__spark")
Ejemplo n.º 2
0
def main():
    start_time = datetime.now()

    # upload Sedona (geospark) JARs
    # theoretically only need to do this once
    upload_jars()

    spark = (SparkSession.builder.master("local[*]").appName("query").config(
        "spark.sql.session.timeZone",
        "UTC").config("spark.sql.debug.maxToStringFields", 100).config(
            "spark.serializer", KryoSerializer.getName).config(
                "spark.kryo.registrator",
                GeoSparkKryoRegistrator.getName).config(
                    "spark.sql.adaptive.enabled",
                    "true").config("spark.executor.cores", 1).config(
                        "spark.cores.max", num_processors).config(
                            "spark.driver.memory",
                            "8g").config("spark.driver.maxResultSize",
                                         "1g").getOrCreate())

    # Register Apache Sedona (geospark) UDTs and UDFs
    GeoSparkRegistrator.registerAll(spark)

    # set Sedona spatial indexing and partitioning config in Spark session
    # (no effect on the "small" spatial join query in this script. Will improve bigger queries)
    spark.conf.set("geospark.global.index", "true")
    spark.conf.set("geospark.global.indextype", "rtree")
    spark.conf.set("geospark.join.gridtype", "kdbtree")

    logger.info("\t - PySpark {} session initiated: {}".format(
        spark.sparkContext.version,
        datetime.now() - start_time))
    start_time = datetime.now()

    # load boundaries (geometries are Well Known Text strings)
    bdy_wkt_df = spark.read.parquet(os.path.join(input_path, "boundaries"))
    # bdy_wkt_df.printSchema()
    # bdy_wkt_df.show(5)

    # create view to enable SQL queries
    bdy_wkt_df.createOrReplaceTempView("bdy_wkt")

    # create geometries from WKT strings into new DataFrame
    # new DF will be spatially indexed automatically
    bdy_df = spark.sql(
        "select bdy_id, st_geomFromWKT(wkt_geom) as geometry from bdy_wkt")

    # repartition and cache for performance (no effect on the "small" spatial join query here)
    # bdy_df.repartition(spark.sparkContext.defaultParallelism).cache().count()
    # bdy_df.printSchema()
    # bdy_df.show(5)

    # create view to enable SQL queries
    bdy_df.createOrReplaceTempView("bdy")

    logger.info("\t - Loaded and spatially enabled {:,} boundaries: {}".format(
        bdy_df.count(),
        datetime.now() - start_time))
    start_time = datetime.now()

    # load points (spatial data is lat/long fields)
    point_wkt_df = spark.read.parquet(os.path.join(input_path, "points"))
    # point_wkt_df.printSchema()
    # point_wkt_df.show(5)

    # create view to enable SQL queries
    point_wkt_df.createOrReplaceTempView("point_wkt")

    # create geometries from lat/long fields into new DataFrame
    # new DF will be spatially indexed automatically
    sql = """select point_id, 
                    st_point(cast(longitude as decimal(9, 6)), cast(latitude as decimal(8, 6))) as geometry
             from point_wkt"""
    point_df = spark.sql(sql)

    # repartition and cache for performance (no effect on the "small" spatial join query here)
    # point_df.repartition(spark.sparkContext.defaultParallelism).cache().count()
    # point_df.printSchema()
    # point_df.show(5)

    # create view to enable SQL queries
    point_df.createOrReplaceTempView("pnt")

    logger.info("\t - Loaded and spatially enabled {:,} points: {}".format(
        point_df.count(),
        datetime.now() - start_time))
    start_time = datetime.now()

    # run spatial join to boundary tag the points
    # notes:
    #   - spatial partitions and indexes for join will be created automatically
    #   - it's an inner join so point records could be lost
    sql = """SELECT pnt.point_id,
                    bdy.bdy_id, 
                    pnt.geometry
             FROM pnt
             INNER JOIN bdy ON ST_Intersects(pnt.geometry, bdy.geometry)"""
    join_df = spark.sql(sql)
    # join_df.explain()

    # # output join DataFrame
    # join_df.write.option("compression", "gzip") \
    #     .mode("overwrite") \
    #     .parquet(os.path.join(input_path, "output"))

    num_joined_points = join_df.count()

    join_df.printSchema()
    join_df.show(5)

    logger.info("\t - {:,} points were boundary tagged: {}".format(
        num_joined_points,
        datetime.now() - start_time))

    # cleanup
    spark.stop()
def main():
    start_time = datetime.now()

    # copy gnaf tables to CSV
    pg_conn = psycopg2.connect(local_pg_connect_string)
    pg_cur = pg_conn.cursor()

    sql = """COPY (
                 SELECT longitude, latitude, gnaf_pid, state
                 FROM gnaf_202008.{}
             ) TO STDOUT WITH CSV"""
    # sql = """COPY (
    #              SELECT gnaf_pid, street_locality_pid, locality_pid, alias_principal, primary_secondary, building_name,
    #                     lot_number, flat_number, level_number, number_first, number_last, street_name, street_type,
    #                     street_suffix, address, locality_name, postcode, state, locality_postcode, confidence,
    #                     legal_parcel_id, mb_2011_code, mb_2016_code, latitude, longitude, geocode_type, reliability
    #              FROM gnaf_202008.{}
    #          ) TO STDOUT WITH CSV"""

    # address principals
    with open(os.path.join(output_path, "gnaf_light.csv"), 'w') as csv_file:
        pg_cur.copy_expert(sql.format("address_principals"), csv_file)
        # pg_cur.copy_expert(sql.format("address_principals") + " HEADER", csv_file)

    # address aliases
    with open(os.path.join(output_path, "gnaf_light.csv"), 'a') as csv_file:
        pg_cur.copy_expert(sql.format("address_aliases"), csv_file)

    pg_cur.close()
    pg_conn.close()

    logger.info("\t - GNAF points exported to CSV: {}".format(datetime.now() -
                                                              start_time))
    start_time = datetime.now()

    # upload Sedona (geospark) JARs
    upload_jars()

    spark = (SparkSession.builder.master("local[*]").appName("query").config(
        "spark.sql.session.timeZone",
        "UTC").config("spark.sql.debug.maxToStringFields", 100).config(
            "spark.serializer", KryoSerializer.getName).config(
                "spark.kryo.registrator",
                GeoSparkKryoRegistrator.getName).config(
                    "spark.cores.max",
                    cpu_count()).config("spark.sql.adaptive.enabled",
                                        "true").config("spark.driver.memory",
                                                       "8g").getOrCreate())

    # Register Apache Sedona (geospark) UDTs and UDFs
    GeoSparkRegistrator.registerAll(spark)

    logger.info("\t - PySpark {} session initiated: {}".format(
        spark.sparkContext.version,
        datetime.now() - start_time))
    start_time = datetime.now()

    # load gnaf points
    df = spark.read \
        .option("header", True) \
        .option("inferSchema", True) \
        .csv(input_file_name)
    # df.printSchema()
    # df.show()

    # # manually assign field types (not needed here as inferSchema works)
    # df2 = (df
    #        .withColumn("confidence", df.confidence.cast(t.ShortType()))
    #        .withColumn("mb_2011_code", df.mb_2011_code.cast(t.LongType()))
    #        .withColumn("mb_2016_code", df.mb_2016_code.cast(t.LongType()))
    #        .withColumn("reliability", df.reliability.cast(t.ShortType()))
    #        .withColumn("longitude", df.longitude.cast(t.DoubleType()))
    #        .withColumn("latitude", df.latitude.cast(t.DoubleType()))
    #        )
    # # df2.printSchema()
    # # df2.show()

    # add point geometries and partition by longitude into 400-500k row partitions
    gnaf_df = df.withColumn("geom", f.expr("ST_Point(longitude, latitude)"))
    # .withColumnRenamed("gnaf_pid", "id")
    # .withColumn("partition_id", (f.percent_rank().over(Window.partitionBy().orderBy("longitude")) * f.lit(100.0))
    #             .cast(t.ShortType())) \
    # .repartitionByRange(100, "partition_id") \
    # gnaf_df.printSchema()

    # check partition counts
    gnaf_df.groupBy(f.spark_partition_id()).count().show()

    # write gnaf to gzipped parquet
    export_to_parquet(gnaf_df, "gnaf")

    # export PG boundary tables to parquet
    export_bdys(spark, "commonwealth_electorates", "ce_pid")
    export_bdys(spark, "local_government_areas", "lga_pid")
    export_bdys(spark, "local_government_wards", "ward_pid")
    export_bdys(spark, "state_lower_house_electorates", "se_lower_pid")
    export_bdys(spark, "state_upper_house_electorates", "se_upper_pid")

    # cleanup
    spark.stop()

    logger.info(
        "\t - GNAF and boundaries exported to gzipped parquet files: {}".
        format(datetime.now() - start_time))
import string, sys, re

import pandas as pd
import geopandas as gpd
from pyspark.sql.types import *
from pyspark.sql import SparkSession
from geospark.register import upload_jars
from geospark.register import GeoSparkRegistrator

# Create Spark Session
spark = SparkSession.builder.\
    appName("SparkSessionExample").\
    getOrCreate()

# Uses findspark Python package to upload jar files to executor and nodes.
upload_jars()

# Registers all GeoSparkSQL functions
GeoSparkRegistrator.registerAll(spark)

# Load matrix of coordinates and US county data into Spark and GeoPandas
original_matrix_df = spark.read.format("csv").option("header", "true").load("geospark_matrix.csv")
original_geo_df = gpd.read_file("cb_2018_us_county_500k/cb_2018_us_county_500k.shp")

# Map Polygon in geometry field of geo_d fto WKT (well-known-text) format and rename as counties_df 
wkts = map(lambda g: str(g.to_wkt()), original_geo_df.geometry)
original_geo_df['wkt'] = pd.Series(wkts)
original_geo_df = original_geo_df.drop("geometry", axis=1)
counties_df = spark.createDataFrame(original_geo_df)

# Use Spark SQL to create new column location with each location as ST_POINT
Ejemplo n.º 5
0
def main():
    start_time = datetime.now()

    # upload Sedona (geospark) JARs
    upload_jars()

    spark = (SparkSession.builder.master("local[*]").appName("query").config(
        "spark.sql.session.timeZone",
        "UTC").config("spark.sql.debug.maxToStringFields", 100).config(
            "spark.serializer", KryoSerializer.getName).config(
                "spark.kryo.registrator",
                GeoSparkKryoRegistrator.getName).config(
                    "spark.cores.max", num_processors).config(
                        "spark.sql.adaptive.enabled",
                        "true").config("spark.driver.memory",
                                       "12g").getOrCreate())
    #              .config("spark.driver.extraJavaOptions", "-XX:+UseG1GC")
    #              .config("spark.executor.extraJavaOptions", "-XX:+UseG1GC")
    #              .config("spark.sql.autoBroadcastJoinThreshold", -1)
    #              .config("spark.serializer", "org.apache.spark.serializer.KryoSerializer")
    #              .config("spark.driver.maxResultSize", "1g")
    #
    #              .config("spark.executor.cores", 1)
    #              .config("spark.executor.memory", "2g")

    # Register Apache Sedona (geospark) UDTs and UDFs
    GeoSparkRegistrator.registerAll(spark)

    logger.info("PySpark {} session initiated: {}".format(
        spark.sparkContext.version,
        datetime.now() - start_time))
    logger.info("\t - Running on Python {}".format(
        sys.version.replace("\n", " ")))
    start_time = datetime.now()

    # # load gzip csv files
    # df = spark.read.csv(input_file_name)
    # # df = spark.read.csv(os.path.join(output_path, "testing"))
    # # df = spark.read.csv(os.path.join(output_path, "sydney"))
    # # df.printSchema()
    # # df.show()
    #
    # # # create small dataset to speed testing up
    # # testing_df = df.filter(f.col("_c0").isin(vehicle_id_list)).cache()
    # # print(testing_df.count())
    # # testing_df.repartition(1).write.option("compression", "gzip") \
    # #     .mode("overwrite") \
    # #     .csv(os.path.join(output_path, "testing"))
    #
    # # fix column types and names - for some unknown reason it's 3-4x faster than enforcing schema on load
    # df2 = (df.withColumnRenamed("_c0", "vehicle_id")
    #        .withColumn("longitude", df["_c1"].cast(t.DoubleType()))
    #        .withColumn("latitude", df["_c2"].cast(t.DoubleType()))
    #        .withColumn("speed", df["_c3"].cast(t.DoubleType()))
    #        .withColumn("bearing", df["_c4"].cast(t.DoubleType()))
    #        .withColumn("time_utc", df["_c5"].cast(t.TimestampType()))
    #        .withColumn("unix_time", df["_c6"].cast(t.IntegerType()))
    #        .withColumn("geom", f.expr("st_point(longitude, latitude)"))
    #        .drop("_c1")
    #        .drop("_c2")
    #        .drop("_c3")
    #        .drop("_c4")
    #        .drop("_c5")
    #        .drop("_c6")
    #        .repartition(f.to_date(f.col("time_utc")))
    #        )
    # # df2.printSchema()
    # # df2.show(10, False)
    #
    # df2.write.option("compression", "gzip") \
    #     .mode("overwrite") \
    #     .parquet(os.path.join(output_path, "step_1_schema_applied"))
    #
    # df.unpersist()
    # df2.unpersist()

    schema_df = spark.read.parquet(
        os.path.join(output_path, "step_1_schema_applied"))
    schema_df.createOrReplaceTempView("point")

    # # # get counts
    # # sql = """SELECT count(distinct vehicle_id) as unique_id_count,
    # #                 count(*) as point_count
    # #          FROM point"""
    # # area_df = spark.sql(sql)
    # # area_df.show()
    #
    # logger.info("Step 1 : {} points loaded : {}".format(schema_df.count(), datetime.now() - start_time))
    # # start_time = datetime.now()

    # --------------------------
    # output stuff
    # --------------------------

    # get_time_gap_stats(spark)
    export_trip_segments(spark)
    # export_small_area_data(spark)
    # export_single_id_data(spark)
    # export_trip_and_stop_data(spark)

    # --------------------------

    # cleanup
    spark.stop()
    pg_pool.closeall()
Ejemplo n.º 6
0
def main():
    start_time = datetime.now()

    # ----------------------------------------------------------
    # copy gnaf tables from Postgres to a CSV file - a one off
    #   - export required fields only and no header
    # ----------------------------------------------------------

    pg_conn = pg_pool.getconn()
    pg_cur = pg_conn.cursor()

    sql = """COPY (
                 SELECT longitude, latitude, gnaf_pid, locality_pid, locality_name, postcode, state
                 FROM gnaf_202008.{}
             ) TO STDOUT WITH CSV"""

    # address principals
    with open(gnaf_csv_file_path, 'w') as csv_file:
        pg_cur.copy_expert(sql.format("address_principals"), csv_file)

    # append address aliases
    with open(gnaf_csv_file_path, 'a') as csv_file:
        pg_cur.copy_expert(sql.format("address_aliases"), csv_file)

    pg_cur.close()
    pg_pool.putconn(pg_conn)

    logger.info("\t - GNAF points exported to CSV: {}".format(datetime.now() -
                                                              start_time))
    start_time = datetime.now()

    # ----------------------------------------------------------
    # create Spark session and context
    # ----------------------------------------------------------

    # upload Apache Sedona JARs
    upload_jars()

    spark = (SparkSession.builder.master("local[*]").appName("query").config(
        "spark.sql.session.timeZone",
        "UTC").config("spark.sql.debug.maxToStringFields", 100).config(
            "spark.serializer", KryoSerializer.getName).config(
                "spark.kryo.registrator",
                GeoSparkKryoRegistrator.getName).config(
                    "spark.cores.max", num_processors).config(
                        "spark.sql.adaptive.enabled",
                        "true").config("spark.driver.memory",
                                       "8g").getOrCreate())

    # Register Apache Sedona UDTs and UDFs
    GeoSparkRegistrator.registerAll(spark)

    # # set Sedona spatial indexing and partitioning config in Spark session
    # # (no effect on the "small" spatial join query in this script. Will improve bigger queries)
    # spark.conf.set("geospark.global.index", "true")
    # spark.conf.set("geospark.global.indextype", "rtree")
    # spark.conf.set("geospark.join.gridtype", "kdbtree")

    sc = spark.sparkContext

    logger.info("\t - PySpark {} session initiated: {}".format(
        sc.version,
        datetime.now() - start_time))
    start_time = datetime.now()

    # ----------------------------------------------------------
    # create GNAF PointRDD from CSV file
    # ----------------------------------------------------------

    offset = 0  # The point long/lat fields start at column 0
    carry_other_attributes = True  # include non-geo columns

    point_rdd = PointRDD(sc, os.path.join(output_path, gnaf_csv_file_path),
                         offset, FileDataSplitter.CSV, carry_other_attributes)
    point_rdd.analyze()

    # add partitioning and indexing
    point_rdd.spatialPartitioning(GridType.KDBTREE)
    point_rdd.buildIndex(IndexType.RTREE, True)

    # set Spark storage type - set to MEMORY_AND_DISK if low on memory
    point_rdd.indexedRDD.persist(StorageLevel.MEMORY_ONLY)

    logger.info("\t - GNAF RDD created: {}".format(datetime.now() -
                                                   start_time))

    # ----------------------------------------------------------
    # get boundary tags using a spatial join
    # ----------------------------------------------------------

    for bdy in bdy_list:
        bdy_tag(spark, point_rdd, bdy)

    # point_rdd.unpersist()  # no such method on a SpatialRDD

    # ----------------------------------------------------------
    # merge boundary tag dataframes with GNAF records
    #   - required because spatial joins are INNER JOIN only,
    #     need to add untagged GNAF points
    # ----------------------------------------------------------

    start_time = datetime.now()

    # create gnaf dataframe and SQL view
    gnaf_df = spark.read \
        .option("header", False) \
        .option("inferSchema", True) \
        .csv(gnaf_csv_file_path) \
        .drop("_C0") \
        .drop("_C1") \
        .withColumnRenamed("_C2", "gnaf_pid") \
        .withColumnRenamed("_C3", "locality_pid") \
        .withColumnRenamed("_C4", "locality_name") \
        .withColumnRenamed("_C5", "postcode") \
        .withColumnRenamed("_C6", "state")
    # gnaf_df.printSchema()
    # gnaf_df.show(10, False)

    gnaf_df.createOrReplaceTempView("pnt")

    # add bdy tags, one bdy type at a time
    for bdy in bdy_list:
        gnaf_df = join_bdy_tags(spark, bdy)
        gnaf_df.createOrReplaceTempView("pnt")

    # # add point geoms for output to Postgres - in the PostGIS specific EWKT format
    # final_df = gnaf_df.withColumn("geom", f.expr("concat('SRID=4326;POINT (', longitude, ' ', latitude, ')')")) \
    #     .drop("longitude") \
    #     .drop("latitude")
    # # final_df.printSchema()
    # # final_df.show(10, False)

    logger.info("\t - Boundary tags merged: {}".format(datetime.now() -
                                                       start_time))

    # output result to Postgres
    export_to_postgres(gnaf_df, "testing2.gnaf_with_bdy_tags",
                       os.path.join(output_path, "temp_gnaf_with_bdy_tags"),
                       True)

    # cleanup
    spark.stop()

    # delete intermediate bdy tag files and GNAF csv file
    for bdy in bdy_list:
        shutil.rmtree(
            os.path.join(output_path, "gnaf_with_{}".format(bdy["name"])))

    os.remove(gnaf_csv_file_path)
def main():
    start_time = datetime.now()

    # upload Sedona (geospark) JARs
    upload_jars()

    spark = (SparkSession.builder.master("local[*]").appName("query").config(
        "spark.sql.session.timeZone",
        "UTC").config("spark.sql.debug.maxToStringFields", 100).config(
            "spark.serializer", KryoSerializer.getName).config(
                "spark.kryo.registrator",
                GeoSparkKryoRegistrator.getName).config(
                    "spark.cores.max", num_processors).config(
                        "spark.sql.adaptive.enabled",
                        "true").config("spark.driver.memory",
                                       "8g").getOrCreate())

    # Register Apache Sedona (geospark) UDTs and UDFs
    GeoSparkRegistrator.registerAll(spark)

    # # set Sedona spatial indexing and partitioning config in Spark session
    # # (slowed down the "small" spatial join query in this script. Might improve bigger queries)
    # spark.conf.set("geospark.global.index", "true")
    # spark.conf.set("geospark.global.indextype", "rtree")
    # spark.conf.set("geospark.join.gridtype", "kdbtree")
    # spark.conf.set("ggeospark.join.numpartition", "-1")
    # spark.conf.set("geospark.join.indexbuildside", "right")
    # spark.conf.set("geospark.join.spatitionside", "right")

    logger.info("\t - PySpark {} session initiated: {}".format(
        spark.sparkContext.version,
        datetime.now() - start_time))
    start_time = datetime.now()

    # # load gnaf points and create geoms
    # df = spark.read \
    #     .option("header", True) \
    #     .option("inferSchema", True) \
    #     .csv(input_file_name)
    #
    # point_df = df \
    #     .withColumn("geom", f.expr("ST_Point(longitude, latitude)")) \
    #     .cache()

    point_df = spark.read.parquet(os.path.join(output_path, "gnaf")).select(
        "gnaf_pid", "state", "geom")
    # point_df = gnaf_df.select("gnaf_pid", "state", "geom")
    # point_df = gnaf_df.select("gnaf_pid", "state", "longitude", "latitude", "geom")\
    #     .repartitionByRange(100, "longitude")

    point_df.createOrReplaceTempView("pnt")

    logger.info("\t - Loaded {:,} GNAF points: {}".format(
        point_df.count(),
        datetime.now() - start_time))

    # boundary tag gnaf points
    bdy_tag(spark, "commonwealth_electorates", "ce_pid")

    point_df.unpersist()

    # tag_df.printSchema()

    point_df = spark.read.parquet(
        os.path.join(output_path,
                     "gnaf_with_{}".format("commonwealth_electorates")))

    # point_df.createOrReplaceTempView("pnt")

    # bdy_tag(spark, "local_government_areas", "lga_pid")
    # tag_df2.printSchema()

    # point_df.unpersist()
    #
    # point_df = spark.read.parquet(os.path.join(output_path, "gnaf_with_{}".format("local_government_areas")))
    # # point_df.createOrReplaceTempView("pnt")
    #
    # # bdy_tag(spark, "local_government_wards", "ward_pid")
    # # bdy_tag(spark, "state_lower_house_electorates", "se_lower_pid")
    # # bdy_tag(spark, "state_upper_house_electorates", "se_upper_pid")
    #
    # bdy_ids = "ce_pid text, lga_pid text"
    #
    # final_df = point_df.withColumn("wkt_geom", f.expr("concat('SRID=4326;POINT (', st_x(geom), ' ', st_y(geom), ')')"))\
    #     .drop("geom")
    # # final_df.printSchema()
    #
    # # output to postgres, via CSV
    # table_name = "gnaf_with_bdy_tags"
    # export_to_postgres(final_df, "testing2.{}".format(table_name), bdy_ids, os.path.join(output_path, table_name))

    # cleanup
    spark.stop()