def draw_world_weighted_point_map(spark): df = spark.read.format("csv").option("header", True).option( "delimiter", "," ).schema( "continent string, country string, locationId string, longitude double, latitude double," "currentConfirmedCount int, confirmedCount int, suspectedCount int, curedCount int, deadCount int, " "updateTime timestamp").load(country_csv).cache() df.createOrReplaceTempView("COVID_country") register_funcs(spark) # 1 res1 = spark.sql( "select ST_Point(longitude, latitude) as point from COVID_country ") res1.createOrReplaceTempView("res1") res1 = spark.sql("select * from res1 where point != 'POINT (nan nan)' ") res1.show(20, False) vega1 = vega_weighted_pointmap( 3000, 2000, [-289.095983, -73.863121, 289.095983, 73.863121], "#EEEEEE", [2, 60], [6], 1.0, "EPSG:4326") res_png1 = weighted_pointmap(res1, vega1) save_png(res_png1, './COVID_country_weighted_point_map1.png') spark.catalog.dropGlobalTempView("COVID_country")
def draw_heat_map(spark): df = spark.read.format("csv").option("header", True).option( "delimiter", "," ).schema( "VendorID string, tpep_pickup_datetime timestamp, tpep_dropoff_datetime timestamp, passenger_count long, trip_distance double, pickup_longitude double, pickup_latitude double, dropoff_longitude double, dropoff_latitude double, fare_amount double, tip_amount double, total_amount double, buildingid_pickup long, buildingid_dropoff long, buildingtext_pickup string, buildingtext_dropoff string" ).load("file:///tmp/0_5M_nyc_taxi_and_building.csv").cache() df.show(20, False) df.createOrReplaceTempView("nyc_taxi") # df.createOrReplaceGlobalTempView("nyc_taxi") res = spark.sql( "select pickup_latitude as x, pickup_longitude as y, passenger_count as w from nyc_taxi" ) res.printSchema() res.createOrReplaceTempView("pickup") register_funcs(spark) res = spark.sql( "select ST_Transform(ST_Point(x, y), 'EPSG:4326','EPSG:3857' ) as pickup_point, w from pickup" ) res.show(20, False) res.createOrReplaceTempView("project") res = spark.sql( "select Projection(pickup_point, 'POINT (4534000 -12510000)', 'POINT (4538000 -12513000)', 1024, 896) as point, w from project" ) res.show(20, False) vega_heat_map = VegaHeatMap(300, 200, 10.0) vega = vega_heat_map.build() res = heatmap(res, vega) save_png(res, '/tmp/heatmap.png') spark.sql("show tables").show() spark.catalog.dropGlobalTempView("nyc_taxi")
def draw_heat_map(spark): start_time = time.time() df = spark.read.format("csv").option("header", True).option( "delimiter", "," ).schema( "VendorID string, tpep_pickup_datetime timestamp, tpep_dropoff_datetime timestamp, passenger_count long, trip_distance double, pickup_longitude double, pickup_latitude double, dropoff_longitude double, dropoff_latitude double, fare_amount double, tip_amount double, total_amount double, buildingid_pickup long, buildingid_dropoff long, buildingtext_pickup string, buildingtext_dropoff string" ).load("file:///tmp/0_5M_nyc_taxi_and_building.csv").cache() df.createOrReplaceTempView("nyc_taxi") register_funcs(spark) res = spark.sql( "select ST_Point(pickup_longitude, pickup_latitude) as point, passenger_count as w from nyc_taxi where ST_Within(ST_Point(pickup_longitude, pickup_latitude), ST_GeomFromText('POLYGON ((-73.998427 40.730309, -73.954348 40.730309, -73.954348 40.780816 ,-73.998427 40.780816, -73.998427 40.730309))'))" ) res.show() vega = vega_heatmap(1024, 896, [-73.998427, 40.730309, -73.954348, 40.780816], 10.0, 'EPSG:4326') res = heatmap(vega, res) save_png(res, '/tmp/heatmap.png') spark.sql("show tables").show() spark.catalog.dropGlobalTempView("nyc_taxi") print("--- %s seconds ---" % (time.time() - start_time))
def draw_point_map(spark): # file 0_5M_nyc_build.csv is generated from New York taxi data and taxi zone shapefile. Data is available at the following URL: https://www1.nyc.gov/site/tlc/about/tlc-trip-record-data.page df = spark.read.format("csv").option("header", True).option( "delimiter", "," ).schema( "VendorID string, tpep_pickup_datetime timestamp, tpep_dropoff_datetime timestamp, passenger_count long, trip_distance double, pickup_longitude double, pickup_latitude double, dropoff_longitude double, dropoff_latitude double, fare_amount double, tip_amount double, total_amount double, buildingid_pickup long, buildingid_dropoff long, buildingtext_pickup string, buildingtext_dropoff string" ).load(data_path).cache() df.show(20, False) df.createOrReplaceTempView("nyc_taxi") # df.createOrReplaceGlobalTempView("nyc_taxi") res = spark.sql( "select pickup_latitude as x, pickup_longitude as y from nyc_taxi") res.printSchema() res.createOrReplaceTempView("pickup") register_funcs(spark) res = spark.sql( "select ST_Transform(ST_Point(x, y), 'EPSG:4326','EPSG:3857' ) as pickup_point from pickup" ) res.show(20, False) res.createOrReplaceTempView("project") res = spark.sql( "select Projection(pickup_point, 'POINT (4534000 -12510000)', 'POINT (4538000 -12513000)', 1024, 896) as point from project" ) res.show(20, False) vega_point_map = VegaCircle2d(1900, 1410, 3, "#2DEF4A", 0.5) vega = vega_point_map.build() res = pointmap(res, vega) save_png(res, '/tmp/pointmap.png') spark.sql("show tables").show() spark.catalog.dropGlobalTempView("nyc_taxi")
def draw_point_map(spark): # file 0_5M_nyc_taxi_and_building.csv could be obtained from arctern-turoial warehouse under zilliztech account. The link on github is https://github.com/zilliztech/arctern-tutorial df = spark.read.format("csv").option("header", True).option( "delimiter", "," ).schema( "VendorID string, tpep_pickup_datetime timestamp, tpep_dropoff_datetime timestamp, passenger_count long, trip_distance double, pickup_longitude double, pickup_latitude double, dropoff_longitude double, dropoff_latitude double, fare_amount double, tip_amount double, total_amount double, buildingid_pickup long, buildingid_dropoff long, buildingtext_pickup string, buildingtext_dropoff string" ).load("file:///tmp/0_5M_nyc_taxi_and_building.csv").cache() df.show(20, False) df.createOrReplaceTempView("nyc_taxi") # df.createOrReplaceGlobalTempView("nyc_taxi") res = spark.sql( "select pickup_latitude as x, pickup_longitude as y from nyc_taxi") res.printSchema() res.createOrReplaceTempView("pickup") register_funcs(spark) res = spark.sql( "select ST_Transform(ST_Point(x, y), 'EPSG:4326','EPSG:3857' ) as pickup_point from pickup" ) res.show(20, False) res.createOrReplaceTempView("project") res = spark.sql( "select Projection(pickup_point, 'POINT (4534000 -12510000)', 'POINT (4538000 -12513000)', 1024, 896) as point from project" ) res.show(20, False) vega_point_map = VegaCircle2d(1900, 1410, 3, "#2DEF4A", 0.5) vega = vega_point_map.build() res = pointmap(res, vega) save_png(res, '/tmp/pointmap.png') spark.sql("show tables").show() spark.catalog.dropGlobalTempView("nyc_taxi")
def draw_point_map(spark): start_time = time.time() # file 0_5M_nyc_taxi_and_building.csv could be obtained from arctern-turoial warehouse under zilliztech account. The link on github is https://github.com/zilliztech/arctern-tutorial df = spark.read.format("csv").option("header", True).option( "delimiter", "," ).schema( "VendorID string, tpep_pickup_datetime timestamp, tpep_dropoff_datetime timestamp, passenger_count long, trip_distance double, pickup_longitude double, pickup_latitude double, dropoff_longitude double, dropoff_latitude double, fare_amount double, tip_amount double, total_amount double, buildingid_pickup long, buildingid_dropoff long, buildingtext_pickup string, buildingtext_dropoff string" ).load("file:///tmp/0_5M_nyc_taxi_and_building.csv").cache() df.createOrReplaceTempView("nyc_taxi") register_funcs(spark) res = spark.sql( "select ST_Point(pickup_longitude, pickup_latitude) as point from nyc_taxi where ST_Within(ST_Point(pickup_longitude, pickup_latitude), ST_GeomFromText('POLYGON ((-73.998427 40.730309, -73.954348 40.730309, -73.954348 40.780816 ,-73.998427 40.780816, -73.998427 40.730309))'))" ) vega = vega_pointmap( 1024, 896, bounding_box=[-73.998427, 40.730309, -73.954348, 40.780816], point_size=3, point_color="#2DEF4A", opacity=0.5, coordinate_system="EPSG:4326") res = pointmap(vega, res) save_png(res, '/tmp/pointmap.png') spark.sql("show tables").show() spark.catalog.dropGlobalTempView("nyc_taxi") print("--- %s seconds ---" % (time.time() - start_time))
def _create_session(self): """ clone new session """ session = self.session.newSession() register_funcs(session) return session
def draw_choropleth_map(spark): start_time = time.time() df = spark.read.format("csv").option("header", True).option( "delimiter", "," ).schema( "VendorID string, tpep_pickup_datetime timestamp, tpep_dropoff_datetime timestamp, passenger_count long, trip_distance double, pickup_longitude double, pickup_latitude double, dropoff_longitude double, dropoff_latitude double, fare_amount double, tip_amount double, total_amount double, buildingid_pickup long, buildingid_dropoff long, buildingtext_pickup string, buildingtext_dropoff string" ).load("file:///tmp/0_5M_nyc_taxi_and_building.csv").cache() df.createOrReplaceTempView("nyc_taxi") register_funcs(spark) res = spark.sql( "select ST_GeomFromText(buildingtext_dropoff) as polygon, passenger_count as w from nyc_taxi where (buildingtext_dropoff!='')" ) vega1 = vega_choroplethmap( 1900, 1410, bounding_box=[-73.994092, 40.753893, -73.977588, 40.759642], color_gradient=["#0000FF", "#FF0000"], color_bound=[2.5, 5], opacity=1.0, coordinate_system='EPSG:4326') res1 = choroplethmap(vega1, res) save_png(res1, '/tmp/choroplethmap1.png') spark.sql("show tables").show() spark.catalog.dropGlobalTempView("nyc_taxi") print("--- %s seconds ---" % (time.time() - start_time))
def __init__(self, db_config): envs = db_config['spark'].get('envs', None) if envs: # for spark on yarn self._setup_driver_envs(envs) import uuid self._db_id = str(uuid.uuid1()).replace('-', '') self._db_name = db_config['db_name'] self._db_type = 'spark' self._table_list = [] print("init spark begin") import socket localhost_ip = socket.gethostbyname(socket.gethostname()) _t = SparkSession.builder \ .appName(db_config['spark']['app_name']) \ .master(db_config['spark']['master-addr']) \ .config('spark.driver.host', localhost_ip) \ .config("spark.sql.execution.arrow.pyspark.enabled", "true") configs = db_config['spark'].get('configs', None) if configs: for key in configs: _v = configs.get(key) if _v: print("spark config: {} = {}".format(key, _v)) _t = _t.config(key, _v) self.session = _t.getOrCreate() print("init spark done") register_funcs(self.session)
def run(sql): """ submit sql to spark """ session = INSTANCE.create_session() register_funcs(session) return session.sql(sql)
def draw_world_include_province_weighted_point_map(spark): # 1 df = spark.read.format("csv").option("header", True).option( "delimiter", "," ).schema( "Province string, Country string, Longitude double, Latitude double, ConfirmedCount int," "DeadCount int, CuredCount int, LastUpdateTime string").load( country_with_province_csv).cache() df.createOrReplaceTempView("COVID_country_province") register_funcs(spark) res2 = spark.sql( "select ST_Point(Longitude, Latitude) as point, ConfirmedCount as s from COVID_country_province " "where LastUpdateTime like '%03-29%'") res2.createOrReplaceTempView("res2") res2 = spark.sql("select * from res2 where point != 'POINT (nan nan)' ") vega2 = vega_weighted_pointmap( 3000, 2000, [-289.095983, -73.863121, 289.095983, 73.863121], "#F0356D", [2, 60], [6, 60], 1.0, "EPSG:4326") res_png2 = weighted_pointmap(res2, vega2) save_png(res_png2, './COVID_country_weighted_point_map2.png') spark.catalog.dropGlobalTempView("COVID_country_province")
def run_curve_z(spark): curve_z_df = spark.read.json("/tmp/z_curve.json").cache() curve_z_df.createOrReplaceTempView("curve_z") register_funcs(spark) hex_data = spark.sql("select my_plot(x, y) from curve_z").collect()[0][0] str_hex_data = str(hex_data) import binascii binary_string = binascii.unhexlify(str_hex_data) with open('/tmp/hex_curve_z.png', 'wb') as png: png.write(binary_string)
def draw_weighted_point_map(spark): start_time = time.time() df = spark.read.format("csv").option("header", True).option( "delimiter", "," ).schema( "VendorID string, tpep_pickup_datetime timestamp, tpep_dropoff_datetime timestamp, passenger_count long, trip_distance double, pickup_longitude double, pickup_latitude double, dropoff_longitude double, dropoff_latitude double, fare_amount double, tip_amount double, total_amount double, buildingid_pickup long, buildingid_dropoff long, buildingtext_pickup string, buildingtext_dropoff string" ).load("file:///tmp/0_5M_nyc_taxi_and_building.csv").cache() df.createOrReplaceTempView("nyc_taxi") register_funcs(spark) # single color and single stroke width res1 = spark.sql( "select ST_Point(pickup_longitude, pickup_latitude) as point from nyc_taxi where ST_Within(ST_Point(pickup_longitude, pickup_latitude), ST_GeomFromText('POLYGON ((-73.998427 40.730309, -73.954348 40.730309, -73.954348 40.780816 ,-73.998427 40.780816, -73.998427 40.730309))'))" ) vega1 = vega_weighted_pointmap( 1024, 896, [-73.998427, 40.730309, -73.954348, 40.780816], ["#87CEEB"], [0, 2], [5], 1.0, "EPSG:4326") res1 = weighted_pointmap(vega1, res1) save_png(res1, '/tmp/weighted_pointmap_0_0.png') # multiple color and single stroke width res2 = spark.sql( "select ST_Point(pickup_longitude, pickup_latitude) as point, tip_amount as c from nyc_taxi where ST_Within(ST_Point(pickup_longitude, pickup_latitude), ST_GeomFromText('POLYGON ((-73.998427 40.730309, -73.954348 40.730309, -73.954348 40.780816 ,-73.998427 40.780816, -73.998427 40.730309))'))" ) vega2 = vega_weighted_pointmap( 1024, 896, [-73.998427, 40.730309, -73.954348, 40.780816], ["#0000FF", "#FF0000"], [0, 2], [5], 1.0, "EPSG:4326") res2 = weighted_pointmap(vega2, res2) save_png(res2, '/tmp/weighted_pointmap_1_0.png') # single color and multiple stroke width res3 = spark.sql( "select ST_Point(pickup_longitude, pickup_latitude) as point, fare_amount as s from nyc_taxi where ST_Within(ST_Point(pickup_longitude, pickup_latitude), ST_GeomFromText('POLYGON ((-73.998427 40.730309, -73.954348 40.730309, -73.954348 40.780816 ,-73.998427 40.780816, -73.998427 40.730309))'))" ) vega3 = vega_weighted_pointmap( 1024, 896, [-73.998427, 40.730309, -73.954348, 40.780816], ["#87CEEB"], [0, 2], [0, 10], 1.0, "EPSG:4326") res3 = weighted_pointmap(vega3, res3) save_png(res3, '/tmp/weighted_pointmap_0_1.png') # multiple color and multiple stroke width res4 = spark.sql( "select ST_Point(pickup_longitude, pickup_latitude) as point, tip_amount as c, fare_amount as s from nyc_taxi where ST_Within(ST_Point(pickup_longitude, pickup_latitude), ST_GeomFromText('POLYGON ((-73.998427 40.730309, -73.954348 40.730309, -73.954348 40.780816 ,-73.998427 40.780816, -73.998427 40.730309))'))" ) vega4 = vega_weighted_pointmap( 1024, 896, [-73.998427, 40.730309, -73.954348, 40.780816], ["#0000FF", "#FF0000"], [0, 2], [0, 10], 1.0, "EPSG:4326") res4 = weighted_pointmap(vega4, res4) save_png(res4, '/tmp/weighted_pointmap_1_1.png') spark.sql("show tables").show() spark.catalog.dropGlobalTempView("nyc_taxi") print("--- %s seconds ---" % (time.time() - start_time))
def __init__(self): self.session = SparkSession.builder \ .appName("Arctern") \ .master(config.INSTANCE.get("spark", "master-addr")) \ .config("spark.executorEnv.PYSPARK_PYTHON", config.INSTANCE.get("spark", "executor-python") ) \ .config("spark.sql.execution.arrow.pyspark.enabled", "true") \ .config("spark.databricks.session.share", "false") \ .getOrCreate() register_funcs(self.session)
def draw_weighted_point_map(spark): df = spark.read.format("csv").option("header", True).option( "delimiter", "," ).schema( "VendorID string, tpep_pickup_datetime timestamp, tpep_dropoff_datetime timestamp, passenger_count long, trip_distance double, pickup_longitude double, pickup_latitude double, dropoff_longitude double, dropoff_latitude double, fare_amount double, tip_amount double, total_amount double, buildingid_pickup long, buildingid_dropoff long, buildingtext_pickup string, buildingtext_dropoff string" ).load(data_path).cache() df.show(20, False) df.createOrReplaceTempView("nyc_taxi") register_funcs(spark) # single color and single stroke width res1 = spark.sql( "select ST_Point(pickup_longitude, pickup_latitude) as point from nyc_taxi where ST_Within(ST_Point(pickup_longitude, pickup_latitude), 'POLYGON ((-73.998427 40.730309, -73.954348 40.730309, -73.954348 40.780816 ,-73.998427 40.780816, -73.998427 40.730309))')" ) vega1 = vega_weighted_pointmap( 1024, 896, [-73.998427, 40.730309, -73.954348, 40.780816], "#87CEEB", [0, 2], [5], 1.0, "EPSG:4326") res1 = weighted_pointmap(res1, vega1) save_png(res1, '/tmp/weighted_pointmap_0_0.png') # multiple color and single stroke width res2 = spark.sql( "select ST_Point(pickup_longitude, pickup_latitude) as point, tip_amount as c from nyc_taxi where ST_Within(ST_Point(pickup_longitude, pickup_latitude), 'POLYGON ((-73.998427 40.730309, -73.954348 40.730309, -73.954348 40.780816 ,-73.998427 40.780816, -73.998427 40.730309))')" ) vega2 = vega_weighted_pointmap( 1024, 896, [-73.998427, 40.730309, -73.954348, 40.780816], "blue_to_red", [0, 2], [5], 1.0, "EPSG:4326") res2 = weighted_pointmap(res2, vega2) save_png(res2, '/tmp/weighted_pointmap_1_0.png') # single color and multiple stroke width res3 = spark.sql( "select ST_Point(pickup_longitude, pickup_latitude) as point, fare_amount as s from nyc_taxi where ST_Within(ST_Point(pickup_longitude, pickup_latitude), 'POLYGON ((-73.998427 40.730309, -73.954348 40.730309, -73.954348 40.780816 ,-73.998427 40.780816, -73.998427 40.730309))')" ) vega3 = vega_weighted_pointmap( 1024, 896, [-73.998427, 40.730309, -73.954348, 40.780816], "#87CEEB", [0, 2], [0, 10], 1.0, "EPSG:4326") res3 = weighted_pointmap(res3, vega3) save_png(res3, '/tmp/weighted_pointmap_0_1.png') # multiple color and multiple stroke width res4 = spark.sql( "select ST_Point(pickup_longitude, pickup_latitude) as point, tip_amount as c, fare_amount as s from nyc_taxi where ST_Within(ST_Point(pickup_longitude, pickup_latitude), 'POLYGON ((-73.998427 40.730309, -73.954348 40.730309, -73.954348 40.780816 ,-73.998427 40.780816, -73.998427 40.730309))')" ) vega4 = vega_weighted_pointmap( 1024, 896, [-73.998427, 40.730309, -73.954348, 40.780816], "blue_to_red", [0, 2], [0, 10], 1.0, "EPSG:4326") res4 = weighted_pointmap(res4, vega4) save_png(res4, '/tmp/weighted_pointmap_1_1.png') spark.sql("show tables").show() spark.catalog.dropGlobalTempView("nyc_taxi")
def run_test_plot(spark): register_funcs(spark) raw_data = [] raw_data.extend([(0, 'polygon((0 0,0 1,1 1,1 0,0 0))')]) raw_data.extend([(1, 'linestring(0 0,0 1,1 1,1 0,0 0)')]) raw_data.extend([(2, 'point(2 2)')]) wkt_collect = "GEOMETRYCOLLECTION(" \ "MULTIPOLYGON (((0 0,0 1,1 1,1 0,0 0)),((1 1,1 2,2 2,2 1,1 1)))," \ "POLYGON((3 3,3 4,4 4,4 3,3 3))," \ "LINESTRING(0 8,5 5,8 0)," \ "POINT(4 7)," \ "MULTILINESTRING ((1 1,1 2),(2 4,1 9,1 8))," \ "MULTIPOINT (6 8,5 7)" \ ")" raw_data.extend([(3, wkt_collect)]) raw_schema = StructType([ StructField('idx', LongType(), False), StructField('geo', StringType(), False) ]) df = spark.createDataFrame(data=raw_data, schema=raw_schema) df.createOrReplaceTempView("geoms") df2 = spark.sql("select st_geomfromtext(geo) from geoms") # run baseline fig1, ax1 = plt.subplots() plot(ax1, df2) ax1.grid() baseline_png1 = png_path + "plot_test_1.png" fig1.savefig(baseline_png1) # run plot test fig2, ax2 = plt.subplots() plot(ax2, df2) ax2.grid() plot_test1 = png_path + "test_plot_test_1.png" fig2.savefig(plot_test1) spark.catalog.dropGlobalTempView("nyc_taxi") assert run_diff_png(baseline_png1, plot_test1)
def draw_heat_map(spark): df = spark.read.format("csv").option("header", True).option( "delimiter", "," ).schema( "VendorID string, tpep_pickup_datetime timestamp, tpep_dropoff_datetime timestamp, passenger_count long, trip_distance double, pickup_longitude double, pickup_latitude double, dropoff_longitude double, dropoff_latitude double, fare_amount double, tip_amount double, total_amount double, buildingid_pickup long, buildingid_dropoff long, buildingtext_pickup string, buildingtext_dropoff string" ).load(data_path).cache() df.show(20, False) df.createOrReplaceTempView("nyc_taxi") register_funcs(spark) res = spark.sql( "select ST_Point(pickup_longitude, pickup_latitude) as point, passenger_count as w from nyc_taxi where ST_Within(ST_Point(pickup_longitude, pickup_latitude), 'POLYGON ((-73.998427 40.730309, -73.954348 40.730309, -73.954348 40.780816 ,-73.998427 40.780816, -73.998427 40.730309))')" ) vega = vega_heatmap(1024, 896, 10.0, [-73.998427, 40.730309, -73.954348, 40.780816], 'EPSG:4326') res = heatmap(res, vega) save_png(res, '/tmp/heatmap.png') spark.sql("show tables").show() spark.catalog.dropGlobalTempView("nyc_taxi")
def draw_point_map(spark): # file 0_5M_nyc_build.csv is generated from New York taxi data and taxi zone shapefile. Data is available at the following URL: https://www1.nyc.gov/site/tlc/about/tlc-trip-record-data.page df = spark.read.format("csv").option("header", True).option( "delimiter", "," ).schema( "VendorID string, tpep_pickup_datetime timestamp, tpep_dropoff_datetime timestamp, passenger_count long, trip_distance double, pickup_longitude double, pickup_latitude double, dropoff_longitude double, dropoff_latitude double, fare_amount double, tip_amount double, total_amount double, buildingid_pickup long, buildingid_dropoff long, buildingtext_pickup string, buildingtext_dropoff string" ).load(data_path).cache() df.show(20, False) df.createOrReplaceTempView("nyc_taxi") register_funcs(spark) res = spark.sql( "select ST_Point(pickup_longitude, pickup_latitude) as point from nyc_taxi where ST_Within(ST_Point(pickup_longitude, pickup_latitude), 'POLYGON ((-73.998427 40.730309, -73.954348 40.730309, -73.954348 40.780816 ,-73.998427 40.780816, -73.998427 40.730309))')" ) vega = vega_pointmap(1024, 896, [-73.998427, 40.730309, -73.954348, 40.780816], 3, "#2DEF4A", 0.5, "EPSG:4326") res = pointmap(res, vega) save_png(res, '/tmp/pointmap.png') spark.sql("show tables").show() spark.catalog.dropGlobalTempView("nyc_taxi")
def run_test_heat_map(spark): df = spark.read.format("csv").option("header", True).option("delimiter", ",").schema( "VendorID string, tpep_pickup_datetime timestamp, tpep_dropoff_datetime timestamp, passenger_count long, " "trip_distance double, pickup_longitude double, pickup_latitude double, dropoff_longitude double, " "dropoff_latitude double, fare_amount double, tip_amount double, total_amount double, buildingid_pickup long, " "buildingid_dropoff long, buildingtext_pickup string, buildingtext_dropoff string").load( file_path).cache() df.createOrReplaceTempView("nyc_taxi") register_funcs(spark) res = spark.sql( "select ST_Point(pickup_longitude, pickup_latitude) as point, passenger_count as w from nyc_taxi where ST_Within(ST_Point(pickup_longitude, pickup_latitude), 'POLYGON ((-73.998427 40.730309, -73.954348 40.730309, -73.954348 40.780816 ,-73.998427 40.780816, -73.998427 40.730309))')") # 1 size:1024*896, map_scale: 10.0 vega_1 = vega_heatmap(1024, 896, 10.0, [-73.998427, 40.730309, -73.954348, 40.780816], 'EPSG:4326') baseline1 = heatmap(res, vega_1) heat_map1_1 = heatmap(res, vega_1) heat_map1_2 = heatmap(res, vega_1) baseline_png1 = png_path + "heat_map_nyc_1.png" save_png(baseline1, baseline_png1) save_png(heat_map1_1, png_path + "test_heat_map_nyc_1-1.png") save_png(heat_map1_2, png_path + "test_heat_map_nyc_1-2.png") # 2 map_scale: 0.0 vega_2 = vega_heatmap(1024, 896, 0.0, [-73.998427, 40.730309, -73.954348, 40.780816], 'EPSG:4326') baseline2 = heatmap(res, vega_2) heat_map2_1 = heatmap(res, vega_2) heat_map2_2 = heatmap(res, vega_2) baseline_png2 = png_path + "heat_map_nyc_2.png" save_png(baseline2, baseline_png2) save_png(heat_map2_1, png_path + "test_heat_map_nyc_2-1.png") save_png(heat_map2_2, png_path + "test_heat_map_nyc_2-2.png") # 3 map_scale: 12.0 vega_3 = vega_heatmap(1024, 896, 12.0, [-73.998427, 40.730309, -73.954348, 40.780816], 'EPSG:4326') baseline3 = heatmap(res, vega_3) heat_map3_1 = heatmap(res, vega_3) heat_map3_2 = heatmap(res, vega_3) baseline_png3 = png_path + "heat_map_nyc_3.png" save_png(baseline3, baseline_png3) save_png(heat_map3_1, png_path + "test_heat_map_nyc_3-1.png") save_png(heat_map3_2, png_path + "test_heat_map_nyc_3-2.png") # 4 map_scale: 5.5 vega_4 = vega_heatmap(1024, 896, 5.5, [-73.998427, 40.730309, -73.954348, 40.780816], 'EPSG:4326') baseline4 = heatmap(res, vega_4) heat_map4_1 = heatmap(res, vega_4) heat_map4_2 = heatmap(res, vega_4) baseline_png4 = png_path + "heat_map_nyc_4.png" save_png(baseline4, baseline_png4) save_png(heat_map4_1, png_path + "test_heat_map_nyc_4-1.png") save_png(heat_map4_2, png_path + "test_heat_map_nyc_4-2.png") # 5 size:200*200 vega_5 = vega_heatmap(200, 200, 10.0, [-73.998427, 40.730309, -73.954348, 40.780816], 'EPSG:4326') baseline5 = heatmap(res, vega_5) heat_map5_1 = heatmap(res, vega_5) heat_map5_2 = heatmap(res, vega_5) baseline_png5 = png_path + "heat_map_nyc_5.png" save_png(baseline5, baseline_png5) save_png(heat_map5_1, png_path + "test_heat_map_nyc_5-1.png") save_png(heat_map5_2, png_path + "test_heat_map_nyc_5-2.png") spark.catalog.dropGlobalTempView("nyc_taxi") assert run_diff_png(baseline_png1, png_path + "test_heat_map_nyc_1-1.png", 0.1) assert run_diff_png(baseline_png1, png_path + "test_heat_map_nyc_1-2.png", 0.1) assert run_diff_png(baseline_png2, png_path + "test_heat_map_nyc_2-1.png", 0.1) assert run_diff_png(baseline_png2, png_path + "test_heat_map_nyc_2-2.png", 0.1) assert run_diff_png(baseline_png3, png_path + "test_heat_map_nyc_3-1.png", 0.15) assert run_diff_png(baseline_png3, png_path + "test_heat_map_nyc_3-2.png", 0.15) assert run_diff_png(baseline_png4, png_path + "test_heat_map_nyc_4-1.png", 0.1) assert run_diff_png(baseline_png4, png_path + "test_heat_map_nyc_4-2.png", 0.1) assert run_diff_png(baseline_png5, png_path + "test_heat_map_nyc_5-1.png", 0.2) assert run_diff_png(baseline_png5, png_path + "test_heat_map_nyc_5-2.png", 0.2)
def run_st_intersection(spark): test_df = spark.read.json("/tmp/intersection.json").cache() test_df.createOrReplaceTempView("intersection") register_funcs(spark) spark.sql("select ST_Intersection_UDF(left, right) from intersection").show()
#df.show() df.createOrReplaceTempView(table_name) rs = spark.sql(sql).cache() #rs.printSchema() #rs.show() save_result("results/%s" % table_name, rs) if __name__ == "__main__": url = 'local' spark_session = SparkSession.builder.appName("Python zgis sample").master(url).getOrCreate() spark_session.conf.set("spark.sql.execution.arrow.pyspark.enabled", "true") clear_result_dir('/tmp/results') register_funcs(spark_session) run_test_st_geomfromgeojson(spark_session) run_test_st_geomfromgeojson2(spark_session) run_test_st_curvetoline(spark_session) run_test_st_point(spark_session) run_test_envelope_aggr_1(spark_session) run_test_envelope_aggr_curve(spark_session) run_test_envelope_aggr_2(spark_session) run_test_union_aggr_2(spark_session) run_test_union_aggr_curve(spark_session) run_test_st_isvalid_1(spark_session) run_test_st_isvalid_curve(spark_session) run_test_st_intersection(spark_session) run_test_st_intersection_curve(spark_session) run_test_st_convexhull(spark_session)
def run_st_point(spark): points_df = spark.read.json("/tmp/points.json").cache() points_df.createOrReplaceTempView("points") register_funcs(spark) spark.sql("select ST_Point(x, y) from points").show()
def run_test_point_map(spark): # file 0_5M_nyc_taxi_and_building.csv could be obtained from arctern-turoial warehouse under zilliztech account. The link on github is https://github.com/zilliztech/arctern-tutorial # file 0_10000_nyc_taxi_and_building.csv is from file 0_5M_nyc_taxi_and_building.csv first 10000 lines df = spark.read.format("csv").option("header", True).option("delimiter", ",").schema( "VendorID string, tpep_pickup_datetime timestamp, tpep_dropoff_datetime timestamp, passenger_count long, " "trip_distance double, pickup_longitude double, pickup_latitude double, dropoff_longitude double, " "dropoff_latitude double, fare_amount double, tip_amount double, total_amount double, buildingid_pickup long, " "buildingid_dropoff long, buildingtext_pickup string, buildingtext_dropoff string").load( file_path).cache() df.createOrReplaceTempView("nyc_taxi") register_funcs(spark) res = spark.sql( "select ST_Point(pickup_longitude, pickup_latitude) as point from nyc_taxi where ST_Within(ST_Point(pickup_longitude, pickup_latitude), 'POLYGON ((-73.998427 40.730309, -73.954348 40.730309, -73.954348 40.780816 ,-73.998427 40.780816, -73.998427 40.730309))')") # 1 size:1024*896, point_size: 3, opacity: 0.5, color: #2DEF4A(green) vega_1 = vega_pointmap(1024, 896, [-73.998427, 40.730309, -73.954348, 40.780816], 3, "#2DEF4A", 0.5, "EPSG:4326") baseline1 = pointmap(res, vega_1) point_map1_1 = pointmap(res, vega_1) point_map1_2 = pointmap(res, vega_1) baseline_png1 = png_path + "point_map_nyc_1.png" save_png(baseline1, baseline_png1) save_png(point_map1_1, png_path + "test_point_map_nyc_1-1.png") save_png(point_map1_2, png_path + "test_point_map_nyc_1-2.png") # 2 #F50404(red) vega_2 = vega_pointmap(1024, 896, [-73.998427, 40.730309, -73.954348, 40.780816], 5, "#F50404", 0.5, "EPSG:4326") baseline2 = pointmap(res, vega_2) point_map2_1 = pointmap(res, vega_2) point_map2_2 = pointmap(res, vega_2) baseline_png2 = png_path + "point_map_nyc_2.png" save_png(baseline2, baseline_png2) save_png(point_map2_1, png_path + "test_point_map_nyc_2-1.png") save_png(point_map2_2, png_path + "test_point_map_nyc_2-2.png") # 3 color: #1455EE(blue) vega_3 = vega_pointmap(1024, 896, [-73.998427, 40.730309, -73.954348, 40.780816], 5, "#1455EE", 0.5, "EPSG:4326") baseline3 = pointmap(res, vega_3) point_map3_1 = pointmap(res, vega_3) point_map3_2 = pointmap(res, vega_3) baseline_png3 = png_path + "point_map_nyc_3.png" save_png(baseline3, baseline_png3) save_png(point_map3_1, png_path + "test_point_map_nyc_3-1.png") save_png(point_map3_2, png_path + "test_point_map_nyc_3-2.png") # 4 size:1024*896, point_size: 3, opacity: 1, color: #2DEF4A vega_4 = vega_pointmap(1024, 896, [-73.998427, 40.730309, -73.954348, 40.780816], 3, "#2DEF4A", 1.0, "EPSG:4326") baseline4 = pointmap(res, vega_4) point_map4_1 = pointmap(res, vega_4) point_map4_2 = pointmap(res, vega_4) baseline_png4 = png_path + "point_map_nyc_4.png" save_png(baseline4, baseline_png4) save_png(point_map4_1, png_path + "test_point_map_nyc_4-1.png") save_png(point_map4_2, png_path + "test_point_map_nyc_4-2.png") # 5 size:1024*896, point_size: 3, opacity: 0, color: #2DEF4A vega_5 = vega_pointmap(1024, 896, [-73.998427, 40.730309, -73.954348, 40.780816], 3, "#2DEF4A", 0.0, "EPSG:4326") baseline5 = pointmap(res, vega_5) point_map5_1 = pointmap(res, vega_5) point_map5_2 = pointmap(res, vega_5) baseline_png5 = png_path + "point_map_nyc_5.png" save_png(baseline5, baseline_png5) save_png(point_map5_1, png_path + "test_point_map_nyc_5-1.png") save_png(point_map5_2, png_path + "test_point_map_nyc_5-2.png") # 6 size:200*200, point_size: 3, opacity: 0.5, color: #2DEF4A vega_6 = vega_pointmap(200, 200, [-73.998427, 40.730309, -73.954348, 40.780816], 3, "#2DEF4A", 0.5, "EPSG:4326") baseline6 = pointmap(res, vega_6) point_map6_1 = pointmap(res, vega_6) point_map6_2 = pointmap(res, vega_6) baseline_png6 = png_path + "point_map_nyc_6.png" save_png(baseline6, baseline_png6) save_png(point_map6_1, png_path + "test_point_map_nyc_6-1.png") save_png(point_map6_2, png_path + "test_point_map_nyc_6-2.png") spark.catalog.dropGlobalTempView("nyc_taxi") assert run_diff_png(baseline_png1, png_path + "test_point_map_nyc_1-1.png") assert run_diff_png(baseline_png1, png_path + "test_point_map_nyc_1-2.png") assert run_diff_png(baseline_png2, png_path + "test_point_map_nyc_2-1.png") assert run_diff_png(baseline_png2, png_path + "test_point_map_nyc_2-2.png") assert run_diff_png(baseline_png3, png_path + "test_point_map_nyc_3-1.png") assert run_diff_png(baseline_png3, png_path + "test_point_map_nyc_3-2.png") assert run_diff_png(baseline_png4, png_path + "test_point_map_nyc_4-1.png") assert run_diff_png(baseline_png4, png_path + "test_point_map_nyc_4-2.png") assert run_diff_png(baseline_png5, png_path + "test_point_map_nyc_5-1.png") assert run_diff_png(baseline_png5, png_path + "test_point_map_nyc_5-2.png") assert run_diff_png(baseline_png6, png_path + "test_point_map_nyc_6-1.png") assert run_diff_png(baseline_png6, png_path + "test_point_map_nyc_6-2.png")
args = parse.parse_args() source_file = args.source_file[0] output_file = args.output_file[0] run_times = int(args.run_times[0]) version_commit = args.version[0] user_module = importlib.import_module( "test_case." + (source_file.split(".")[0]).replace("/", "."), "test_case/" + source_file) spark = SparkSession \ .builder \ .appName("Python Arrow-in-Spark example") \ .getOrCreate() spark.conf.set("spark.sql.execution.arrow.pyspark.enabled", "true") register_funcs(spark) all_time_info = { "version": version_commit.split("-")[0], "commit_id": version_commit.split("-")[-1], "func_name": user_module.func_name } data_df = spark.read.format("csv").option("header", False).option( "delimiter", "|").schema(user_module.schema).load(user_module.csv_path).cache() data_df.createOrReplaceTempView(user_module.func_name) if hasattr(user_module, "spark_test"): for times in range(run_times): time_info = {} begin_time = time.time()
def draw_china_weighted_point_map(spark): df = spark.read.format("csv").option("header", True).option( "delimiter", "," ).schema( "continent string, country string, province string, provinceLocationId string, " "provinceCurrentConfirmedCount int , provinceConfirmedCount int, provinceSuspectedCount int," "provinceCuredCount int, provinceDeadCount int, cityName string, longitude double, latitude double," "cityLocationId string, cityCurrentConfirmedCount int, cityConfirmedCount int, citySuspectedCount int," "cityCuredCount int, cityDeadCount int, updateTime timestamp").load( china_csv).cache() spark.catalog.dropGlobalTempView("COVID_china") df.createOrReplaceTempView("COVID_china") register_funcs(spark) # 1 res1 = spark.sql( "select ST_Point(longitude, latitude) as point from COVID_china where ST_Within(ST_Point(longitude, latitude), 'POLYGON ((71.604264 17.258977, 137.319408 17.258977, 137.319408 53.808533, 71.604264 53.808533, 71.604264 17.258977))')" ) res1.createOrReplaceTempView("res1") res1 = spark.sql("select * from res1 where point != 'POINT (nan nan)' ") vega1 = vega_weighted_pointmap( 1024, 896, [71.604264, 17.258977, 137.319408, 53.808533], "#EEEEEE", [2, 60], [6], 1.0, "EPSG:4326") res_png1 = weighted_pointmap(res1, vega1) save_png(res_png1, './COVID_china_weighted_point_map1.png') # 2 res2 = spark.sql( "select ST_Point(longitude, latitude) as point, provinceConfirmedCount as c from COVID_china " "where ST_Within(ST_Point(longitude, latitude), " "'POLYGON ((71.604264 17.258977, 137.319408 17.258977, 137.319408 53.808533," " 71.604264 53.808533, 71.604264 17.258977))')") res2.createOrReplaceTempView("res2") res2 = spark.sql("select * from res2 where point != 'POINT (nan nan)' ") vega2 = vega_weighted_pointmap( 1024, 896, [71.604264, 17.258977, 137.319408, 53.808533], "blue_to_red", [2, 1000], [6], 1.0, "EPSG:4326") res_png2 = weighted_pointmap(res2, vega2) save_png(res_png2, './COVID_china_weighted_point_map2.png') # 3 res3 = spark.sql( "select ST_Point(longitude, latitude) as point, provinceConfirmedCount as c, " "provinceConfirmedCount as s from COVID_china " "where ST_Within(ST_Point(longitude, latitude), " "'POLYGON ((71.604264 17.258977, 137.319408 17.258977, 137.319408 53.808533," " 71.604264 53.808533, 71.604264 17.258977))')") res3.createOrReplaceTempView("res3") res3 = spark.sql("select * from res3 where point != 'POINT (nan nan)' ") vega3 = vega_weighted_pointmap( 3000, 2000, [71.604264, 17.258977, 137.319408, 53.808533], "blue_to_red", [2, 1000], [5, 1000], 1.0, "EPSG:4326") res_png3 = weighted_pointmap(res3, vega3) save_png(res_png3, './COVID_china_weighted_point_map3.png') spark.catalog.dropGlobalTempView("COVID_china")