def ch9_sql(): # Import Spark SQL from pyspark.sql import HiveContext, Row # Or if you can't include the hive requirements from pyspark.sql import SQLContext, Row hiveCtx = HiveContext(sc) input_file = hiveCtx.read.json("testweet.json") # Register the input_file schema RDD input_file.registerTempTable("tweets") # Select tweets based on the retweetCount topTweets = hiveCtx.sql("""SELECT text, retweetCount FROM tweets ORDER BY retweetCount LIMIT 10""") topTweetText = topTweets.map(lambda row: row.text) topTweetText.collect() topTweets.schema hiveCtx.cacheTable("tweets")
def query12_input(query_name, conf=None, output_persist=False): sc = SparkContext(conf=conf) sqlContext = HiveContext(sc) # SQL statements can be run by using the sql methods provided by sqlContext sql = "use tpcds_text_db_1_50" _ = sqlContext.sql(sql) # web_sales_sql = "select * from web_sales" # web_sales = sqlContext.sql(web_sales_sql) # web_sales.persist() # web_sales.registerAsTable("web_sales") # item_sql = "select * from item" # item = sqlContext.sql(item_sql) # item.persist() # item.registerAsTable("item") # date_dim_sql = "select * from date_dim" # date_dim = sqlContext.sql(date_dim_sql) # date_dim.persist() # date_dim.registerAsTable("date_dim") sqlContext.cacheTable("web_sales") sqlContext.cacheTable("item") sqlContext.cacheTable("date_dim") # discard the first query output = execute_sql(query_name, sqlContext, output_persist) # check the re-run statistics output = execute_sql(query_name, sqlContext) output['describe'] = output['output'].describe().show() sc.stop() return output
# Loading order_items table order_items = sqlContext.sql( """select * from {0}.order_items""".format(DB_NAME)) order_items.registerTempTable('order_items') # Loading products table products = sqlContext.sql("""select * from {0}.products""".format(DB_NAME)) products.registerTempTable('products') # Joining categories, products, order_items and orders tables cat_prod_ord_items = sqlContext.sql( """select c.*, o.order_customer_id as customer_id, p.product_id, oi.* from categories c join products p on c.category_id = p.product_category_id join order_items oi on p.product_id = oi.order_item_product_id join orders o on oi.order_item_order_id = o.order_id """ ) cat_prod_ord_items.registerTempTable('cat_prod_ord_items') sqlContext.cacheTable('cat_prod_ord_items') #### Category wise sales per day #### category_wise_sales1 = sqlContext.sql( """select category_id, category_name, category_department_id as department_id, round(sum(order_item_subtotal),2) as total_sales from cat_prod_ord_items group by category_id, category_name, category_department_id""" ) category_wise_sales1.registerTempTable('category_wise_sales1') category_wise_sales = sqlContext.sql( """select c.category_id, c.category_name, d.department_name, c.total_sales from category_wise_sales1 c join departments d on c.department_id = d.department_id""" ).withColumn( "txn_date", from_unixtime(unix_timestamp() - (86400 * lit(date_difference_in_days)), 'yyyy-MM-dd')) category_wise_sales.registerTempTable('category_wise_sales')
from pyspark import SparkConf, SparkContext from pyspark.sql import HiveContext from pyspark.sql.types import Row conf = SparkConf().setAppName("spark_sql_cache_table_extend") sc = SparkContext(conf=conf) hc = HiveContext(sc) dataRDD = sc.textFile( "/user/hdfs/rawlog/app_weibomobile03x4ts1kl_mwb_interface/" ).map(lambda line: line.split(",")).filter(lambda words: len(words) >= 3).map( lambda words: Row(col1=words[0], col2=words[1], col3=words[2])) sourceRDD = hc.inferSchema(dataRDD) sourceRDD.registerAsTable("source") hc.cacheTable("source") hc.sql("select count(*) from source").collect() hc.sql("select col2, max(col3) from source group by col2").collect() hc.sql("select col3, min(col2) from source group by col3").collect() # hc.uncacheTable("source") sc.stop()
mytable.registerTempTable("temp_mytable") """ def convert(val): return val.upper() hc.registerFunction("temp_convert", convert) convertRDD = hc.sql( "select temp_convert(col1) as col1, col2, col3 from temp_source") convertRDD.registerAsTable("temp_mytable") hc.cacheTable("temp_mytable") def printRows(rows): for row in rows: print row datas = hc.sql("select * from temp_mytable").collect() printRows(datas) datas = hc.sql("select col1 from temp_mytable").collect() printRows(datas) # hc.uncacheTable("temp_mytable")
sqlContext.sql('set spark.sql.shuffle.partitions=%i' % numPartitions) #comments = sqlContext.read.json('data/test/*/') #comments = sqlContext.read.json('data/micro_fake.json') comments = sqlContext.read.json('s3n://%s:%s@boazreddit/micro_fake.json' % (acc, sec)) #comments = sqlContext.read.json('s3n://%s:%s@boazreddit/test/*/*' % (acc, sec)) #comments = sqlContext.read.json('s3n://%s:%s@boazreddit/comments/2007/*' % (acc, sec)) #comments = sqlContext.read.json('s3n://%s:%s@boazreddit/comments/200*/*' % (acc, sec)) #comments = sqlContext.read.json('s3n://%s:%s@boazreddit/comments/*/*' % (acc, sec)) polcomments = comments.filter(comments.subreddit=='politics') polcomments2 = polcomments.repartition(numPartitions) polcomments2.registerTempTable('comments') sqlContext.cacheTable('comments') # Removed when filtering to single subreddit # COLLECT_LIST(subreddit) AS subreddits, # COUNT(DISTINCT(subreddit)) AS total_subreddits, user_pivot = sqlContext.sql('''SELECT author, MIN(CAST((FROM_UNIXTIME(INT(created_utc))) AS TIMESTAMP)) AS first_post_datetime, MAX(CAST((FROM_UNIXTIME(INT(created_utc))) AS TIMESTAMP)) AS last_post_datetime, COLLECT_LIST(CAST((FROM_UNIXTIME(INT(created_utc))) AS TIMESTAMP)) AS post_datetimes, COLLECT_LIST(id) AS post_ids, COUNT(*) AS total_posts FROM comments GROUP BY author''') #user_pivot2 = user_pivot.repartition(numPartitions)
def sql_hive_context_example(spark): # create hive context object. hive_ctx = HiveContext(spark.sparkContext) # createDataFrame l = [('Alice', 18), ('Bob', 20), ('Charley', 22)] df = hive_ctx.createDataFrame(l, ('name', 'age')) print("createDataFrame API finished") # registerDataFrameAsTable hive_ctx.registerDataFrameAsTable(df, "table1") print("registerDataFrameAsTable API finished") # sql tmp_df = hive_ctx.sql("select * from table1") tmp_df.show() print("sql API finished") # table tmp_df = hive_ctx.table("table1") tmp_df.show() print("table API finished") # tableNames table_names = hive_ctx.tableNames() print(table_names) print("tableNames API finished") # tables tables = hive_ctx.tables() print(tables) print("tables API finished") # range tmp_df = hive_ctx.range(1,10,2) tmp_df.show() print("range API finished") # dropTempTable hive_ctx.dropTempTable("table1") table_names = hive_ctx.tableNames() print(table_names) print("dropTempTable API finished") # cacheTable & uncacheTable & clearCache df = hive_ctx.range(1,10,2) hive_ctx.registerDataFrameAsTable(df, "table") hive_ctx.cacheTable("table") hive_ctx.uncacheTable("table") hive_ctx.clearCache() print("cacheTable & uncacheTable & clearCache API finished") # createExternalTable # newSession # registerFunction # Deprecated in 2.3.0. Use :func:`spark.udf.register` instead # registerJavaFunction # Deprecated in 2.3.0. Use :func:`spark.udf.registerJavaFunction` instead # setConf & getConf hive_ctx.setConf("key1", "value1") value = hive_ctx.getConf("key1") print(value) print("setConf & getConf API finished") # refreshTable # Exception: An error occurred while calling o26.refreshTable: # Method refreshTable([class java.lang.String]) does not exist print("Finish running HiveContext API")
# coding=utf-8 from pyspark import SparkConf, SparkContext from pyspark.sql import HiveContext from pyspark.sql.types import Row conf = SparkConf().setAppName("spark_sql_cache_table_extend") sc = SparkContext(conf=conf) hc = HiveContext(sc) dataRDD = sc.textFile("/user/hdfs/rawlog/app_weibomobile03x4ts1kl_mwb_interface/").map(lambda line: line.split( ",")).filter(lambda words: len(words) >= 3).map(lambda words: Row(col1=words[0], col2=words[1], col3=words[2])) sourceRDD = hc.inferSchema(dataRDD) sourceRDD.registerAsTable("source") hc.cacheTable("source") hc.sql("select count(*) from source").collect() hc.sql("select col2, max(col3) from source group by col2").collect() hc.sql("select col3, min(col2) from source group by col3").collect() # hc.uncacheTable("source") sc.stop()
def main(): """Run the query and print the statistics.""" run = parse_cmd_line_arg() # Clear cache map(clear_cache, hosts) map(clear_local_dir, hosts) name = 'CS-838-Assignment2-Question2' sc = SparkContext(conf=get_conf(name)) hc = HiveContext(sc) hc.sql('use tpcds_text_db_1_50') query12 = """ select i_item_desc ,i_category ,i_class ,i_current_price ,i_item_id ,sum(ws_ext_sales_price) as itemrevenue ,sum(ws_ext_sales_price)*100/sum(sum(ws_ext_sales_price)) over (partition by i_class) as revenueratio from web_sales ,item ,date_dim where web_sales.ws_item_sk = item.i_item_sk and item.i_category in ('Jewelry', 'Sports', 'Books') and web_sales.ws_sold_date_sk = date_dim.d_date_sk and date_dim.d_date between '2001-01-12' and '2001-02-11' group by i_item_id ,i_item_desc ,i_category ,i_class ,i_current_price order by i_category ,i_class ,i_item_id ,i_item_desc ,revenueratio limit 100 """ query54 = """ with my_customers as ( select c_customer_sk , c_current_addr_sk from ( select cs_sold_date_sk sold_date_sk, cs_bill_customer_sk customer_sk, cs_item_sk item_sk from catalog_sales union all select ws_sold_date_sk sold_date_sk, ws_bill_customer_sk customer_sk, ws_item_sk item_sk from web_sales ) cs_or_ws_sales, item, date_dim, customer where sold_date_sk = d_date_sk and item_sk = i_item_sk and i_category = 'Jewelry' and i_class = 'football' and c_customer_sk = cs_or_ws_sales.customer_sk and d_moy = 3 and d_year = 2000 group by c_customer_sk , c_current_addr_sk ) , my_revenue as ( select c_customer_sk, sum(ss_ext_sales_price) as revenue from my_customers, store_sales, customer_address, store, date_dim where c_current_addr_sk = ca_address_sk and ca_county = s_county and ca_state = s_state and ss_sold_date_sk = d_date_sk and c_customer_sk = ss_customer_sk and d_month_seq between (1203) and (1205) group by c_customer_sk ) , segments as (select cast((revenue/50) as int) as segment from my_revenue ) select segment, count(*) as num_customers, segment*50 as segment_base from segments group by segment order by segment, num_customers limit 100 """ # cache runs if run == 2 or run == 3: # tables in query12 used for collecting stats hc.cacheTable('web_sales') hc.cacheTable('item') hc.cacheTable('date_dim') # to circumvent lazy computation and force cache, we run a query # that involves the above cached tables if run == 2: # we will avoid running query12 now since we want to run # it below and collect stats # Instead, we run query54 which involves all the above 3 # cached tables df = hc.sql(query54) # to force the caching of the outputRDD elif run == 3: # running the same query used to collect stats: query12 # since we want to cache the output df = hc.sql(query12) df.cache() df.show() time.sleep(120) # record stats befor starting nw_before = map(get_network_bytes, hosts) st_before = map(get_storage_bytes, hosts) time_before = time.time() # actually run the query for collecting stastics hc.sql(query12).show() # record stat after completion time_after = time.time() nw_after = map(get_network_bytes, hosts) st_after = map(get_storage_bytes, hosts) # calculate the difference in stats nw_read_hosti = 0 nw_write_hosti = 0 st_read_hosti = 0 st_write_hosti = 0 for i in range(len(hosts)): nw_read_hosti += nw_after[i][0] - nw_before[i][0] nw_write_hosti += nw_after[i][1] - nw_before[i][1] st_read_hosti += st_after[i][0] - st_before[i][0] st_write_hosti += st_after[i][1] - st_before[i][1] # output the stats print time_after - time_before print bytes_to_mb(nw_read_hosti) print bytes_to_mb(nw_write_hosti) print bytes_to_mb(st_read_hosti) print bytes_to_mb(st_write_hosti) sc.stop()
from pyspark.sql.types import Row import random conf = SparkConf().setAppName("spark_sql_cache_table") sc = SparkContext(conf=conf) hc = HiveContext(sc) dataRDD = sc.textFile("hdfs://dip.cdh5.dev:8020/user/yurun/datas").map( lambda line: line.split("\t")).map( lambda words: Row(col1=words[0], col2=words[1], col3=words[2])) sourceRDD = hc.inferSchema(dataRDD) sourceRDD.registerAsTable("source") cacheRDD = hc.sql("select * from source where col1 = 'col1_50'") cacheRDD.registerAsTable("cacheTable") hc.cacheTable("cacheTable") hc.sql("select col2, max(col3) from cacheTable group by col2").collect() hc.sql("select col3, min(col2) from cacheTable group by col3").collect() # hc.uncacheTable("cacheTable") sc.stop()
sc = SparkContext(conf=conf) #创建SQL上下文环境 hiveCtx = HiveContext(sc) #使用sparkSQL读取json文件 rows = hiveCtx.jsonFile('file:///usr/local/test_data/json') rows.registerTempTable('rows') result = hiveCtx.sql("select * from rows") result.first() result_data = result.map(lambda x: x.data) #获取data字段 result_data.collect() result.printSchema() #输出结构信息 #数据缓存 hiveCtx.cacheTable('rows') #读取hive数据库的数据 score_data = hiveCtx.sql('select name,score from testdb.score') score = score_data.map(lambda x: x[1]) score.collect() #读取parquet文件 parquet_data = hiveCtx.parquetFile('hdfs://192.168.0.104:9000/users') parquet_data.first() gender = parquet_data.map(lambda x: x.gender) gender.collect() parquet_data.registerTempTable('users') male_data = hiveCtx.sql("select * from users where gender='male'") male_data.collect()
from pyspark.sql import HiveContext from pyspark.sql.types import Row import random conf = SparkConf().setAppName("spark_sql_cache_table") sc = SparkContext(conf=conf) hc = HiveContext(sc) dataRDD = sc.textFile("hdfs://dip.cdh5.dev:8020/user/yurun/datas").map(lambda line: line.split( "\t")).map(lambda words: Row(col1=words[0], col2=words[1], col3=words[2])) sourceRDD = hc.inferSchema(dataRDD) sourceRDD.registerAsTable("source") cacheRDD = hc.sql("select * from source where col1 = 'col1_50'") cacheRDD.registerAsTable("cacheTable") hc.cacheTable("cacheTable") hc.sql("select col2, max(col3) from cacheTable group by col2").collect() hc.sql("select col3, min(col2) from cacheTable group by col3").collect() # hc.uncacheTable("cacheTable") sc.stop()
class SparkEngine(object): def __init__(self, sc, debug=False): self.export_path = os.environ['COOPERHEWITT_ROOT'] + "/export/" self.sc = sc # hive requires writable permissions: ~/ephemeral-hdfs/bin/hadoop fs -chmod 777 /tmp/hive self.hive_cxt = HiveContext(sc) self.sql_cxt = SQLContext(sc) if debug: print "{0}\n{1}\n{2}\n".format(sc.master, self.hive_cxt, self.sql_cxt) print sc._conf.getAll() #TBD destructor Unpersist memory ### functionality to query and create tables def _create_df_table(self, schema, frame, name): if schema: df = self.hive_cxt.createDataFrame(frame, schema=schema) else: df = self.hive_cxt.createDataFrame(frame) df.printSchema() df.registerTempTable(name) self.hive_cxt.cacheTable(name) return df def _query_temporal_data(self): # step 1. create main temporal table # n_obs => first join causes for each pen entry * num location entries existent (dependent on time period) samples_temporal_tb = self.hive_cxt.sql(""" SELECT s.refers_to_object_id, created, visit_raw, room_floor, room_id, room_name, spot_id, spot_name, spot_description, room_count_objects, room_count_spots, spot_count_objects, abs(datediff( from_utc_timestamp(from_unixtime(created, "yyyy-MM-dd"), 'US/Eastern'), from_utc_timestamp(from_unixtime(visit_raw, "yyyy-MM-dd"), 'US/Eastern') )) as delta FROM samples s JOIN temporal t ON s.refers_to_object_id = t.refers_to_object_id ORDER by s.refers_to_object_id, created, delta """) samples_temporal_tb.registerTempTable('samplestemporal') self.hive_cxt.cacheTable('samplestemporal') return samples_temporal_tb def _minimize_query(self): # From the temporal table, we need minimize the location (multiple locations) to the appropriate sample timestamp tb_samples = self.hive_cxt.sql(""" SELECT * FROM ( SELECT *, MIN(delta) OVER ( PARTITION BY refers_to_object_id, created) AS min_delta, row_number() OVER ( PARTITION BY refers_to_object_id, created) AS ranks FROM samplestemporal st ORDER BY refers_to_object_id ) query where query.ranks = 1 """) tb_samples = tb_samples.withColumn("meta_store", lit(1)) tb_samples.registerTempTable('minimizedsamples') self.hive_cxt.cacheTable('minimizedsamples') return tb_samples def execute_query(self, (samples_schema, samples_frame, samples_name), (temporal_schema, temporal_frame, temporal_name), cols): self.df_samples = self._create_df_table(samples_schema, samples_frame, samples_name) self.df_temporal = self._create_df_table(temporal_schema, temporal_frame, temporal_name) self.tb_meta = self._query_temporal_data() self.tb_meta_min = self._minimize_query() # combine to the original pen data (meta_store indicates if we had object data to integrate) self.df_samplesmeta = self.df_samples.join(self.tb_meta_min, ['refers_to_object_id', 'created'], "left_outer") self.df_samplesmeta = self.df_samplesmeta.fillna({'meta_store': 0}) self.df_samplesmeta.printSchema() # pickle file to pandas: alternatively we can store as a json or parquet columnar format dropped_cols = ['delta', 'min_delta', 'ranks'] + cols samplesmeta_pd = self.df_samplesmeta.toPandas() samplesmeta_pd = samplesmeta_pd.drop(dropped_cols, axis=1) samplesmeta_pd.to_pickle(self.export_path + "penmeta_spark.pkl")
schema = StructType([ StructField("domain", StringType(), False), StructField("range", StringType(), False) ]) edges = sqlCtx.createDataFrame(tc, schema) tcSQL = sqlCtx.createDataFrame(tc, schema) tcSQL.cache() print "****** schema created ********" tcSQL.printSchema() orgCount = tcSQL.count() edges.registerAsTable("edges") sqlCtx.cacheTable("edges") print "******* Loop Start ********" loop_start = datetime.now() iteration = 0 oldCount = 0L nextCount = tcSQL.count() while True: iteration = iteration + 1 print "****** Start iteration %i ******" % iteration oldCount = nextCount
mytable.registerTempTable("temp_mytable") """ def convert(val): return val.upper() hc.registerFunction("temp_convert", convert) convertRDD = hc.sql( "select temp_convert(col1) as col1, col2, col3 from temp_source") convertRDD.registerAsTable("temp_mytable") hc.cacheTable("temp_mytable") def printRows(rows): for row in rows: print row datas = hc.sql("select * from temp_mytable").collect() printRows(datas) datas = hc.sql("select col1 from temp_mytable").collect() printRows(datas)