# imports from pyspark import SparkContext from pyspark.sql import HiveContext # start Spark and Hive SQL contexts sc = SparkContext("local", "demo app") hc = HiveContext(sc) # get the table print "Printing tables in DB:" print hc.tableNames() print "Printing first 10 rows in zip_neighborhood_borough_xref table." sqlQuery = "SELECT * FROM zip_neighborhood_borough_xref limit 10" hc.sql(sqlQuery).show()
def sql_hive_context_example(spark): # create hive context object. hive_ctx = HiveContext(spark.sparkContext) # createDataFrame l = [('Alice', 18), ('Bob', 20), ('Charley', 22)] df = hive_ctx.createDataFrame(l, ('name', 'age')) print("createDataFrame API finished") # registerDataFrameAsTable hive_ctx.registerDataFrameAsTable(df, "table1") print("registerDataFrameAsTable API finished") # sql tmp_df = hive_ctx.sql("select * from table1") tmp_df.show() print("sql API finished") # table tmp_df = hive_ctx.table("table1") tmp_df.show() print("table API finished") # tableNames table_names = hive_ctx.tableNames() print(table_names) print("tableNames API finished") # tables tables = hive_ctx.tables() print(tables) print("tables API finished") # range tmp_df = hive_ctx.range(1,10,2) tmp_df.show() print("range API finished") # dropTempTable hive_ctx.dropTempTable("table1") table_names = hive_ctx.tableNames() print(table_names) print("dropTempTable API finished") # cacheTable & uncacheTable & clearCache df = hive_ctx.range(1,10,2) hive_ctx.registerDataFrameAsTable(df, "table") hive_ctx.cacheTable("table") hive_ctx.uncacheTable("table") hive_ctx.clearCache() print("cacheTable & uncacheTable & clearCache API finished") # createExternalTable # newSession # registerFunction # Deprecated in 2.3.0. Use :func:`spark.udf.register` instead # registerJavaFunction # Deprecated in 2.3.0. Use :func:`spark.udf.registerJavaFunction` instead # setConf & getConf hive_ctx.setConf("key1", "value1") value = hive_ctx.getConf("key1") print(value) print("setConf & getConf API finished") # refreshTable # Exception: An error occurred while calling o26.refreshTable: # Method refreshTable([class java.lang.String]) does not exist print("Finish running HiveContext API")
from pyspark import SparkContext from pyspark.sql import HiveContext from pyspark.sql.types import * from udf.pyspark.udfs import * if __name__ == "__main__": sc = SparkContext(appName="SparkSQL:[demo][pysparkdemo]") sqlContext = HiveContext(sc) # RDD is created from a list of rows df = sqlContext.read.parquet("/mvad/warehouse/session/dspan/date=2015-09-01/") df.registerTempTable("sessionlog") for table in sqlContext.tableNames(): print table df.printSchema() sqlContext.udf.register("toNormalCookie",toNormalCookie ) sql1 = """ select toNormalCookie(cookie) as cookiestr,eventTime,eventType,geoInfo.country as country, geoInfo.province as province from sessionlog limit 10 """.replace('\n',' ') sample = sqlContext.sql(sql1) sample.show() sql2 = """select eventType, count(cookie) as count from sessionlog group by eventType """.replace('\n',' ') result = sqlContext.sql(sql2) result.cache() # only show 20 records
from pyspark import SparkContext from pyspark.sql import HiveContext from pyspark.sql.types import Row, StructField, StructType, StringType, IntegerType if __name__ == "__main__": sc = SparkContext(appName="SparkSQL:[com.mvad.spark.demo][pysparkdemo]") sqlContext = HiveContext(sc) # RDD is created from a list of rows df = sqlContext.parquetFile( "/mvad/warehouse/session/dspan/date=2015-05-01/") df.registerTempTable("sessionlog") for table in sqlContext.tableNames(): print table df.printSchema() sqlContext.udf.register("intarr2str", lambda array: "".join(map(str, array))) sql1 = """ select intarr2str(cookie) as cookiestr,eventTime,eventType,geoInfo.country as country, geoInfo.province as province from sessionlog limit 10 """.replace( '\n', ' ') sample = sqlContext.sql(sql1) sample.show() sql2 = """select eventType, count(cookie) as count from sessionlog group by eventType """.replace('\n', ' ') result = sqlContext.sql(sql2) result.cache() # only show 20 records result.show()
spark_df = sqlCtx.read.format('com.databricks.spark.csv').options(header='true', inferschema='true').load("./data/clicks_test.csv") spark_df.registerTempTable("clicks_train") try: sqlCtx.sql("drop table table_7") except pyspark.sql.utils.AnalysisException as e: pass except Exception as e: pass print("*** CREATING TABLE 7 ***") # table 7 is train but geo location needs to be coded. # I don't know how to coded in sql so I will just remove it for now sqlCtx.sql("create table table_7 as select a.document_id, a.platform, " "a.traffic_source, a.display_id, a.source_id, a.publisher_id, " "a.category_id, b.ad_id, a.topic_id from table_6 a inner join clicks_train b on a.display_id = b.display_id") print("*** FINISHED CREATING TABLE 7 ***") # create train file from table_7 train_spark_df = sqlCtx.sql("select * from table_7") train_spark_df.write.csv('./cleaned_data/test_files_from_spark') # TODO do test file spark_tables = sqlCtx.tableNames() print(spark_tables)