from pyspark import SparkContext, SparkConf from pyspark.sql import HiveContext, Row from pyspark.sql.types import IntegerType import json import sys if __name__ == "__main__": inputFile = sys.argv[1] conf = SparkConf().setAppName("TwitterAnalytics") sc = SparkContext() hiveCtx = HiveContext(sc) print "Loading tweets from " + inputFile input = hiveCtx.jsonFile(inputFile) input.registerTempTable("tweets") topTweets = hiveCtx.sql("SELECT text, retweetCount FROM tweets ORDER BY retweetCount LIMIT 10") print topTweets.collect() topTweetText = topTweets.map(lambda row : row.text) print topTweetText.collect() # Make a happy person row happyPeopleRDD = sc.parallelize([Row(name="ganguly", favouriteBeverage="coffee")]) happyPeopleSchemaRDD = hiveCtx.inferSchema(happyPeopleRDD) happyPeopleSchemaRDD.registerTempTable("strong_people") # Make a UDF to tell us how long some text is hiveCtx.registerFunction("strLenPython", lambda x: len(x), IntegerType()) lengthSchemaRDD = hiveCtx.sql("SELECT strLenPython('text') FROM tweets LIMIT 10") print lengthSchemaRDD.collect() sc.stop()
return Row(**mydict) convertRDD = hc.sql( "select col1, col2, col3 from temp_source").map(convert) mytable = hc.inferSchema(convertRDD) mytable.registerTempTable("temp_mytable") """ def convert(val): return val.upper() hc.registerFunction("temp_convert", convert) convertRDD = hc.sql( "select temp_convert(col1) as col1, col2, col3 from temp_source") convertRDD.registerAsTable("temp_mytable") hc.cacheTable("temp_mytable") def printRows(rows): for row in rows: print row datas = hc.sql("select * from temp_mytable").collect()
from pyspark.sql import HiveContext, Row from pyspark.sql.types import StringType conf = SparkConf().setAppName("spark_sql_udf") sc = SparkContext(conf=conf) hc = HiveContext(sc) lines = sc.parallelize(["a", "b", "c"]) people = lines.map(lambda value: Row(name=value)) peopleSchema = hc.inferSchema(people) peopleSchema.registerTempTable("people") def myfunc(value): return value.upper() hc.registerFunction("myfunc", myfunc, StringType()) rows = hc.sql("select myfunc(name) from people").rdd.filter( lambda row: isinstance(row, tuple)).collect() sc.stop() for row in rows: print row, type(row[0])
table = hc.applySchema(rows, schema) table.registerTempTable("temp_table") def parseCDN(video_cdn): if not video_cdn: return "" words = video_cdn.split("s=") if len(words) >= 2: return words[1].split(",")[0] return "" hc.registerFunction("parseCDN", parseCDN) def cal_buffer_num(set): buffer_count = 0 buffer_t_sum = 0 buffer_smaller_500ms_count = 0 buffer_bigger_2min_count = 0 if set == None: pass else: list = set for s in list: if s >= 500 and s <= 120000: buffer_count = buffer_count + 1
sc = SparkContext(conf=conf) hc = HiveContext(sc) def split_idc(idc): if idc == None or idc == '' or (not isinstance(idc, basestring)): return '' else: words = idc.split('.') if len(words) >= 2: return words[0] + '.' + words[1] else: return '' hc.registerFunction("temp_split_idc", split_idc) #--------------------------2.0 RDD----------------------- spark_sql = '''select '1' as job_date,cdn,province,isp,ua,idc,play_process_group,version,init_timetag,buffer_count, sum(sum_play_process) as sum_play_process, sum(sum_video_init_duration) as sum_video_init_duration, sum(sum_buffer_t_sum) as sum_buffer_t_sum, sum(num) as num from( select cdn,province,isp,ua,play_process_group,version,init_timetag,buffer_count,sum_play_process,sum_video_init_duration,sum_buffer_t_sum,num, temp_split_idc(idc) as idc from datacubic.app_picserversweibof6vwt_wapvideodownload where log_dir= '20151012110000' and version>='5.4.5' limit 10 )a group by cdn,province,isp,ua,idc,play_process_group,version,init_timetag,buffer_count'''
from __future__ import absolute_import, print_function, division, unicode_literals import sys from pyspark import SparkConf, SparkContext from pyspark.sql import HiveContext, IntegerType if __name__ == '__main__': conf = SparkConf().setAppName('Restaurants Parquet') sc = SparkContext(conf=conf) hive_ctx = HiveContext(sc) inputs = hive_ctx.parquetFile(sys.argv[1]) inputs.registerTempTable('restaurants') hive_ctx.registerFunction("LEN", lambda s: len(s), IntegerType()) print('### Schema ###') inputs.printSchema() print() print('### Restaurants in Tokyo ###') restaurants_in_tokyo = hive_ctx.sql(""" SELECT r.id, r.alphabet FROM restaurants r WHERE r.pref_id = '13' AND r.alphabet <> ''
jsonRDD = hc.jsonFile("hdfs://dip.cdh5.dev:8020/user/hdfs/rawlog/app_saesinacomkafka12345_nginx/2015_10_22/09") hc.registerRDDAsTable(jsonRDD, "temp_schema") def if_in_top_10_domain(domain): if domain == "" or domain == None or len(domain) < 3: return "no" else: if top_domain_dict.has_key(domain): return top_domain_dict[domain] else: return "no" hc.registerFunction("temp_if_in_top_10_domain", if_in_top_10_domain) spark_sql = """select domain,url,cast(sum(body_bytes_sent) as bigint) as flow from ( select domain, split(request,'\\\\?')[0] as url, body_bytes_sent from temp_schema where body_bytes_sent>0 and temp_if_in_top_10_domain(domain)!='no' )A group by domain,url limit 100 """ rows_temp = hc.sql(spark_sql).map(lambda row: ((row.domain, if_in_top_10_domain(row.domain), row.url, row.flow), None)) def partitionFunc(key):
if __name__ == "__main__": if len(sys.argv) != 2: print("Usage: hive input file") exit(-1) path = sys.argv[1] conf = SparkConf().setAppName("spark_sql_hive") sc = SparkContext(conf=conf) hc = HiveContext(sc) # 创建表 hc.sql("CREATE TABLE IF NOT EXISTS src (key INT, value STRING)") # 加载数据 hc.sql("LOAD DATA INPATH '%s' INTO TABLE src" % path) # 注册函数 hc.registerFunction("myfunc", lambda name: name.upper()) rows = hc.sql("select key, myfunc(value) from src").take(5) for row in rows: print row sc.stop()
from pyspark import SparkContext, SparkConf from pyspark.sql import HiveContext, Row from pyspark.sql.types import IntegerType import json import sys if __name__ == "__main__": inputFile = sys.argv[1] conf = SparkConf().setAppName("SparkSQLTwitter") sc = SparkContext() hiveCtx = HiveContext(sc) print "Loading tweets from " + inputFile input = hiveCtx.jsonFile(inputFile) input.registerTempTable("tweets") topTweets = hiveCtx.sql( "SELECT text, retweetCount FROM tweets ORDER BY retweetCount LIMIT 10") print topTweets.collect() topTweetText = topTweets.map(lambda row: row.text) print topTweetText.collect() # Make a happy person row happyPeopleRDD = sc.parallelize( [Row(name="holden", favouriteBeverage="coffee")]) happyPeopleSchemaRDD = hiveCtx.inferSchema(happyPeopleRDD) happyPeopleSchemaRDD.registerTempTable("happy_people") # Make a UDF to tell us how long some text is hiveCtx.registerFunction("strLenPython", lambda x: len(x), IntegerType()) lengthSchemaRDD = hiveCtx.sql( "SELECT strLenPython('text') FROM tweets LIMIT 10") print lengthSchemaRDD.collect() sc.stop()
hc = HiveContext(sc) source = sc.parallelize([("value",)]) schema = StructType([StructField("col", StringType(), False)]) table = hc.applySchema(source, schema) table.registerTempTable("temp_table") def func_string(): return "abc" hc.registerFunction("func_string", func_string) rows = hc.sql("select func_string() from temp_table").collect() def func_int(): return 123 hc.registerFunction("func_int", func_int, IntegerType()) rows = hc.sql("select func_int() from temp_table").collect() def func_array(): # list or tuple return [1, 2, 3]