def read_csv(sc, file_name, sep=",", storage="hive://", header=True, names=None, table_name=None, infer_limit=10000): table_name = table_name if table_name is not None else "df" + str(uuid.uuid4()) hc = HiveContext(sc) df = pd.read_csv(file_name, sep=sep, nrows=infer_limit) names = df.columns if not names else names types = [] for i in range(len(names)): tp = names[i] + " " if df.dtypes[i] == "O": tp += "STRING" elif df.dtypes[i] == "int64": tp += "INT" else: tp += "DOUBLE" types.append(tp) hc.sql('drop table if exists %s' %table_name) qw = """CREATE TABLE IF NOT EXISTS %s (%s) row format delimited fields terminated by '%s' LINES TERMINATED BY '\n'""" %(table_name, ','.join(types), sep) if header: qw += " tblproperties ('skip.header.line.count'='1')" hc.sql(qw) hc.sql("LOAD DATA LOCAL INPATH '%s' OVERWRITE INTO TABLE %s" %(file_name, table_name)) rdd = hc.sql("SELECT * FROM %s" %table_name) ctx = hc if storage.startswith("parquet://"): path = storage.replace("parquet://", "") rdd.saveAsParquetFile("%s/%s" %(path, table_name)) sq = HiveContext(sc) rdd = sq.parquetFile("%s/%s" %(path, table_name)) rdd.registerTempTable(table_name) rdd = sq.sql("select * from %s" %table_name) ctx = sq return DataFrame(ctx, table_name, data=rdd, columns=names, dtype=types)
tweets ORDER BY retweetCount LIMIT 10""") # 在python中访问topTweet这个SchemaRDD中的text列 topTweetText = topTweets.map(lambda row: row.text) # 使用 python从Hive 读取 from pyspark.sql import HiveContext hiveCtx = HiveContext(sc) rows = hiveCtx.sql("SELECT key,value FROM mytable") keys = rows.map(lambda row: row[0]) # python中的Parquet数据读取 # 从一个有name和favouriteAnimal字段的Parquet文件中读取数据 rows = hiveCtx.parquetFile(parquetFile) names = rows.map(lambda row:row.name) print "Everyone" print names.collect() # python 中的Parqust数据查询 # 寻找熊猫爱好者 tbl = rows.registerTempTable("people") pandaFriends = hiveCtx.sql("SELECT name FROM people WHERE favouriteAnimal = \"panda\"") print "Panda friends" print pandaFriends.map(lambda row: row.name).collect() # Parquet文件保存 pandaFriends.saveAsTextFile("hdfs://...")
print "Starting.", datetime.now() sconf = SparkConf().set("spark.buffer.pageSize", 1024*1024).setAppName("FanDuelGame") sc = SparkContext(conf=sconf) sqlContext = HiveContext(sc) rddDir = CreateStatsRDD.rddDir (filename, dataDirectory, gameDescription, actualModel)=getCommandLine() print "start: ", datetime.now() game = FanDuelGame(sqlContext, filename, dataDirectory, gameDescription) eligiblePlayers = game.getEligiblePlayers() print "eligiblePlayers=", eligiblePlayers print "gameDate=", game.gameDate # get MLB.com players gamePlayers = sqlContext.parquetFile(rddDir + "/" + "game_players.parquet") gamePlayers.registerTempTable("game_players") gamePlayers.cache() ldf = sqlContext.sql("select distinct lookup_name, player_id from game_players where '" + str(game.gameDate) + "' >= effective_start_dt and '" + str(game.gameDate) + "' < effective_stop_dt").collect() print "ldf=", ldf pids = {} for row in ldf: x = row.asDict() pids[x['lookup_name'].upper()] = x['player_id'] print "pids=", pids with open(rddDir + "batting_encoded.json", 'r') as f: encoded = json.load(f) encodedPlayerIds = encoded['player_id'] decodedHitterPlayerIds = dict(zip(encodedPlayerIds.values(), encodedPlayerIds.keys()))
from pyspark import SparkContext from pyspark.sql import HiveContext from pyspark.sql.types import Row, StructField, StructType, StringType, IntegerType if __name__ == "__main__": sc = SparkContext(appName="SparkSQL:[com.mvad.spark.demo][pysparkdemo]") sqlContext = HiveContext(sc) # RDD is created from a list of rows df = sqlContext.parquetFile( "/mvad/warehouse/session/dspan/date=2015-05-01/") df.registerTempTable("sessionlog") for table in sqlContext.tableNames(): print table df.printSchema() sqlContext.udf.register("intarr2str", lambda array: "".join(map(str, array))) sql1 = """ select intarr2str(cookie) as cookiestr,eventTime,eventType,geoInfo.country as country, geoInfo.province as province from sessionlog limit 10 """.replace( '\n', ' ') sample = sqlContext.sql(sql1) sample.show() sql2 = """select eventType, count(cookie) as count from sessionlog group by eventType """.replace('\n', ' ') result = sqlContext.sql(sql2) result.cache() # only show 20 records result.show()
appName = "NetflowReplication:QA" conf = SparkConf().setAppName(appName) conf.setExecutorEnv( 'PYTHONPATH', '/opt/spark/python:/opt/spark/python/lib/py4j-0.8.2.1-src.zip') conf.set("spark.driver.maxResultSize", "2g") sc = SparkContext(conf=conf) sqlContext = HiveContext(sc) if len(sys.argv) < 3: print "Usage: /opt/spark/bin/spark-submit " + sys.argv[ 0] + " <netflow input path> <file with list of IP addresses to filter> <output filtered netflow text directory>" sys.exit() path = sys.argv[1] input_ip = sys.argv[2] output = sys.argv[3] list = [] for line in open(input_ip): line = line.strip('\n') line = sum( [int(i) * 2**(8 * j) for i, j in zip(line.split('.'), [3, 2, 1, 0])]) list.append(line) print list df = sqlContext.parquetFile(path) df.count() df_filtered = df.where(col("IPV4_SRC_ADDR").isin(list)) df_filtered.rdd.map(lambda row: [str(c) for c in row]).saveAsTextFile(output)
# -*- coding: utf-8 -*- from __future__ import absolute_import, print_function, division, unicode_literals import sys from pyspark import SparkConf, SparkContext from pyspark.sql import HiveContext, IntegerType if __name__ == '__main__': conf = SparkConf().setAppName('Restaurants Parquet') sc = SparkContext(conf=conf) hive_ctx = HiveContext(sc) inputs = hive_ctx.parquetFile(sys.argv[1]) inputs.registerTempTable('restaurants') hive_ctx.registerFunction("LEN", lambda s: len(s), IntegerType()) print('### Schema ###') inputs.printSchema() print() print('### Restaurants in Tokyo ###') restaurants_in_tokyo = hive_ctx.sql(""" SELECT r.id, r.alphabet FROM restaurants r WHERE
from pyspark import SparkConf, SparkContext from pyspark.sql import HiveContext sc = SparkContext('yarn-cluster') hc = HiveContext(sc) parquetFile = hc.parquetFile("/czdataset/weather/weather_data/000000_0") parquetFile.registerTempTable("weatherStation") stations = hc.sql("SELECT wban_number, yearmonthday, cz_year, cz_month, dayofmonth FROM weatherStation") stations.write.parquet("/user/sachin/output/hive-spark")
path = '/data/collector/xyz/2016/01/01/00/' binInterval = 05 startEpoch = 1481220000 sourceEpoch = 1451606400 outpath = '/data/replicated_data/' no_of_hours = 3 def unionAll(dfs): return reduce(DataFrame.unionAll, dfs) list = [] for i in range(00, 60, binInterval): i = "{:0>2}".format(i) df = sqlContext.parquetFile(path + str(i) + '/*') list.append(df) while no_of_hours > 0: OffSet = int(startEpoch) - sourceEpoch count = 0 for i in list: count = "{:0>2}".format(count) df = i.withColumn('FIRST_SWITCHED', i.FIRST_SWITCHED + OffSet) dfs = [[df]] dfs = [y for x in dfs for y in x] df_final = unionAll(dfs) df_final.coalesce(8).write.parquet(outpath + str(startEpoch) + '/' + str(count)) count = int(count) + binInterval startEpoch = int(startEpoch) + 3600
result = hiveCtx.sql("select * from rows") result.first() result_data = result.map(lambda x: x.data) #获取data字段 result_data.collect() result.printSchema() #输出结构信息 #数据缓存 hiveCtx.cacheTable('rows') #读取hive数据库的数据 score_data = hiveCtx.sql('select name,score from testdb.score') score = score_data.map(lambda x: x[1]) score.collect() #读取parquet文件 parquet_data = hiveCtx.parquetFile('hdfs://192.168.0.104:9000/users') parquet_data.first() gender = parquet_data.map(lambda x: x.gender) gender.collect() parquet_data.registerTempTable('users') male_data = hiveCtx.sql("select * from users where gender='male'") male_data.collect() #将RDD转化为SchemaRDD happyPeopleRDD = sc.parallelize([Row(name='lin', age=25)]) happyPeopleSchemaRDD = hiveCtx.inferSchema(happyPeopleRDD) happyPeopleSchemaRDD.registerTempTable('happyPeople') result = hiveCtx.sql('select name from happyPeople') result.collect() #用户自定义函数
from pyspark import SparkContext from pyspark.sql import HiveContext from pyspark.sql.types import Row, StructField, StructType, StringType, IntegerType if __name__ == "__main__": sc = SparkContext(appName="SparkSQL:[demo][pysparkdemo]") sqlContext = HiveContext(sc) # RDD is created from a list of rows df = sqlContext.parquetFile("/mvad/warehouse/session/dspan/date=2015-05-01/") df.registerTempTable("sessionlog") for table in sqlContext.tableNames(): print table df.printSchema() sqlContext.udf.register("intarr2str",lambda array:"".join(map(str,array)) ) sql1 = """ select intarr2str(cookie) as cookiestr,eventTime,eventType,geoInfo.country as country, geoInfo.province as province from sessionlog limit 10 """.replace('\n',' ') sample = sqlContext.sql(sql1) sample.show() sql2 = """select eventType, count(cookie) as count from sessionlog group by eventType """.replace('\n',' ') result = sqlContext.sql(sql2) result.cache() # only show 20 records result.show() result.show(100)
from pyspark.sql import HiveContext from pyspark.sql.functions import col appName = "ParquetPyspark::Filter" conf = SparkConf().setAppName(appName) conf.setExecutorEnv( 'PYTHONPATH', '/opt/spark/python:/opt/spark/python/lib/py4j-0.8.2.1-src.zip') conf.set("spark.driver.maxResultSize", "2g") sc = SparkContext(conf=conf) sqlContext = HiveContext(sc) if len(sys.argv) < 3: print "Usage: /opt/spark/bin/spark-submit --master yarn --deploy-mode client --executor-memory 2G --num-executors 2 --total-executor-cores 2 " + sys.argv[ 0] + " <input parquet files directory> <file with list of IP addresses to filter> <output filtered text directory>" sys.exit() input_path = sys.argv[1] input_ip = sys.argv[2] output_path = sys.argv[3] list = [] for line in open(input_ip).readlines(): line = line.strip('\n') list.append(line) df = sqlContext.parquetFile(input_path) df_filtered = df.where(col("IPV4_SRC_ADDR").isin(list)) df_filtered.rdd.map(lambda row: [str(field) for field in row]).saveAsTextFile( output_path)