# 读取SequenceFile val data = sc.sequenceFile(inFile, "org.apache.hadoop.io.Text", "org.apache.hadoop.io.IntWritable") # 创建HiveContext并查询数据 from pyspark.sql import HiveContext hiveCtx = HiveContext(sc) rows = hiveCtx.sql("SELECT name, age FROM usrs") firstRow = rows.first() print firstRow.name # 使用Spark SQL 读取 JSON 数据 tweets = hiveCtx.jsonFile("tweets.json") tweets.registerTempTable("tweets") results = hiveCtx.sql("SELECT usr.name, text FROM tweets") #---------------------------------------------------------------------------# # 呼叫日志示例: """ {"address":"address here", "band":"40m","callsign":"KK6JLK","city":"SUNNYVALE", "contactlat":"37.384733","contactlong":"-122.032164", "county":"Santa Clara","dxcc":"291","fullname":"MATTHEW McPherrin", "id":57779,"mode":"FM","mylat":"37.751952821","mylong":"-122.4208688735",...} """ # 累加器 accumulator 对信息进行聚合 # 常见用途是在调试时对作业执行过程中的时间进行计数
from pyspark import SparkContext, SparkConf from pyspark.sql import HiveContext, Row from pyspark.sql.types import IntegerType import json import sys if __name__ == "__main__": inputFile = sys.argv[1] conf = SparkConf().setAppName("TwitterAnalytics") sc = SparkContext() hiveCtx = HiveContext(sc) print "Loading tweets from " + inputFile input = hiveCtx.jsonFile(inputFile) input.registerTempTable("tweets") topTweets = hiveCtx.sql("SELECT text, retweetCount FROM tweets ORDER BY retweetCount LIMIT 10") print topTweets.collect() topTweetText = topTweets.map(lambda row : row.text) print topTweetText.collect() # Make a happy person row happyPeopleRDD = sc.parallelize([Row(name="ganguly", favouriteBeverage="coffee")]) happyPeopleSchemaRDD = hiveCtx.inferSchema(happyPeopleRDD) happyPeopleSchemaRDD.registerTempTable("strong_people") # Make a UDF to tell us how long some text is hiveCtx.registerFunction("strLenPython", lambda x: len(x), IntegerType()) lengthSchemaRDD = hiveCtx.sql("SELECT strLenPython('text') FROM tweets LIMIT 10") print lengthSchemaRDD.collect() sc.stop()
# A simple demo for working with SparkSQL and Tweets import sys from pyspark import SparkContext, SparkConf from pyspark.sql import HiveContext, Row from pyspark.sql.types import IntegerType if __name__ == "__main__": inputFile = sys.argv[1] conf = SparkConf().setAppName("SparkSQLTwitter") sc = SparkContext() hiveCtx = HiveContext(sc) print "Loading tweets from " + inputFile input = hiveCtx.jsonFile(inputFile) input.registerTempTable("tweets") topTweets = hiveCtx.sql( "SELECT text, retweetCount FROM tweets ORDER BY retweetCount LIMIT 10") print topTweets.collect() topTweetText = topTweets.map(lambda row: row.text) print topTweetText.collect() # Make a happy person row happyPeopleRDD = sc.parallelize( [Row(name="holden", favouriteBeverage="coffee")]) happyPeopleSchemaRDD = hiveCtx.inferSchema(happyPeopleRDD) happyPeopleSchemaRDD.registerTempTable("happy_people") # Make a UDF to tell us how long some text is hiveCtx.registerFunction("strLenPython", lambda x: len(x), IntegerType()) lengthSchemaRDD = hiveCtx.sql( "SELECT strLenPython('text') FROM tweets LIMIT 10") print lengthSchemaRDD.collect() sc.stop()
'file:///usr/local/test_data/test4') #读取csv文件 import csv import StringIO data = sc.textFile('file:///usr/local/test_data/score.csv') def LoadRecord(line): input = StringIO.StringIO(line) reader = csv.DictReader(input, fieldnames=['name', 'score']) return reader.next() result = data.map(LoadRecord) #读取hdfs文件系统 distfile = sc.textFile('hdfs://192.168.0.104:9000/test.txt') #使用sparkSQL读取hive数据 from pyspark.sql import HiveContext hiveCtx = HiveContext(sc) rows = hiveCtx.sql('select name,score from testdb.score') first_row = rows.first() print first_row.name #使用sparkSQL读取json文件 rows = hiveCtx.jsonFile('file:///usr/local/test_data/json') rows.registerTempTable('rows') result = hiveCtx.sql("select * from rows") result.first()
u'www.backgrounds.sinaapp.com': 7, u'liukebin.sinaapp.com': 13 } """ i = 1 for domain in top_domain_list: top_domain_dict[domain[0]] = i i = i + 1 print top_domain_dict """ jsonRDD = hc.jsonFile( "hdfs://dip.cdh5.dev:8020/user/hdfs/rawlog/app_saesinacomkafka12345_nginx/2015_10_22/09" ) hc.registerRDDAsTable(jsonRDD, "temp_schema") def if_in_top_10_domain(domain): if domain == '' or domain == None or len(domain) < 3: return 'no' else: if top_domain_dict.has_key(domain): return top_domain_dict[domain] else: return 'no'
from pyspark import SparkContext,SparkConf from pyspark.sql import HiveContext,SQLContext,Row import json conf = SparkConf().setAppName("Task") sc = SparkContext(conf=conf) hc = HiveContext(sc) hc.sql("set spark.sql.shuffle.partitions=10") artists = hc.jsonFile("artists_en.json") movies = hc.jsonFile("movies_en.json") movies.registerTempTable("movies") artists.registerTempTable("artists") # function to print data using foreach def printx(x) : print(x) #Question 2 solution : movies_clean = hc.sql("select id,title,year,director,genre,country,actors from movies") #Question 3 solution : mUs_movies = hc.sql("select year,title from movies") \ .map(lambda row : (row.year,row.title)).groupByKey() \ .mapValues(lambda data : [title for title in data]) # Question 4 solution : mUs_directors = hc.sql("select director,title from movies") \
注意事项同样适用于 pickle 文件: pickle 库可能很慢,并且在修改类定义后,已经生产的 数据文件可能无法再读出来""" # 5.2.6 Hadoop输入输出格式 # 1.读取其他Hadoop输入格式 input2 = sc.hadoopFile(inputFile).map(lambda x, y: (str(x), str(y))) # 2.保存Hadoop输出格式 input2.saveAsNewAPIHadoopFile(inputFile) # 3.to do # 5.3文件系统 # 5.3.1 本地文件系统 rdd = sc.textFile(inputFile) # 5.3.2 Amazon S3 # 5.3.3 HDFS:只需要将输入输出路径指定为hdfs://master:port/path即可 # 5.4 Spark SQL中的结构化数据,详见第九章 # 5.4.1Apache Hive """要把 Spark SQL 连接到已有的 Hive 上,你需要提供 Hive 的配置文件。你需要将 hive-site. xml 文件复制到 Spark 的 ./conf/ 目录下""" hiveCtx = HiveContext(sc) rows = hiveCtx.sql('SELECT name, age FROM users') firstRow = rows.first() print firstRow.name # 5.4.2 JSON tweets = hiveCtx.jsonFile('tweets.json') tweets.registerTempTable('tweets') results = hiveCtx.sql('select user.name, text from tweets') # 5.5 数据库 # 5.5.1 Java数据库连接:任何支持Java数据库连接的关系型数据库都可以:MySql,Postgre等 # 5.5.2 Cassandra # 5.5.3 HBase # 5.5.4 Elasticsearch
from pyspark import SparkContext, SparkConf from pyspark.sql import HiveContext, SQLContext, Row import json conf = SparkConf().setAppName("Task") sc = SparkContext(conf=conf) hc = HiveContext(sc) hc.sql("set spark.sql.shuffle.partitions=10") artists = hc.jsonFile("artists_en.json") movies = hc.jsonFile("movies_en.json") movies.registerTempTable("movies") artists.registerTempTable("artists") # function to print data using foreach def printx(x): print(x) #Question 2 solution : movies_clean = hc.sql( "select id,title,year,director,genre,country,actors from movies") #Question 3 solution : mUs_movies = hc.sql("select year,title from movies") \ .map(lambda row : (row.year,row.title)).groupByKey() \ .mapValues(lambda data : [title for title in data])
u"www.backgrounds.sinaapp.com": 7, u"liukebin.sinaapp.com": 13, } """ i = 1 for domain in top_domain_list: top_domain_dict[domain[0]] = i i = i + 1 print top_domain_dict """ jsonRDD = hc.jsonFile("hdfs://dip.cdh5.dev:8020/user/hdfs/rawlog/app_saesinacomkafka12345_nginx/2015_10_22/09") hc.registerRDDAsTable(jsonRDD, "temp_schema") def if_in_top_10_domain(domain): if domain == "" or domain == None or len(domain) < 3: return "no" else: if top_domain_dict.has_key(domain): return top_domain_dict[domain] else: return "no" hc.registerFunction("temp_if_in_top_10_domain", if_in_top_10_domain)
from pyspark import SparkContext from pyspark.sql import HiveContext sc = SparkContext('local') print('sc:', sc) hiveCtx = HiveContext(sc) rows = hiveCtx.jsonFile( r'E:\code\Java\workspace48\SparkFastDataAnalysis\src\main\resources\data\chapter05\FileFormats\json.txt' ) rows.registerTempTable("test") result = hiveCtx.sql("select date,city,data.ganmao from test") firstRow = result.first() print(firstRow.getString(0))