Exemple #1
0
#  读取SequenceFile
val data = sc.sequenceFile(inFile,
	"org.apache.hadoop.io.Text", "org.apache.hadoop.io.IntWritable")


# 创建HiveContext并查询数据
from pyspark.sql import HiveContext

hiveCtx = HiveContext(sc)
rows = hiveCtx.sql("SELECT name, age FROM usrs")
firstRow = rows.first()
print firstRow.name

# 使用Spark SQL 读取 JSON 数据
tweets = hiveCtx.jsonFile("tweets.json")
tweets.registerTempTable("tweets")
results = hiveCtx.sql("SELECT usr.name, text FROM tweets")


#---------------------------------------------------------------------------#
# 呼叫日志示例:
"""
{"address":"address here", "band":"40m","callsign":"KK6JLK","city":"SUNNYVALE",
"contactlat":"37.384733","contactlong":"-122.032164",
"county":"Santa Clara","dxcc":"291","fullname":"MATTHEW McPherrin",
"id":57779,"mode":"FM","mylat":"37.751952821","mylong":"-122.4208688735",...}
"""

# 累加器 accumulator              对信息进行聚合
# 常见用途是在调试时对作业执行过程中的时间进行计数
from pyspark import SparkContext, SparkConf
from pyspark.sql import HiveContext, Row
from pyspark.sql.types import IntegerType
import json
import sys

if __name__ == "__main__":
    inputFile = sys.argv[1]
    conf = SparkConf().setAppName("TwitterAnalytics")
    sc = SparkContext()
    hiveCtx = HiveContext(sc)
    print "Loading tweets from " + inputFile
    input = hiveCtx.jsonFile(inputFile)
    input.registerTempTable("tweets")
    topTweets = hiveCtx.sql("SELECT text, retweetCount FROM tweets ORDER BY retweetCount LIMIT 10")
    print topTweets.collect()
    topTweetText = topTweets.map(lambda row : row.text)
    print topTweetText.collect()
    # Make a happy person row
    happyPeopleRDD = sc.parallelize([Row(name="ganguly", favouriteBeverage="coffee")])
    happyPeopleSchemaRDD = hiveCtx.inferSchema(happyPeopleRDD)
    happyPeopleSchemaRDD.registerTempTable("strong_people")
    # Make a UDF to tell us how long some text is
    hiveCtx.registerFunction("strLenPython", lambda x: len(x), IntegerType())
    lengthSchemaRDD = hiveCtx.sql("SELECT strLenPython('text') FROM tweets LIMIT 10")
    print lengthSchemaRDD.collect()
    sc.stop()
# A simple demo for working with SparkSQL and Tweets
import sys

from pyspark import SparkContext, SparkConf
from pyspark.sql import HiveContext, Row
from pyspark.sql.types import IntegerType

if __name__ == "__main__":
    inputFile = sys.argv[1]
    conf = SparkConf().setAppName("SparkSQLTwitter")
    sc = SparkContext()
    hiveCtx = HiveContext(sc)
    print "Loading tweets from " + inputFile
    input = hiveCtx.jsonFile(inputFile)
    input.registerTempTable("tweets")
    topTweets = hiveCtx.sql(
        "SELECT text, retweetCount FROM tweets ORDER BY retweetCount LIMIT 10")
    print topTweets.collect()
    topTweetText = topTweets.map(lambda row: row.text)
    print topTweetText.collect()
    # Make a happy person row
    happyPeopleRDD = sc.parallelize(
        [Row(name="holden", favouriteBeverage="coffee")])
    happyPeopleSchemaRDD = hiveCtx.inferSchema(happyPeopleRDD)
    happyPeopleSchemaRDD.registerTempTable("happy_people")
    # Make a UDF to tell us how long some text is
    hiveCtx.registerFunction("strLenPython", lambda x: len(x), IntegerType())
    lengthSchemaRDD = hiveCtx.sql(
        "SELECT strLenPython('text') FROM tweets LIMIT 10")
    print lengthSchemaRDD.collect()
    sc.stop()
    'file:///usr/local/test_data/test4')

#读取csv文件
import csv
import StringIO
data = sc.textFile('file:///usr/local/test_data/score.csv')


def LoadRecord(line):
    input = StringIO.StringIO(line)
    reader = csv.DictReader(input, fieldnames=['name', 'score'])
    return reader.next()


result = data.map(LoadRecord)

#读取hdfs文件系统
distfile = sc.textFile('hdfs://192.168.0.104:9000/test.txt')

#使用sparkSQL读取hive数据
from pyspark.sql import HiveContext
hiveCtx = HiveContext(sc)
rows = hiveCtx.sql('select name,score from testdb.score')
first_row = rows.first()
print first_row.name

#使用sparkSQL读取json文件
rows = hiveCtx.jsonFile('file:///usr/local/test_data/json')
rows.registerTempTable('rows')
result = hiveCtx.sql("select * from rows")
result.first()
    u'www.backgrounds.sinaapp.com': 7,
    u'liukebin.sinaapp.com': 13
}
"""
i = 1

for domain in top_domain_list:
    top_domain_dict[domain[0]] = i

    i = i + 1

print top_domain_dict
"""

jsonRDD = hc.jsonFile(
    "hdfs://dip.cdh5.dev:8020/user/hdfs/rawlog/app_saesinacomkafka12345_nginx/2015_10_22/09"
)

hc.registerRDDAsTable(jsonRDD, "temp_schema")


def if_in_top_10_domain(domain):
    if domain == '' or domain == None or len(domain) < 3:
        return 'no'
    else:
        if top_domain_dict.has_key(domain):
            return top_domain_dict[domain]
        else:
            return 'no'

Exemple #6
0
from pyspark import SparkContext,SparkConf
from pyspark.sql import HiveContext,SQLContext,Row
import json

conf = SparkConf().setAppName("Task")
sc = SparkContext(conf=conf)
hc = HiveContext(sc)

hc.sql("set spark.sql.shuffle.partitions=10")

artists = hc.jsonFile("artists_en.json")
movies = hc.jsonFile("movies_en.json")

movies.registerTempTable("movies")
artists.registerTempTable("artists")

# function to print data using foreach
def printx(x) : 
  print(x)


#Question 2 solution :
movies_clean = hc.sql("select id,title,year,director,genre,country,actors from movies")

#Question 3 solution :
mUs_movies = hc.sql("select year,title from movies") \
.map(lambda row : (row.year,row.title)).groupByKey() \
.mapValues(lambda data : [title for title in data])

# Question 4 solution :
mUs_directors = hc.sql("select director,title from movies") \
Exemple #7
0
注意事项同样适用于 pickle 文件: pickle 库可能很慢,并且在修改类定义后,已经生产的
数据文件可能无法再读出来"""
# 5.2.6 Hadoop输入输出格式
# 1.读取其他Hadoop输入格式
input2 = sc.hadoopFile(inputFile).map(lambda x, y: (str(x), str(y)))
# 2.保存Hadoop输出格式
input2.saveAsNewAPIHadoopFile(inputFile)
# 3.to do
# 5.3文件系统
# 5.3.1 本地文件系统
rdd = sc.textFile(inputFile)
# 5.3.2 Amazon S3
# 5.3.3 HDFS:只需要将输入输出路径指定为hdfs://master:port/path即可
# 5.4 Spark SQL中的结构化数据,详见第九章
# 5.4.1Apache Hive
"""要把 Spark SQL 连接到已有的 Hive 上,你需要提供 Hive 的配置文件。你需要将 hive-site.
xml 文件复制到 Spark 的 ./conf/ 目录下"""
hiveCtx = HiveContext(sc)
rows = hiveCtx.sql('SELECT name, age FROM users')
firstRow = rows.first()
print firstRow.name
# 5.4.2 JSON
tweets = hiveCtx.jsonFile('tweets.json')
tweets.registerTempTable('tweets')
results = hiveCtx.sql('select user.name, text from tweets')
# 5.5 数据库
# 5.5.1 Java数据库连接:任何支持Java数据库连接的关系型数据库都可以:MySql,Postgre等
# 5.5.2 Cassandra
# 5.5.3 HBase
# 5.5.4 Elasticsearch
Exemple #8
0
from pyspark import SparkContext, SparkConf
from pyspark.sql import HiveContext, SQLContext, Row
import json

conf = SparkConf().setAppName("Task")
sc = SparkContext(conf=conf)
hc = HiveContext(sc)

hc.sql("set spark.sql.shuffle.partitions=10")

artists = hc.jsonFile("artists_en.json")
movies = hc.jsonFile("movies_en.json")

movies.registerTempTable("movies")
artists.registerTempTable("artists")


# function to print data using foreach
def printx(x):
    print(x)


#Question 2 solution :
movies_clean = hc.sql(
    "select id,title,year,director,genre,country,actors from movies")

#Question 3 solution :
mUs_movies = hc.sql("select year,title from movies") \
.map(lambda row : (row.year,row.title)).groupByKey() \
.mapValues(lambda data : [title for title in data])
    u"www.backgrounds.sinaapp.com": 7,
    u"liukebin.sinaapp.com": 13,
}

"""
i = 1

for domain in top_domain_list:
    top_domain_dict[domain[0]] = i

    i = i + 1

print top_domain_dict
"""

jsonRDD = hc.jsonFile("hdfs://dip.cdh5.dev:8020/user/hdfs/rawlog/app_saesinacomkafka12345_nginx/2015_10_22/09")

hc.registerRDDAsTable(jsonRDD, "temp_schema")


def if_in_top_10_domain(domain):
    if domain == "" or domain == None or len(domain) < 3:
        return "no"
    else:
        if top_domain_dict.has_key(domain):
            return top_domain_dict[domain]
        else:
            return "no"


hc.registerFunction("temp_if_in_top_10_domain", if_in_top_10_domain)
from pyspark import SparkContext
from pyspark.sql import HiveContext

sc = SparkContext('local')
print('sc:', sc)

hiveCtx = HiveContext(sc)
rows = hiveCtx.jsonFile(
    r'E:\code\Java\workspace48\SparkFastDataAnalysis\src\main\resources\data\chapter05\FileFormats\json.txt'
)
rows.registerTempTable("test")
result = hiveCtx.sql("select date,city,data.ganmao from test")
firstRow = result.first()
print(firstRow.getString(0))