Exemple #1
0
from pyspark import SparkConf, SparkContext
from pyspark.sql import HiveContext, Row
from pyspark.sql.types import StringType

conf = SparkConf().setAppName("spark_sql_udf")

sc = SparkContext(conf=conf)

hc = HiveContext(sc)

lines = sc.parallelize(["a", "b", "c"])

people = lines.map(lambda value: Row(name=value))

peopleSchema = hc.inferSchema(people)

peopleSchema.registerTempTable("people")


def myfunc(value):
    return value.upper()


hc.registerFunction("myfunc", myfunc, StringType())

rows = hc.sql("select myfunc(name) from people").rdd.filter(
    lambda row: isinstance(row, tuple)).collect()

sc.stop()

for row in rows:
Exemple #2
0
# 寻找熊猫爱好者
tbl = rows.registerTempTable("people")
pandaFriends = hiveCtx.sql("SELECT name FROM people WHERE favouriteAnimal = \"panda\"")
print "Panda friends"
print pandaFriends.map(lambda row: row.name).collect()

# Parquet文件保存
pandaFriends.saveAsTextFile("hdfs://...")


# 在python中使用Spark SQL 读取JSON数据
input = hiveCtx.jsonFile(inputFile)

#  在python中使用Row和具名远足创建SchemaRDD
happyPeopleRDD = sc.parallelize([Row(name = "holden", favouriteBeverage = "coffee")])
happyPeopleSchemaRDD = hiveCtx.inferSchema(happyPeopleRDD)
happyPeopleSchemaRDD.registerTempTable("happy_people")

#****************  使用Beeline    JDBC/ODBC服务器   ************************************

# 用户自定义函数

# 字符串长度 UDF
# 写一个求字符串长度的UDF
hiveCtx.registerFunction("strLenPython", lambda x: len(x), IntegerType())
lengthSchemaRDD = hiveCtx.sql("SELECT strLenPython('text') FROM tweets LIMIT 10")


# ----------------------------------------------------------------------------------########

Spark Streaming使用离散化流(discretized stream)作为抽象表示,叫做DStream。
from pyspark import SparkConf, SparkContext
from pyspark.sql import HiveContext, Row

conf = SparkConf().setAppName("spark_sql_delimiter_infer_schema")

sc = SparkContext(conf=conf)

hc = HiveContext(sc)

source = sc.parallelize(["row1_col1 row1_col2 row1_col3",
                         "row2_col1 row2_col2 row3_col3", "row3_col1 row3_col2 row3_col3"])

columns = source.map(lambda line: line.split(" ")).filter(
    lambda columns: columns and len(columns) == 3)

rows = columns.map(
    lambda columns: Row(col1=columns[0], col2=columns[1], col3=columns[2]))

table = hc.inferSchema(rows)

table.registerAsTable("temp_mytable")

datas = hc.sql("select * from temp_mytable").collect()

sc.stop()

if datas:
    for data in datas:
        print data
from pyspark import SparkConf, SparkContext
from pyspark.sql import HiveContext
from pyspark.sql.types import Row

conf = SparkConf().setAppName("spark_sql_cache_table_extend")

sc = SparkContext(conf=conf)

hc = HiveContext(sc)

dataRDD = sc.textFile(
    "/user/hdfs/rawlog/app_weibomobile03x4ts1kl_mwb_interface/"
).map(lambda line: line.split(",")).filter(lambda words: len(words) >= 3).map(
    lambda words: Row(col1=words[0], col2=words[1], col3=words[2]))

sourceRDD = hc.inferSchema(dataRDD)

sourceRDD.registerAsTable("source")

hc.cacheTable("source")

hc.sql("select count(*) from source").collect()

hc.sql("select col2, max(col3) from source group by col2").collect()

hc.sql("select col3, min(col2) from source group by col3").collect()

# hc.uncacheTable("source")

sc.stop()
# coding=utf-8

from pyspark import SparkConf, SparkContext
from pyspark.sql import HiveContext
from pyspark.sql.types import Row

conf = SparkConf().setAppName("spark_sql_cache_table_extend")

sc = SparkContext(conf=conf)

hc = HiveContext(sc)

dataRDD = sc.textFile("/user/hdfs/rawlog/app_weibomobile03x4ts1kl_mwb_interface/").map(lambda line: line.split(
    ",")).filter(lambda words: len(words) >= 3).map(lambda words: Row(col1=words[0], col2=words[1], col3=words[2]))

sourceRDD = hc.inferSchema(dataRDD)

sourceRDD.registerAsTable("source")

hc.cacheTable("source")

hc.sql("select count(*) from source").collect()

hc.sql("select col2, max(col3) from source group by col2").collect()

hc.sql("select col3, min(col2) from source group by col3").collect()

# hc.uncacheTable("source")

sc.stop()
from pyspark import SparkContext, SparkConf
from pyspark.sql import HiveContext, Row
from pyspark.sql.types import IntegerType
import json
import sys

if __name__ == "__main__":
    inputFile = sys.argv[1]
    conf = SparkConf().setAppName("TwitterAnalytics")
    sc = SparkContext()
    hiveCtx = HiveContext(sc)
    print "Loading tweets from " + inputFile
    input = hiveCtx.jsonFile(inputFile)
    input.registerTempTable("tweets")
    topTweets = hiveCtx.sql("SELECT text, retweetCount FROM tweets ORDER BY retweetCount LIMIT 10")
    print topTweets.collect()
    topTweetText = topTweets.map(lambda row : row.text)
    print topTweetText.collect()
    # Make a happy person row
    happyPeopleRDD = sc.parallelize([Row(name="ganguly", favouriteBeverage="coffee")])
    happyPeopleSchemaRDD = hiveCtx.inferSchema(happyPeopleRDD)
    happyPeopleSchemaRDD.registerTempTable("strong_people")
    # Make a UDF to tell us how long some text is
    hiveCtx.registerFunction("strLenPython", lambda x: len(x), IntegerType())
    lengthSchemaRDD = hiveCtx.sql("SELECT strLenPython('text') FROM tweets LIMIT 10")
    print lengthSchemaRDD.collect()
    sc.stop()
pattern = re.compile("(.*) (.*) (.*)")


def parse(line):
    matcher = pattern.match(line)

    if matcher:
        return matcher.groups()
    else:
        return None


columns = source.map(parse).filter(
    lambda columns: columns and len(columns) == 3)

rows = columns.map(
    lambda columns: Row(col1=columns[0], col2=columns[1], col3=columns[2]))

table = hc.inferSchema(rows)

table.registerAsTable("temp_mytable")

datas = hc.sql("select * from temp_mytable").collect()

sc.stop()

if datas:
    for data in datas:
        print data
	sc = SparkContext(conf=conf)
	hc = HiveContext(sc)


	hc.sql("set spark.sql.shuffle.partitions = 10")

	orderMap = sc.textFile("hdfs:///user/hive/warehouse/retaildb.db/orders")\
	.map(lambda record : record.split(","))\
	.map(lambda record : Row(orderID=int(record[0]),orderDate=record[1][:11]))
	
	itemMap  = sc.textFile("hdfs:///user/hive/warehouse/retaildb.db/order_items")\
 	.map(lambda record : record.split(","))\
	.map(lambda row: (int(row[1]),float(row[4]))).reduceByKey(lambda x,y : x+y)\
	.map(lambda record : Row(orderID=int(record[0]),Total=record[1]))

	oSchema = hc.inferSchema(orderMap)
	iSchema = hc.inferSchema(itemMap)

	oSchema.registerTempTable("orders")
	iSchema.registerTempTable("items")

	avgSalesPerDay = hc.sql(" SELECT o.orderDate,avg(i.Total) as avgSales \
                            from orders o join items i \ 
                            where o.orderID = i.orderID \
                            group by o.orderDate \
                            order by avgSales DESC")


	avgSalesPerDay.map(lambda row : ",".join([row.orderDate,str(row.avgSales)]))\
  	.coalesce(1) \
  	.saveAsTextFile("AvgSalesPerDay2")
Exemple #9
0
from pyspark import SparkConf, SparkContext
from pyspark.sql import HiveContext, Row
from pyspark.sql.types import StringType

conf = SparkConf().setAppName("spark_sql_udf")

sc = SparkContext(conf=conf)

hc = HiveContext(sc)

lines = sc.parallelize(["a", "b", "c"])

people = lines.map(lambda value: Row(name=value))

peopleSchema = hc.inferSchema(people)

peopleSchema.registerTempTable("people")


def myfunc(value):
    return value.upper()

hc.registerFunction("myfunc", myfunc, StringType())

rows = hc.sql("select myfunc(name) from people").rdd.filter(
    lambda row: isinstance(row, tuple)).collect()

sc.stop()

for row in rows:
    print row, type(row[0])
Exemple #10
0
conf = SparkConf().setAppName('spar_sql_test')
sc = SparkContext(conf=conf)

#sqlContext = SQLContext(sc);
hc = HiveContext(sc)

# Parallelize a list and convert each line to a Row
# Row(id=1, name="a", age=28)
# datas -> Spark RDD source, type = str
datas = ['1 a 28', '2 b 39', '3 c 30']
source = sc.parallelize(datas)
splits = source.map(lambda line: line.split(" "))
rows = splits.map(lambda words: Row(id=words[0], name=words[1], age=words[2]))

# Infer the schema, and register the Schema as a table
people = hc.inferSchema(rows)
people.printSchema()

# SQL can be run over SchemaRDD that have been registered as a table
people.registerTempTable("people")
results = hc.sql('select * from people where age > 28 and age < 30')
results.printSchema()

# The results of SQL queries are SchemaRDD, so register it as a table
results.registerTempTable("people2")
results2 = hc.sql('select name from people2')
results2.printSchema()

# The SchemaRDD support all the normal RDD operations
results3 = results2.map(lambda row: row.name.upper()).collect()
for results in results:
   print results


from pyspark import SparkConf, SparkContext
conf = SparkConf().setAppName("bleh")
sc = SparkContext(conf=conf)
from pyspark.sql import HiveContext
sqlContext = HiveContext(sc)
sql = """
select
distinct date_category
from
ck_membership.date_driver
LIMIt 10
"""
r = sqlContext.sql(sql)
for i in r.collect():
    print(i)
r.saveAsParquetFile("hdfs://nameservice1/data/unmanaged/datascience_ck/vish/mydata")
sqlContext.sql("CREATE EXTERNAL TABLE vsubr2.some_date_driver1 (date_category String) STORED AS PARQUET LOCATION 'hdfs://nameservice1/data/unmanaged/datascience_ck/vish/mydata'")

like rdd
r2 = sqlContext.inferSchema(r)

r.createExternalTable('vsubr2.some_Date_driver',path='hdfs://nameservice1/data/unmanaged/datascience_ck/vish/')

sqlContext.sql("create table if not exists vsubr2.some_date_driver1")
create table if not exists vsubr2.some_date_driver1
LOCATION 'hdfs://nameservice1/data/unmanaged/datascience_ck/vish/'
as