def sqlType(cls): return StructType([ StructField("type", ByteType(), False), StructField("size", IntegerType(), True), StructField("indices", ArrayType(IntegerType(), False), True), StructField("values", ArrayType(DoubleType(), False), True) ])
def table2(): ##################################################### # Import/Create Second Schema # # Another RDD is created from a list of tuples another_rdd = sc.parallelize([("John", "England", 120), ("Jenny", "Spain", 45), ("Sarah", "Japan", 55)]) # Schema with two fields - person_name and person_age schema = StructType([ StructField("Person_First_Name", StringType(), False), StructField("Person_Location_Country", StringType(), False), StructField("Person_Avg_Spend", IntegerType(), False) ]) # Create a SchemaRDD by applying the schema to the RDD and print the schema another_schemardd = sqlCtx.applySchema(another_rdd, schema) # Print Schema on Screen print('Print the Second Schema - People_Details\n') another_schemardd.printSchema() ##################################################### # Save Data Above as a Parqet File # SchemaRDDs can be saved as Parquet files, maintaining the schema # information. another_schemardd.saveAsParquetFile( "/home/dan/Desktop/People_Details.parquet") # Register this SchemaRDD as a table. return another_schemardd.registerAsTable("People_Details")
sc = SparkContext(appName="PythonSQL") sqlContext = SQLContext(sc) # RDD is created from a list of rows some_rdd = sc.parallelize([Row(name="John", age=19), Row(name="Smith", age=23), Row(name="Sarah", age=18)]) # Infer schema from the first row, create a SchemaRDD and print the schema some_schemardd = sqlContext.inferSchema(some_rdd) some_schemardd.printSchema() # Another RDD is created from a list of tuples another_rdd = sc.parallelize([("John", 19), ("Smith", 23), ("Sarah", 18)]) # Schema with two fields - person_name and person_age schema = StructType([StructField("person_name", StringType(), False), StructField("person_age", IntegerType(), False)]) # Create a SchemaRDD by applying the schema to the RDD and print the schema another_schemardd = sqlContext.applySchema(another_rdd, schema) another_schemardd.printSchema() # root # |-- age: integer (nullable = true) # |-- name: string (nullable = true) # A JSON dataset is pointed to by path. # The path can be either a single text file or a directory storing text files. path = os.path.join(os.environ['SPARK_HOME'], "examples/src/main/resources/people.json") # Create a SchemaRDD from the file(s) pointed to by path people = sqlContext.jsonFile(path) # root # |-- person_name: string (nullable = false) # |-- person_age: integer (nullable = false)
LongType: datashape.int64, FloatType: datashape.float32, DoubleType: datashape.float64, StringType: datashape.string, BinaryType: datashape.bytes_, BooleanType: datashape.bool_, TimestampType: datashape.datetime_, DateType: datashape.date_, # sql.ArrayType: ?, # sql.MapTYpe: ?, # sql.StructType: ? } dshape_to_sparksql = { datashape.int16: ShortType(), datashape.int32: IntegerType(), datashape.int64: LongType(), datashape.float32: FloatType(), datashape.float64: DoubleType(), datashape.real: DoubleType(), datashape.time_: TimestampType(), datashape.date_: DateType(), datashape.datetime_: TimestampType(), datashape.bool_: BooleanType(), datashape.string: StringType() } ooc_types |= set([SparkDataFrame, SchemaRDD]) SQLContext = memoize(SQLContext) HiveContext = memoize(HiveContext)
if isinstance(ds, DataShape) and not isdimension(ds[0]): return deoption(ds[0]) if isinstance(ds, Option): return ds.ty else: return ds if pyspark: if not issubclass(SQLContext, object): raise ImportError("This version of SparkSQL uses old-style classes. " "Please update to newer version of Spark") types = { datashape.int16: ShortType(), datashape.int32: IntegerType(), datashape.int64: IntegerType(), datashape.float32: FloatType(), datashape.float64: DoubleType(), datashape.real: DoubleType(), datashape.time_: TimestampType(), datashape.date_: TimestampType(), datashape.datetime_: TimestampType(), datashape.bool_: BooleanType(), datashape.string: StringType() } rev_types = { IntegerType(): datashape.int64, ShortType(): datashape.int32, LongType(): datashape.int64,
# coding: utf-8 from pyspark import SparkConf, SparkContext from pyspark.sql import HiveContext from pyspark.sql import StructType, StructField, ArrayType, IntegerType conf = SparkConf().setAppName("spark_sql_datatype_array") sc = SparkContext(conf=conf) hc = HiveContext(sc) source = sc.parallelize([([1, 2, 3], )]) schema = StructType( [StructField("array", ArrayType(IntegerType(), False), False)]) table = hc.applySchema(source, schema) table.registerAsTable("temp_table") rows = hc.sql("select array[0], array[1], array[2] from temp_table").collect() sc.stop() for row in rows: print row
from pyspark.sql import HiveContext from pyspark.sql import StructType, StructField, IntegerType, FloatType, StringType conf = SparkConf().setAppName("spark_sql_datatype_struct") sc = SparkContext(conf=conf) hc = HiveContext(sc) source = sc.parallelize([((1, 2.0, "3.0"), )]) schema = StructType([ StructField( "struct", StructType([ StructField("first", IntegerType(), False), StructField("second", FloatType(), False), StructField("third", StringType(), False) ]), False) ]) table = hc.applySchema(source, schema) table.registerAsTable("temp_table") rows = hc.sql( "select struct.first, struct.second, struct.third from temp_table" ).collect() sc.stop()
sc = SparkContext(conf=conf) hc = HiveContext(sc) source = sc.parallelize([ (int("127"), int("32767"), int("2147483647"), long("9223372036854775807"), float("1.1"), float("2.2"), Decimal("3.3"), "str", bool(0), datetime(2015, 9, 22, 9, 39, 45), date(2015, 9, 22), [1, 2, 3], { "key": "value" }, (1, 2.0, "3.0")) ]) schema = StructType([ StructField("byte", ByteType(), False), StructField("short", ShortType(), False), StructField("int", IntegerType(), False), StructField("long", LongType(), False), StructField("float", FloatType(), False), StructField("double", DoubleType(), False), StructField("decimal", DecimalType(), False), StructField("string", StringType(), False), StructField("boolean", BooleanType(), False), StructField("timestamp", TimestampType(), False), StructField("date", DateType(), False), StructField("array", ArrayType(IntegerType(), False), False), StructField("col_map", MapType(StringType(), StringType(), False), False), StructField( "struct", StructType([ StructField("first", IntegerType(), False), StructField("second", FloatType(), False),
def func_string(): return "abc" hc.registerFunction("func_string", func_string) rows = hc.sql("select func_string() from temp_table").collect() def func_int(): return 123 hc.registerFunction("func_int", func_int, IntegerType()) rows = hc.sql("select func_int() from temp_table").collect() def func_array(): # list or tuple return [1, 2, 3] hc.registerFunction("func_array", func_array, ArrayType(IntegerType())) rows = hc.sql( "select val[0], val[1], val[2] from (select func_array() as val from temp_table) t" ).collect()
if isinstance(ds, DataShape) and not isdimension(ds[0]): return deoption(ds[0]) if isinstance(ds, Option): return ds.ty else: return ds if pyspark: if not issubclass(SQLContext, object): raise ImportError("This version of SparkSQL uses old-style classes. " "Please update to newer version of Spark") types = {datashape.int16: ShortType(), datashape.int32: IntegerType(), datashape.int64: IntegerType(), datashape.float32: FloatType(), datashape.float64: DoubleType(), datashape.real: DoubleType(), datashape.time_: TimestampType(), datashape.date_: TimestampType(), datashape.datetime_: TimestampType(), datashape.bool_: BooleanType(), datashape.string: StringType()} rev_types = {IntegerType(): datashape.int64, ShortType(): datashape.int32, LongType(): datashape.int64, FloatType(): datashape.float32, DoubleType(): datashape.float64,
# coding: utf-8 from pyspark import SparkConf, SparkContext from pyspark.sql import HiveContext from pyspark.sql import StructType, StructField, MapType, StringType, IntegerType conf = SparkConf().setAppName("spark_sql_datatype_map") sc = SparkContext(conf=conf) hc = HiveContext(sc) source = sc.parallelize([({"key1": 1, "key2": 2}, )]) schema = StructType([ StructField("col_map", MapType(StringType(), IntegerType(), False), False) ]) table = hc.applySchema(source, schema) table.registerAsTable("temp_table") rows = hc.sql( "select col_map['key1'], col_map['key2'] from temp_table").collect() sc.stop() for row in rows: print row
# A simple demo for working with SparkSQL and Tweets from pyspark import SparkContext, SparkConf from pyspark.sql import HiveContext, Row, IntegerType import json import sys if __name__ == "__main__": inputFile = sys.argv[1] conf = SparkConf().setAppName("SparkSQLTwitter") sc = SparkContext() hiveCtx = HiveContext(sc) print "Loading tweets from " + inputFile input = hiveCtx.jsonFile(inputFile) input.registerTempTable("tweets") topTweets = hiveCtx.sql( "SELECT text, retweetCount FROM tweets ORDER BY retweetCount LIMIT 10") print topTweets.collect() topTweetText = topTweets.map(lambda row: row.text) print topTweetText.collect() # Make a happy person row happyPeopleRDD = sc.parallelize( [Row(name="holden", favouriteBeverage="coffee")]) happyPeopleSchemaRDD = hiveCtx.inferSchema(happyPeopleRDD) happyPeopleSchemaRDD.registerTempTable("happy_people") # Make a UDF to tell us how long some text is hiveCtx.registerFunction("strLenPython", lambda x: len(x), IntegerType()) lengthSchemaRDD = hiveCtx.sql( "SELECT strLenPython('text') FROM tweets LIMIT 10") print lengthSchemaRDD.collect() sc.stop()