Esempio n. 1
0
 def sqlType(cls):
     return StructType([
         StructField("type", ByteType(), False),
         StructField("size", IntegerType(), True),
         StructField("indices", ArrayType(IntegerType(), False), True),
         StructField("values", ArrayType(DoubleType(), False), True)
     ])
Esempio n. 2
0
def table2():
    #####################################################
    # Import/Create Second Schema
    #
    # Another RDD is created from a list of tuples
    another_rdd = sc.parallelize([("John", "England", 120),
                                  ("Jenny", "Spain", 45),
                                  ("Sarah", "Japan", 55)])
    # Schema with two fields - person_name and person_age
    schema = StructType([
        StructField("Person_First_Name", StringType(), False),
        StructField("Person_Location_Country", StringType(), False),
        StructField("Person_Avg_Spend", IntegerType(), False)
    ])
    # Create a SchemaRDD by applying the schema to the RDD and print the schema
    another_schemardd = sqlCtx.applySchema(another_rdd, schema)
    # Print Schema on Screen
    print('Print the Second Schema - People_Details\n')
    another_schemardd.printSchema()
    #####################################################
    # Save Data Above as a Parqet File
    # SchemaRDDs can be saved as Parquet files, maintaining the schema
    # information.
    another_schemardd.saveAsParquetFile(
        "/home/dan/Desktop/People_Details.parquet")
    # Register this SchemaRDD as a table.
    return another_schemardd.registerAsTable("People_Details")
Esempio n. 3
0
    sc = SparkContext(appName="PythonSQL")
    sqlContext = SQLContext(sc)

    # RDD is created from a list of rows
    some_rdd = sc.parallelize([Row(name="John", age=19),
                              Row(name="Smith", age=23),
                              Row(name="Sarah", age=18)])
    # Infer schema from the first row, create a SchemaRDD and print the schema
    some_schemardd = sqlContext.inferSchema(some_rdd)
    some_schemardd.printSchema()

    # Another RDD is created from a list of tuples
    another_rdd = sc.parallelize([("John", 19), ("Smith", 23), ("Sarah", 18)])
    # Schema with two fields - person_name and person_age
    schema = StructType([StructField("person_name", StringType(), False),
                        StructField("person_age", IntegerType(), False)])
    # Create a SchemaRDD by applying the schema to the RDD and print the schema
    another_schemardd = sqlContext.applySchema(another_rdd, schema)
    another_schemardd.printSchema()
    # root
    #  |-- age: integer (nullable = true)
    #  |-- name: string (nullable = true)

    # A JSON dataset is pointed to by path.
    # The path can be either a single text file or a directory storing text files.
    path = os.path.join(os.environ['SPARK_HOME'], "examples/src/main/resources/people.json")
    # Create a SchemaRDD from the file(s) pointed to by path
    people = sqlContext.jsonFile(path)
    # root
    #  |-- person_name: string (nullable = false)
    #  |-- person_age: integer (nullable = false)
Esempio n. 4
0
    LongType: datashape.int64,
    FloatType: datashape.float32,
    DoubleType: datashape.float64,
    StringType: datashape.string,
    BinaryType: datashape.bytes_,
    BooleanType: datashape.bool_,
    TimestampType: datashape.datetime_,
    DateType: datashape.date_,
    # sql.ArrayType: ?,
    # sql.MapTYpe: ?,
    # sql.StructType: ?
}

dshape_to_sparksql = {
    datashape.int16: ShortType(),
    datashape.int32: IntegerType(),
    datashape.int64: LongType(),
    datashape.float32: FloatType(),
    datashape.float64: DoubleType(),
    datashape.real: DoubleType(),
    datashape.time_: TimestampType(),
    datashape.date_: DateType(),
    datashape.datetime_: TimestampType(),
    datashape.bool_: BooleanType(),
    datashape.string: StringType()
}

ooc_types |= set([SparkDataFrame, SchemaRDD])

SQLContext = memoize(SQLContext)
HiveContext = memoize(HiveContext)
Esempio n. 5
0
    if isinstance(ds, DataShape) and not isdimension(ds[0]):
        return deoption(ds[0])
    if isinstance(ds, Option):
        return ds.ty
    else:
        return ds


if pyspark:
    if not issubclass(SQLContext, object):
        raise ImportError("This version of SparkSQL uses old-style classes. "
                          "Please update to newer version of Spark")

    types = {
        datashape.int16: ShortType(),
        datashape.int32: IntegerType(),
        datashape.int64: IntegerType(),
        datashape.float32: FloatType(),
        datashape.float64: DoubleType(),
        datashape.real: DoubleType(),
        datashape.time_: TimestampType(),
        datashape.date_: TimestampType(),
        datashape.datetime_: TimestampType(),
        datashape.bool_: BooleanType(),
        datashape.string: StringType()
    }

    rev_types = {
        IntegerType(): datashape.int64,
        ShortType(): datashape.int32,
        LongType(): datashape.int64,
# coding: utf-8

from pyspark import SparkConf, SparkContext
from pyspark.sql import HiveContext
from pyspark.sql import StructType, StructField, ArrayType, IntegerType

conf = SparkConf().setAppName("spark_sql_datatype_array")

sc = SparkContext(conf=conf)

hc = HiveContext(sc)

source = sc.parallelize([([1, 2, 3], )])

schema = StructType(
    [StructField("array", ArrayType(IntegerType(), False), False)])

table = hc.applySchema(source, schema)

table.registerAsTable("temp_table")

rows = hc.sql("select array[0], array[1], array[2] from temp_table").collect()

sc.stop()

for row in rows:
    print row
Esempio n. 7
0
from pyspark.sql import HiveContext
from pyspark.sql import StructType, StructField, IntegerType, FloatType, StringType

conf = SparkConf().setAppName("spark_sql_datatype_struct")

sc = SparkContext(conf=conf)

hc = HiveContext(sc)

source = sc.parallelize([((1, 2.0, "3.0"), )])

schema = StructType([
    StructField(
        "struct",
        StructType([
            StructField("first", IntegerType(), False),
            StructField("second", FloatType(), False),
            StructField("third", StringType(), False)
        ]), False)
])

table = hc.applySchema(source, schema)

table.registerAsTable("temp_table")

rows = hc.sql(
    "select struct.first, struct.second, struct.third from temp_table"
).collect()

sc.stop()
Esempio n. 8
0
sc = SparkContext(conf=conf)

hc = HiveContext(sc)

source = sc.parallelize([
    (int("127"), int("32767"), int("2147483647"), long("9223372036854775807"),
     float("1.1"), float("2.2"), Decimal("3.3"), "str", bool(0),
     datetime(2015, 9, 22, 9, 39, 45), date(2015, 9, 22), [1, 2, 3], {
         "key": "value"
     }, (1, 2.0, "3.0"))
])

schema = StructType([
    StructField("byte", ByteType(), False),
    StructField("short", ShortType(), False),
    StructField("int", IntegerType(), False),
    StructField("long", LongType(), False),
    StructField("float", FloatType(), False),
    StructField("double", DoubleType(), False),
    StructField("decimal", DecimalType(), False),
    StructField("string", StringType(), False),
    StructField("boolean", BooleanType(), False),
    StructField("timestamp", TimestampType(), False),
    StructField("date", DateType(), False),
    StructField("array", ArrayType(IntegerType(), False), False),
    StructField("col_map", MapType(StringType(), StringType(), False), False),
    StructField(
        "struct",
        StructType([
            StructField("first", IntegerType(), False),
            StructField("second", FloatType(), False),
Esempio n. 9
0

def func_string():
    return "abc"


hc.registerFunction("func_string", func_string)

rows = hc.sql("select func_string() from temp_table").collect()


def func_int():
    return 123


hc.registerFunction("func_int", func_int, IntegerType())

rows = hc.sql("select func_int() from temp_table").collect()


def func_array():
    # list or tuple
    return [1, 2, 3]


hc.registerFunction("func_array", func_array, ArrayType(IntegerType()))

rows = hc.sql(
    "select val[0], val[1], val[2] from (select func_array() as val from temp_table) t"
).collect()
Esempio n. 10
0
    if isinstance(ds, DataShape) and not isdimension(ds[0]):
        return deoption(ds[0])
    if isinstance(ds, Option):
        return ds.ty
    else:
        return ds


if pyspark:
    if not issubclass(SQLContext, object):
        raise ImportError("This version of SparkSQL uses old-style classes. "
                "Please update to newer version of Spark")


    types = {datashape.int16: ShortType(),
             datashape.int32: IntegerType(),
             datashape.int64: IntegerType(),
             datashape.float32: FloatType(),
             datashape.float64: DoubleType(),
             datashape.real: DoubleType(),
             datashape.time_: TimestampType(),
             datashape.date_: TimestampType(),
             datashape.datetime_: TimestampType(),
             datashape.bool_: BooleanType(),
             datashape.string: StringType()}

    rev_types = {IntegerType(): datashape.int64,
                 ShortType(): datashape.int32,
                 LongType(): datashape.int64,
                 FloatType(): datashape.float32,
                 DoubleType(): datashape.float64,
Esempio n. 11
0
# coding: utf-8

from pyspark import SparkConf, SparkContext
from pyspark.sql import HiveContext
from pyspark.sql import StructType, StructField, MapType, StringType, IntegerType

conf = SparkConf().setAppName("spark_sql_datatype_map")

sc = SparkContext(conf=conf)

hc = HiveContext(sc)

source = sc.parallelize([({"key1": 1, "key2": 2}, )])

schema = StructType([
    StructField("col_map", MapType(StringType(), IntegerType(), False), False)
])

table = hc.applySchema(source, schema)

table.registerAsTable("temp_table")

rows = hc.sql(
    "select col_map['key1'], col_map['key2'] from temp_table").collect()

sc.stop()

for row in rows:
    print row
Esempio n. 12
0
# A simple demo for working with SparkSQL and Tweets
from pyspark import SparkContext, SparkConf
from pyspark.sql import HiveContext, Row, IntegerType
import json
import sys

if __name__ == "__main__":
    inputFile = sys.argv[1]
    conf = SparkConf().setAppName("SparkSQLTwitter")
    sc = SparkContext()
    hiveCtx = HiveContext(sc)
    print "Loading tweets from " + inputFile
    input = hiveCtx.jsonFile(inputFile)
    input.registerTempTable("tweets")
    topTweets = hiveCtx.sql(
        "SELECT text, retweetCount FROM tweets ORDER BY retweetCount LIMIT 10")
    print topTweets.collect()
    topTweetText = topTweets.map(lambda row: row.text)
    print topTweetText.collect()
    # Make a happy person row
    happyPeopleRDD = sc.parallelize(
        [Row(name="holden", favouriteBeverage="coffee")])
    happyPeopleSchemaRDD = hiveCtx.inferSchema(happyPeopleRDD)
    happyPeopleSchemaRDD.registerTempTable("happy_people")
    # Make a UDF to tell us how long some text is
    hiveCtx.registerFunction("strLenPython", lambda x: len(x), IntegerType())
    lengthSchemaRDD = hiveCtx.sql(
        "SELECT strLenPython('text') FROM tweets LIMIT 10")
    print lengthSchemaRDD.collect()
    sc.stop()