Beispiel #1
0
 def sqlType(cls):
     return StructType([
         StructField("type", ByteType(), False),
         StructField("size", IntegerType(), True),
         StructField("indices", ArrayType(IntegerType(), False), True),
         StructField("values", ArrayType(DoubleType(), False), True)
     ])
 def run(self):
     sc = SparkContext("local", "gender")
     sqlContext = SQLContext(sc)
     #StringType =(str, unicode)
     _out = self.output().open('w')
     #lines = sc.textFile("myUser.csv")
     #fobj = self.input().open("r")
     #lines = sc.textFile(fobj.name)
     print(type(self.required_tasks['insert_source'].output()))
     print(self.required_tasks['insert_source'])
     #print(self.input()['insert_source'].input())
     lines = sc.textFile("myUser.csv")
     parts = lines.map(lambda l: l.split(","))
     users = parts.map(lambda p: (p[0], p[1], p[2], p[3], p[4], p[5], p[
         6], p[7], p[8], p[9], p[10], p[11], p[12], p[13], p[14], p[15], p[
             16], p[17], p[18], p[19]))
     schemaString = "userId lmsUserId lmsName orgName name gender registrationDate emailId mothertounge highestEduDegree goals city state active firstAccesDate lastAccessDate allowCert yearOfBirth pincode aadharId"
     print(schemaString)
     _out.write(schemaString)
     fields = [
         StructField(field_name, StringType(), True)
         for field_name in schemaString.split()
     ]
     schema = StructType(fields)
     #schemaUser = sqlContext.createDataFrame(users, schema)
     schemaUser = sqlContext.applySchema(users, schema)
     schemaUser.registerTempTable("users")
     results = sqlContext.sql("SELECT gender FROM users")
     genders = results.map(lambda p: (p, 1))
     counts = genders.reduceByKey(
         lambda a, b: a + b
     )  #.map(lambda t : ("Gender " + t(0) + " No " + t(1))).collect()
     for name in counts.collect():
         _out.write(str(name))
     _out.close()
Beispiel #3
0
def table2():
    #####################################################
    # Import/Create Second Schema
    #
    # Another RDD is created from a list of tuples
    another_rdd = sc.parallelize([("John", "England", 120),
                                  ("Jenny", "Spain", 45),
                                  ("Sarah", "Japan", 55)])
    # Schema with two fields - person_name and person_age
    schema = StructType([
        StructField("Person_First_Name", StringType(), False),
        StructField("Person_Location_Country", StringType(), False),
        StructField("Person_Avg_Spend", IntegerType(), False)
    ])
    # Create a SchemaRDD by applying the schema to the RDD and print the schema
    another_schemardd = sqlCtx.applySchema(another_rdd, schema)
    # Print Schema on Screen
    print('Print the Second Schema - People_Details\n')
    another_schemardd.printSchema()
    #####################################################
    # Save Data Above as a Parqet File
    # SchemaRDDs can be saved as Parquet files, maintaining the schema
    # information.
    another_schemardd.saveAsParquetFile(
        "/home/dan/Desktop/People_Details.parquet")
    # Register this SchemaRDD as a table.
    return another_schemardd.registerAsTable("People_Details")
Beispiel #4
0
def dshape_to_schema(ds):
    """Convert datashape to SparkSQL type system.

    Examples
    --------
    >>> print(dshape_to_schema('int32'))  # doctest: +SKIP
    IntegerType
    >>> print(dshape_to_schema('5 * int32')  # doctest: +SKIP
    ArrayType(IntegerType,false)
    >>> print(dshape_to_schema('5 * ?int32'))  # doctest: +SKIP
    ArrayType(IntegerType,true)
    >>> print(dshape_to_schema('{name: string, amount: int32}'))  # doctest: +SKIP
    StructType(List(StructField(name,StringType,false),StructField(amount,IntegerType,false)  # doctest: +SKIP))
    >>> print(dshape_to_schema('10 * {name: string, amount: ?int32}'))  # doctest: +SKIP
    ArrayType(StructType(List(StructField(name,StringType,false),StructField(amount,IntegerType,true))),false)
    """
    if isinstance(ds, str):
        return dshape_to_schema(dshape(ds))
    if isinstance(ds, Tuple):
        raise TypeError('Please provide a Record dshape for these column '
                        'types: %s' % (ds.dshapes, ))
    if isinstance(ds, Record):
        return StructType([
            StructField(name, dshape_to_schema(deoption(typ)),
                        isinstance(typ, datashape.Option))
            for name, typ in ds.fields
        ])
    if isinstance(ds, DataShape):
        if isdimension(ds[0]):
            elem = ds.subshape[0]
            if isinstance(elem, DataShape) and len(elem) == 1:
                elem = elem[0]
            return ArrayType(dshape_to_schema(deoption(elem)),
                             isinstance(elem, Option))
        else:
            return dshape_to_schema(ds[0])
    if ds in dshape_to_sparksql:
        return dshape_to_sparksql[ds]
    raise NotImplementedError()
sc = SparkContext(conf=conf)

hc = HiveContext(sc)

source = sc.parallelize([
    "row1_col1 row1_col2 row1_col3", "row2_col1 row2_col2 row3_col3",
    "row3_col1 row3_col2 row3_col3"
])

columns = source.map(lambda line: line.split(" ")).filter(
    lambda columns: columns and len(columns) == 3)

rows = columns.map(lambda columns: (columns[0], columns[1], columns[2]))

schema = StructType([
    StructField("col1", StringType(), False),
    StructField("col2", StringType(), False),
    StructField("col3", StringType(), False)
])

table = hc.applySchema(rows, schema)

table.registerAsTable("temp_mytable")

rdd = hc.sql("select count(*) from temp_mytable where col1 = '' group by col2")

print rdd.toDebugString()

sc.stop()
Beispiel #6
0
sc = SparkContext(conf=conf)

hc = HiveContext(sc)

datas = ["1 a 28", "2 b 29", "3 c 30"]

source = sc.parallelize(datas)

splits = source.map(lambda line: line.split(" "))

rows = splits.map(lambda words: (int(words[0]), words[1], int(words[2])))

fields = []

fields.append(StructField("id", IntegerType(). True))
fields.append(StructField("name", StringType(). True))
fields.append(StructField("age", IntegerType(). True))

schema = StructType(fields)

people = hc.applySchema(rows, schema)

people.registerTempTable("people")

results = hc.sql("select * from people where age>28 and age<30").collect()

sc.stop()

for result in results:
    print("id: %s, name: %s, age: %s" % (result.id, result.name, result.age))
from pyspark.sql import StructType, StructField, ArrayType, MapType

conf = SparkConf().setAppName("spark_sql_datatype_complex")

sc = SparkContext(conf=conf)

hc = HiveContext(sc)

source = sc.parallelize([([1, 2, 3], {"key1": 1, "key2": 2}, (1, 2.0, "3.0"))])

schema = StructType([
    StructField("array", ArrayType(IntegerType(), False), False),
    StructField("col_map", MapType(StringType(), IntegerType(), False), False),
    StructField(
        "struct",
        StructType([
            StructField("first", IntegerType(), False),
            StructField("second", FloatType(), False),
            StructField("third", StringType(), False)
        ]), False)
])

table = hc.applySchema(source, schema)

table.registerAsTable("temp_table")

rows = hc.sql(
    "select array[0], array[1], array[2], col_map['key1'], col_map['key2'], struct.first, struct.second, struct.third from temp_table"
).collect()

sc.stop()
Beispiel #8
0
if __name__ == "__main__":
    sc = SparkContext(appName="PythonSQL")
    sqlContext = SQLContext(sc)

    # RDD is created from a list of rows
    some_rdd = sc.parallelize([Row(name="John", age=19),
                              Row(name="Smith", age=23),
                              Row(name="Sarah", age=18)])
    # Infer schema from the first row, create a SchemaRDD and print the schema
    some_schemardd = sqlContext.inferSchema(some_rdd)
    some_schemardd.printSchema()

    # Another RDD is created from a list of tuples
    another_rdd = sc.parallelize([("John", 19), ("Smith", 23), ("Sarah", 18)])
    # Schema with two fields - person_name and person_age
    schema = StructType([StructField("person_name", StringType(), False),
                        StructField("person_age", IntegerType(), False)])
    # Create a SchemaRDD by applying the schema to the RDD and print the schema
    another_schemardd = sqlContext.applySchema(another_rdd, schema)
    another_schemardd.printSchema()
    # root
    #  |-- age: integer (nullable = true)
    #  |-- name: string (nullable = true)

    # A JSON dataset is pointed to by path.
    # The path can be either a single text file or a directory storing text files.
    path = os.path.join(os.environ['SPARK_HOME'], "examples/src/main/resources/people.json")
    # Create a SchemaRDD from the file(s) pointed to by path
    people = sqlContext.jsonFile(path)
    # root
    #  |-- person_name: string (nullable = false)
    #  |-- person_age: integer (nullable = false)
Beispiel #9
0
from pyspark import SparkConf, SparkContext
from pyspark.sql import HiveContext
from datetime import datetime, date
from pyspark.sql import StructType, StructField, DateType, TimestampType

conf = SparkConf().setAppName("spark_sql_datatype_date_or_datetime")

sc = SparkContext(conf=conf)

hc = HiveContext(sc)

source = sc.parallelize([(date(2015, 9, 22), datetime(2015, 9, 22, 9, 39,
                                                      45))])

schema = StructType([
    StructField("date", DateType(), False),
    StructField("timestamp", TimestampType(), False)
])

table = hc.applySchema(source, schema)

table.registerAsTable("temp_table")

rows = hc.sql("select date, timestamp from temp_table").collect()

sc.stop()

for row in rows:
    print row
Beispiel #10
0
from pyspark import SparkConf, SparkContext
from pyspark.sql import HiveContext
from pyspark.sql import StructType, StructField, StringType, BooleanType, BinaryType, NullType

conf = SparkConf().setAppName("spark_sql_datatype_str_bool_binary_none")

sc = SparkContext(conf=conf)

hc = HiveContext(sc)

source = sc.parallelize([("str", False, bytearray(range(0, 256)), None)])

schema = StructType([
    StructField("str", StringType(), False),
    StructField("bool", BooleanType(), False),
    StructField("bytes", BinaryType(), False),
    StructField("none", NullType())
])

table = hc.applySchema(source, schema)

table.registerAsTable("temp_table")

rows = hc.sql("select str, bool, bytes, none from temp_table").collect()

sc.stop()

for row in rows:
    print row
# coding: utf-8

from pyspark import SparkConf, SparkContext
from pyspark.sql import HiveContext
from pyspark.sql import StructType, StructField, ArrayType, IntegerType

conf = SparkConf().setAppName("spark_sql_datatype_array")

sc = SparkContext(conf=conf)

hc = HiveContext(sc)

source = sc.parallelize([([1, 2, 3], )])

schema = StructType(
    [StructField("array", ArrayType(IntegerType(), False), False)])

table = hc.applySchema(source, schema)

table.registerAsTable("temp_table")

rows = hc.sql("select array[0], array[1], array[2] from temp_table").collect()

sc.stop()

for row in rows:
    print row
Beispiel #12
0
from pyspark.sql import HiveContext
from pyspark.sql import StructType, StructField, IntegerType, FloatType, StringType

conf = SparkConf().setAppName("spark_sql_datatype_struct")

sc = SparkContext(conf=conf)

hc = HiveContext(sc)

source = sc.parallelize([((1, 2.0, "3.0"), )])

schema = StructType([
    StructField(
        "struct",
        StructType([
            StructField("first", IntegerType(), False),
            StructField("second", FloatType(), False),
            StructField("third", StringType(), False)
        ]), False)
])

table = hc.applySchema(source, schema)

table.registerAsTable("temp_table")

rows = hc.sql(
    "select struct.first, struct.second, struct.third from temp_table"
).collect()

sc.stop()
     datetime(2015, 9, 22, 9, 39, 45), date(2015, 9, 22), [1, 2, 3], {
         "key": "value"
     }, (1, 2.0, "3.0"))
])

schema = StructType([
    StructField("byte", ByteType(), False),
    StructField("short", ShortType(), False),
    StructField("int", IntegerType(), False),
    StructField("long", LongType(), False),
    StructField("float", FloatType(), False),
    StructField("double", DoubleType(), False),
    StructField("decimal", DecimalType(), False),
    StructField("string", StringType(), False),
    StructField("boolean", BooleanType(), False),
    StructField("timestamp", TimestampType(), False),
    StructField("date", DateType(), False),
    StructField("array", ArrayType(IntegerType(), False), False),
    StructField("col_map", MapType(StringType(), StringType(), False), False),
    StructField(
        "struct",
        StructType([
            StructField("first", IntegerType(), False),
            StructField("second", FloatType(), False),
            StructField("third", StringType(), False)
        ]), False)
])

table = hc.applySchema(source, schema)

table.registerAsTable("temp_table")
# coding: utf-8

from pyspark import SparkConf, SparkContext
from pyspark.sql import HiveContext
from pyspark.sql import StructType, StructField, MapType, StringType, IntegerType

conf = SparkConf().setAppName("spark_sql_datatype_map")

sc = SparkContext(conf=conf)

hc = HiveContext(sc)

source = sc.parallelize([({"key1": 1, "key2": 2}, )])

schema = StructType([
    StructField("col_map", MapType(StringType(), IntegerType(), False), False)
])

table = hc.applySchema(source, schema)

table.registerAsTable("temp_table")

rows = hc.sql(
    "select col_map['key1'], col_map['key2'] from temp_table").collect()

sc.stop()

for row in rows:
    print row