Example #1
0
 def sqlType(cls):
     return StructType([
         StructField("type", ByteType(), False),
         StructField("size", IntegerType(), True),
         StructField("indices", ArrayType(IntegerType(), False), True),
         StructField("values", ArrayType(DoubleType(), False), True)
     ])
Example #2
0
def table2():
    #####################################################
    # Import/Create Second Schema
    #
    # Another RDD is created from a list of tuples
    another_rdd = sc.parallelize([("John", "England", 120),
                                  ("Jenny", "Spain", 45),
                                  ("Sarah", "Japan", 55)])
    # Schema with two fields - person_name and person_age
    schema = StructType([
        StructField("Person_First_Name", StringType(), False),
        StructField("Person_Location_Country", StringType(), False),
        StructField("Person_Avg_Spend", IntegerType(), False)
    ])
    # Create a SchemaRDD by applying the schema to the RDD and print the schema
    another_schemardd = sqlCtx.applySchema(another_rdd, schema)
    # Print Schema on Screen
    print('Print the Second Schema - People_Details\n')
    another_schemardd.printSchema()
    #####################################################
    # Save Data Above as a Parqet File
    # SchemaRDDs can be saved as Parquet files, maintaining the schema
    # information.
    another_schemardd.saveAsParquetFile(
        "/home/dan/Desktop/People_Details.parquet")
    # Register this SchemaRDD as a table.
    return another_schemardd.registerAsTable("People_Details")
 def run(self):
     sc = SparkContext("local", "gender")
     sqlContext = SQLContext(sc)
     #StringType =(str, unicode)
     _out = self.output().open('w')
     #lines = sc.textFile("myUser.csv")
     #fobj = self.input().open("r")
     #lines = sc.textFile(fobj.name)
     print(type(self.required_tasks['insert_source'].output()))
     print(self.required_tasks['insert_source'])
     #print(self.input()['insert_source'].input())
     lines = sc.textFile("myUser.csv")
     parts = lines.map(lambda l: l.split(","))
     users = parts.map(lambda p: (p[0], p[1], p[2], p[3], p[4], p[5], p[
         6], p[7], p[8], p[9], p[10], p[11], p[12], p[13], p[14], p[15], p[
             16], p[17], p[18], p[19]))
     schemaString = "userId lmsUserId lmsName orgName name gender registrationDate emailId mothertounge highestEduDegree goals city state active firstAccesDate lastAccessDate allowCert yearOfBirth pincode aadharId"
     print(schemaString)
     _out.write(schemaString)
     fields = [
         StructField(field_name, StringType(), True)
         for field_name in schemaString.split()
     ]
     schema = StructType(fields)
     #schemaUser = sqlContext.createDataFrame(users, schema)
     schemaUser = sqlContext.applySchema(users, schema)
     schemaUser.registerTempTable("users")
     results = sqlContext.sql("SELECT gender FROM users")
     genders = results.map(lambda p: (p, 1))
     counts = genders.reduceByKey(
         lambda a, b: a + b
     )  #.map(lambda t : ("Gender " + t(0) + " No " + t(1))).collect()
     for name in counts.collect():
         _out.write(str(name))
     _out.close()
Example #4
0
def get_channel_mapping(spark: pyspark.SparkContext) -> pyspark.sql.DataFrame:
    """
    Creates the channel mapping dataframe from the hard-coded values

    Parameters
    ----------
    spark : pyspark.SparkContext
        Spark context to initialize variables and get data from hive

    Returns
    -------
    pyspark.sql.DataFrame
        PySpark dataframe with channel mapping data
    """

    channel_mapping = spark.createDataFrame(
        [
            ("01", "Distribution Channel 01"),
            ("10", "Other"),
            ("11", "DSD Bis Intercompany"),
            ("12", "DSD Pizza Intercomp"),
            ("20", """Warehouse/Exports"""),
            ("30", "Foodservice"),
            ("40", "DSD Pizza"),
            ("45", "DSD"),
            ("50", "KFI"),
            ("55", "Plant Ingredient"),
            ("60", "Imports"),
            ("65", "Bulk FS - Specialty"),
        ],
        StructType([
            StructField("bic_zdistr_ch", StringType(), True),
            StructField("channel_desc", StringType(), True),
        ]),  # add your columns label here
    )

    return channel_mapping
Example #5
0
def dshape_to_schema(ds):
    """Convert datashape to SparkSQL type system.

    Examples
    --------
    >>> print(dshape_to_schema('int32'))  # doctest: +SKIP
    IntegerType
    >>> print(dshape_to_schema('5 * int32')  # doctest: +SKIP
    ArrayType(IntegerType,false)
    >>> print(dshape_to_schema('5 * ?int32'))  # doctest: +SKIP
    ArrayType(IntegerType,true)
    >>> print(dshape_to_schema('{name: string, amount: int32}'))  # doctest: +SKIP
    StructType(List(StructField(name,StringType,false),StructField(amount,IntegerType,false)  # doctest: +SKIP))
    >>> print(dshape_to_schema('10 * {name: string, amount: ?int32}'))  # doctest: +SKIP
    ArrayType(StructType(List(StructField(name,StringType,false),StructField(amount,IntegerType,true))),false)
    """
    if isinstance(ds, str):
        return dshape_to_schema(dshape(ds))
    if isinstance(ds, Tuple):
        raise TypeError('Please provide a Record dshape for these column '
                        'types: %s' % (ds.dshapes, ))
    if isinstance(ds, Record):
        return StructType([
            StructField(name, dshape_to_schema(deoption(typ)),
                        isinstance(typ, datashape.Option))
            for name, typ in ds.fields
        ])
    if isinstance(ds, DataShape):
        if isdimension(ds[0]):
            elem = ds.subshape[0]
            if isinstance(elem, DataShape) and len(elem) == 1:
                elem = elem[0]
            return ArrayType(dshape_to_schema(deoption(elem)),
                             isinstance(elem, Option))
        else:
            return dshape_to_schema(ds[0])
    if ds in dshape_to_sparksql:
        return dshape_to_sparksql[ds]
    raise NotImplementedError()
# coding: utf-8

from pyspark import SparkConf, SparkContext
from pyspark.sql import HiveContext
from pyspark.sql import StructType, StructField, ArrayType, IntegerType

conf = SparkConf().setAppName("spark_sql_datatype_array")

sc = SparkContext(conf=conf)

hc = HiveContext(sc)

source = sc.parallelize([([1, 2, 3], )])

schema = StructType(
    [StructField("array", ArrayType(IntegerType(), False), False)])

table = hc.applySchema(source, schema)

table.registerAsTable("temp_table")

rows = hc.sql("select array[0], array[1], array[2] from temp_table").collect()

sc.stop()

for row in rows:
    print row
sc = SparkContext(conf=conf)

hc = HiveContext(sc)

source = sc.parallelize([
    "row1_col1 row1_col2 row1_col3", "row2_col1 row2_col2 row3_col3",
    "row3_col1 row3_col2 row3_col3"
])

columns = source.map(lambda line: line.split(" ")).filter(
    lambda columns: columns and len(columns) == 3)

rows = columns.map(lambda columns: (columns[0], columns[1], columns[2]))

schema = StructType([
    StructField("col1", StringType(), False),
    StructField("col2", StringType(), False),
    StructField("col3", StringType(), False)
])

table = hc.applySchema(rows, schema)

table.registerAsTable("temp_mytable")

rdd = hc.sql("select count(*) from temp_mytable where col1 = '' group by col2")

print rdd.toDebugString()

sc.stop()
# coding: utf-8

from pyspark import SparkConf, SparkContext
from pyspark.sql import HiveContext
from pyspark.sql import StructType, StructField, ArrayType, MapType

conf = SparkConf().setAppName("spark_sql_datatype_complex")

sc = SparkContext(conf=conf)

hc = HiveContext(sc)

source = sc.parallelize([([1, 2, 3], {"key1": 1, "key2": 2}, (1, 2.0, "3.0"))])

schema = StructType([
    StructField("array", ArrayType(IntegerType(), False), False),
    StructField("col_map", MapType(StringType(), IntegerType(), False), False),
    StructField(
        "struct",
        StructType([
            StructField("first", IntegerType(), False),
            StructField("second", FloatType(), False),
            StructField("third", StringType(), False)
        ]), False)
])

table = hc.applySchema(source, schema)

table.registerAsTable("temp_table")

rows = hc.sql(
Example #9
0
if __name__ == "__main__":
    sc = SparkContext(appName="PythonSQL")
    sqlContext = SQLContext(sc)

    # RDD is created from a list of rows
    some_rdd = sc.parallelize([Row(name="John", age=19),
                              Row(name="Smith", age=23),
                              Row(name="Sarah", age=18)])
    # Infer schema from the first row, create a SchemaRDD and print the schema
    some_schemardd = sqlContext.inferSchema(some_rdd)
    some_schemardd.printSchema()

    # Another RDD is created from a list of tuples
    another_rdd = sc.parallelize([("John", 19), ("Smith", 23), ("Sarah", 18)])
    # Schema with two fields - person_name and person_age
    schema = StructType([StructField("person_name", StringType(), False),
                        StructField("person_age", IntegerType(), False)])
    # Create a SchemaRDD by applying the schema to the RDD and print the schema
    another_schemardd = sqlContext.applySchema(another_rdd, schema)
    another_schemardd.printSchema()
    # root
    #  |-- age: integer (nullable = true)
    #  |-- name: string (nullable = true)

    # A JSON dataset is pointed to by path.
    # The path can be either a single text file or a directory storing text files.
    path = os.path.join(os.environ['SPARK_HOME'], "examples/src/main/resources/people.json")
    # Create a SchemaRDD from the file(s) pointed to by path
    people = sqlContext.jsonFile(path)
    # root
    #  |-- person_name: string (nullable = false)
Example #10
0
from pyspark import SparkConf, SparkContext
from pyspark.sql import HiveContext
from datetime import datetime, date
from pyspark.sql import StructType, StructField, DateType, TimestampType

conf = SparkConf().setAppName("spark_sql_datatype_date_or_datetime")

sc = SparkContext(conf=conf)

hc = HiveContext(sc)

source = sc.parallelize([(date(2015, 9, 22), datetime(2015, 9, 22, 9, 39,
                                                      45))])

schema = StructType([
    StructField("date", DateType(), False),
    StructField("timestamp", TimestampType(), False)
])

table = hc.applySchema(source, schema)

table.registerAsTable("temp_table")

rows = hc.sql("select date, timestamp from temp_table").collect()

sc.stop()

for row in rows:
    print row
from pyspark import SparkConf, SparkContext
from pyspark.sql import HiveContext
from pyspark.sql import StructType, StructField, LongType

conf = SparkConf().setAppName("spark_sql_datatype_extend2")

sc = SparkContext(conf=conf)

hc = HiveContext(sc)

source = sc.parallelize([(85070591730234615847396907784232501249,
                          85070591730234615847396907784232501249)])

schema = StructType([
    StructField("col1", LongType(), False),
    StructField("col2", LongType(), False)
])

table = hc.applySchema(source, schema)

table.registerAsTable("temp_table")

rows = hc.sql("select * from temp_table").collect()

sc.stop()

for row in rows:
    print row
"""
# java.lang.ClassCastException: java.math.BigInteger cannot be cast to
Example #12
0
# coding: utf-8

from pyspark import SparkConf, SparkContext
from pyspark.sql import HiveContext
from pyspark.sql import StructType, StructField, StringType, BooleanType, BinaryType, NullType

conf = SparkConf().setAppName("spark_sql_datatype_str_bool_binary_none")

sc = SparkContext(conf=conf)

hc = HiveContext(sc)

source = sc.parallelize([("str", False, bytearray(range(0, 256)), None)])

schema = StructType([
    StructField("str", StringType(), False),
    StructField("bool", BooleanType(), False),
    StructField("bytes", BinaryType(), False),
    StructField("none", NullType())
])

table = hc.applySchema(source, schema)

table.registerAsTable("temp_table")

rows = hc.sql("select str, bool, bytes, none from temp_table").collect()

sc.stop()

for row in rows:
    print row
Example #13
0
    if columns and len(columns) == 3:
        successLines.add(1)

        return True
    else:
        errorLines.add(1)

        return False


columns = source.map(lambda line: line.split(" ")).filter(lineFilter)

rows = columns.map(lambda columns: (columns[0], columns[1], columns[2]))

schema = StructType([
    StructField("col1", StringType()),
    StructField("col2", StringType()),
    StructField("col3", StringType())
])

table = hc.applySchema(rows, schema)

table.registerAsTable("temp_mytable")

datas = hc.sql("select * from temp_mytable").collect()

sc.stop()

if datas:
    for data in datas:
        print data
Example #14
0
from pyspark.sql import HiveContext
from pyspark.sql import StructType, StructField, IntegerType, FloatType, StringType

conf = SparkConf().setAppName("spark_sql_datatype_struct")

sc = SparkContext(conf=conf)

hc = HiveContext(sc)

source = sc.parallelize([((1, 2.0, "3.0"), )])

schema = StructType([
    StructField(
        "struct",
        StructType([
            StructField("first", IntegerType(), False),
            StructField("second", FloatType(), False),
            StructField("third", StringType(), False)
        ]), False)
])

table = hc.applySchema(source, schema)

table.registerAsTable("temp_table")

rows = hc.sql(
    "select struct.first, struct.second, struct.third from temp_table"
).collect()

sc.stop()
Example #15
0
conf = SparkConf().setAppName("spark_sql_datatype")

sc = SparkContext(conf=conf)

hc = HiveContext(sc)

source = sc.parallelize([
    (int("127"), int("32767"), int("2147483647"), long("9223372036854775807"),
     float("1.1"), float("2.2"), Decimal("3.3"), "str", bool(0),
     datetime(2015, 9, 22, 9, 39, 45), date(2015, 9, 22), [1, 2, 3], {
         "key": "value"
     }, (1, 2.0, "3.0"))
])

schema = StructType([
    StructField("byte", ByteType(), False),
    StructField("short", ShortType(), False),
    StructField("int", IntegerType(), False),
    StructField("long", LongType(), False),
    StructField("float", FloatType(), False),
    StructField("double", DoubleType(), False),
    StructField("decimal", DecimalType(), False),
    StructField("string", StringType(), False),
    StructField("boolean", BooleanType(), False),
    StructField("timestamp", TimestampType(), False),
    StructField("date", DateType(), False),
    StructField("array", ArrayType(IntegerType(), False), False),
    StructField("col_map", MapType(StringType(), StringType(), False), False),
    StructField(
        "struct",
        StructType([
Example #16
0
from pyspark import SparkConf, SparkContext
from pyspark.sql import HiveContext, StructType, StructField, StringType, IntegerType, ArrayType, FloatType, MapType

conf = SparkConf().setAppName("spark_sql_udf")

sc = SparkContext(conf=conf)

hc = HiveContext(sc)

source = sc.parallelize([("value", )])

schema = StructType([StructField("col", StringType(), False)])

table = hc.applySchema(source, schema)

table.registerTempTable("temp_table")


def func_string():
    return "abc"


hc.registerFunction("func_string", func_string)

rows = hc.sql("select func_string() from temp_table").collect()


def func_int():
    return 123

# coding: utf-8

from pyspark import SparkConf, SparkContext
from pyspark.sql import HiveContext
from pyspark.sql import StructType, StructField, MapType, StringType, IntegerType

conf = SparkConf().setAppName("spark_sql_datatype_map")

sc = SparkContext(conf=conf)

hc = HiveContext(sc)

source = sc.parallelize([({"key1": 1, "key2": 2}, )])

schema = StructType([
    StructField("col_map", MapType(StringType(), IntegerType(), False), False)
])

table = hc.applySchema(source, schema)

table.registerAsTable("temp_table")

rows = hc.sql(
    "select col_map['key1'], col_map['key2'] from temp_table").collect()

sc.stop()

for row in rows:
    print row
from pyspark import SparkConf, SparkContext
from pyspark.sql import HiveContext
import decimal
from datetime import datetime, date
from pyspark.sql import StructType, StructField, LongType

conf = SparkConf().setAppName("spark_sql_datatype_long")

sc = SparkContext(conf=conf)

hc = HiveContext(sc)

source = sc.parallelize(
    [(9223372036854775807, 9223372036854775807)])

schema = StructType([StructField("col1", LongType(), False),
                     StructField("col2", LongType(), False)])

table = hc.applySchema(source, schema)

table.registerAsTable("temp_table")

"""
rows = hc.sql("select col1 + col2 from temp_table").collect()
"""

"""
rows = hc.sql(
    "select cast(col1 as bigint) + cast(col2 as bigint) from temp_table").collect()
"""