Ejemplos de HiveContext.applySchema en Python

Lenguaje de programación: Python

Namespace/Package Name: pyspark.sql

Clase / Tipo: HiveContext

Método / Función: applySchema

Ejemplos en hotexamples.com: 5

Python HiveContext.applySchema - 5 ejemplos encontrados. Estos son los ejemplos en Python del mundo real mejor valorados de pyspark.sql.HiveContext.applySchema extraídos de proyectos de código abierto. Puedes valorar ejemplos para ayudarnos a mejorar la calidad de los ejemplos.

Métodos usados con frecuencia

Mostrar Ocultar

HiveContext(30)

table(30)

sql(30)

setConf(30)

createDataFrame(30)

registerDataFrameAsTable(12)

cacheTable(8)

inferSchema(7)

jsonFile(7)

parquetFile(6)

tableNames(4)

jsonRDD(4)

clearCache(3)

range(3)

dropTempTable(3)

applySchema(3)

registerFunction(2)

hql(2)

registerRDDAsTable(2)

spl(1)

stop(1)

tables(1)

load(1)

show(1)

refreshTable(1)

_createForTesting(1)

getConf(1)

close(1)

cache(1)

_inferSchema(1)

uncacheTable(1)

Ejemplo n.º 1

Mostrar archivo

Archivo: spark_sql_datatype.py Proyecto: Leaderman/pyspark

        StructField("timestamp", TimestampType(), False),
        StructField("date", DateType(), False),
        StructField("array", ArrayType(IntegerType(), False), False),
        StructField("col_map", MapType(StringType(), StringType(), False), False),
        StructField(
            "struct",
            StructType(
                [
                    StructField("first", IntegerType(), False),
                    StructField("second", FloatType(), False),
                    StructField("third", StringType(), False),
                ]
            ),
            False,
        ),
    ]
)

table = hc.applySchema(source, schema)

table.registerAsTable("temp_table")

rows = hc.sql(
    "select byte, short, int, long, float, double, decimal, string, boolean, timestamp, date, array[0], array[1], array[2], col_map['key'], struct.first, struct.second, struct.third from temp_table"
).collect()

sc.stop()

for row in rows:
    print row

Ejemplo n.º 2

Mostrar archivo

Archivo: spark_app_toDebugString.py Proyecto: imran273/pyspark-1

sc = SparkContext(conf=conf)

hc = HiveContext(sc)

source = sc.parallelize([
    "row1_col1 row1_col2 row1_col3", "row2_col1 row2_col2 row3_col3",
    "row3_col1 row3_col2 row3_col3"
])

columns = source.map(lambda line: line.split(" ")).filter(
    lambda columns: columns and len(columns) == 3)

rows = columns.map(lambda columns: (columns[0], columns[1], columns[2]))

schema = StructType([
    StructField("col1", StringType(), False),
    StructField("col2", StringType(), False),
    StructField("col3", StringType(), False)
])

table = hc.applySchema(rows, schema)

table.registerAsTable("temp_mytable")

rdd = hc.sql("select count(*) from temp_mytable where col1 = '' group by col2")

print rdd.toDebugString()

sc.stop()

Ejemplo n.º 3

Mostrar archivo

sc = SparkContext(conf=conf)

hc = HiveContext(sc)

datas = ["1 a 28", "2 b 29", "3 c 30"]

source = sc.parallelize(datas)

splits = source.map(lambda line: line.split(" "))

rows = splits.map(lambda words: (int(words[0]), words[1], int(words[2])))

fields = []

fields.append(StructField("id", IntegerType(). True))
fields.append(StructField("name", StringType(). True))
fields.append(StructField("age", IntegerType(). True))

schema = StructType(fields)

people = hc.applySchema(rows, schema)

people.registerTempTable("people")

results = hc.sql("select * from people where age>28 and age<30").collect()

sc.stop()

for result in results:
    print("id: %s, name: %s, age: %s" % (result.id, result.name, result.age))

Ejemplo n.º 4

Mostrar archivo

Archivo: spark_sql_regex_specify_schema.py Proyecto: Leaderman/pyspark

def parse(line):
    matcher = pattern.match(line)

    if matcher:
        return matcher.groups()
    else:
        return None

columns = source.map(parse).filter(
    lambda columns: columns and len(columns) == 3)

rows = columns.map(
    lambda columns: (columns[0], columns[1], columns[2]))

schema = StructType([StructField("col1", StringType(), False), StructField(
    "col2", StringType(), False), StructField("col3", StringType(), False)])


table = hc.applySchema(rows, schema)

table.registerAsTable("temp_mytable")

datas = hc.sql("select * from temp_mytable").collect()

sc.stop()

if datas:
    for data in datas:
        print data

Ejemplo n.º 5

Mostrar archivo

Archivo: spark_sql_datatype_complex.py Proyecto: imran273/pyspark-1

sc = SparkContext(conf=conf)

hc = HiveContext(sc)

source = sc.parallelize([([1, 2, 3], {"key1": 1, "key2": 2}, (1, 2.0, "3.0"))])

schema = StructType([
    StructField("array", ArrayType(IntegerType(), False), False),
    StructField("col_map", MapType(StringType(), IntegerType(), False), False),
    StructField(
        "struct",
        StructType([
            StructField("first", IntegerType(), False),
            StructField("second", FloatType(), False),
            StructField("third", StringType(), False)
        ]), False)
])

table = hc.applySchema(source, schema)

table.registerAsTable("temp_table")

rows = hc.sql(
    "select array[0], array[1], array[2], col_map['key1'], col_map['key2'], struct.first, struct.second, struct.third from temp_table"
).collect()

sc.stop()

for row in rows:
    print row