Python HiveContext.inferSchema Exemples

Langage de programmation: Python

Espace de nommage/Pack: pyspark.sql

Class/Type: HiveContext

Méthode/Fonction: inferSchema

Exemples au hotexamples.com: 11

Python HiveContext.inferSchema - 11 exemples trouvés. Ce sont les exemples réels les mieux notés de pyspark.sql.HiveContext.inferSchema extraits de projets open source. Vous pouvez noter les exemples pour nous aider à en améliorer la qualité.

Méthodes fréquemment utilisées

Afficher Cacher

HiveContext(30)

table(30)

sql(30)

setConf(30)

createDataFrame(30)

registerDataFrameAsTable(12)

cacheTable(8)

inferSchema(7)

jsonFile(7)

parquetFile(6)

tableNames(4)

jsonRDD(4)

clearCache(3)

range(3)

dropTempTable(3)

applySchema(3)

registerFunction(2)

hql(2)

registerRDDAsTable(2)

spl(1)

stop(1)

tables(1)

load(1)

show(1)

refreshTable(1)

_createForTesting(1)

getConf(1)

close(1)

cache(1)

_inferSchema(1)

uncacheTable(1)

Méthodes fréquemment utilisées

HiveContext (30)

table (30)

sql (30)

setConf (30)

createDataFrame (30)

registerDataFrameAsTable (12)

cacheTable (8)

inferSchema (7)

jsonFile (7)

parquetFile (6)

Méthodes fréquemment utilisées

tableNames (4)

jsonRDD (4)

clearCache (3)

range (3)

dropTempTable (3)

applySchema (3)

registerFunction (2)

hql (2)

registerRDDAsTable (2)

spl (1)

stop (1)

tables (1)

load (1)

show (1)

refreshTable (1)

_createForTesting (1)

getConf (1)

close (1)

cache (1)

_inferSchema (1)

Méthodes fréquemment utilisées

stop (1)

tables (1)

load (1)

show (1)

refreshTable (1)

_createForTesting (1)

getConf (1)

close (1)

cache (1)

_inferSchema (1)

uncacheTable (1)

Related in langs

ll_reset_gen_settings (PHP)

JoomapConfig (PHP)

SingleCondition (C#)

MyCreator (C#)

readSampleFrames (C++)

unloadInternal (C++)

Errorf (Go)

MergeScopes (Go)

GeckoAppShell (Java)

QuickMath (Java)

Méthodes fréquemment utilisées

uncacheTable (1)

Associées

Gamess

CDContact

qtTableReport

main

StatusModel

DataSplitter

strverscmp

VcsBaseGroupWidget

read_history_file

generate_set_example_function

Exemple #1

0

Afficher le fichier

from pyspark import SparkConf, SparkContext from pyspark.sql import HiveContext, Row from pyspark.sql.types import StringType conf = SparkConf().setAppName("spark_sql_udf") sc = SparkContext(conf=conf) hc = HiveContext(sc) lines = sc.parallelize(["a", "b", "c"]) people = lines.map(lambda value: Row(name=value)) peopleSchema = hc.inferSchema(people) peopleSchema.registerTempTable("people") def myfunc(value): return value.upper() hc.registerFunction("myfunc", myfunc, StringType()) rows = hc.sql("select myfunc(name) from people").rdd.filter( lambda row: isinstance(row, tuple)).collect() sc.stop() for row in rows:

Exemple #2

0

Afficher le fichier

# 寻找熊猫爱好者 tbl = rows.registerTempTable("people") pandaFriends = hiveCtx.sql("SELECT name FROM people WHERE favouriteAnimal = \"panda\"") print "Panda friends" print pandaFriends.map(lambda row: row.name).collect() # Parquet文件保存 pandaFriends.saveAsTextFile("hdfs://...") # 在python中使用Spark SQL 读取JSON数据 input = hiveCtx.jsonFile(inputFile) # 在python中使用Row和具名远足创建SchemaRDD happyPeopleRDD = sc.parallelize([Row(name = "holden", favouriteBeverage = "coffee")]) happyPeopleSchemaRDD = hiveCtx.inferSchema(happyPeopleRDD) happyPeopleSchemaRDD.registerTempTable("happy_people") #**************** 使用Beeline JDBC/ODBC服务器 ************************************ # 用户自定义函数 # 字符串长度 UDF # 写一个求字符串长度的UDF hiveCtx.registerFunction("strLenPython", lambda x: len(x), IntegerType()) lengthSchemaRDD = hiveCtx.sql("SELECT strLenPython('text') FROM tweets LIMIT 10") # ----------------------------------------------------------------------------------######## Spark Streaming使用离散化流(discretized stream)作为抽象表示，叫做DStream。

Exemple #3

0

Afficher le fichier

Fichier : spark_sql_delimiter_infer_schema.py Projet : Leaderman/pyspark

from pyspark import SparkConf, SparkContext from pyspark.sql import HiveContext, Row conf = SparkConf().setAppName("spark_sql_delimiter_infer_schema") sc = SparkContext(conf=conf) hc = HiveContext(sc) source = sc.parallelize(["row1_col1 row1_col2 row1_col3", "row2_col1 row2_col2 row3_col3", "row3_col1 row3_col2 row3_col3"]) columns = source.map(lambda line: line.split(" ")).filter( lambda columns: columns and len(columns) == 3) rows = columns.map( lambda columns: Row(col1=columns[0], col2=columns[1], col3=columns[2])) table = hc.inferSchema(rows) table.registerAsTable("temp_mytable") datas = hc.sql("select * from temp_mytable").collect() sc.stop() if datas: for data in datas: print data

Exemple #4

0

Afficher le fichier

Fichier : spark_sql_cache_table_extend.py Projet : imran273/pyspark-1

from pyspark import SparkConf, SparkContext from pyspark.sql import HiveContext from pyspark.sql.types import Row conf = SparkConf().setAppName("spark_sql_cache_table_extend") sc = SparkContext(conf=conf) hc = HiveContext(sc) dataRDD = sc.textFile( "/user/hdfs/rawlog/app_weibomobile03x4ts1kl_mwb_interface/" ).map(lambda line: line.split(",")).filter(lambda words: len(words) >= 3).map( lambda words: Row(col1=words[0], col2=words[1], col3=words[2])) sourceRDD = hc.inferSchema(dataRDD) sourceRDD.registerAsTable("source") hc.cacheTable("source") hc.sql("select count(*) from source").collect() hc.sql("select col2, max(col3) from source group by col2").collect() hc.sql("select col3, min(col2) from source group by col3").collect() # hc.uncacheTable("source") sc.stop()

Exemple #5

0

Afficher le fichier

Fichier : spark_sql_cache_table_extend.py Projet : Leaderman/pyspark

# coding=utf-8 from pyspark import SparkConf, SparkContext from pyspark.sql import HiveContext from pyspark.sql.types import Row conf = SparkConf().setAppName("spark_sql_cache_table_extend") sc = SparkContext(conf=conf) hc = HiveContext(sc) dataRDD = sc.textFile("/user/hdfs/rawlog/app_weibomobile03x4ts1kl_mwb_interface/").map(lambda line: line.split( ",")).filter(lambda words: len(words) >= 3).map(lambda words: Row(col1=words[0], col2=words[1], col3=words[2])) sourceRDD = hc.inferSchema(dataRDD) sourceRDD.registerAsTable("source") hc.cacheTable("source") hc.sql("select count(*) from source").collect() hc.sql("select col2, max(col3) from source group by col2").collect() hc.sql("select col3, min(col2) from source group by col3").collect() # hc.uncacheTable("source") sc.stop()

Exemple #6

0

Afficher le fichier

Fichier : TwitterAnalytics.py Projet : datafibers/BigData-Analytics

from pyspark import SparkContext, SparkConf from pyspark.sql import HiveContext, Row from pyspark.sql.types import IntegerType import json import sys if __name__ == "__main__": inputFile = sys.argv[1] conf = SparkConf().setAppName("TwitterAnalytics") sc = SparkContext() hiveCtx = HiveContext(sc) print "Loading tweets from " + inputFile input = hiveCtx.jsonFile(inputFile) input.registerTempTable("tweets") topTweets = hiveCtx.sql("SELECT text, retweetCount FROM tweets ORDER BY retweetCount LIMIT 10") print topTweets.collect() topTweetText = topTweets.map(lambda row : row.text) print topTweetText.collect() # Make a happy person row happyPeopleRDD = sc.parallelize([Row(name="ganguly", favouriteBeverage="coffee")]) happyPeopleSchemaRDD = hiveCtx.inferSchema(happyPeopleRDD) happyPeopleSchemaRDD.registerTempTable("strong_people") # Make a UDF to tell us how long some text is hiveCtx.registerFunction("strLenPython", lambda x: len(x), IntegerType()) lengthSchemaRDD = hiveCtx.sql("SELECT strLenPython('text') FROM tweets LIMIT 10") print lengthSchemaRDD.collect() sc.stop()

Exemple #7

0

Afficher le fichier

Fichier : spark_sql_regex_infer_schema.py Projet : imran273/pyspark-1

pattern = re.compile("(.*) (.*) (.*)") def parse(line): matcher = pattern.match(line) if matcher: return matcher.groups() else: return None columns = source.map(parse).filter( lambda columns: columns and len(columns) == 3) rows = columns.map( lambda columns: Row(col1=columns[0], col2=columns[1], col3=columns[2])) table = hc.inferSchema(rows) table.registerAsTable("temp_mytable") datas = hc.sql("select * from temp_mytable").collect() sc.stop() if datas: for data in datas: print data

Exemple #8

0

Afficher le fichier

Fichier : AvgSalesPerDay_Hive.py Projet : priyanshu1210/CCA175

sc = SparkContext(conf=conf) hc = HiveContext(sc) hc.sql("set spark.sql.shuffle.partitions = 10") orderMap = sc.textFile("hdfs:///user/hive/warehouse/retaildb.db/orders")\ .map(lambda record : record.split(","))\ .map(lambda record : Row(orderID=int(record[0]),orderDate=record[1][:11])) itemMap = sc.textFile("hdfs:///user/hive/warehouse/retaildb.db/order_items")\ .map(lambda record : record.split(","))\ .map(lambda row: (int(row[1]),float(row[4]))).reduceByKey(lambda x,y : x+y)\ .map(lambda record : Row(orderID=int(record[0]),Total=record[1])) oSchema = hc.inferSchema(orderMap) iSchema = hc.inferSchema(itemMap) oSchema.registerTempTable("orders") iSchema.registerTempTable("items") avgSalesPerDay = hc.sql(" SELECT o.orderDate,avg(i.Total) as avgSales \ from orders o join items i \ where o.orderID = i.orderID \ group by o.orderDate \ order by avgSales DESC") avgSalesPerDay.map(lambda row : ",".join([row.orderDate,str(row.avgSales)]))\ .coalesce(1) \ .saveAsTextFile("AvgSalesPerDay2")

Exemple #9

0

Afficher le fichier

Fichier : spark_sql_udf.py Projet : Leaderman/pyspark

from pyspark import SparkConf, SparkContext from pyspark.sql import HiveContext, Row from pyspark.sql.types import StringType conf = SparkConf().setAppName("spark_sql_udf") sc = SparkContext(conf=conf) hc = HiveContext(sc) lines = sc.parallelize(["a", "b", "c"]) people = lines.map(lambda value: Row(name=value)) peopleSchema = hc.inferSchema(people) peopleSchema.registerTempTable("people") def myfunc(value): return value.upper() hc.registerFunction("myfunc", myfunc, StringType()) rows = hc.sql("select myfunc(name) from people").rdd.filter( lambda row: isinstance(row, tuple)).collect() sc.stop() for row in rows: print row, type(row[0])

Exemple #10

0

Afficher le fichier

Fichier : sql_hive.py Projet : ginny315/cloud_cluster1

conf = SparkConf().setAppName('spar_sql_test') sc = SparkContext(conf=conf) #sqlContext = SQLContext(sc); hc = HiveContext(sc) # Parallelize a list and convert each line to a Row # Row(id=1, name="a", age=28) # datas -> Spark RDD source, type = str datas = ['1 a 28', '2 b 39', '3 c 30'] source = sc.parallelize(datas) splits = source.map(lambda line: line.split(" ")) rows = splits.map(lambda words: Row(id=words[0], name=words[1], age=words[2])) # Infer the schema, and register the Schema as a table people = hc.inferSchema(rows) people.printSchema() # SQL can be run over SchemaRDD that have been registered as a table people.registerTempTable("people") results = hc.sql('select * from people where age > 28 and age < 30') results.printSchema() # The results of SQL queries are SchemaRDD, so register it as a table results.registerTempTable("people2") results2 = hc.sql('select name from people2') results2.printSchema() # The SchemaRDD support all the normal RDD operations results3 = results2.map(lambda row: row.name.upper()).collect()

Exemple #11

0

Afficher le fichier

Fichier : popularemovies_nicer.py Projet : mjk276/Pyspark

for results in results: print results from pyspark import SparkConf, SparkContext conf = SparkConf().setAppName("bleh") sc = SparkContext(conf=conf) from pyspark.sql import HiveContext sqlContext = HiveContext(sc) sql = """ select distinct date_category from ck_membership.date_driver LIMIt 10 """ r = sqlContext.sql(sql) for i in r.collect(): print(i) r.saveAsParquetFile("hdfs://nameservice1/data/unmanaged/datascience_ck/vish/mydata") sqlContext.sql("CREATE EXTERNAL TABLE vsubr2.some_date_driver1 (date_category String) STORED AS PARQUET LOCATION 'hdfs://nameservice1/data/unmanaged/datascience_ck/vish/mydata'") like rdd r2 = sqlContext.inferSchema(r) r.createExternalTable('vsubr2.some_Date_driver',path='hdfs://nameservice1/data/unmanaged/datascience_ck/vish/') sqlContext.sql("create table if not exists vsubr2.some_date_driver1") create table if not exists vsubr2.some_date_driver1 LOCATION 'hdfs://nameservice1/data/unmanaged/datascience_ck/vish/' as