StructField('postPrice', FloatType(), True), \ StructField('userNick', StringType(), True), \ StructField('categoryId', StringType(), True), \ StructField('categoryName', StringType(), True), \ StructField('fishPoolId', StringType(), True), \ StructField('fishpoolName', StringType(), True), \ StructField('bar', StringType(), True), \ StructField('barInfo', StringType(), True), \ StructField('abbr', StringType(), True), \ StructField('shiren', StringType(), True), \ StructField('zhima', StringType(), True), \ StructField('ts', StringType(), True) ]) hc = HiveContext(sc) df = hc.createDataFrame(data, schema) hc.registerDataFrameAsTable(df, "xianyu_iteminfo") hc.sql( "insert OVERWRITE table wl_base.`t_base_ec_xianyu_iteminfo_parquet` PARTITION(ds = '" + lastday + "') " "select " "case when t1.itemid is null then t2.itemid else t1.itemid end, " "case when t1.itemid is null then t2.userid else t1.userid end, " "case when t1.itemid is null then t2.phone else t1.phone end, " "case when t1.itemid is null then t2.contacts else t1.contacts end, " "case when t1.itemid is null then t2.title else t1.title end, " "case when t1.itemid is null then t2.province else t1.province end, " "case when t1.itemid is null then t2.city else t1.city end, " "case when t1.itemid is null then t2.area else t1.area end, " "case when t1.itemid is null then t2.auctionType else t1.auctionType end, " "case when t1.itemid is null then t2.description else t1.description end, " "case when t1.itemid is null then t2.detailFrom else t1.detailFrom end, "
#Importing the Youtube data file using the schema build for analysis df = sqlContext.read.format('com.databricks.spark.csv') \ .options(header='true').schema(schema_youtube)\ .load('file:///home/cloudera/Downloads/YouTubeData.csv') #Checking that the data is loaded properly print("*********************VIEWING THE IMPORTED DATASET****************************") df.show(n=10) #Selecting only the columns from the dataframe that we are interested in #which are video_id , title , category_id , views , likes, dislikes, comment_count, country sqlContext.registerDataFrameAsTable(df,"table1") youtube_df = sqlContext.sql("SELECT video_id,title,category_id,views,likes,dislikes,comment_count,country from table1") youtube_df.show() youtube_df.registerTempTable('DataTable') youtube_df1 = sqlContext.sql("SELECT * FROM DataTable") print("*********************VIEWING YOUTUBE DATASET****************************") youtube_df1.show(n=10) #Importing the Youtube Category details from the text file youtube_categories = sqlContext.read.format('com.databricks.spark.csv') \ .options(header='true', inferschema='true') \ .load('file:///home/cloudera/Downloads/Categories.csv')
for word in words: if afinn.has_key(word): sentiments.append(int(afinn[word])) else: sentiments.append(0) if sentiments: sentiment = float(sum(sentiments)) / math.sqrt(len(sentiments)) else: sentiment = 0 return sentiment sentimentTuple = tweets.rdd.map(lambda r: [r.id, r.text, r.name]) \ .map(lambda r: [sentiment(r[1]),r[2]]) \ .flatMapValues(lambda x: x) \ .map(lambda y: (y[1],y[0])) \ .reduceByKey(lambda x, y: x+y) \ .sortByKey(ascending=True) scoreDF = sentimentTuple.join(candidates) \ .map(lambda (x, y): (y[1],y[0])) \ .reduceByKey(lambda a, b: a + b) \ .toDF() scoreRenameDF = scoreDF.withColumnRenamed("_1", "Candidate").withColumnRenamed( "_2", "Score") sqlCtx.registerDataFrameAsTable(scoreRenameDF, "SCORE_TEMP") sqlCtx.sql( "INSERT OVERWRITE TABLE candidate_score SELECT Candidate, Score FROM SCORE_TEMP" )
# En este caso, tenemos tablas ya guardadas en Hive.Podemos verlas con el comando: hiveContext.tables().show() # Cargamos la información de la tabla tweets tweets = hiveContext.table("tweets28a") print("\nLos datos cargados incluyen {} tweets\n".format(tweets.count())) tweets.printSchema() # Analizamos los tweets que están geolocalizados hiveContext.sql('DROP TABLE IF EXISTS tweets') hiveContext.registerDataFrameAsTable(tweets, "tweets") tweets_place = hiveContext.sql( "SELECT place.name, COUNT(text) AS tweets FROM tweets WHERE place IS NOT NULL GROUP BY place.name ORDER BY tweets DESC" ) tweets_place.limit(10).show() # Podemos hacer el mismo análisis a través de RDDs tweets_geo = hiveContext.sql( "SELECT place.name FROM tweets WHERE place IS NOT NULL") tweets_place_rdd = tweets_geo.rdd tweets_place = tweets_place_rdd.toDF() tweets_place = tweets_place.withColumn("tweets", lit(1)) tweets_place = tweets_place.groupBy("name")\