Ejemplo n.º 1
0
                     StructField('postPrice', FloatType(), True), \
                     StructField('userNick', StringType(), True), \
                     StructField('categoryId', StringType(), True), \
                     StructField('categoryName', StringType(), True), \
                     StructField('fishPoolId', StringType(), True), \
                     StructField('fishpoolName', StringType(), True), \
                     StructField('bar', StringType(), True), \
                     StructField('barInfo', StringType(), True), \
                     StructField('abbr', StringType(), True), \
                     StructField('shiren', StringType(), True), \
                     StructField('zhima', StringType(), True), \
                     StructField('ts', StringType(), True)
                     ])
hc = HiveContext(sc)
df = hc.createDataFrame(data, schema)
hc.registerDataFrameAsTable(df, "xianyu_iteminfo")
hc.sql(
    "insert OVERWRITE table  wl_base.`t_base_ec_xianyu_iteminfo_parquet` PARTITION(ds = '"
    + lastday + "') "
    "select "
    "case when t1.itemid is null then t2.itemid else t1.itemid end, "
    "case when t1.itemid is null then t2.userid else t1.userid end, "
    "case when t1.itemid is null then t2.phone else t1.phone end, "
    "case when t1.itemid is null then t2.contacts else t1.contacts end, "
    "case when t1.itemid is null then t2.title else t1.title end, "
    "case when t1.itemid is null then t2.province else t1.province end, "
    "case when t1.itemid is null then t2.city else t1.city end, "
    "case when t1.itemid is null then t2.area else t1.area end, "
    "case when t1.itemid is null then t2.auctionType else t1.auctionType end, "
    "case when t1.itemid is null then t2.description else t1.description end, "
    "case when t1.itemid is null then t2.detailFrom else t1.detailFrom end, "
#Importing the Youtube data file using the schema build for analysis
df = sqlContext.read.format('com.databricks.spark.csv') \
    .options(header='true').schema(schema_youtube)\
    .load('file:///home/cloudera/Downloads/YouTubeData.csv')


#Checking that the data is loaded properly
print("*********************VIEWING THE IMPORTED DATASET****************************")
df.show(n=10)



#Selecting only the columns from the dataframe that we are interested in
#which are video_id ,  title , category_id , views , likes, dislikes, comment_count, country
sqlContext.registerDataFrameAsTable(df,"table1")
youtube_df = sqlContext.sql("SELECT video_id,title,category_id,views,likes,dislikes,comment_count,country from table1")
youtube_df.show()
youtube_df.registerTempTable('DataTable')

youtube_df1 = sqlContext.sql("SELECT * FROM DataTable")

print("*********************VIEWING YOUTUBE DATASET****************************")
youtube_df1.show(n=10)



#Importing the Youtube Category details from the text file
youtube_categories = sqlContext.read.format('com.databricks.spark.csv') \
    .options(header='true', inferschema='true') \
     .load('file:///home/cloudera/Downloads/Categories.csv')
    for word in words:
        if afinn.has_key(word):
            sentiments.append(int(afinn[word]))
        else:
            sentiments.append(0)
    if sentiments:
        sentiment = float(sum(sentiments)) / math.sqrt(len(sentiments))
    else:
        sentiment = 0
    return sentiment

sentimentTuple = tweets.rdd.map(lambda r: [r.id, r.text, r.name]) \
               .map(lambda r: [sentiment(r[1]),r[2]]) \
               .flatMapValues(lambda x: x) \
               .map(lambda y: (y[1],y[0])) \
               .reduceByKey(lambda x, y: x+y) \
               .sortByKey(ascending=True)

scoreDF = sentimentTuple.join(candidates) \
            .map(lambda (x, y): (y[1],y[0])) \
            .reduceByKey(lambda a, b: a + b) \
            .toDF()

scoreRenameDF = scoreDF.withColumnRenamed("_1", "Candidate").withColumnRenamed(
    "_2", "Score")

sqlCtx.registerDataFrameAsTable(scoreRenameDF, "SCORE_TEMP")

sqlCtx.sql(
    "INSERT OVERWRITE TABLE candidate_score SELECT Candidate, Score FROM SCORE_TEMP"
)
Ejemplo n.º 4
0
# En este caso, tenemos tablas ya guardadas en Hive.Podemos verlas con el comando:

hiveContext.tables().show()

# Cargamos la información de la tabla tweets

tweets = hiveContext.table("tweets28a")
print("\nLos datos cargados incluyen {} tweets\n".format(tweets.count()))

tweets.printSchema()

# Analizamos los tweets que están geolocalizados

hiveContext.sql('DROP TABLE IF EXISTS tweets')
hiveContext.registerDataFrameAsTable(tweets, "tweets")
tweets_place = hiveContext.sql(
    "SELECT place.name, COUNT(text) AS tweets FROM tweets WHERE place IS NOT NULL GROUP BY place.name ORDER BY tweets DESC"
)
tweets_place.limit(10).show()

# Podemos hacer el mismo análisis a través de RDDs

tweets_geo = hiveContext.sql(
    "SELECT place.name FROM tweets WHERE place IS NOT NULL")
tweets_place_rdd = tweets_geo.rdd
tweets_place = tweets_place_rdd.toDF()

tweets_place = tweets_place.withColumn("tweets", lit(1))

tweets_place = tweets_place.groupBy("name")\