def test_avg(self): data = [('Benny', 86), ('Jenny', 77), ('Oscar', 55), ('Scarlett', 89)] df = self.spark.createDataFrame(data) df = df.withColumnRenamed('_1', 'name').withColumnRenamed('_2', 'marks') # data is returned as a list of tuples self.assertEqual(76.75, df.select(avg(df.marks)).collect()[0][0])
def process(time, rdd): print("========= %s =========" % str(time)) try: # Get the singleton instance of SQLContext sqlContext = getSqlContextInstance(rdd.context) # Convert RDD[String] to RDD[Row] to DataFrame parts = rdd.map(lambda line: line.split(",")) delays_rdd= parts.map(lambda w: Row(carrier=w[0], origin=w[1], delay=float(w[2]))) delays = sqlContext.createDataFrame(delays_rdd, samplingRatio=1) avg_delays = delays.groupBy("origin", "carrier").agg(F.avg(delays.delay).alias('average')) avg_delays.write.format("org.apache.spark.sql.cassandra").\ options(table="task2_part2_group2_1", keyspace="mykeyspace").\ save(mode="append") # Register as table #dataFrame.registerTempTable("origin_carrier_delays") # Do word count on table using SQL and print it #carrier_delays_df = \ # sqlContext.sql("SELECT origin, carrier, avg(delay) AS average FROM origin_carrier_delays GROUP BY origin, carrier") #carrier_delays_df.registerTempTable("origin_carrier_avg_delays") #carrier_avg_delays_df = \ # sqlContext.sql("SELECT origin, carrier, avg_delay FROM origin_carrier_avg_delays GROUP BY origin ORDER BY avg_delay LIMIT 10") #for i in carrier_delays_df.rdd.takeOrderedByKey(10, sortValue=lambda x: x[2], reverse=False).map(lambda x: x[1]).collect(): # print (i) #dataFrame.select("origin", "carrier", "delay").write \ #carrier_delays_df.write \ # .format("org.apache.spark.sql.cassandra") \ # .options( table = "task2_part2_group2_1", keyspace = "mykeyspace") \ # .save(mode="append") #carrier_delays_df.show() except Exception as e: print (e)
def getValueFieldValueLists(self, handlerId, keyFields, valueFields): df = self.entity.groupBy(keyFields) agg = self.options.get("aggregation",self.getDefaultAggregation(handlerId)) maxRows = int(self.options.get("rowCount","100")) numRows = min(maxRows,df.count()) valueLists = [] for valueField in valueFields: valueDf = None if agg == "SUM": valueDf = df.agg(F.sum(valueField).alias("agg")) elif agg == "AVG": valueDf = df.agg(F.avg(valueField).alias("agg")) elif agg == "MIN": valueDf = df.agg(F.min(valueField).alias("agg")) elif agg == "MAX": valueDf = df.agg(F.max(valueField).alias("agg")) else: valueDf = df.agg(F.count(valueField).alias("agg")) for keyField in keyFields: valueDf = valueDf.sort(F.col(keyField).asc()) valueDf = valueDf.dropna() rows = valueDf.select("agg").take(numRows) valueList = [] for row in rows: valueList.append(row["agg"]) valueLists.append(valueList) return valueLists
def process_ratings(time, rdd): print "============== %s ============" % str(time) # # ts = now() # print "TIME AS now(): {}".format(ts) local_sql = getSqlContextInstance(rdd.context) from datetime import datetime ts = datetime.now() # from pyspark.sql.types import * # schema = StructType([ # StructField("user_id", IntegerType(), True), # StructField("movie_id", IntegerType(), True), # StructField("rating", FloatType(), True), # StructField("timestamp") # ] # ) ratings = rdd.map(lambda line: line.split("::")) row_rdd = ratings.map(lambda (user_id, movie_id, rating, timestamp): Row(movie_id=int(movie_id), user_id=int(user_id), rating=float(rating), ts=ts)) ratings = local_sql.createDataFrame(row_rdd, samplingRatio=1) # ratings.show() # df.registerTempTable("ratings") # I want to get the average rating, and count of the number of ratings for each movie and persist it to cassandra from pyspark.sql import functions as F # movie_ids = ratings.select("movie_id").distinct() # movie_ids.show() # create table movie_ratings_time_series ( movie_id int, ts timeuuid, rating float, primary key (movie_id, ts) ); avg_ratings = ratings.groupBy("movie_id", "ts").agg(F.avg(ratings.rating).alias('rating')) avg_ratings.write.format("org.apache.spark.sql.cassandra").\ options(table="movie_ratings_time_series", keyspace="training").\ save(mode="append") # writer("movie_ratings_time_series", avg_ratings) # movie_to_ts = local_sql.sql("select distinct movie_id, ts from ratings") # movie_to_ts.registerTempTable("movie_ts") # going to join this against itself # agg = local_sql.sql("SELECT movie_id, avg(rating) as a, count(rating) as c from ratings group by movie_id") # agg.registerTempTable("movie_aggregates") # matched = local_sql.sql("select a.movie_id, b.ts, a.a, a.c from movie_aggregates a join movie_ts b on a.movie_id = b.movie_id ") # writer(matched, "movie_stream_ratings") print "========== DONE WRITING ============== "
def process_ratings(time, rdd): if (rdd.isEmpty()): print "============== RDD Is Empty. Give it a few moments to get the stream. You started the stream right?" return print "============== %s ============" % str(time) import time ts = time.time() ratings = rdd.map(lambda line: line.split("::")) row_rdd = ratings.map( lambda (user_id, movie_id, rating, timestamp): Row( movie_id=int(movie_id), user_id=int(user_id), rating=float(rating), timestamp=int(timestamp) ) ) local_sql = getSqlContextInstance(rdd.context) ratings = local_sql.createDataFrame(row_rdd, samplingRatio=1) ratings.show() # Save dataFrame to rating_by_movie #writer('rating_by_movie', ratings) #ratings.registerTempTable("ratings") # I want to get the average rating, and count of the number of ratings for each movie and persist it to cassandra #movie_ids = ratings.select("movie_id").distinct() #movie_ids.show() # create table movie_ratings_time_series ( movie_id int, ts timeuuid, rating float, primary key (movie_id, ts) ); from pyspark.sql import functions as F avg_ratings = ratings.groupBy("movie_id", "timestamp").agg(F.avg(ratings.rating).alias('rating')) avg_ratings.show() #avg_ratings.write.format("org.apache.spark.sql.cassandra").\ # options(table="movie_ratings_time_series", keyspace="training").\ # save(mode="append") # writer("movie_ratings_time_series", avg_ratings) # movie_to_ts = local_sql.sql("select distinct movie_id, ts from ratings") # movie_to_ts.registerTempTable("movie_ts") # going to join this against itself # agg = local_sql.sql("SELECT movie_id, avg(rating) as a, count(rating) as c from ratings group by movie_id") # agg.registerTempTable("movie_aggregates") # matched = local_sql.sql("select a.movie_id, b.ts, a.a, a.c from movie_aggregates a join movie_ts b on a.movie_id = b.movie_id ") # writer(matched, "movie_stream_ratings") print "============== DONE WRITING ============== "
def handleUIOptions(self, displayColName): agg = self.options.get("aggregation") valFields = self.options.get("valueFields") if agg == 'COUNT': return self.entity.groupBy(displayColName).agg(F.count(displayColName).alias("agg")).toPandas() elif agg == 'SUM': return self.entity.groupBy(displayColName).agg(F.sum(valFields).alias("agg")).toPandas() elif agg == 'AVG': return self.entity.groupBy(displayColName).agg(F.avg(valFields).alias("agg")).toPandas() elif agg == 'MIN': return self.entity.groupBy(displayColName).agg(F.min(valFields).alias("agg")).toPandas() elif agg == 'MAX': return self.entity.groupBy(displayColName).agg(F.max(valFields).alias("agg")).toPandas() elif agg == 'MEAN': return self.entity.groupBy(displayColName).agg(F.mean(valFields).alias("agg")).toPandas() else: return self.entity.groupBy(displayColName).agg(F.count(displayColName).alias("agg")).toPandas()
def match_accidents_with_roads(spark, road_df, accident_df, use_cache=True): cache_path = workdir + 'data/matches_accident-road.parquet' if isdir(cache_path) and use_cache: print('Reading accident-road matches from cache...') return spark.read.parquet(cache_path) nb_top_road_center_preselected = 5 max_distance_accepted = 10 # in meters # Compute distance between accident and road centers to identify the # top nb_top_road_center_preselected closest roads road_centers = (road_df.select(['street_id', 'center_long', 'center_lat']).drop_duplicates()) acc_window = ( Window.partitionBy("accident_id").orderBy("distance_measure")) accidents_top_k_roads = (accident_df.select( 'loc_lat', 'loc_long', 'accident_id').crossJoin(road_centers).withColumn( 'distance_inter', distance_intermediate_formula('loc_lat', 'loc_long', 'center_lat', 'center_long') ).withColumn('distance_measure', distance_measure()).select( 'accident_id', 'street_id', 'distance_measure', 'loc_lat', 'loc_long', rank().over(acc_window).alias('distance_rank')).filter( col('distance_rank') <= nb_top_road_center_preselected).drop( 'distance_measure', 'distance_rank').persist()) # For each accident identify road point closest accidents_roads_first_match = (accidents_top_k_roads.join( road_df, 'street_id').withColumn( 'distance_inter', distance_intermediate_formula( 'loc_lat', 'loc_long', 'coord_lat', 'coord_long')).withColumn( 'distance_measure', distance_measure()).select( 'accident_id', 'loc_lat', 'loc_long', 'coord_lat', 'coord_long', 'street_id', 'street_name', row_number().over(acc_window).alias('distance_rank'), 'distance_measure').filter( col('distance_rank') == 1).withColumn( 'distance', col('distance_measure') * (6371 * 2 * 1000)).drop( 'distance_rank', 'distance_measure', 'coord_lat', 'coord_long').persist()) # If the distance is lower than max_distance_accepted we keep the # accident/street matches accidents_road_correct_match = (accidents_roads_first_match.filter( col('distance') < max_distance_accepted).select( 'accident_id', 'street_id')) # If not, we try to get a better match by adding intermediate points on # the preselected streets # For unsatisfying matches, recompute the k closests roads # Recomputing is probably faster than reading from disk # cache + joining on accident_ids accidents_close_streets_coords = \ (accidents_roads_first_match .filter(col('distance') >= max_distance_accepted) .select('accident_id', 'loc_lat', 'loc_long') .crossJoin(road_centers) .withColumn('distance_inter', distance_intermediate_formula( 'loc_lat', 'loc_long', 'center_lat', 'center_long')) .withColumn('distance_measure', distance_measure()) .select('accident_id', 'street_id', 'distance_measure', 'loc_lat', 'loc_long', rank().over(acc_window) .alias('distance_rank')) .filter(col('distance_rank') <= nb_top_road_center_preselected) .drop('distance_measure', 'distance_rank') .join( road_df.select('street_id', 'coord_lat', 'coord_long'), 'street_id')) # Add the intermediate points street_rolling_window = ( Window.partitionBy('street_id').orderBy("coord_long").rowsBetween( 0, +1)) accidents_close_streets_with_additional_coords = \ (accidents_close_streets_coords .select('accident_id', 'street_id', 'loc_lat', 'loc_long', avg('coord_long') .over(street_rolling_window) .alias('coord_long'), avg('coord_lat') .over(street_rolling_window) .alias('coord_lat')) .union(accidents_close_streets_coords) .dropDuplicates()) accidents_close_streets_coords.unpersist() # Recompute distances between accident and new set of points # and use closest point to identify street accidents_roads_first_match_with_additional_coords = \ (accidents_close_streets_with_additional_coords .withColumn('distance_inter', distance_intermediate_formula( 'loc_lat', 'loc_long', 'coord_lat', 'coord_long')) .withColumn('distance_measure', distance_measure()) .select('accident_id', 'street_id', 'loc_lat', 'loc_long', 'coord_lat', 'coord_long', row_number().over(acc_window).alias('distance_rank')) .filter(col('distance_rank') == 1) .drop('distance_rank', 'loc_lat', 'loc_long', 'coord_lat', 'coord_long')) # Union accidents matched correctly with first method with the accidents # for which we used more street points final_match = (accidents_road_correct_match.union( accidents_roads_first_match_with_additional_coords)) # Make sure there is only one road per accident final_match = (final_match.join(road_centers, 'street_id').join( accident_df.select('loc_lat', 'loc_long', 'accident_id'), 'accident_id').withColumn( 'distance_inter', distance_intermediate_formula( 'loc_lat', 'loc_long', 'center_lat', 'center_long')).withColumn( 'distance_measure', distance_measure()).withColumn( 'dist_rank', row_number().over(acc_window)).filter( col('dist_rank') == 1).select( 'accident_id', 'street_id')) return final_match
def start_stream(args): validate_params(args) _, brokers, topic = args spark = create_spark_session() json = spark \ .readStream \ .format("kafka") \ .option("kafka.bootstrap.servers", brokers) \ .option("subscribe", topic) \ .load() json.printSchema() # Explicitly set schema schema = StructType([StructField("symbol", StringType(), False), StructField("timestamp", TimestampType(), False), StructField("price", DoubleType(), False)]) json_options = {"timestampFormat": "yyyy-MM-dd'T'HH:mm'Z'"} stocks_json = json \ .select(from_json(F.col("value").cast("string"), schema, json_options).alias("content")) stocks_json.printSchema stocks = stocks_json.select("content.*") #################################### # Stream to Parquet #################################### query = stocks \ .withColumn('year', year(F.col('timestamp'))) \ .withColumn('month', month(F.col('timestamp'))) \ .withColumn('day', dayofmonth(F.col('timestamp'))) \ .withColumn('hour', hour(F.col('timestamp'))) \ .withColumn('minute', minute(F.col('timestamp'))) \ .writeStream \ .format('parquet') \ .partitionBy('year', 'month', 'day', 'hour', 'minute') \ .option('startingOffsets', 'earliest') \ .option('checkpointLocation', '/dataset/checkpoint') \ .option('path', '/dataset/streaming.parquet') \ .trigger(processingTime='30 seconds') \ .start() avg_pricing = stocks \ .groupBy(F.col("symbol")) \ .agg(F.avg(F.col("price")).alias("avg_price")) #################################### # Console Output #################################### query2 = avg_pricing.writeStream \ .outputMode('complete') \ .format("console") \ .trigger(processingTime="10 seconds") \ .start() #################################### # Table in Memory #################################### # query3 = avg_pricing \ # .writeStream \ # .queryName("avgPricing") \ # .outputMode("complete") \ # .format("memory") \ # .trigger(processingTime="10 seconds") \ # .start() # # while True: # print('\n' + '_' * 30) # # interactively query in-memory table # spark.sql('SELECT * FROM avgPricing').show() # print(query3.lastProgress) # sleep(10) query2.awaitTermination() pass
extended_trips = trip_data \ .withColumn("pick_date", f.to_date(trip_data["lpep_pickup_datetime"])) \ .withColumn("pick_hour", f.hour(trip_data["lpep_pickup_datetime"]))\ .withColumn("drop_date", f.to_date(trip_data["lpep_dropoff_datetime"])) \ .withColumn("drop_hour", f.hour(trip_data["lpep_dropoff_datetime"])) \ .withColumn("duration", f.unix_timestamp(trip_data["lpep_dropoff_datetime"]) - f.unix_timestamp(trip_data["lpep_pickup_datetime"])) extended_trips = extended_trips.filter((trip_data["lpep_pickup_datetime"] > '2020-01-01 00:00:00')) hourly_taxi_trips = extended_trips \ .groupBy("pick_date", "pick_hour").agg( f.count(extended_trips["fare_amount"]).alias("trip_count"), f.sum(extended_trips["passenger_count"]).alias("passenger_count"), f.sum(extended_trips["fare_amount"]).alias("fare_amount"), f.sum(extended_trips["tip_amount"]).alias("tip_amount"), f.sum(extended_trips["total_amount"]).alias("total_amount"), f.avg(extended_trips["duration"]).alias("avg_duration") ) # hourly_taxi_trips.write.mode("overwrite").parquet("./values/taxi-trips-hourly") hourly_taxi_trips_drop = extended_trips \ .groupBy("drop_date", "drop_hour").agg( f.count(extended_trips["fare_amount"]).alias("trip_count"), f.sum(extended_trips["passenger_count"]).alias("passenger_count"), f.sum(extended_trips["fare_amount"]).alias("fare_amount"), f.sum(extended_trips["tip_amount"]).alias("tip_amount"), f.sum(extended_trips["total_amount"]).alias("total_amount"), f.avg(extended_trips["duration"]).alias("avg_duration") ) daily_taxi_trips = hourly_taxi_trips.groupBy("pick_date").agg( f.sum(hourly_taxi_trips["trip_count"]).alias("trip_count"),
import os app = MainApp() app.init() app.loadData() app.createCheckInDataPerUser() df_userLocs = app.user_locations df_businessLocs = app.df_business df_userFeatures = sqlContext.read.json(os.environ['WORKDIR'] + "user_features.json") df_reviews = sqlContext.read.json(os.environ['WORKDIR'] + "yelp_dataset_challenge_academic_dataset/yelp_academic_dataset_review.json") df_business = sqlContext.read.json(os.environ['WORKDIR'] + "yelp_dataset_challenge_academic_dataset/yelp_academic_dataset_business.json") df_finalBusiness = df_business.join(df_businessLocs,df_business.business_id == df_businessLocs.business_id).select(df_business.business_id,df_business.stars) df_joinBusinessLocsAndReviews = df_businessLocs.join(df_reviews,df_businessLocs.business_id == df_reviews.business_id).select(df_reviews.user_id,df_reviews.business_id,df_reviews.stars) df_finalUsersBusinessRating = df_joinBusinessLocsAndReviews.join(df_userLocs,df_userLocs.user_id == df_joinBusinessLocsAndReviews.user_id).select(df_joinBusinessLocsAndReviews.business_id,df_joinBusinessLocsAndReviews.stars).groupBy("business_id").agg(func.avg("stars").alias('avg_rating')) df = df_finalUsersBusinessRating.join(df_finalBusiness, df_finalUsersBusinessRating.business_id == df_finalBusiness.business_id).select(df_finalUsersBusinessRating.business_id, "stars","avg_rating") pdf = df.toPandas() pdf.plot(x='business_id',y='avg_rating',color='y',label='avg_rating_by_users') pdf.plot(x='business_id',y='stars',color='r',label='business_rating') plt.legend(loc='lower left', fontsize=20) pylab.show()
a = python_ratings.join(python_users, python_ratings['user_id']==python_users['user_id'],'inner')\ .drop(python_users['user_id']) movie_lens_joined = a.join(python_movie_data, a['movie_id']==python_movie_data['movie_id'], 'inner')\ .drop(python_movie_data['movie_id']) #First actual output (user count of new joined dataframe) print "Record Count of New Movie Lens (Joined) table is: ", movie_lens_joined.count() print '\n' #Analysis Piece #3 - Aggregration of ratings to rescale them by occupation #Extra imports from pyspark.sql import functions as spfun #avg rating computation avgs = movie_lens_joined.groupby('user_id').agg(spfun.avg('rating')\ .alias('avg_rating')) #join again with intitial final_avgs = movie_lens_joined.join(avgs, movie_lens_joined['user_id']==avgs['user_id'])\ .drop(avgs['user_id']) #final column for new rescaled ratings by occupation df = final_avgs.withColumn('rescaled_rating', final_avgs['rating'] - final_avgs['avg_rating']) #Analysis Piece #4 - Plot rescaled ratings by occupation matplotlib.style.use('ggplot') #Spark Dataframe stats = df.groupby('occupation').avg('rescaled_rating').toPandas() #Pandas Dataframe stats.columns = ['occupation', 'rescaled_rating']
# array是否包含某個值 array_contains() # first() last() from pyspark.sql.functions import first, last df.select(first("name"),last("salary")) # min(), max() from pyspark.sql.functions import min, max df.select(min("salary"),max("salary")).show() from pyspark.sql.functions import sum df.select(sum("salary")).show() #sumDistinct() from pyspark.sql.functions import avg df.select(avg("salary")).show() """自定義函數""" def toFormat(s): return str(s).split(",")[0].replace("[","").replace("'","") toFormat=udf(toFormat, StringType()) df.withColumn('words',toFormat('keywords')).select("words").show() """cache""" DF1.cache() DF2 = DF1.groupBy("DEST_COUNTRY_NAME").count().collect() DF3 = DF1.groupBy("ORIGIN_COUNTRY_NAME").count().collect() DF4 = DF1.groupBy("count").count().collect()
trunc_df = yelp_df.filter("review_count>=10 and open = 'True'").groupBy("state").count() trunc_df.orderBy(desc("count")).collect() ################### /usr/lib/hue/apps/search/examples/collections/solr_configs_log_analytics_demo/index_data.csv logs_df = sqlCtx.load(source="com.databricks.spark.csv",header = 'true',inferSchema = 'true',path ='index_data_http.csv') sc._jsc.hadoopConfiguration().set('textinputformat.record.delimiter','\r\n') sc._jsc.hadoopConfiguration().set('textinputformat.record.delimiter','\r\n') from pyspark.sql.functions import asc, desc logs_df.groupBy("code").count().orderBy(desc("count")).show() logs_df.groupBy("code").avg("bytes").show() import pyspark.sql.functions as F logs_df.groupBy("code").agg(logs_df.code,F.avg(logs_df.bytes),F.min(logs_df.bytes),F.max(logs_df.bytes)).show() ########################################### yelp_df = sqlCtx.load(source='com.databricks.spark.csv',header = 'true',inferSchema = 'true',path ='index_data.csv') yelp_df.registerTempTable("yelp") filtered_yelp = sqlCtx.sql("SELECT * FROM yelp WHERE useful >= 1") filtered_yelp.count() sqlCtx.sql("SELECT MAX(useful) AS max_useful FROM yelp").collect() useful_perc_data.join(yelp_df,yelp_df.id == useful_perc_data.uid,"inner").select(useful_perc_data.uid, "useful_perc", "review_count") useful_perc_data.registerTempTable("useful_perc_data") sqlCtx.sql( """SELECT useful_perc_data.uid, useful_perc, review_count FROM useful_perc_data
#import SQLContext and pyspark SQL functions from pyspark.sql import SQLContext, Row import pyspark.sql.functions as func sqlContext = SQLContext(sc) inputRDD = sc.textFile("/user/pravat/auctiondata.csv").map(lambda l: l.split(",")) auctions = inputRDD.map(lambda p:Row(auctionid=p[0], bid=float(p[1]), bidtime=float(p[2]), bidder=p[3], bidrate=int(p[4]), openbid=float(p[5]), price=float(p[6]), itemtype=p[7], dtl=int(p[8]))) # Infer the schema, and register the DataFrame as a table. auctiondf = sqlContext.createDataFrame(auctions) auctiondf.registerTempTable("auctions") auctiondf.show() auctiondf.printSchema() totbids = auctiondf.count() print totbids totalauctions = auctiondf.select("auctionid").distinct().count() print total auctions itemtypes = auctiondf.select("itemtype").distinct().count() print itemtypes auctiondf.groupBy("itemtype","auctionid").count().show() auctiondf.groupBy("itemtype","auctionid").count().agg(func.min("count"), func.max("count"), func.avg("count")).show() auctiondf.groupBy("itemtype", "auctionid").agg(func.min("bid"), func.max("bid"), func.avg("bid")).show() auctiondf.filter(auctiondf.price>200).count() xboxes = sqlContext.sql("SELECT auctionid, itemtype,bid,price,openbid FROM auctions WHERE itemtype = 'xbox'").show()
def loadData(self): category_list = self.sc.textFile(os.environ['WORKDIR'] + "yelp_dataset_challenge_academic_dataset/cat_subcat.csv").map(lambda line: (line.split(',')[0], line.split(','))) category_schema = StructType([ StructField("category", StringType(), True), StructField("sub_category", ArrayType(StringType()), True) ]) # self.category_list.registerTempTable("categories_list") # subcat = self.sqlContext.sql("SELECT sub_category FROM categories_list WHERE category = \"{0}\" LIMIT 1".format(self.category)) category_list = self.sqlContext.createDataFrame(category_list, category_schema) subcat = category_list.where(category_list.category == self.category).first().sub_category self.df_business = self.sqlContext.read.json(os.environ['WORKDIR'] + "yelp_dataset_challenge_academic_dataset/yelp_academic_dataset_business.json") # self.df_business = self.sqlContext.read.json("s3n://ds-emr-spark/data/yelp_academic_dataset_business.json").cache() self.df_business = self.df_business.select("business_id", "name", "stars", "latitude", "longitude", "categories") filter_business = partial(isBusinessLocalAndRelevant, latitude = self.loc_lat, longitude = self.loc_long, sub_categories = subcat) self.df_business = self.df_business.rdd.filter(filter_business) self.df_business = self.sqlContext.createDataFrame(self.df_business) self.df_business = self.df_business.select("business_id", "name", "stars") self.df_business.registerTempTable("business") schema_2 = StructType([ StructField("latitude", FloatType(), True), StructField("longitude", FloatType(), True) ]) schema = StructType([ StructField("cluster_centers", ArrayType(schema_2), True), StructField("user_id", StringType(), True) ]) self.df_user_locations = self.sqlContext.read.json(os.environ['WORKDIR'] + "clustering_models/center.json/dbscan", schema) filter_users = partial(isUserlocal, latitude = self.loc_lat, longitude = self.loc_long) self.df_user_locations = self.df_user_locations.rdd.filter(filter_users) self.df_user_locations = self.sqlContext.createDataFrame(self.df_user_locations) self.df_user_locations = self.df_user_locations.select("user_id") self.df_user_locations.registerTempTable("user") #print "user locations: ", self.self.df_user_locations.count() self.df_review = self.sqlContext.read.json(os.environ['WORKDIR'] + "yelp_dataset_challenge_academic_dataset/yelp_academic_dataset_review.json") self.df_review = self.df_review.select("business_id", "user_id", "stars") self.df_review.registerTempTable("review") #print "reviews: ", self.self.df_review.count() self.df_joined = self.sqlContext.sql("SELECT r.user_id AS user_id, r.business_id AS business_id, first(b.name) AS business_name, first(b.stars) as business_stars, avg(r.stars) AS avg_rev_stars FROM review r, business b, user u WHERE r.business_id = b.business_id AND r.user_id = u.user_id GROUP BY r.user_id, r.business_id") self.df_joined.registerTempTable("joined") self.df_business.unpersist() self.df_user_locations.unpersist() self.df_review.unpersist() self.df_category_pred = self.loadEliteScorePredictionsForCategory() self.df_category_pred.registerTempTable("prediction") self.df_joined = self.sqlContext.sql("SELECT j.*, p.prediction AS elite_score, (j.avg_rev_stars*p.prediction) AS w_score FROM joined j, prediction p WHERE j.user_id = p.user_id") #print "joined: ", self.self.df_joined.count() #self.self.df_joined.show() self.df_category_pred.unpersist() df_grouped = self.df_joined.groupBy("business_id", "business_name", "business_stars").agg(F.avg("w_score").alias("rank")) df_grouped = df_grouped.sort("rank", ascending=False) print df_grouped.count() df_grouped.show() self.df_joined.unpersist() return df_grouped
d1 = spark.read.option("header", "true") \ .option("sep", ",").option("inferSchema", True) \ .option("mode", "DROPMALFORMED") \ .csv("file:///Users/beginspark/Temp/data2.csv") d2 = d1.toDF("year", "month", "road", "avr_traffic_month", "avr_velo_month", "mon", "tue", "wed", "thu", "fri", "sat", "sun") # data 확인 d2.printSchema() # null 값 제거 d3 = d2.where("avr_velo_month is not null") # 도로별 평균 속도 d4 = d3.groupBy("road").agg(functions.round(functions.avg("avr_velo_month"), 1).alias("avr_velo_total")) d5 = d3.join(d4, ["road"]) # label 부여 d6 = d5.withColumn("label", label(d5.avr_velo_month, d5.avr_velo_total).cast("double")) d6.select("road", "avr_velo_month", "avr_velo_total", "label").show(5, False) d6.groupBy("label").count().show(truncate=False) dataArr = d6.randomSplit([0.7, 0.3]) train = dataArr[0] test = dataArr[1] indexer = StringIndexer(inputCol="road", outputCol="roadcode") assembler = VectorAssembler(inputCols=["roadcode", "mon", "tue", "wed", "thu", "fri", "sat", "sun"], outputCol="features")
from pyspark import SparkConf from pyspark.sql import SparkSession from pyspark.sql.functions import avg def getSparkSession(): return SparkSession.builder.appName('API-Test')\ .config(conf = SparkConf()).getOrCreate() if __name__ == '__main__': spark = getSparkSession() data = [('Benny', 86), ('Jenny', 77), ('Oscar', 55), ('Scarlett', 89)] df = spark.createDataFrame(data) df = df.withColumnRenamed('_1', 'name').withColumnRenamed('_2', 'marks') print('Average marks: ', df.select(avg(df.marks)).collect()) print('Marks between 80 & 90: ', df.filter(df.marks.between(80, 90)).collect()) print('Marks between 80 & 90 and name starts with \'S\': ', \ df.filter(df.marks.between(80, 90) & df.name.startswith('S')).collect()) print('Names having \'y\': ', df.filter(df.name.like('%y%')).collect()) names_with_y = df.filter(df.name.like('%y%')) print('Avg. of names having \'y\': ', \ names_with_y.select(avg(names_with_y.marks)).collect())
sqlCtx = SQLContext(sc) lines = sc.parallelize(["m1,d1,1", "m1,d2,2", "m2,d1,1", "m2,d2,2"]) record = lines.map(lambda line: line.split(",")).map( lambda columns: Row(machine=columns[0], domain=columns[1], request=columns[2])) recordSchema = sqlCtx.createDataFrame(record) recordSchema.groupBy().agg({"*": "count"}).show() recordSchema.groupBy("machine", recordSchema["domain"]).agg( {"domain": "max", "request": "min"}).show() recordSchema.groupBy("machine", recordSchema.domain).agg(functions.count("*"), functions.max( recordSchema.request), functions.min(recordSchema["request"]), functions.sum(recordSchema["request"]), functions.avg(recordSchema["request"])).show() recordSchema.select(recordSchema.machine, recordSchema.request.cast( "int")).groupBy("machine").count().show() recordSchema.select(recordSchema.machine, recordSchema.request.cast( "int").alias("request")).groupBy("machine").max("request").show() recordSchema.select(recordSchema.machine, recordSchema.request.cast( "int").alias("request")).groupBy("machine").min("request").show() recordSchema.select(recordSchema.machine, recordSchema.request.cast( "int").alias("request")).groupBy("machine").sum("request").show() recordSchema.select(recordSchema.machine, recordSchema.request.cast( "int").alias("request")).groupBy("machine").avg("request").show()
def cal_mat_window(sc, sqlContext, dfSC, window): windowSpec = Window.partitionBy("symbol").orderBy("date").rangeBetween(-1 * window+1,1) mat = func.avg("close").over(windowSpec) dfSC = dfSC.select(dfSC.symbol, dfSC.date, dfSC.close, mat ) print dfSC.collect()
def doRenderMpld3(self, handlerId, figure, axes, keyFields, keyFieldValues, keyFieldLabels, valueFields, valueFieldValues): allNumericCols = self.getNumericalFieldNames() if len(allNumericCols) == 0: self._addHTML("Unable to find a numerical column in the dataframe") return keyFields = self.options.get("keyFields") valueField = self.options.get("valueFields") if(keyFields==None and valueField==None): keyFields=self.getFirstStringColInfo() valueField=self.getFirstNumericalColInfo() else: keyFields = keyFields.split(',') valueField = valueField.split(',') if(len(valueField) > 1): self._addHTML("You can enter only have one value field for Bar Charts (2-D)"+str(len(valueField))) return keyFields = keyFields[0] valueField=valueField[0] #if(len(valueFields>)): #init fig=figure ax=axes #fig, ax = plt.subplots() #fig = plt.figure() params = plt.gcf() plSize = params.get_size_inches() params.set_size_inches( (plSize[0]*2, plSize[1]*2) ) agg=self.options.get("aggregation") groupByCol=self.options.get("groupByCol") if (agg=="None" or agg==None): colLabel = keyFields y = self.entity.select(valueField).toPandas()[valueField].dropna().tolist() x_intv = np.arange(len(y)) labels = self.entity.select(keyFields).toPandas()[keyFields].dropna().tolist() plt.xticks(x_intv,labels) plt.xlabel(keyFields, fontsize=18) plt.ylabel(valueField, fontsize=18) elif(agg=='AVG'): y1=self.entity.groupBy(keyFields).agg(F.avg(valueField).alias("avg")).toPandas().sort_values(by=keyFields) y=y1["avg"].dropna().tolist() x_intv = np.arange(len(y)) labels=y1[keyFields].dropna().tolist() plt.xticks(x_intv,labels) plt.xlabel(keyFields, fontsize=18) plt.ylabel("Average "+valueField, fontsize=18) elif(agg=='SUM'): y1=self.entity.groupBy(keyFields).agg(F.sum(valueField).alias("sum")).toPandas().sort_values(by=keyFields) y=y1["sum"].dropna().tolist() x_intv = np.arange(len(y)) labels=y1[keyFields].dropna().tolist() plt.xticks(x_intv,labels) plt.xlabel(keyFields, fontsize=18) plt.ylabel("sum "+valueField, fontsize=18) elif(agg=='MAX'): y1=self.entity.groupBy(keyFields).agg(F.max(valueField).alias("max")).toPandas().sort_values(by=keyFields) y=y1["max"].dropna().tolist() x_intv = np.arange(len(y)) labels=y1[keyFields].dropna().tolist() plt.xticks(x_intv,labels) plt.xlabel(keyFields, fontsize=18) plt.ylabel("max "+valueField, fontsize=18) elif(agg=='MIN'): y1=self.entity.groupBy(keyFields).agg(F.min(valueField).alias("min")).toPandas().sort_values(by=keyFields) y=y1["min"].dropna().tolist() x_intv = np.arange(len(y)) labels=y1[keyFields].dropna().tolist() plt.xticks(x_intv,labels) plt.xlabel(keyFields, fontsize=18) plt.ylabel("min "+valueField, fontsize=18) elif(agg=='COUNT'): y1=self.entity.groupBy(keyFields).agg(F.count(valueField).alias("count")).toPandas().sort_values(by=keyFields) y=y1["count"].dropna().tolist() x_intv = np.arange(len(y)) labels=y1[keyFields].dropna().tolist() plt.xticks(x_intv,labels) plt.xlabel(keyFields, fontsize=18) plt.ylabel("count "+valueField, fontsize=18) mpld3.enable_notebook() plt.bar(x_intv,y,color="blue",alpha=0.5) ax_fmt = BarChart(labels) mpld3.plugins.connect(fig, ax_fmt)
simpleData = [("James", "Sales", 3000), ("Michael", "Sales", 4600), ("Robert", "Sales", 4100), ("Maria", "Finance", 3000), ("James", "Sales", 3000), ("Scott", "Finance", 3300), ("Jen", "Finance", 3900), ("Jeff", "Marketing", 3000), ("Kumar", "Marketing", 2000), ("Saif", "Sales", 4100)] schema = ["employee_name", "department", "salary"] df = spark.createDataFrame(data=simpleData, schema=schema) df.printSchema() df.show(truncate=False) print("approx_count_distinct: " + \ str(df.select(approx_count_distinct("salary")).collect()[0][0])) print("avg: " + str(df.select(avg("salary")).collect()[0][0])) df.select(collect_list("salary")).show(truncate=False) df.select(collect_set("salary")).show(truncate=False) df2 = df.select(countDistinct("department", "salary")) df2.show(truncate=False) print("Distinct Count of Department & Salary: " + str(df2.collect()[0][0])) print("count: " + str(df.select(count("salary")).collect()[0])) df.select(first("salary")).show(truncate=False) df.select(last("salary")).show(truncate=False) df.select(kurtosis("salary")).show(truncate=False) df.select(max("salary")).show(truncate=False) df.select(min("salary")).show(truncate=False)
from pyspark import SparkContext, SparkConf, SQLContext import pyspark.sql.functions as f conf = (SparkConf().setMaster("local[20]").setAppName( "sample app for reading files").set("spark.executor.memory", "2g")) sc = SparkContext(conf=conf) sqlContext = SQLContext(sc) df = sqlContext.read.load("ratings.csv", format='com.databricks.spark.csv', header='true', inferSchema='true') df.groupby('movieId').agg(f.avg('rating').alias('movie_rating')).orderBy( 'movieId', ascending=True).coalesce(1).write.format( "com.databricks.spark.csv").save("WowResultsnew.csv")
) movies_df_schema = StructType( [StructField('ID', IntegerType()), StructField('title', StringType())] ) spark = SparkSession.builder.appName('ALS Movie Predictions').getOrCreate() movies_raw_df = spark.read.format("csv").option("header", "true").load("/home/ragesh/Data/Movie_Ratings/movies.csv", schema=movies_df_schema) ratings_raw_df = spark.read.format("csv").option("header", "true").load("/home/ragesh/Data/Movie_Ratings/ratings.csv", schema=ratings_df_schema) movies_raw_df.cache() ratings_raw_df.cache() # Movies with Highest Average Ratings movie_ids_with_avg_ratings_df = ratings_raw_df.groupBy('movieId').agg(F.count(ratings_raw_df.rating).alias("count"), F.avg(ratings_raw_df.rating).alias("average")) # print('movie_ids_with_avg_ratings_df:') # movie_ids_with_avg_ratings_df.show(3, truncate=False) movie_names_with_avg_ratings_df = movie_ids_with_avg_ratings_df.join(movies_raw_df, movies_raw_df.ID == movie_ids_with_avg_ratings_df.movieId) \ .select(movie_ids_with_avg_ratings_df['average'], movies_raw_df.title, movie_ids_with_avg_ratings_df['count'], movie_ids_with_avg_ratings_df.movieId) # print('movie_names_with_avg_ratings_df:') # movie_names_with_avg_ratings_df.show(3, truncate=False) # Movies with Highest Average Ratings and at least 500 reviews movies_with_500_ratings_or_more = movie_names_with_avg_ratings_df.filter(movie_names_with_avg_ratings_df['count'] >= 500)\ .sort(movie_names_with_avg_ratings_df['average'].desc()) # print('Movies with highest ratings:') # movies_with_500_ratings_or_more.show(20, truncate=False)
if __name__ == "__main__": conf = get_spark_app_config() spark = SparkSession.builder \ .config(conf=conf) \ .getOrCreate() logger = Log4j(spark) logger.info("Starting the pyspark application") invoice_df = load_invoice_df(spark) invoice_df.select( f.countDistinct(col("InvoiceNo")).alias("Count_Of_Invoice"), f.sum(col("Quantity")).alias("Sum_Of_Quantity"), f.avg(col("UnitPrice")).alias("Avg_Of_UnitPrice"), f.count("*").alias("Total_Count"), ).show() summary_df = invoice_df \ .groupBy(col("Country"),col("InvoiceNo")) \ .agg(f.sum(col("Quantity")).alias("Total_Quantity"), f.round(f.sum(col("Quantity")*col("UnitPrice")),2).alias("InvoiceValue") ).show() weekly_summary_df = invoice_df \ .withColumn("WeekNumber", f.weekofyear(f.to_date(f.substring(col("InvoiceDate"), 1, 10), "M-d-yyyy")) ) \ .groupBy(col("Country"), col("WeekNumber")) \ .agg( f.countDistinct(col("InvoiceNo")).alias("NumInvoices"),
.withColumn('x_centers', x_udf('centers')) .withColumn('y_centers', y_udf('centers')) .withColumn('group_sizes', group_udf('centers')) .withColumn('num_groups', count_udf('group_sizes')) .withColumn('velocities', velocity_udf('pair_centers')) .withColumn('num_velocities', count_udf('velocities')) .withColumn('sum_velocities', sum_udf('velocities'))) """# Aggregate each 5 minute window to compute: - average number of people detected - average group size - average velocity """ window_str = '{} minutes'.format(window_minutes) agg_df = (df.groupBy(window('timestamp', windowDuration=window_str, slideDuration=window_str)) .agg(F.sum('num_people'), F.sum('num_groups'), F.sum('sum_velocities'), F.sum('num_velocities'), avg('num_people'), collect_list('x_centers'), collect_list('y_centers')) .withColumn('x_centers', flatten_udf('collect_list(x_centers)')) .withColumn('y_centers', flatten_udf('collect_list(y_centers)')) .drop('collect_list(x_centers)') .drop('collect_list(y_centers)') .orderBy('window')) agg_df.show()
def cal_mat_window(sc, sqlContext, dfSC, window): windowSpec = Window.partitionBy("symbol").orderBy("date").rangeBetween( -1 * window + 1, 1) mat = func.avg("close").over(windowSpec) dfSC = dfSC.select(dfSC.symbol, dfSC.date, dfSC.close, mat) print dfSC.collect()
from pyspark.sql import functions as F #Creating data frame from list data = [('John', 'Smith', 47),('Jane', 'Smith', 22), ('Frank', 'Jones', 28)] schema = ['fname', 'lname', 'age'] df = sqlContext.createDataFrame(data, schema) df #Retrieving contents of data frame df.printSchema() df.show() df.first() df.count() #Adding columns df = df.withColumn('salary', F.lit(0)) df.show() df.withColumn('salary2', df['age'] * 100).show() #Filtering and subsetting df.filter(df['age'] > 30).select('fname','age').show() df.select(F.max('age').alias('max-age')).show() #Grouped aggregations df.groupBy('lname').max('age').show() df.groupBy('lname').agg(F.avg('age').alias('avg-age'), F.min('age'), F.max('age')).show()
totalSal=0 count=0 for x in list: count=count+1 totalSal=totalSal+x[0] return totalSal/count avgSal=totalAvg(salList) newEmpDt_df = empDt_df.withColumn("avgSalary", Func.lit(avgSal)) # another approach using windows function from pyspark.sql.window import Window window = Window.partitionBy(empDt_df.address).orderBy(empDt_df.address.desc()) empDt1_df = empDt_df.withColumn("Avg_salary_country_wise",Func.avg(empDt_df.salary).over(window)) """ +------+--------+---------+------+-----------------------+ |emp_id|emp_name| address|salary|Avg_salary_country_wise| +------+--------+---------+------+-----------------------+ | 4| Tanya| Russia| 7500| 23750.0| | 7| Jerry| Russia| 40000| 23750.0| | 1| Tim| US| 4800| 5750.0| | 2| George| US| 3200| 5750.0| | 8| Cathy| US| 5000| 5750.0| | 10| Peter| US| 10000| 5750.0| | 3| Mary| UK| 8000| 6700.0| | 6| Jim| UK| 5400| 6700.0| | 5| Rose|Australia| 7000| 13500.0| | 9| Andy|Australia| 20000| 13500.0|
split_col = F.split(df_modified["location"], ',') df_modified = df_modified.withColumn("name", split_col.getItem(0))\ .withColumn("highway", split_col.getItem(1))\ .withColumn("lanes", split_col.getItem(2))\ .withColumn("bridge", split_col.getItem(3))\ .withColumn("lit", split_col.getItem(4))\ .withColumn("id", split_col.getItem(5))\ .withColumn("unique_id", split_col.getItem(6)) # Calculation of the density of vehicles divided by section and type of vehicle query = df_modified \ .withWatermark("timestamp_millisecond", "2 minutes") \ .groupBy("timestamp_millisecond", "name", "unique_id", "id_object")\ .agg( F.count("*").alias("count"), F.avg("speed").alias("average_speed"), ) ############### Writing the stream modified to memory ################# query.writeStream\ .queryName("streamingOutput")\ .format("parquet")\ .option("path", os.path.join(os.getcwd(), 'sink', 'sink_stream_modified'))\ .option("checkpointLocation", os.path.join(os.getcwd(), 'checkpoint', 'checkpoint_stream_modified'))\ .start() ##################### Aggregation 1 minute #################### # Load the previously modified stream df_read = spark \ .readStream \ .schema(query.schema) \
phy_df = spark.createDataFrame(phy) phy_df = phy_df\ .withColumnRenamed('_1', 'id')\ .withColumnRenamed('_2', 'name')\ .withColumnRenamed('_3', 'marks') chem_df = spark.createDataFrame(chem) chem_df = chem_df.withColumnRenamed('_1', 'id')\ .withColumnRenamed('_2', 'name')\ .withColumnRenamed('_3', 'marks') phy_df.printSchema() chem_df.printSchema() phy_df.agg(avg(phy_df.marks)).show() chem_df.groupBy().avg('marks').show() # alternate API #inner join phy_df\ .join(chem_df, phy_df.id == chem_df.id)\ .select(phy_df.name, phy_df.marks, chem_df.marks).show() #with SQL statements phy_df.createOrReplaceTempView('phy') chem_df.createOrReplaceTempView('chem') sql_str = 'select phy.name, phy.marks, chem.marks from phy, chem where phy.id = chem.id' spark.sql(sql_str).show() sql_str = 'select phy.name, phy.marks, chem.marks from phy full outer join chem on phy.id = chem.id'
def fill_with_mean(df, include=set()): stats = df.agg(*(fn.avg(c).alias(c) for c in df.columns if c in include)) return df.fillna(stats.first().asDict())
import pyspark.sql.functions as F df = sqlContext.createDataFrame([('a', 1), ('b', 2), ('a', 3)], ["key", "value"]) df2 = df.withColumn('key', F.upper(df.key)) df2.groupBy('key').agg(F.avg(df.value)).collect()
return ( row[0], # commentBody row[1], # commentID row[2], # createDate row[3], # articleID str(nltk_sentiment(row[0]))) df = df.select('commentBody', 'commentId', 'createDate', 'articleID') df = df.na.drop() df = df.rdd.map(callback)\ .toDF(['commentBody', 'commentId', 'createDate', 'articleID', 'sentimentScore']) df.show() df.write.csv("hdfs://cm:9000/uhadoop2019/dpi/test2") df.cache() # get avg score per article article_df = spark.read.option('header', 'true') \ .option("delimiter", ",") \ .option('quote', '"') \ .option('multiLine', 'true') \ .option('parserLib', 'univocity') \ .csv("hdfs://cm:9000/uhadoop2019/dpi/ArticlesApril2017.csv.gz") joined_df = article_df.join(df, 'articleID') joined_df = joined_df.groupBy('articleID').agg( avg('SentimentScore').alias('avg_score')) joined_df.write.csv("hdfs://cm:9000/uhadoop2019/dpi/avg_score_article") print(joined_df.show())
'file:///usr/lib/hue/apps/search/examples/collections/solr_configs_log_analytics_demo/index_data.csv') logs_df.count() # count by different code type logs_df.groupBy("code").count().show() # rank by counts from pyspark.sql.functions import asc, desc logs_df.groupBy('code').count().orderBy(desc('count')).show() # calculate average size of different code logs_df.groupBy("code").avg("bytes").show() # more calculation by code - average, min, max import pyspark.sql.functions as F logs_df.groupBy("code").agg( logs_df.code, F.avg(logs_df.bytes), F.min(logs_df.bytes), F.max(logs_df.bytes) ).show() # homework # 1 yelp_df.select("cool").agg({"cool" : "mean"}).collect() # 2 import pyspark.sql.functions as F yelp_df.filter('review_count >= 10').groupBy("stars").agg(yelp_df.stars, F.avg(yelp_df.cool)).show() # 3 yelp_df.filter((yelp_df.review_count >= 10) & (yelp_df.open == 'True')).groupBy("stars").agg(yelp_df.stars, F.avg(yelp_df.cool)).show() # 4 from pyspark.sql.functions import asc, desc
from pyspark.sql import SparkSession from pyspark.sql import functions as func import os spark = SparkSession.builder.appName("SparkSQLDataframes").getOrCreate() curwd = os.getcwd() people = spark.read.option("header", "true").option( "inferSchema", "true").csv(f"file:///{curwd}/fakefriends-header.csv") print("log: check our inferred schema: ") people.printSchema() print("showing only name column from data") people.select("age", "friends").show(10) print("group by age with average of friends") people.select("age", "friends").groupBy("age").avg("friends").sort("age").show(5) print("doing the same with agg and with round: ") people.select("age","friends") \ .groupBy("age") \ .agg(func.round(func.avg("friends"),2)) \ .alias("n_friends_avg") \ .sort("age") \ .show(5) spark.stop()
# MAGIC +-------+-----------------------------+-----+-------+ # MAGIC |5.0 |Ella Lola, a la Trilby (1898)|1 |94431 | # MAGIC |5.0 |Serving Life (2011) |1 |129034 | # MAGIC |5.0 |Diplomatic Immunity (2009? ) |1 |107434 | # MAGIC +-------+-----------------------------+-----+-------+ # MAGIC only showing top 3 rows # MAGIC ``` # COMMAND ---------- # TODO: Replace <FILL_IN> with appropriate code from pyspark.sql import functions as F # From ratingsDF, create a movie_ids_with_avg_ratings_df that combines the two DataFrames ratings_df.show(3) movie_ids_with_avg_ratings_df = ratings_df.groupBy('movieId').agg(F.count(ratings_df.rating).alias("count"), F.avg(ratings_df.rating).alias("average")) print 'movie_ids_with_avg_ratings_df:' movie_ids_with_avg_ratings_df.show(3, truncate=False) # Note: movie_names_df is a temporary variable, used only to separate the steps necessary # to create the movie_names_with_avg_ratings_df DataFrame. movie_names_df = movie_ids_with_avg_ratings_df.join(movies_df,movie_ids_with_avg_ratings_df["movieId"]==movies_df["Id"]) movie_names_with_avg_ratings_df = movie_names_df.drop("Id") print 'movie_names_with_avg_ratings_df:' movie_names_with_avg_ratings_df.show(3, truncate=False) # COMMAND ---------- # TEST Movies with Highest Average Ratings (1a) Test.assertEquals(movie_ids_with_avg_ratings_df.count(), 26744,
# Location of raw historian data in ADLS adls = os.environ["ADLS_PATH"] testDataDir = os.path.join(adls, os.environ['PROC_HIST_DATA_PATH'], runDay, 'clean/test') predDataDir = os.path.join(adls, os.environ['PROC_HIST_DATA_PATH'], runDay, 'clean/predictions') modelDevDir = os.path.join(adls, os.environ["PROC_HIST_DATA_PATH"], runDay, 'models') modelName = 'spark-lrPipelineModel-v3' testData = spark.read.load(testDataDir, format="parquet") # # Data Feature Preparation # This model requires data to be stored 'hourly' and padded for missing hours with previous values hourlyData = testData.withColumn('recordInterval',(F.round(F.unix_timestamp('recordTime')/3600)*3600).cast('timestamp'))\ .groupBy('recordInterval','deviceID').agg(F.avg('value').alias('reading')) # 'Pad' the data by filling in any missing hours using the last good (ie. non-null) value from the device paddedHourlyData = util.padData(hourlyData, spark) # We need the following features: # * Previous hour's reading # * Average of last 5 hours reading window = Window.partitionBy("deviceID").orderBy("recordInterval") modelData = paddedHourlyData\ .withColumn('lastReading',F.avg('reading').over(window.rowsBetween(-1,-1)))\ .withColumn('avgReadings5hr',F.avg('reading').over(window.rowsBetween(-5,-1)))\ .filter('deviceID=="85WIC2703_COR"')\ .na.drop() # # Model Testing
# COMMAND ---------- from pyspark.sql.functions import sumDistinct df.select(sumDistinct("Quantity")).show() # 29310 # COMMAND ---------- from pyspark.sql.functions import sum, count, avg, expr df.select( count("Quantity").alias("total_transactions"), sum("Quantity").alias("total_purchases"), avg("Quantity").alias("avg_purchases"), expr("mean(Quantity)").alias("mean_purchases"))\ .selectExpr( "total_purchases/total_transactions", "avg_purchases", "mean_purchases").show() # COMMAND ---------- from pyspark.sql.functions import var_pop, stddev_pop from pyspark.sql.functions import var_samp, stddev_samp df.select(var_pop("Quantity"), var_samp("Quantity"), stddev_pop("Quantity"), stddev_samp("Quantity")).show()
def summary_df(df,fn): #,max_date): # drop null ad_click values df = df.na.drop(subset=["ad_click"]) # Remove non search sessions df = df[df['ad_click']>0] # sum ad_click sum_search_clients_daily = df.groupBy("client_id", "country", "submission_date_s3", "activity_date")\ .agg(F.sum("ad_click").alias("ad_click")) # read revenue_by_country rev_by_country_s3_path = "s3://net-mozaws-prod-us-west-2-pipeline-analysis/nawong/revenue_by_country.csv" rev_by_country = sqlContext.read.csv(rev_by_country_s3_path, header=True) rev_by_country = rev_by_country.withColumn("rev_per_search_float", F.col("rev_per_search").cast("double"))\ .withColumn("yyyyMM_timestamp", F.to_timestamp(F.col("yyyymm"), "yyyyMM"))\ .withColumn("country_code", F.upper(F.col("country_code"))) # add country field and revenue table - need transform to calculate transaction-level monetary value tbl = sum_search_clients_daily.join(rev_by_country, sum_search_clients_daily.country == rev_by_country.country_code,how='left_outer') spec = Window.partitionBy("client_id","country","submission_date_s3").orderBy(F.col("yyyyMM_timestamp").desc()) # NOTE partition includes country because client may change country over time no_country=( tbl .where(F.isnull(F.col("yyyymm"))) .withColumn("rev_per_search_float", F.lit(.005)) ) has_country=( tbl .na.drop(subset=["yyyymm"]) .where("yyyyMM_timestamp <= activity_date") .withColumn('rank', F.row_number().over(spec)) .where("rank = 1") .drop('rank') ) tbl2=(no_country.union(has_country)) # drop first purchase to calculate revenue spec2 = Window.partitionBy("client_id").orderBy(F.col("activity_date").asc()) # earliest date has row #1 search_rev = (tbl2 .withColumn("rank", F.row_number().over(spec2)) .where("rank > 1") ).groupBy("client_id").agg(F.avg(F.col('rev_per_search_float')*F.col('ad_click')).alias("monetary_value")) # compute the final dataset for the BG/NBD model dataset = ( tbl2 .groupBy("client_id") .agg(F.datediff(F.max('activity_date'),F.min("activity_date")).alias("recency"), (F.countDistinct('activity_date')-1).alias("frequency"), (F.datediff(F.lit(end_date.strftime("%Y-%m-%d")).cast("date"),F.min("activity_date"))).alias("T"), F.sum("ad_click").alias("historical_searches"), F.sum(F.col('rev_per_search_float')*F.col('ad_click')).alias("historical_clv")) .join(search_rev, "client_id", how="left") .where("frequency >= 0 AND recency >= 0 AND T >= 0") .select("client_id", (F.crc32("client_id") % 100).alias("sample_id"), "frequency","recency","T","monetary_value","historical_searches","historical_clv") ).fillna(0, subset=['monetary_value']) # anonymize client_id dataset = dataset.withColumn('client_id',sha1(dataset.client_id)) # write dataset recency, freq, age, revenue table per client #dataset.write.partitionBy("sample_id").format("parquet").mode("overwrite").save(fn) duplicated = dataset.withColumn("sample_id_dupe", dataset["sample_id"]) duplicated.write.partitionBy("sample_id_dupe").format("parquet").mode("append").save(fn)
schema1=['id1','name','math'] schema2=['id2','physics','biology'] df1=rdd1.toDF(schema1) df2=rdd2.toDF(schema2) print("Time taken for joining 2 DFs:") start=time.time() joined_df=df1.join(df2,(df1.id1 == df2.id2)) print(joined_df.show()) end=time.time() print(end-start) ##Appending 2 DFs: #df_appended= df1.unionAll(df2) df_appended = joined_df.unionAll(joined_df) df_groupby_id=df_appended.groupBy('id1').agg(F.avg(df_appended.math)) print(df_groupby_id.show()) #df filter filtered_data=joined_df.filter(joined_df.math>95) print(filtered_data.show()) #df sort df_sortby_name=joined_df.sort(joined_df.name.asc()) print(df_sortby_name.show()) ##Groupby: ########################filter#####################333 filtered_data = counted_grped_data.filter(lambda x: True if x[0]=='hi' else False )
df2 = df2.withColumn("Media_por_org_sup_diaria", udf_to_value(df2["Valor diárias"])) df2 = df2.withColumn("Min_por_org_sup_diaria", udf_to_value(df2["Valor diárias"])) df2 = df2.withColumn("Total_por_org_sup_diaria", udf_to_value(df2["Valor diárias"])) # In[ ]: from pyspark.sql import functions as F # In[ ]: df2.groupBy("Nome do órgão superior").agg(F.max("Max_por_org_sup"), F.avg("Media_por_org_sup"), F.min("Min_por_org_sup"), F.sum("Total_por_org_sup")).sort('Nome do órgão superior').show( truncate=True) # In[ ]: df2.groupBy("Destinos").agg(F.max("Max_por_destinos"), F.avg("Media_por_destinos"), F.min("Min_por_destinos"), F.sum("Total_por_destinos")).sort('Destinos').show( truncate=True) # In[ ]:
def _add_average(self, df, low_col, high_col, enddate, compute_term): avgDf = df.agg( fn.avg(low_col).alias(low_col), fn.avg(high_col).alias(high_col), fn.avg("avg_chg_market_3d").alias("avg_chg_market_3d"), fn.avg("avg_chg_market_5d").alias("avg_chg_market_5d"), fn.avg("avg_chg_market_10d").alias("avg_chg_market_10d"), fn.avg("avg_chg_industry_3d").alias("avg_chg_industry_3d"), fn.avg("avg_chg_industry_5d").alias("avg_chg_industry_5d"), fn.avg("avg_chg_industry_10d").alias("avg_chg_industry_10d"), fn.avg("top_ind_perc_3d").alias("top_ind_perc_3d"), fn.avg("top_ind_perc_5d").alias("top_ind_perc_5d"), fn.avg("top_ind_perc_10d").alias("top_ind_perc_10d"), fn.avg("avg_chg_stock_3d").alias("avg_chg_stock_3d"), fn.avg("avg_chg_stock_5d").alias("avg_chg_stock_5d"), fn.avg("avg_chg_stock_10d").alias("avg_chg_stock_10d"), fn.avg("top_stock_perc_3d").alias("top_stock_perc_3d"), fn.avg("top_stock_perc_5d").alias("top_stock_perc_5d"), fn.avg("top_stock_perc_10d").alias("top_stock_perc_10d") ).withColumn("busi_date", fn.lit(enddate))\ .withColumn("compute_term", fn.lit(compute_term))\ .withColumn("trade_id", fn.lit("average")) df = df.union(avgDf) return df
# Get Number of Rows of a DataFrame df_title_basics.count() # In[6]: # Groups and Counts: Get column titleTypes values with counts and ordered descending from pyspark.sql.functions import desc df_title_basics.groupBy("titleType").count().orderBy(desc("count")).show() # In[7]: # Calculate average Movie length in minutes from pyspark.sql.functions import avg, col df_title_basics.where(col('titleType') == 'movie').agg( avg('runtimeMinutes')).show() # In[8]: # Save Dataframe back to HDFS (partitioned) as Parquet files df_title_basics.repartition('startYear').write.format("parquet").mode( "overwrite").partitionBy('startYear').save( '/user/hadoop/imdb/title_basics_partitioned_files') # In[9]: # Save Dataframe back to HDFS (partitioned) as EXTERNAL TABLE and Parquet files df_title_basics.repartition('startYear').write.format( "parquet").mode("overwrite").option( 'path', '/user/hadoop/imdb/title_basics_partitioned_table').partitionBy(
spark.sql("""SELECT first(StockCode) as first, last(StockCode) as last, min(Quantity) as minQty, max(Quantity) as maxQty FROM dfTable""").show() # ---------------------------------------------------------- # Example 3 - sum, sumDistinct, avg # ---------------------------------------------------------- from pyspark.sql.functions import sum, sumDistinct, avg df.select(sum("Quantity")).show() df.select(sumDistinct("Quantity")).show() df.select(avg("Quantity")).show() spark.sql("""SELECT sum(Quantity) as sumQty, mean(Quantity) as mean FROM dfTable""").show() from pyspark.sql.functions import mean, expr df.select( count("Quantity").alias("total_transactions"), sum("Quantity").alias("total_purchases"), avg("Quantity").alias("avg_purchases"), expr("mean(Quantity)").alias("mean_purchases")) \ .selectExpr( "total_purchases/total_transactions", "avg_purchases", "mean_purchases").show()
.getOrCreate() df = spark.read.format("csv").option("header", "true").load(DATA_FILE) print('Source data frame:') df.show() df = df.withColumn("temperatureTmp", df.temperature.cast('float')) \ .drop("temperature") \ .withColumnRenamed("temperatureTmp", "temperature") df = df.withColumn('month', F.month('date')) print('Min, max and avg temperature for each month for each city:') temp_df = df.groupBy('city', 'month').agg( F.max(F.col('temperature')).alias('max_temperature'), F.min(F.col('temperature')).alias('min_temperature'), F.avg(F.col('temperature')).alias('avg_temperature')).orderBy( 'month', 'city') temp_df.show() if SAVE_RESULTS: temp_df.write.csv('data/agg_city_month.csv') N = 255 # min examples to take print( 'Min, max and avg temperature for each month for each city having at least {} records:' .format(N)) temp_df = df.groupBy('city', 'month').agg(F.max(F.col('temperature')).alias('max_temperature'), F.min(F.col('temperature')).alias('min_temperature'), F.avg(F.col('temperature')).alias('avg_temperature'), F.count(F.col('temperature')).alias('count_temperature')) \ .orderBy('month', 'city')
.drop("value", "sensor_id", "id") # Group all streaming measurements by location and timestamp. # Group them by 1 minutes and calculate avg metrics for air quality # one-minute intervals. # Also, group them with 5 minutes delay for rewriting previous metrics # for last 5 minutes if they occurs. w = df4\ .withWatermark("timestamp", "5 minutes") \ .groupBy(window("timestamp", "1 minutes"), \ col("location_id"), \ col("latitude"), \ col("longitude"), \ col("country")) \ .agg( \ avg("temperature").alias("temperature"), \ avg("humidity").alias("humidity"), \ avg("pressure").alias("pressure"), \ avg("P1").alias("P1"), \ avg("P2").alias("P2")) \ .withColumn("timestamp", col("window").end).drop("window") # Create new DataFrame with only one column 'value' # that contains all row as csv row delimited by comma output = w.select( concat( col("timestamp"), lit(","), \ col("location_id"), lit(","), \ col("latitude"), lit(","), col("longitude"), lit(","), \ col("country"), lit(","), \ col("temperature"), lit(","), col("humidity"), lit(","), \
marginal_y="violin", marginal_x="box", trendline="ols", template="seaborn") fig.update_layout(yaxis_tickformat="$.0f" if iscash else ".2%", xaxis_tickformat=".2%") return fig scurve(df_PVDE_F_P, 'BeckPVDE', iscash=True) # COMMAND ---------- # DBTITLE 1,3D Surface Graph pivot_Tbl = dfr.groupby("runid", "sourcevarname", "LineNo").pivot("Month").agg( avg("variablevalue")).orderBy(*('runid', 'LineNo'), ascending=True) pivot_Tbl_Pandas = pivot_Tbl.toPandas() c = pivot_Tbl_Pandas.shape[1] - 3 x = np.linspace(0, 594, c) df = pivot_Tbl_Pandas.drop(['runid', 'sourcevarname', 'LineNo'], axis=1) fig_3dsurface = go.Figure( data=[go.Surface(z=df.values, x=x, y=StatInc_Tbl_Pandas.LineNo)]) fig_3dsurface.show() # COMMAND ---------- vw_mv_get_df.unpersist() # COMMAND ----------
# Create a schema for dataframe. df_schema = StructType([ StructField("user_id", IntegerType(), False), StructField("name", StringType(), False), StructField("age", IntegerType(), False), StructField("friends_count", IntegerType(), True) ]) dataset_df = spark.read.schema(df_schema).csv( 'dataset/fakefriends-header.csv', header=True, sep=',', inferSchema=False).cache() # Select only required columns and discard the useless ones as soon as possible to prevent wasting cluster resource. age_friends_count_df = dataset_df.select('age', 'friends_count') # Simple average # avg_friends_count_by_age = age_friends_count_df.groupBy('age').avg('friends_count') # Round average and rename the aggregation column. # Needs agg function to modify average. avg_friends_count_by_age = age_friends_count_df.groupBy('age').\ agg(func.round(func.avg('friends_count'), 2).alias('avg_friends_count')).\ orderBy('avg_friends_count') avg_friends_count_by_age.show() spark.stop()
'*', u_parse_time(cleaned_df['timestamp']).cast('timestamp').alias( 'time')).drop('timestamp') logs_df.cache() # Content Size statistics # ------------------------- content_size_summary_df = logs_df.describe(['content_size']) # content_size_summary_df.show() # Alternate method content_size_stats = content_size_summary_df.agg( min(content_size_summary_df['content_size']), avg(content_size_summary_df['content_size']), max(content_size_summary_df['content_size'])).first() # print(content_size_stats[1]) # HTTP Status analysis # -------------------- status_by_count_df = logs_df.groupby('status').count().sort('status') # status_by_count_df.show() # Frequent hosts # -------------- host_sum_df = logs_df.groupBy("host").count() # host_sum_df.show(10, truncate=False) host_more_than_10_df = host_sum_df.filter(host_sum_df['count'] > 10).select(
# MAGIC ### Baseline Model # MAGIC # MAGIC A **baseline model** offers an educated best guess to improve upon as different models are trained and evaluated. It represents the simplest model we can create. This is generally approached as the center of the data. In the case of regression, this could involve predicting the average of the outcome regardless of the features it sees. In the case of classification, the center of the data is the mode, or the most common class. # MAGIC # MAGIC A baseline model could also be a random value or a preexisting model. Through each new model, we can track improvements with respect to this baseline. # COMMAND ---------- # MAGIC %md # MAGIC Create a baseline model by calculating the most common Survival status (rounding of average) in the training dataset. # COMMAND ---------- from pyspark.sql.functions import avg trainAvg = trainDF.select(avg("Survived")).first()[0] trainAvg = float(round(trainAvg)) print("Common Survival Status: {}".format(trainAvg)) # COMMAND ---------- # MAGIC %md # MAGIC Take the average calculated on the training dataset and append it as the column `prediction` on the test dataset. # COMMAND ---------- from pyspark.sql.functions import lit testPredictionDF = testDF.withColumn("prediction", lit(trainAvg))
def calculate_avg_node_score(self): # TODO Assumes that only one nodes file exists, needs to be fixed for link data # Create a SparkSession # Note: In case its run on Windows and generates errors use (tmp Folder mus exist): # spark = SparkSession.builder.config("spark.sql.warehouse.dir", "file:///C:/temp"). # appName("Postprocessing").getOrCreate() spark = SparkSession.builder.appName( "Calculate_Controvercy_Score_Nodes").getOrCreate() nodes_source = spark.sparkContext.textFile( os.path.join(os.getcwd(), self.data_path, self.nodes_files[0])) nodes = nodes_source.map(self.mapper_nodes) nodes_df = spark.createDataFrame(nodes).cache() nodes_df.createOrReplaceTempView("nodes") results_file = os.path.join(self.data_path, self.nodes_files[0]) tmp_results_file = os.path.join(self.data_path, 'tmp_' + self.nodes_files[0]) spark_results_path = os.path.join(self.data_path, self.nodes_files[0][:-4]) for file in self.events_files: events_source = spark.sparkContext.textFile( os.path.join(self.data_path, file)) events = events_source.map(self.mapper_events) events_df = spark.createDataFrame(events).cache() events_df.createOrReplaceTempView("events") source_df = spark.sql('SELECT source as node, cscore FROM events') target_df = spark.sql('SELECT target as node, cscore FROM events') node_cscores_df = source_df.union(target_df) avg_node_cscores_df = node_cscores_df.groupby('node').agg( avg('cscore').alias('avg_cscore')) avg_node_cscores_df.createOrReplaceTempView("cscore_nodes") nodes = spark.sql( "SELECT n.id, n.title, n.ns, c.avg_cscore as cscore " "FROM nodes n LEFT OUTER JOIN cscore_nodes c ON n.id = c.node") nodes.write.format('com.databricks.spark.csv').option( 'header', 'false').option('delimiter', '\t').save(spark_results_path) self.assemble_spark_results(spark_results_path, tmp_results_file) os.remove(os.path.join(self.data_path, self.nodes_files[0])) os.rename(tmp_results_file, results_file) print('results assembled') #HANDLE Null Values in CSCORE: Replace NULL WITH ZERO Option 1 print('Handle Cscore Null Values for Nodes') nodes = pd.read_csv(results_file, header=None, delimiter='\t', names=['id', 'title', 'ns', 'cscore'], skip_blank_lines=True, na_filter=False, error_bad_lines=False, warn_bad_lines=True) print('Number of nodes without cscore') print(len(nodes.loc[nodes['cscore'] == ""])) nodes.loc[nodes['cscore'] == "", 'cscore'] = 0.0 nodes.to_csv(results_file, sep='\t', index=False, header=False, mode='w') del spark
def calculate_avg_edge_score(self): # TODO Assumes that only one edges file exists, needs to be fixed for link data # Create a SparkSession # Note: In case its run on Windows and generates errors use (tmp Folder mus exist): # spark = SparkSession.builder.config("spark.sql.warehouse.dir", "file:///C:/temp").appName("Postprocessing").getOrCreate() spark = SparkSession.builder.appName( "Calculate_Controvercy_Score_Edges").getOrCreate() edges_source = spark.sparkContext.textFile( os.path.join(self.data_path, self.edges_files[0])) edges = edges_source.map(self.mapper_edges) edges_df = spark.createDataFrame(edges).cache() edges_df.createOrReplaceTempView("edges") results_file = os.path.join(self.data_path, self.edges_files[0]) tmp_results_file = os.path.join(self.data_path, 'tmp_' + self.edges_files[0]) spark_results_path = os.path.join(self.data_path, self.edges_files[0][:-4]) for file in self.events_files: events_source = spark.sparkContext.textFile( os.path.join(self.data_path, file)) events = events_source.map(self.mapper_events) events_df = spark.createDataFrame(events).cache() events_df.createOrReplaceTempView("events") avg_edge_cscores_df = events_df.groupby('source', 'target').agg( avg('cscore').alias('avg_cscore')) avg_edge_cscores_df.createOrReplaceTempView("cscore_edges") edges = spark.sql( "SELECT e.source, e.target, e.etype, c.avg_cscore as cscore " "FROM edges e LEFT OUTER JOIN cscore_edges c " "ON e.source = c.source AND e.target = c.target") edges.write.format('com.databricks.spark.csv').option( 'header', 'false').option('delimiter', '\t').save(spark_results_path) self.assemble_spark_results(spark_results_path, tmp_results_file) os.remove(os.path.join(self.data_path, self.edges_files[0])) os.rename(tmp_results_file, results_file) #HANDLE Null Values in CSCORE: Replace NULL WITH ZERO print('Handle Cscore Null Values for edges.') edges = pd.read_csv(results_file, header=None, delimiter='\t', names=['source', 'target', 'type', 'cscore'], skip_blank_lines=True, na_filter=False, error_bad_lines=False, warn_bad_lines=True) print('Number of edges without cscore') print(len(edges.loc[edges['cscore'] == ""])) edges.loc[edges['cscore'] == "", 'cscore'] = 0.0 edges.to_csv(results_file, sep='\t', index=False, header=False, mode='w') del spark
# In[ ]: sqlCtx.sql("select program,avg(age) AS AverageAge FROM st GROUP BY program").show() # In[ ]: # In[ ]: from pyspark.sql import functions as funcs AvgMin=students.groupBy('program').agg(funcs.avg('age').alias('AverageAge '),funcs.max('age').alias('MaximumAge')) AvgMin.show() # In[ ]: # #How the queries are optimized # In[ ]: sqlCtx.sql("select name, program FROM st").explain()
(new_fire_df.select("ResponseDelayedinMins").where( col("ResponseDelayedinMins") > 5).show(5, False)) # Convert to more usable formats fire_ts_df=(new_fire_df .withColumn("IncidentDate", to_timestamp(col("CallDate"), "MM/dd/yyyy")) .drop("CallDate") .withColumn("OnWatchDate", to_timestamp(col("WatchDate"), "MM/dd/yyyy")) .drop("WatchDate") .withColumn("AvailableDtTS", to_timestamp(col("AvailableDtTm"), "MM/dd/yyyy hh:mm:ss a")) .drop("AvailableDtTm")) # Select the converted columns fire_ts_df.select("IncidentDate", "OnWatchDate", "AvailableDtTS").show(5, False) fire_ts_df.select(year('IncidentDate')).distinct().orderBy( year("IncidentDate")).show(10, False) # the most common types of fire calls (fire_ts_df.select("CallType") .where(col("CallType").isNotNull()).groupBy("CallType") .count().orderBy("count", ascending=False) .show(n=10, truncate=False)) # some Computatinos (fire_ts_df .select(F.sum("NumAlarms"), F.avg("ResponseDelayedinMins"), F.min("ResponseDelayedinMins"), F.max("ResponseDelayedinMins")) .show()) # ----------------------------------------------------------------------------
content_size_summary_df.show() # COMMAND ---------- # MAGIC %md # MAGIC # MAGIC Alternatively, we can use SQL to directly calculate these statistics. You can explore the many useful functions within the `pyspark.sql.functions` module in the [documentation](https://spark.apache.org/docs/latest/api/python/pyspark.sql.html#module-pyspark.sql.functions). # MAGIC # MAGIC After we apply the `.agg()` function, we call `.first()` to extract the first value, which is equivalent to `.take(1)[0]`. # COMMAND ---------- from pyspark.sql import functions as sqlFunctions contentSizeStats = (logs_df .agg(sqlFunctions.min(logs_df['content_size']), sqlFunctions.avg(logs_df['content_size']), sqlFunctions.max(logs_df['content_size'])) .first()) print 'Using SQL functions:' print 'Content Size Avg: {1:,.2f}; Min: {0:.2f}; Max: {2:,.0f}'.format(*contentSizeStats) # COMMAND ---------- # MAGIC %md # MAGIC ### (3b) Example: HTTP Status Analysis # MAGIC # MAGIC Next, let's look at the status values that appear in the log. We want to know which status values appear in the data and how many times. We again start with `logs_df`, then group by the `status` column, apply the `.count()` aggregation function, and sort by the `status` column. # COMMAND ----------
sqlContext = SQLContext(sc) #set time variables for date filtering time = datetime.datetime.now() epochtime = int(time.strftime("%s")) start_time = epochtime - 86400 compare_time = datetime.datetime.fromtimestamp(start_time) #create a dataframe from the raw metrics rawmetrics = sqlContext.read.format("org.apache.spark.sql.cassandra").options(table="raw_metrics", keyspace="metrics").load() #filter metrics to those in last 24 hours last_day = rawmetrics.where(rawmetrics.metric_time > compare_time) #aggregates averages = last_day.groupby('device_id').agg(func.avg('metric_value').alias('metric_avg')) maximums = last_day.groupby('device_id').agg(func.max('metric_value').alias('metric_max')) minimums = last_day.groupby('device_id').agg(func.min('metric_value').alias('metric_min')) #rename id columns for uniqueness averages_a = averages.withColumnRenamed("device_id", "id") maximums_a = maximums.withColumnRenamed("device_id", "maxid") minimums_a = minimums.withColumnRenamed("device_id", "minid") #join the tables above temp = averages_a.join(maximums_a, averages_a.id == maximums_a.maxid) aggs = temp.join(minimums, temp.id == minimums.device_id).select('id','metric_min','metric_max','metric_avg') #add columns to format for cassandra addday = aggs.withColumn("metric_day", lit(time)) addname = addday.withColumn("metric_name",lit("KWH"))