def __account_new(jdbc_df_2): """新增会员会员包信息函数""" jdbc_df_2.createOrReplaceTempView("v_df") #购买会员 & 赠送会员 sql = """ select fsk_account_name,fsk_package_provide channelName,(case when fsk_order_type=1 then "赠送" else fsk_package_id end) as packageName from v_df """ spark.sql("show databases") spark.sql("use sharp") df_ = spark.sql(sql) #汇总各类型会员 df = df_.cube(F.col("channelName"), F.col("packageName")).agg( F.grouping_id().alias("id_1"), F.countDistinct("fsk_account_name").alias("totalUserCount")) return df
dpt1 0 3635 dpt2 0 2583 dpt2 1 25 2- rollup('dept', 'cat_flg').agg(sum('cnt'), grouping_id().alias('gflg')) 结果: (null) (null) 6243 3 # 这条是 dept的单独汇总 (rollup 中的第一个特征的全部特征) dpt2 (null) 2608 1 # 这条是 dept2的单独汇总 (rollup 中的第一个特征的特征类型) dpt1 (null) 3635 1 # 这条是 dept1的单独汇总 (rollup 中的第一个特征的特征类型) ------- 下面三条一般group by就能出来 dpt1 0 3635 0 dpt2 0 2583 0 dpt2 1 25 0 """ # 去掉全部的统计级数据 with_rollup = sales.rollup('city', 'year')\ .agg(sum('amount').alias('amount'), grouping_id().alias('gflg'))\ .filter(col('gflg') != 3) with_rollup = with_rollup.sort(with_rollup.city.desc_nulls_last(), with_rollup.year.asc_nulls_last())\ .select('city', 'year', 'amount', 'gflg') print("# with_rollup:") with_rollup.show() ## SQL sales.createOrReplaceTempView('sales_tbl') sales_SQL = spark.sql("""SELECT city, year, sum(amount) as amount ,GROUPING_ID(city, year) GFLG FROM sales_tbl GROUP BY ROLLUP(city, year) HAVING 3 != GROUPING_ID(city, year) ORDER BY city DESC NULLS LAST, year ASC NULLS LAST """) print("# sales_SQL:")
#where one of the column is null is subtotal rolledUpDF.where("Country is NULL").show() rolledUpDF.where("Date is NULL").show() # COMMAND ---------- # DBTITLE 1,Cube #Cube takes the rollup to a level deeper. dfNoNull.cube("Date", "Country").sum("Quantity").orderBy("Date").show() # COMMAND ---------- # DBTITLE 1,Grouping Metadata from pyspark.sql.functions import grouping_id, sum, desc dfNoNull.cube("CustomerId","StockCode")\ .agg(grouping_id(), sum(col("Quantity")))\ .orderBy(desc("grouping_id()")).show() # COMMAND ---------- # DBTITLE 1,Pivot pivoted = dfWithDate.groupBy("date").pivot("Country").sum() display(pivoted) # COMMAND ---------- from pyspark.sql.functions import collect_set, collect_list from pyspark.sql.window import Window window = Window\ .partitionBy("date","Country")\
# trying to answer. We highlight a few common approaches below. # ### N-way summary tables # We can use grouping and aggregate functions in Spark to produce summaries. # **Example:** Three categorical variables rides_sdf \ .groupby("rider_student", "rider_gender", "service") \ .count() \ .show() # **Example:** Two categorical variables and one continuous variable rides_sdf \ .cube("rider_student", "rider_gender") \ .agg(F.grouping_id(), F.mean("distance"), F.stddev("distance")) \ .orderBy("rider_student", "rider_gender") \ .show() # **Example:** Two categorical variables and two continuous variables rides_sdf \ .groupBy("rider_student", "rider_gender") \ .agg(F.corr("distance", "duration")) \ .orderBy("rider_student") \ .show() # ### Faceted plots # Generally, carefully crafted visualizations are more enlightening. Before we # produce more visualizations, let us fill in the missing values for rider_gender # using pandas functionality:
.agg(grouping("rider_student"), grouping("service"), count("*"), count("distance"), mean("distance"), stddev("distance")) \ .orderBy("rider_student", "service") \ .show() # Use the `cube` method to get all subtotals: rides \ .cube("rider_student", "service") \ .agg(count("*"), count("distance"), mean("distance"), stddev("distance")) \ .orderBy("rider_student", "service") \ .show() # Use the `grouping_id` function to identify grouped rows: from pyspark.sql.functions import grouping_id rides \ .cube("rider_student", "service") \ .agg(grouping_id("rider_student", "service"), count("*"), count("distance"), mean("distance"), stddev("distance")) \ .orderBy("rider_student", "service") \ .show() # ## Pivoting data # The following use case is common: rides.groupBy("rider_student", "service").count().orderBy("rider_student", "service").show() # The `crosstab` method can be used to present this result in a pivot table: rides.crosstab("rider_student", "service").show() # We can also use the `pivot` method to produce a cross-tabulation: rides.groupBy("rider_student").pivot("service").count().show()
#Easy to create a summary table from pyspark.sql.functions import sum cubeDF=dfNoNull.cube("Date", "Country").sum("Quantity")\ .select("Date", "Country", "sum(Quantity)")\ .orderBy("Date")\ .show() # COMMAND ---------- #Denote the group level of aggregation using grouping_id() from pyspark.sql.functions import grouping_id, sum, desc #Cube cubeGroupDF = dfNoNull.cube("CustomerID", "StockCode").agg(grouping_id().alias("grouped"), sum(col("Quantity"))).orderBy( desc("grouped")) cubeGroupDF.show(5) #Rollup rollGroupDF = dfNoNull.rollup("CustomerID", "StockCode").agg(grouping_id().alias("grouped"), sum(col("Quantity"))).orderBy( desc("grouped")) rollGroupDF.show(5) # COMMAND ---------- #Check the levels of the variable grouped. cubeGroupDF.groupBy("grouped").count().show()
from pyspark.sql.functions import sum dfNoNull.cube("Date", "Country").agg(sum(col("Quantity")))\ .select("Date", "Country", "sum(Quantity)").orderBy("Date").show(5) # This is a quick and easily accessible summary of nearly all of the information in our table, and it’s a great way to create a quick summary table that others can use later on. # COMMAND ---------- # Sometimes when using cubes and rollups, you want to be able to query the aggregation levels so that you can easily filter them down accordingly. # We can do this by using the grouping_id, which gives us a column specifying the level of aggregation that we have in our result set. # The query in the example that follows returns four distinct grouping IDs. from pyspark.sql.functions import grouping_id, sum, expr, col dfNoNull.cube("customerId", "stockCode").agg( grouping_id(), sum("Quantity").alias("total_quantity")).orderBy( col("total_quantity").desc()).show(3) # COMMAND ---------- # Pivots make it possible for you to convert a row into a column. For example, in our current data we have a Country column. # With a pivot, we can aggregate according to some function for each of those given countries and display them in an easy-to-query way: pivoted = dfWithDate.groupBy("date").pivot("Country").sum() # COMMAND ---------- # This DataFrame will now have a column for every combination of country, numeric variable, and a column specifying the date. # For example, for USA we have the following columns: USA_sum(Quantity), USA_sum(UnitPrice), USA_sum(CustomerID).
#summary of string columns from pyspark.sql.functions import countDistinct ab_schema.select("neighbourhood").distinct().show() ab_schema.select("neighbourhood_group").distinct().show() ab_schema.select(countDistinct("neighbourhood_group")).show() ab_schema.select("room_type").distinct().show() ab_schema.select(countDistinct("name")).show() # COMMAND ---------- #Q3: Calculate the average price and its standard deviation by room_type and per neighborhood group and neighborhood. #Create a statement using SQL, DF, and a grouping sets function. from pyspark.sql.functions import grouping_id, avg, stddev, col, desc cubeGroupAB = ab_schema.cube("neighbourhood", "neighbourhood_group", "room_type").agg(grouping_id().alias("grouped"), avg(col("price")), stddev(col("price"))).orderBy( desc("grouped")) cubeGroupAB.show(5) # COMMAND ---------- #Q4: Handle the missing values in the dataset as follows: for neighbourhood and neighbourhood_group, replace the missing values with ‘NA’; for “availability_365”, replace the missing values with 0. fill_cols_vals = { "neighbourhood": "NA", "neighbourhood_group": "NA", "availability_365": 0 } ab_schema.na.fill(fill_cols_vals).show(2)
hotels\ .groupBy(col("neighbourhood_group"), col("neighbourhood"))\ .agg(round(avg(col("price")),2).alias("avg_price"), round(stddev(col("price")),2).alias("std_price"))\ .na.drop("all")\ .orderBy("std_price")\ .show(3) # COMMAND ---------- #Using a DF grouping sets function from pyspark.sql.functions import grouping_id hotelsNoNull = hotels.na.drop("all") rolledUpDF = hotelsNoNull\ .rollup("neighbourhood_group", "neighbourhood")\ .agg(grouping_id().alias("level"), round(avg(col("price")),2).alias("avg_price"), round(stddev(col("price")),2).alias("std_price"))\ .orderBy("std_price") rolledUpDF.show() # COMMAND ---------- # DBTITLE 1,Question 4B: Handle missing values fill_cols_vals = { "neighbourhood": "NA", "neighbourhood_group": "NA", "availability_365": 0 } hotels_imp = hotels.na.fill(fill_cols_vals).orderBy("availability_365") hotels_imp.show(2)