コード例 #1
0
def __account_new(jdbc_df_2):
    """新增会员会员包信息函数"""
    jdbc_df_2.createOrReplaceTempView("v_df")
    #购买会员 & 赠送会员
    sql = """ select fsk_account_name,fsk_package_provide channelName,(case when fsk_order_type=1 then "赠送" else fsk_package_id  end) as packageName from v_df """
    spark.sql("show databases")
    spark.sql("use sharp")
    df_ = spark.sql(sql)
    #汇总各类型会员
    df = df_.cube(F.col("channelName"), F.col("packageName")).agg(
        F.grouping_id().alias("id_1"),
        F.countDistinct("fsk_account_name").alias("totalUserCount"))
    return df
コード例 #2
0
ファイル: df_creation2.py プロジェクト: scchy/My_Learn
     dpt1	0	3635
     dpt2	0	2583
     dpt2	1	25 
 2- rollup('dept', 'cat_flg').agg(sum('cnt'), grouping_id().alias('gflg'))
 结果:
     (null)	(null)	6243  3 # 这条是 dept的单独汇总  (rollup 中的第一个特征的全部特征)
     dpt2	(null)	2608  1 # 这条是 dept2的单独汇总  (rollup 中的第一个特征的特征类型)
     dpt1	(null)	3635  1 # 这条是 dept1的单独汇总  (rollup 中的第一个特征的特征类型)
     ------- 下面三条一般group by就能出来
     dpt1	0	3635   0
     dpt2	0	2583   0
     dpt2	1	25     0
 """
 # 去掉全部的统计级数据
 with_rollup = sales.rollup('city', 'year')\
     .agg(sum('amount').alias('amount'), grouping_id().alias('gflg'))\
     .filter(col('gflg') != 3)
 with_rollup = with_rollup.sort(with_rollup.city.desc_nulls_last(), with_rollup.year.asc_nulls_last())\
     .select('city', 'year', 'amount', 'gflg')
 print("# with_rollup:")
 with_rollup.show()
 ## SQL
 sales.createOrReplaceTempView('sales_tbl')
 sales_SQL = spark.sql("""SELECT   city, year, sum(amount) as amount 
                                   ,GROUPING_ID(city, year) GFLG
                            FROM   sales_tbl
                        GROUP BY   ROLLUP(city, year)
                          HAVING   3 != GROUPING_ID(city, year) 
                        ORDER BY   city DESC NULLS LAST, year ASC NULLS LAST
                     """)
 print("# sales_SQL:")
コード例 #3
0
#where one of the column is null is subtotal
rolledUpDF.where("Country is NULL").show()
rolledUpDF.where("Date is NULL").show()

# COMMAND ----------

# DBTITLE 1,Cube
#Cube takes the rollup to a level deeper.
dfNoNull.cube("Date", "Country").sum("Quantity").orderBy("Date").show()

# COMMAND ----------

# DBTITLE 1,Grouping Metadata
from pyspark.sql.functions import grouping_id, sum, desc
dfNoNull.cube("CustomerId","StockCode")\
        .agg(grouping_id(), sum(col("Quantity")))\
        .orderBy(desc("grouping_id()")).show()

# COMMAND ----------

# DBTITLE 1,Pivot
pivoted = dfWithDate.groupBy("date").pivot("Country").sum()
display(pivoted)

# COMMAND ----------

from pyspark.sql.functions import collect_set, collect_list
from pyspark.sql.window import Window

window = Window\
                   .partitionBy("date","Country")\
コード例 #4
0
# trying to answer.  We highlight a few common approaches below.

# ### N-way summary tables

# We can use grouping and aggregate functions in Spark to produce summaries.

# **Example:** Three categorical variables
rides_sdf \
  .groupby("rider_student", "rider_gender", "service") \
  .count() \
  .show()

# **Example:** Two categorical variables and one continuous variable
rides_sdf \
  .cube("rider_student", "rider_gender") \
  .agg(F.grouping_id(), F.mean("distance"), F.stddev("distance")) \
  .orderBy("rider_student", "rider_gender") \
  .show()

# **Example:** Two categorical variables and two continuous variables
rides_sdf \
  .groupBy("rider_student", "rider_gender") \
  .agg(F.corr("distance", "duration")) \
  .orderBy("rider_student") \
  .show()
  
# ### Faceted plots

# Generally, carefully crafted visualizations are more enlightening.  Before we
# produce more visualizations, let us fill in the missing values for rider_gender
# using pandas functionality:
コード例 #5
0
  .agg(grouping("rider_student"), grouping("service"), count("*"), count("distance"), mean("distance"), stddev("distance")) \
  .orderBy("rider_student", "service") \
  .show()

# Use the `cube` method to get all subtotals:
rides \
  .cube("rider_student", "service") \
  .agg(count("*"), count("distance"), mean("distance"), stddev("distance")) \
  .orderBy("rider_student", "service") \
  .show()

# Use the `grouping_id` function to identify grouped rows:
from pyspark.sql.functions import grouping_id
rides \
  .cube("rider_student", "service") \
  .agg(grouping_id("rider_student", "service"), count("*"), count("distance"), mean("distance"), stddev("distance")) \
  .orderBy("rider_student", "service") \
  .show()

# ## Pivoting data

# The following use case is common:
rides.groupBy("rider_student",
              "service").count().orderBy("rider_student", "service").show()

# The `crosstab` method can be used to present this result in a pivot table:
rides.crosstab("rider_student", "service").show()

# We can also use the `pivot` method to produce a cross-tabulation:
rides.groupBy("rider_student").pivot("service").count().show()
コード例 #6
0
#Easy to create a summary table
from pyspark.sql.functions import sum

cubeDF=dfNoNull.cube("Date", "Country").sum("Quantity")\
  .select("Date", "Country", "sum(Quantity)")\
  .orderBy("Date")\
  .show()

# COMMAND ----------

#Denote the group level of aggregation using grouping_id()
from pyspark.sql.functions import grouping_id, sum, desc

#Cube
cubeGroupDF = dfNoNull.cube("CustomerID",
                            "StockCode").agg(grouping_id().alias("grouped"),
                                             sum(col("Quantity"))).orderBy(
                                                 desc("grouped"))
cubeGroupDF.show(5)

#Rollup
rollGroupDF = dfNoNull.rollup("CustomerID",
                              "StockCode").agg(grouping_id().alias("grouped"),
                                               sum(col("Quantity"))).orderBy(
                                                   desc("grouped"))
rollGroupDF.show(5)

# COMMAND ----------

#Check the levels of the variable grouped.
cubeGroupDF.groupBy("grouped").count().show()
コード例 #7
0
from pyspark.sql.functions import sum

dfNoNull.cube("Date", "Country").agg(sum(col("Quantity")))\
  .select("Date", "Country", "sum(Quantity)").orderBy("Date").show(5)

# This is a quick and easily accessible summary of nearly all of the information in our table, and it’s a great way to create a quick summary table that others can use later on.

# COMMAND ----------

# Sometimes when using cubes and rollups, you want to be able to query the aggregation levels so that you can easily filter them down accordingly.
# We can do this by using the grouping_id, which gives us a column specifying the level of aggregation that we have in our result set.
# The query in the example that follows returns four distinct grouping IDs.

from pyspark.sql.functions import grouping_id, sum, expr, col
dfNoNull.cube("customerId", "stockCode").agg(
    grouping_id(),
    sum("Quantity").alias("total_quantity")).orderBy(
        col("total_quantity").desc()).show(3)

# COMMAND ----------

# Pivots make it possible for you to convert a row into a column. For example, in our current data we have a Country column.
# With a pivot, we can aggregate according to some function for each of those given countries and display them in an easy-to-query way:

pivoted = dfWithDate.groupBy("date").pivot("Country").sum()

# COMMAND ----------

# This DataFrame will now have a column for every combination of country, numeric variable, and a column specifying the date.
# For example, for USA we have the following columns: USA_sum(Quantity), USA_sum(UnitPrice), USA_sum(CustomerID).
コード例 #8
0
#summary of string columns
from pyspark.sql.functions import countDistinct
ab_schema.select("neighbourhood").distinct().show()
ab_schema.select("neighbourhood_group").distinct().show()
ab_schema.select(countDistinct("neighbourhood_group")).show()
ab_schema.select("room_type").distinct().show()
ab_schema.select(countDistinct("name")).show()

# COMMAND ----------

#Q3: Calculate the average price and its standard deviation by room_type and per neighborhood group and neighborhood.
#Create a statement using SQL, DF, and a grouping sets function.
from pyspark.sql.functions import grouping_id, avg, stddev, col, desc
cubeGroupAB = ab_schema.cube("neighbourhood", "neighbourhood_group",
                             "room_type").agg(grouping_id().alias("grouped"),
                                              avg(col("price")),
                                              stddev(col("price"))).orderBy(
                                                  desc("grouped"))
cubeGroupAB.show(5)

# COMMAND ----------

#Q4: Handle the missing values in the dataset as follows: for neighbourhood and neighbourhood_group, replace the missing values with ‘NA’; for “availability_365”, replace the missing values with 0.
fill_cols_vals = {
    "neighbourhood": "NA",
    "neighbourhood_group": "NA",
    "availability_365": 0
}
ab_schema.na.fill(fill_cols_vals).show(2)
コード例 #9
0
hotels\
  .groupBy(col("neighbourhood_group"), col("neighbourhood"))\
  .agg(round(avg(col("price")),2).alias("avg_price"), round(stddev(col("price")),2).alias("std_price"))\
  .na.drop("all")\
  .orderBy("std_price")\
  .show(3)

# COMMAND ----------

#Using a DF grouping sets function
from pyspark.sql.functions import grouping_id

hotelsNoNull = hotels.na.drop("all")
rolledUpDF = hotelsNoNull\
  .rollup("neighbourhood_group", "neighbourhood")\
  .agg(grouping_id().alias("level"), round(avg(col("price")),2).alias("avg_price"), round(stddev(col("price")),2).alias("std_price"))\
  .orderBy("std_price")

rolledUpDF.show()

# COMMAND ----------

# DBTITLE 1,Question 4B: Handle missing values
fill_cols_vals = {
    "neighbourhood": "NA",
    "neighbourhood_group": "NA",
    "availability_365": 0
}
hotels_imp = hotels.na.fill(fill_cols_vals).orderBy("availability_365")
hotels_imp.show(2)