def test_hash_multiple_mode_query(data_gen, conf): print_params(data_gen) assert_gpu_and_cpu_are_equal_collect(lambda spark: gen_df( spark, data_gen, length=100).groupby('a').agg(f.count('a'), f.avg( 'b'), f.avg('a'), f.countDistinct('b'), f.sum('a'), f.min( 'a'), f.max('a'), f.sumDistinct('b'), f.countDistinct('c')), conf=conf)
def learn3(): df1 = ss.read.csv('F:/Research/data/ccFraud.csv', header=True, inferSchema=True) df1.show() # 按gender列对df进行分组,并统计每组的行数 df2 = df1.groupby('gender').count() df2.show() df3 = df1.describe(['balance', 'numTrans', 'numIntlTrans']) df3.show() # 检查偏度 df1.agg({'balance': 'skewness'}).show() df1.agg( functions.max('balance').alias('max'), functions.avg('balance').alias('avg'), functions.mean('balance').alias('mean'), functions.stddev('balance').alias('stddev'), functions.sum('balance').alias('sum'), functions.skewness('balance').alias('skewness'), functions.variance('balance').alias('variance'), functions.sumDistinct('balance').alias('sumDistinct')).show() corr1 = df1.corr('balance', 'numTrans') print(corr1)
df = spark.createDataFrame(data=simpleData, schema = schema) df.printSchema() df.show(truncate=False) print("approx_count_distinct: " + \ str(df.select(approx_count_distinct("salary")).collect()[0][0])) print("avg: " + str(df.select(avg("salary")).collect()[0][0])) df.select(collect_list("salary")).show(truncate=False) df.select(collect_set("salary")).show(truncate=False) df2 = df.select(countDistinct("department", "salary")) df2.show(truncate=False) print("Distinct Count of Department & Salary: "+str(df2.collect()[0][0])) print("count: "+str(df.select(count("salary")).collect()[0])) df.select(first("salary")).show(truncate=False) df.select(last("salary")).show(truncate=False) df.select(kurtosis("salary")).show(truncate=False) df.select(max("salary")).show(truncate=False) df.select(min("salary")).show(truncate=False) df.select(mean("salary")).show(truncate=False) df.select(skewness("salary")).show(truncate=False) df.select(stddev("salary"), stddev_samp("salary"), \ stddev_pop("salary")).show(truncate=False) df.select(sum("salary")).show(truncate=False) df.select(sumDistinct("salary")).show(truncate=False) df.select(variance("salary"),var_samp("salary"),var_pop("salary")) \ .show(truncate=False)
def test_sum_distinct(self): self.spark.range(10).select( assert_true( sum_distinct(col("id")) == sumDistinct(col("id")))).collect()
df.select(first("StockCode"), last("StockCode")).show() # COMMAND ---------- from pyspark.sql.functions import min, max df.select(min("Quantity"), max("Quantity")).show() # COMMAND ---------- from pyspark.sql.functions import sum df.select(sum("Quantity")).show() # 5176450 # COMMAND ---------- from pyspark.sql.functions import sumDistinct df.select(sumDistinct("Quantity")).show() # 29310 # COMMAND ---------- from pyspark.sql.functions import sum, count, avg, expr df.select( count("Quantity").alias("total_transactions"), sum("Quantity").alias("total_purchases"), avg("Quantity").alias("avg_purchases"), expr("mean(Quantity)").alias("mean_purchases"))\ .selectExpr( "total_purchases/total_transactions", "avg_purchases", "mean_purchases").show()
def run_pipeline(self): try: logging.info( "https://sparkbyexamples.com/pyspark/pyspark-aggregate-functions/" ) # check collect_list and collect_set #collect_set() function returns all values from an input column with duplicate values eliminated. #collect_list() function returns all values from an input column with duplicates logging.info( 'run_pipeline method started --> https://sparkbyexamples.com/pyspark/pyspark-explode-array-and-map-columns-to-rows/' ) simpleData = [("James", "Sales", 3000), ("Michael", "Sales", 4600), ("Robert", "Sales", 4100), ("Maria", "Finance", 3000), ("James", "Sales", 3000), ("Scott", "Finance", 3300), ("Jen", "Finance", 3900), ("Jeff", "Marketing", 3000), ("Kumar", "Marketing", 2000), ("Saif", "Sales", 4100)] schema = ["employee_name", "department", "salary"] df = self.spark.createDataFrame(data=simpleData, schema=schema).cache() df.show(truncate=False) from pyspark.sql.functions import approx_count_distinct, collect_list from pyspark.sql.functions import collect_set, sum, avg, max, countDistinct, count from pyspark.sql.functions import first, last, kurtosis, min, mean, skewness from pyspark.sql.functions import stddev, stddev_samp, stddev_pop, sumDistinct from pyspark.sql.functions import variance, var_samp, var_pop df.printSchema() df.show(truncate=False) print("approx_count_distinct: " + \ str(df.select(approx_count_distinct("salary")).collect()[0][0])) print("avg: " + str(df.select(avg("salary")).collect()[0][0])) df.select(collect_list("salary")).show(truncate=False) df.select(collect_set("salary")).show(truncate=False) df2 = df.select(countDistinct("department", "salary")) df2.show(truncate=False) print("Distinct Count of Department & Salary: " + str(df2.collect()[0][0])) print("count: " + str(df.select(count("salary")).collect()[0])) dffirst = df.select(first("salary")) dffirst.show(truncate=False) df.select(last("salary")).show(truncate=False) df.select(kurtosis("salary")).show(truncate=False) df.select(max("salary")).show(truncate=False) df.select(min("salary")).show(truncate=False) df.select(mean("salary")).show(truncate=False) df.select(skewness("salary")).show(truncate=False) df.select(stddev("salary"), stddev_samp("salary"), \ stddev_pop("salary")).show(truncate=False) df.select(sum("salary")).show(truncate=False) df.select(sumDistinct("salary")).show(truncate=False) df.select(variance("salary"), var_samp("salary"), var_pop("salary")) \ .show(truncate=False) logging.info('run_pipeline method ended') except Exception as exp: logging.error("An error occured while running the pipeline > " + str(exp)) # send email notification # log error to database sys.exit(1) return
# In[ ]: # Df for firearm incidents. df_firearm=df_joined.groupBy('LSOA_6').agg( cnt_cond(F.col('Crime_type').isin(['Possession of weapons', 'Public disorder and weapons'])).alias('y_cnt')) df_firearm.show(3) # In[ ]: # Grouping the population by LSOA df_population = df_joined.groupby("LSOA_6"). agg( func.sumDistinct("Number_of_People").alias("total_population") ) # In[ ]: # look at the population data and cross check on internet df_population.show(3) # Population data is validated through internet information # In[ ]: # Joining population and firearms dataframes.
# COMMAND ---------- from pyspark.sql.functions import min, max df.select(min("Quantity"), max("Quantity")).show() # COMMAND ---------- from pyspark.sql.functions import sum df.select(sum("Quantity")).show() # 5176450 # COMMAND ---------- from pyspark.sql.functions import sumDistinct df.select(sumDistinct("Quantity")).show() # 29310 # COMMAND ---------- from pyspark.sql.functions import sum, count, avg, expr df.select( count("Quantity").alias("total_transactions"), sum("Quantity").alias("total_purchases"), avg("Quantity").alias("avg_purchases"), expr("mean(Quantity)").alias("mean_purchases"))\ .selectExpr( "total_purchases/total_transactions", "avg_purchases", "mean_purchases").show()
# compute various column counts: from pyspark.sql.functions import count, countDistinct, approx_count_distinct rides.select(count("*"), count("distance"), countDistinct("distance"), approx_count_distinct("distance")).show() # **Note:** The `count` function returns the number of rows with non-null values. # **Note:** Use `count(lit(1))` rather than `count(1)` as an alternative to `count("*")`. # The `agg` method returns the same results and can be applied to grouped data: rides.agg(count("*"), count("distance"), countDistinct("distance"), approx_count_distinct("distance")).show() # Use the `sum` and `sumDistinct` functions to compute various column sums: from pyspark.sql.functions import sum, sumDistinct rides.agg(sum("distance"), sumDistinct("distance")).show() # **Question:** When would one use the `sumDistinct` function? # Spark SQL provides a number of summary statistics: from pyspark.sql.functions import mean, stddev, variance, skewness, kurtosis rides.agg(mean("distance"), stddev("distance"), variance("distance"), skewness("distance"), kurtosis("distance")).show() # **Note:** `mean` is an alias for `avg`, `stddev` is an alias for the sample # standard deviation `stddev_samp`, and `variance` is an alias for the sample # variance `var_samp`. The population standard deviation and population # variance are available via `stddev_pop` and `var_pop`, respectively. # Use the `min` and `max` functions to compute the minimum and maximum, respectively: from pyspark.sql.functions import min, max
def get_builtin_aggregator_column(agg, ctx): try: aggregator = ctx.aggregators[agg["aggregator"]] try: input = ctx.populate_values(agg["input"], aggregator["input"], preserve_column_refs=False) except CortexException as e: e.wrap("input") raise if aggregator["name"] == "approx_count_distinct": return F.approxCountDistinct(input["col"], input.get("rsd")).alias(agg["name"]) if aggregator["name"] == "avg": return F.avg(input).alias(agg["name"]) if aggregator["name"] in { "collect_set_int", "collect_set_float", "collect_set_string" }: return F.collect_set(input).alias(agg["name"]) if aggregator["name"] == "count": return F.count(input).alias(agg["name"]) if aggregator["name"] == "count_distinct": return F.countDistinct(*input).alias(agg["name"]) if aggregator["name"] == "covar_pop": return F.covar_pop(input["col1"], input["col2"]).alias(agg["name"]) if aggregator["name"] == "covar_samp": return F.covar_samp(input["col1"], input["col2"]).alias(agg["name"]) if aggregator["name"] == "kurtosis": return F.kurtosis(input).alias(agg["name"]) if aggregator["name"] in {"max_int", "max_float", "max_string"}: return F.max(input).alias(agg["name"]) if aggregator["name"] == "mean": return F.mean(input).alias(agg["name"]) if aggregator["name"] in {"min_int", "min_float", "min_string"}: return F.min(input).alias(agg["name"]) if aggregator["name"] == "skewness": return F.skewness(input).alias(agg["name"]) if aggregator["name"] == "stddev": return F.stddev(input).alias(agg["name"]) if aggregator["name"] == "stddev_pop": return F.stddev_pop(input).alias(agg["name"]) if aggregator["name"] == "stddev_samp": return F.stddev_samp(input).alias(agg["name"]) if aggregator["name"] in {"sum_int", "sum_float"}: return F.sum(input).alias(agg["name"]) if aggregator["name"] in {"sum_distinct_int", "sum_distinct_float"}: return F.sumDistinct(input).alias(agg["name"]) if aggregator["name"] == "var_pop": return F.var_pop(input).alias(agg["name"]) if aggregator["name"] == "var_samp": return F.var_samp(input).alias(agg["name"]) if aggregator["name"] == "variance": return F.variance(input).alias(agg["name"]) raise ValueError("missing builtin aggregator") # unexpected except CortexException as e: e.wrap("aggregate " + agg["name"]) raise
#Count distinct (approximated) from pyspark.sql.functions import approx_count_distinct df.select(approx_count_distinct("StockCode", 0.1)).show() #the second value specifies the error that is allowed; the lower the value, the lower the error is allowed to be. df.select(approx_count_distinct("StockCode", 0.01)).show() # COMMAND ---------- #Exercise: Replicate the previous results using SQL code. # COMMAND ---------- #Sum a column from pyspark.sql.functions import sum, sumDistinct df.select(sum("Quantity"), sumDistinct("Quantity")).show() spark.sql("select sum(quantity), sum(distinct quantity) from df").show() # COMMAND ---------- #Get first and last row value of a column from pyspark.sql.functions import first, last df.select(first("StockCode"), last("StockCode")).show() # COMMAND ---------- #Cross table person = spark.createDataFrame([ (0, "Bill Chambers", 0, [100]), (1, "Matei Zaharia", 1, [500, 250, 100]),