コード例 #1
0
def test_hash_multiple_mode_query(data_gen, conf):
    print_params(data_gen)
    assert_gpu_and_cpu_are_equal_collect(lambda spark: gen_df(
        spark, data_gen, length=100).groupby('a').agg(f.count('a'), f.avg(
            'b'), f.avg('a'), f.countDistinct('b'), f.sum('a'), f.min(
                'a'), f.max('a'), f.sumDistinct('b'), f.countDistinct('c')),
                                         conf=conf)
コード例 #2
0
def learn3():
    df1 = ss.read.csv('F:/Research/data/ccFraud.csv',
                      header=True,
                      inferSchema=True)
    df1.show()
    # 按gender列对df进行分组,并统计每组的行数
    df2 = df1.groupby('gender').count()
    df2.show()
    df3 = df1.describe(['balance', 'numTrans', 'numIntlTrans'])
    df3.show()
    # 检查偏度
    df1.agg({'balance': 'skewness'}).show()
    df1.agg(
        functions.max('balance').alias('max'),
        functions.avg('balance').alias('avg'),
        functions.mean('balance').alias('mean'),
        functions.stddev('balance').alias('stddev'),
        functions.sum('balance').alias('sum'),
        functions.skewness('balance').alias('skewness'),
        functions.variance('balance').alias('variance'),
        functions.sumDistinct('balance').alias('sumDistinct')).show()
    corr1 = df1.corr('balance', 'numTrans')
    print(corr1)
コード例 #3
0
df = spark.createDataFrame(data=simpleData, schema = schema)
df.printSchema()
df.show(truncate=False)

print("approx_count_distinct: " + \
      str(df.select(approx_count_distinct("salary")).collect()[0][0]))

print("avg: " + str(df.select(avg("salary")).collect()[0][0]))

df.select(collect_list("salary")).show(truncate=False)

df.select(collect_set("salary")).show(truncate=False)

df2 = df.select(countDistinct("department", "salary"))
df2.show(truncate=False)
print("Distinct Count of Department & Salary: "+str(df2.collect()[0][0]))

print("count: "+str(df.select(count("salary")).collect()[0]))
df.select(first("salary")).show(truncate=False)
df.select(last("salary")).show(truncate=False)
df.select(kurtosis("salary")).show(truncate=False)
df.select(max("salary")).show(truncate=False)
df.select(min("salary")).show(truncate=False)
df.select(mean("salary")).show(truncate=False)
df.select(skewness("salary")).show(truncate=False)
df.select(stddev("salary"), stddev_samp("salary"), \
    stddev_pop("salary")).show(truncate=False)
df.select(sum("salary")).show(truncate=False)
df.select(sumDistinct("salary")).show(truncate=False)
df.select(variance("salary"),var_samp("salary"),var_pop("salary")) \
  .show(truncate=False)
コード例 #4
0
ファイル: test_functions.py プロジェクト: yliou/spark
 def test_sum_distinct(self):
     self.spark.range(10).select(
         assert_true(
             sum_distinct(col("id")) == sumDistinct(col("id")))).collect()
コード例 #5
0
df.select(first("StockCode"), last("StockCode")).show()

# COMMAND ----------

from pyspark.sql.functions import min, max
df.select(min("Quantity"), max("Quantity")).show()

# COMMAND ----------

from pyspark.sql.functions import sum
df.select(sum("Quantity")).show()  # 5176450

# COMMAND ----------

from pyspark.sql.functions import sumDistinct
df.select(sumDistinct("Quantity")).show()  # 29310

# COMMAND ----------

from pyspark.sql.functions import sum, count, avg, expr

df.select(
    count("Quantity").alias("total_transactions"),
    sum("Quantity").alias("total_purchases"),
    avg("Quantity").alias("avg_purchases"),
    expr("mean(Quantity)").alias("mean_purchases"))\
  .selectExpr(
    "total_purchases/total_transactions",
    "avg_purchases",
    "mean_purchases").show()
コード例 #6
0
    def run_pipeline(self):
        try:
            logging.info(
                "https://sparkbyexamples.com/pyspark/pyspark-aggregate-functions/"
            )
            # check collect_list and collect_set
            #collect_set() function returns all values from an input column with duplicate values eliminated.
            #collect_list() function returns all values from an input column with duplicates

            logging.info(
                'run_pipeline method started --> https://sparkbyexamples.com/pyspark/pyspark-explode-array-and-map-columns-to-rows/'
            )
            simpleData = [("James", "Sales", 3000), ("Michael", "Sales", 4600),
                          ("Robert", "Sales", 4100),
                          ("Maria", "Finance", 3000), ("James", "Sales", 3000),
                          ("Scott", "Finance", 3300), ("Jen", "Finance", 3900),
                          ("Jeff", "Marketing", 3000),
                          ("Kumar", "Marketing", 2000),
                          ("Saif", "Sales", 4100)]
            schema = ["employee_name", "department", "salary"]

            df = self.spark.createDataFrame(data=simpleData,
                                            schema=schema).cache()
            df.show(truncate=False)

            from pyspark.sql.functions import approx_count_distinct, collect_list
            from pyspark.sql.functions import collect_set, sum, avg, max, countDistinct, count
            from pyspark.sql.functions import first, last, kurtosis, min, mean, skewness
            from pyspark.sql.functions import stddev, stddev_samp, stddev_pop, sumDistinct
            from pyspark.sql.functions import variance, var_samp, var_pop
            df.printSchema()
            df.show(truncate=False)

            print("approx_count_distinct: " + \
                  str(df.select(approx_count_distinct("salary")).collect()[0][0]))

            print("avg: " + str(df.select(avg("salary")).collect()[0][0]))

            df.select(collect_list("salary")).show(truncate=False)

            df.select(collect_set("salary")).show(truncate=False)

            df2 = df.select(countDistinct("department", "salary"))
            df2.show(truncate=False)
            print("Distinct Count of Department & Salary: " +
                  str(df2.collect()[0][0]))

            print("count: " + str(df.select(count("salary")).collect()[0]))
            dffirst = df.select(first("salary"))
            dffirst.show(truncate=False)
            df.select(last("salary")).show(truncate=False)
            df.select(kurtosis("salary")).show(truncate=False)
            df.select(max("salary")).show(truncate=False)
            df.select(min("salary")).show(truncate=False)
            df.select(mean("salary")).show(truncate=False)
            df.select(skewness("salary")).show(truncate=False)
            df.select(stddev("salary"), stddev_samp("salary"), \
                      stddev_pop("salary")).show(truncate=False)
            df.select(sum("salary")).show(truncate=False)
            df.select(sumDistinct("salary")).show(truncate=False)
            df.select(variance("salary"), var_samp("salary"), var_pop("salary")) \
                .show(truncate=False)

            logging.info('run_pipeline method ended')
        except Exception as exp:
            logging.error("An error occured while running the pipeline > " +
                          str(exp))
            # send email notification
            # log error to database
            sys.exit(1)

        return
コード例 #7
0
# In[ ]:


# Df for firearm incidents.
df_firearm=df_joined.groupBy('LSOA_6').agg(
    cnt_cond(F.col('Crime_type').isin(['Possession of weapons', 'Public disorder and weapons'])).alias('y_cnt'))
df_firearm.show(3)


# In[ ]:


# Grouping the population by LSOA
df_population = df_joined.groupby("LSOA_6").    agg(
        func.sumDistinct("Number_of_People").alias("total_population")
    )


# In[ ]:


# look at the population data and cross check on internet
df_population.show(3)
# Population data is validated through internet information


# In[ ]:


# Joining population and firearms dataframes.
# COMMAND ----------

from pyspark.sql.functions import min, max
df.select(min("Quantity"), max("Quantity")).show()


# COMMAND ----------

from pyspark.sql.functions import sum
df.select(sum("Quantity")).show() # 5176450


# COMMAND ----------

from pyspark.sql.functions import sumDistinct
df.select(sumDistinct("Quantity")).show() # 29310


# COMMAND ----------

from pyspark.sql.functions import sum, count, avg, expr

df.select(
    count("Quantity").alias("total_transactions"),
    sum("Quantity").alias("total_purchases"),
    avg("Quantity").alias("avg_purchases"),
    expr("mean(Quantity)").alias("mean_purchases"))\
  .selectExpr(
    "total_purchases/total_transactions",
    "avg_purchases",
    "mean_purchases").show()
コード例 #9
0
# compute various column counts:
from pyspark.sql.functions import count, countDistinct, approx_count_distinct
rides.select(count("*"), count("distance"), countDistinct("distance"),
             approx_count_distinct("distance")).show()

# **Note:** The `count` function returns the number of rows with non-null values.

# **Note:** Use `count(lit(1))` rather than `count(1)` as an alternative to `count("*")`.

# The `agg` method returns the same results and can be applied to grouped data:
rides.agg(count("*"), count("distance"), countDistinct("distance"),
          approx_count_distinct("distance")).show()

# Use the `sum` and `sumDistinct` functions to compute various column sums:
from pyspark.sql.functions import sum, sumDistinct
rides.agg(sum("distance"), sumDistinct("distance")).show()

# **Question:** When would one use the `sumDistinct` function?

# Spark SQL provides a number of summary statistics:
from pyspark.sql.functions import mean, stddev, variance, skewness, kurtosis
rides.agg(mean("distance"), stddev("distance"), variance("distance"),
          skewness("distance"), kurtosis("distance")).show()

# **Note:** `mean` is an alias for `avg`, `stddev` is an alias for the sample
# standard deviation `stddev_samp`, and `variance` is an alias for the sample
# variance `var_samp`.  The population standard deviation and population
# variance are available via `stddev_pop` and `var_pop`, respectively.

# Use the `min` and `max` functions to compute the minimum and maximum, respectively:
from pyspark.sql.functions import min, max
コード例 #10
0
def get_builtin_aggregator_column(agg, ctx):
    try:
        aggregator = ctx.aggregators[agg["aggregator"]]

        try:
            input = ctx.populate_values(agg["input"],
                                        aggregator["input"],
                                        preserve_column_refs=False)
        except CortexException as e:
            e.wrap("input")
            raise

        if aggregator["name"] == "approx_count_distinct":
            return F.approxCountDistinct(input["col"],
                                         input.get("rsd")).alias(agg["name"])
        if aggregator["name"] == "avg":
            return F.avg(input).alias(agg["name"])
        if aggregator["name"] in {
                "collect_set_int", "collect_set_float", "collect_set_string"
        }:
            return F.collect_set(input).alias(agg["name"])
        if aggregator["name"] == "count":
            return F.count(input).alias(agg["name"])
        if aggregator["name"] == "count_distinct":
            return F.countDistinct(*input).alias(agg["name"])
        if aggregator["name"] == "covar_pop":
            return F.covar_pop(input["col1"], input["col2"]).alias(agg["name"])
        if aggregator["name"] == "covar_samp":
            return F.covar_samp(input["col1"],
                                input["col2"]).alias(agg["name"])
        if aggregator["name"] == "kurtosis":
            return F.kurtosis(input).alias(agg["name"])
        if aggregator["name"] in {"max_int", "max_float", "max_string"}:
            return F.max(input).alias(agg["name"])
        if aggregator["name"] == "mean":
            return F.mean(input).alias(agg["name"])
        if aggregator["name"] in {"min_int", "min_float", "min_string"}:
            return F.min(input).alias(agg["name"])
        if aggregator["name"] == "skewness":
            return F.skewness(input).alias(agg["name"])
        if aggregator["name"] == "stddev":
            return F.stddev(input).alias(agg["name"])
        if aggregator["name"] == "stddev_pop":
            return F.stddev_pop(input).alias(agg["name"])
        if aggregator["name"] == "stddev_samp":
            return F.stddev_samp(input).alias(agg["name"])
        if aggregator["name"] in {"sum_int", "sum_float"}:
            return F.sum(input).alias(agg["name"])
        if aggregator["name"] in {"sum_distinct_int", "sum_distinct_float"}:
            return F.sumDistinct(input).alias(agg["name"])
        if aggregator["name"] == "var_pop":
            return F.var_pop(input).alias(agg["name"])
        if aggregator["name"] == "var_samp":
            return F.var_samp(input).alias(agg["name"])
        if aggregator["name"] == "variance":
            return F.variance(input).alias(agg["name"])

        raise ValueError("missing builtin aggregator")  # unexpected

    except CortexException as e:
        e.wrap("aggregate " + agg["name"])
        raise
コード例 #11
0
#Count distinct (approximated)
from pyspark.sql.functions import approx_count_distinct
df.select(approx_count_distinct("StockCode", 0.1)).show()

#the second value specifies the error that is allowed; the lower the value, the lower the error is allowed to be.
df.select(approx_count_distinct("StockCode", 0.01)).show()

# COMMAND ----------

#Exercise: Replicate the previous results using SQL code.

# COMMAND ----------

#Sum a column
from pyspark.sql.functions import sum, sumDistinct
df.select(sum("Quantity"), sumDistinct("Quantity")).show()

spark.sql("select sum(quantity), sum(distinct quantity) from df").show()

# COMMAND ----------

#Get first and last row value of a column
from pyspark.sql.functions import first, last
df.select(first("StockCode"), last("StockCode")).show()

# COMMAND ----------

#Cross table
person = spark.createDataFrame([
    (0, "Bill Chambers", 0, [100]),
    (1, "Matei Zaharia", 1, [500, 250, 100]),