# standard deviation and variance dailyActivitiesDF.select(var_pop("CaloriesBurned"), var_samp("CaloriesBurned"), stddev_pop("CaloriesBurned"), stddev_samp("CaloriesBurned")).show() # COMMAND ---------- # Any extreme points in our data? dailyActivitiesDF.select(skewness("CaloriesBurned"), kurtosis("CaloriesBurned")).show() # COMMAND ---------- # Covariance and Correlation dailyActivitiesDF.select(corr("CaloriesBurned", "Steps"), covar_samp("CaloriesBurned", "Steps"), covar_pop("CaloriesBurned", "Steps")).show() # COMMAND ---------- # MAGIC %md # MAGIC # MAGIC ## Multiple languages in one notebook # MAGIC # MAGIC - One cool thing about Databricks is that we can combine languages within a notebook # MAGIC - So one example of this could be that our Data Scientists are comfortable writing Python, then our Data Engineers optimise that using Scala # COMMAND ---------- # MAGIC %sql # MAGIC
# COMMAND ---------- from pyspark.sql.functions import var_pop, stddev_pop from pyspark.sql.functions import var_samp, stddev_samp df.select(var_pop("Quantity"), var_samp("Quantity"), stddev_pop("Quantity"), stddev_samp("Quantity")).show() # COMMAND ---------- from pyspark.sql.functions import skewness, kurtosis df.select(skewness("Quantity"), kurtosis("Quantity")).show() # COMMAND ---------- from pyspark.sql.functions import corr, covar_pop, covar_samp df.select(corr("InvoiceNo", "Quantity"), covar_samp("InvoiceNo", "Quantity"), covar_pop("InvoiceNo", "Quantity")).show() # COMMAND ---------- from pyspark.sql.functions import collect_set, collect_list df.agg(collect_set("Country"), collect_list("Country")).show() # COMMAND ---------- from pyspark.sql.functions import count df.groupBy("InvoiceNo").agg( count("Quantity").alias("quan"), expr("count(Quantity)")).show() # COMMAND ----------
from pyspark.sql.functions import var_pop, stddev_pop from pyspark.sql.functions import var_samp, stddev_samp df.select(var_pop("Quantity"), var_samp("Quantity"), stddev_pop("Quantity"), stddev_samp("Quantity")).show() # COMMAND ---------- from pyspark.sql.functions import skewness, kurtosis df.select(skewness("Quantity"), kurtosis("Quantity")).show() # COMMAND ---------- from pyspark.sql.functions import corr, covar_pop, covar_samp df.select(corr("InvoiceNo", "Quantity"), covar_samp("InvoiceNo", "Quantity"), covar_pop("InvoiceNo", "Quantity")).show() # COMMAND ---------- from pyspark.sql.functions import collect_set, collect_list df.agg(collect_set("Country"), collect_list("Country")).show() # COMMAND ---------- from pyspark.sql.functions import count df.groupBy("InvoiceNo").agg( count("Quantity").alias("quan"),
rides.agg(min("distance"), max("distance")).show() # Use the `first` and `last` functions to compute the first and last values, respectively: from pyspark.sql.functions import first, last rides \ .orderBy("distance") \ .agg(first("distance", ignorenulls=False), last("distance", ignorenulls=False)) \ .show() # **Note:** Null values sort before valid numerical values. # Use the `corr`, `covar_samp`, or `covar_pop` functions to measure the linear # association between two columns: from pyspark.sql.functions import corr, covar_samp, covar_pop rides \ .agg(corr("distance", "duration"), covar_samp("distance", "duration"), covar_pop("distance", "duration")) \ .show() # The `collect_list` and `collect_set` functions return a column of array type: from pyspark.sql.functions import collect_list, collect_set rides.agg(collect_set("service")).show(truncate=False) # **Note:** `collect_list` does not remove duplicates and will return a very # long array in this case. # ## Grouping data # Use the `agg` method with the `groupBy` (or `groupby`) method to refine your # analysis: rides \ .groupBy("rider_student") \
def get_builtin_aggregator_column(agg, ctx): try: aggregator = ctx.aggregators[agg["aggregator"]] try: input = ctx.populate_values(agg["input"], aggregator["input"], preserve_column_refs=False) except CortexException as e: e.wrap("input") raise if aggregator["name"] == "approx_count_distinct": return F.approxCountDistinct(input["col"], input.get("rsd")).alias(agg["name"]) if aggregator["name"] == "avg": return F.avg(input).alias(agg["name"]) if aggregator["name"] in { "collect_set_int", "collect_set_float", "collect_set_string" }: return F.collect_set(input).alias(agg["name"]) if aggregator["name"] == "count": return F.count(input).alias(agg["name"]) if aggregator["name"] == "count_distinct": return F.countDistinct(*input).alias(agg["name"]) if aggregator["name"] == "covar_pop": return F.covar_pop(input["col1"], input["col2"]).alias(agg["name"]) if aggregator["name"] == "covar_samp": return F.covar_samp(input["col1"], input["col2"]).alias(agg["name"]) if aggregator["name"] == "kurtosis": return F.kurtosis(input).alias(agg["name"]) if aggregator["name"] in {"max_int", "max_float", "max_string"}: return F.max(input).alias(agg["name"]) if aggregator["name"] == "mean": return F.mean(input).alias(agg["name"]) if aggregator["name"] in {"min_int", "min_float", "min_string"}: return F.min(input).alias(agg["name"]) if aggregator["name"] == "skewness": return F.skewness(input).alias(agg["name"]) if aggregator["name"] == "stddev": return F.stddev(input).alias(agg["name"]) if aggregator["name"] == "stddev_pop": return F.stddev_pop(input).alias(agg["name"]) if aggregator["name"] == "stddev_samp": return F.stddev_samp(input).alias(agg["name"]) if aggregator["name"] in {"sum_int", "sum_float"}: return F.sum(input).alias(agg["name"]) if aggregator["name"] in {"sum_distinct_int", "sum_distinct_float"}: return F.sumDistinct(input).alias(agg["name"]) if aggregator["name"] == "var_pop": return F.var_pop(input).alias(agg["name"]) if aggregator["name"] == "var_samp": return F.var_samp(input).alias(agg["name"]) if aggregator["name"] == "variance": return F.variance(input).alias(agg["name"]) raise ValueError("missing builtin aggregator") # unexpected except CortexException as e: e.wrap("aggregate " + agg["name"]) raise
def simple_regression (x, y, link): # First, join everything together joined_data = x.join(link, 'SNP', 'inner').join(y, ['GENE', 'GT_SAMPLE_NAME', 'STUDY','TN'], 'inner') df = joined_data.groupBy('SNP','GENE','STUDY','TN').agg(var_samp('GT_dosage'), var_samp('ADJ_EXP'), covar_samp('GT_dosage','ADJ_EXP'), count('GT_dosage')).withColumnRenamed('var_samp(GT_dosage)', 'ss_xx').withColumnRenamed('var_samp(ADJ_EXP)','ss_yy').withColumnRenamed('covar_samp(GT_dosage, ADJ_EXP)','ss_xy').withColumnRenamed('count(GT_dosage)','n') return(df.select('SNP','GENE','STUDY','TN', (df.ss_xy / df.ss_xx).alias('BETA'), (sqrt((df.ss_yy - (df.ss_xy * df.ss_xy / df.ss_xx))/(df.n - 2.0)) / sqrt(df.ss_xx)).alias('SE_BETA'), 'n').na.drop())
min("StockCode"), max("StockCode")).show(2) #sum,sumDistinct, avg from pyspark.sql.functions import sum, sumDistinct, avg df.select(sum("Quantity"), sumDistinct("Quantity"), avg("Quantity")).show(2) #표본분산 , 표본표준편차 from pyspark.sql.functions import var_samp, stddev_samp df.select(var_samp("Quantity"), stddev_samp("Quantity")).show(2) #모분산, 모표본편차 from pyspark.sql.functions import var_pop, stddev_pop df.select(var_pop("Quantity"), stddev_pop("Quantity")).show(2) #비대칭도, 척도 from pyspark.sql.functions import skewness, kurtosis df.select(skewness("Quantity"), kurtosis("Quantity")).show(2) #공분산과 상관관계 from pyspark.sql.functions import corr, covar_pop, covar_samp df.select(corr("InvoiceNo", "Quantity"), covar_pop("InvoiceNo", "Quantity"), covar_samp("InvoiceNo", "Quantity")).show(2) #복합데이터 타입의 집계 from pyspark.sql.functions import collect_set, collect_list df.agg(collect_set("Country"), collect_list("Country")).show(2) # COMMAND ---------- # COMMAND ----------