def describe_float_1d(df, column, current_result, nrows): stats_df = df.select(column).na.drop().agg( mean(col(column)).alias("mean"), df_min(col(column)).alias("min"), df_max(col(column)).alias("max"), variance(col(column)).alias("variance"), kurtosis(col(column)).alias("kurtosis"), stddev(col(column)).alias("std"), skewness(col(column)).alias("skewness"), df_sum(col(column)).alias("sum")).toPandas() for x in np.array([0.05, 0.25, 0.5, 0.75, 0.95]): stats_df[pretty_name(x)] = (df.select(column).na.drop().selectExpr( "percentile_approx(`{col}`,CAST({n} AS DOUBLE))".format( col=column, n=x)).toPandas().iloc[:, 0]) stats = stats_df.iloc[0].copy() stats.name = column stats["range"] = stats["max"] - stats["min"] stats["iqr"] = stats[pretty_name(0.75)] - stats[pretty_name(0.25)] stats["cv"] = stats["std"] / float(stats["mean"]) stats["mad"] = (df.select(column).na.drop().select( df_abs(col(column) - stats["mean"]).alias("delta")).agg( df_sum(col("delta"))).toPandas().iloc[0, 0] / float(current_result["count"])) stats["type"] = "NUM" stats['n_zeros'] = df.select(column).where(col(column) == 0.0).count() stats['p_zeros'] = stats['n_zeros'] / float(nrows) hist_data = create_hist_data(df, column, stats["min"], stats["max"], bins) return stats
def dataStatistics(self, categoricalFeatures, numericalFeatures): # self.dataTranform() self.categoricalFeatures = None if categoricalFeatures == None else categoricalFeatures self.numericalFeatures = None if numericalFeatures == None else numericalFeatures summaryList = ['mean', 'stddev', 'min', 'max'] summaryDict = {} dataset = self.dataset import pyspark.sql.functions as F import builtins round = getattr(builtins, 'round') for colm in self.numericalFeatures: summaryListTemp = [] for value in summaryList: summ = list( dataset.select(colm).summary(value).toPandas()[colm]) summaryListSubTemp = [] for val in summ: summaryListSubTemp.append(round(float(val), 4)) summaryListTemp.append(summaryListSubTemp) summaryDict[colm] = summaryListTemp summaryList.extend(['skewness', 'kurtosis', 'variance']) summaryDict['summaryName'] = summaryList summaryDict['categoricalColumn'] = self.categoricalFeatures skewnessList = [] kurtosisList = [] varianceList = [] skewKurtVarDict = {} for colm in self.numericalFeatures: skewness = (dataset.select(F.skewness(dataset[colm])).toPandas()) for i, row in skewness.iterrows(): for j, column in row.iteritems(): skewnessList.append(round(column, 4)) kurtosis = (dataset.select(F.kurtosis(dataset[colm])).toPandas()) for i, row in kurtosis.iterrows(): for j, column in row.iteritems(): kurtosisList.append(round(column, 4)) variance = (dataset.select(F.variance(dataset[colm])).toPandas()) for i, row in variance.iterrows(): for j, column in row.iteritems(): varianceList.append(round(column, 4)) for skew, kurt, var, colm in zip(skewnessList, kurtosisList, varianceList, self.numericalFeatures): print(skew, kurt, var) skewKurtVarList = [] skewKurtVarList.append(skew) skewKurtVarList.append(kurt) skewKurtVarList.append(var) skewKurtVarDict[colm] = skewKurtVarList for (keyOne, valueOne), (keyTwo, valueTwo) in zip(summaryDict.items(), skewKurtVarDict.items()): print(keyOne, valueOne, keyTwo, valueTwo) if keyOne == keyTwo: valueOne.extend(valueTwo) summaryDict[keyOne] = valueOne return summaryDict
def __calc_stats(self, df, resolution): """ Calculates statistics for every column in the Spark DF and returns a seperate DF with the results. Statistics: sum, min, max, count, mean, kurtosis, skewness, stddev, variance. :param df: DF containing the columns that you want to run your statistics calculations on :param resolution: int resolution in milli or microseconds OR string '5m'/'1h'/'1d' :return: aggregation dataframe containing statistics """ if type(resolution) is str: # resolution to microseconds res_dict = {'5m': 300000000, '1h': 3600000000, '1d': 86400000000} agg_interval = res_dict[resolution] elif type(resolution) is int: if len(str(resolution)) < 16: resolution = int(str(resolution).ljust(16, '0')) agg_interval = resolution ts_col = F.col('timestamp') df_ori_cols = list(set(df.columns) - set(['timestamp'])) df = df.withColumn('interval_start', (F.floor(ts_col / agg_interval) * agg_interval)) #\ #.withColumn('interval_stop', F.ceil(ts_col/agg_interval) * agg_interval)\ #.orderBy(F.col('interval_start')) agg_df = df.groupBy('interval_start').agg( F.max(ts_col).alias('max_ts')) # TODO Column type checking: string columns are automatically ignored and parse as NaN, so # TODO drop NaN columns? # TODO: interval_stop ignore, as well as drop max_ts # TODO: filter out NaN columns # TODO: question: run the statistics job as a seperate job without having to make a udf script stat_cols = df_ori_cols #[c for c in df_ori_cols if c not in ['interval_start', 'interval_stop', 'timestamp', 'max_ts']] for column in stat_cols: grouped_df = df.groupBy('interval_start')\ .agg(F.sum(column).alias('sum_%s' % column), F.min(column).alias('min_%s' % column), F.max(column).alias('max_%s' % column), F.count(column).alias('count_%s' % column), F.kurtosis(column).alias('kurtosis_%s' % column), F.mean(column).alias('mean_%s' % column), F.skewness(column).alias('skewness_%s' % column), F.stddev(column).alias('stddev_%s' % column), F.variance(column).alias('var_%s' % column)) agg_df = grouped_df.join(agg_df, on='interval_start') #agg_df = agg_df.drop('max_ts').drop(F.when(F.col('*').isna())).dropna(how='all').drop_duplicates() return agg_df
def describe_numeric_1d(df, bins, column, current_result, nrows, k=2, dtype='int'): stats_df = df.select(column).na.drop().agg( mean(col(column)).alias('mean'), min(col(column)).alias('min'), max(col(column)).alias('max'), variance(col(column)).alias('variance'), kurtosis(col(column)).alias('kurtosis'), stddev(col(column)).alias('std'), skewness(col(column)).alias('skewness'), sum(col(column)).alias('sum')).toPandas() if dtype.lower() == 'int': select_expr = 'percentile({c},CAST({p} AS DOUBLE))' else: select_expr = 'percentile_approx({c},CAST({p} AS DOUBLE))' for p in [0.05, 0.25, 0.5, 0.75, 0.95]: stats_df[pretty_name(p)] = (df.select(column).na.drop().selectExpr( select_expr.format(c=column, p=p)).toPandas().ix[:, 0]) stats = stats_df.ix[0].copy() stats.name = column stats['range'] = stats['max'] - stats['min'] q3, q1 = stats[pretty_name(0.75)], stats[pretty_name(0.25)] stats['iqr'] = q3 - q1 stats['cv'] = stats['std'] / float(stats['mean']) stats['mad'] = (df.select(column).na.drop().select( abs(col(column) - stats['mean']).alias('delta')).agg(sum( col('delta'))).toPandas().ix[0, 0] / float(current_result['count'])) stats['type'] = 'NUM' stats['n_zeros'] = df.select(column).where(col(column) == 0.0).count() stats['p_zeros'] = stats['n_zeros'] / float(nrows) stats['high_idx'] = df.select(column).where( col(column) > q3 + k * (q3 - q1)).count() stats['low_idx'] = df.select(column).where( col(column) < q1 - k * (q3 - q1)).count() # generate histograms hist_data = generate_hist_data(df, column, stats['min'], stats['max'], bins) stats['histogram'] = complete_histogram(hist_data) stats['mini_histogram'] = mini_histogram(hist_data) return stats
def preprocess_file(input_file_name, spark, sample=True): ratings = [] with open(input_file_name, "r") as fp: line = fp.readline() while line: (user_id,num_ratings) = line.split("|") num_ratings = int(num_ratings) for i in range(0,num_ratings): line = fp.readline() (item_id, rating, time, time2) = line.split() ratings.append((int(user_id), int(item_id), int(rating))) line = fp.readline() df = spark.createDataFrame(ratings, ["user_id","item_id","rating"]) # skewness # skewness = df.agg(f.skewness("rating")) # skewness.show() rdd1 = df.rdd.map(map_ratings) df2 = spark.createDataFrame(rdd1) if sample is True: sampled = df2.sampleBy( "rating", fractions={ 1: 0.334, 2: 1, 3: 0.5930, 4: 0.30258, 5: 0.0899145}, seed=0) # skewness skewness = sampled.agg(f.skewness("rating")) skewness.show() #+--------------------+ #| skewness(rating)| #+--------------------+ #|-8.24123249452558E-5| return sampled else: return df2
def learn3(): df1 = ss.read.csv('F:/Research/data/ccFraud.csv', header=True, inferSchema=True) df1.show() # 按gender列对df进行分组,并统计每组的行数 df2 = df1.groupby('gender').count() df2.show() df3 = df1.describe(['balance', 'numTrans', 'numIntlTrans']) df3.show() # 检查偏度 df1.agg({'balance': 'skewness'}).show() df1.agg( functions.max('balance').alias('max'), functions.avg('balance').alias('avg'), functions.mean('balance').alias('mean'), functions.stddev('balance').alias('stddev'), functions.sum('balance').alias('sum'), functions.skewness('balance').alias('skewness'), functions.variance('balance').alias('variance'), functions.sumDistinct('balance').alias('sumDistinct')).show() corr1 = df1.corr('balance', 'numTrans') print(corr1)
train_data.printSchema() # COMMAND ---------- #Summary of all the features in the file train_data.describe().show() # COMMAND ---------- #Summary of the numerical data available in the dataset train_data.describe().select("summary", "outcome", "char_38").show() # COMMAND ---------- #Finding Skewness and kurtosis for the char_38 column train_data.select(skewness('char_38'), kurtosis('char_38')).show() # COMMAND ---------- #Checking if there exists any Null values in the data train_data.where( reduce(lambda x, y: x | y, (func.col(x).isNull() for x in train_data.columns))).show() # COMMAND ---------- #Finding the count for each category of activity train_data.groupBy('activity_category').count().sort(func.desc('count')).show() # COMMAND ----------
meta_data = {f: {} for f in origin_dtypes.keys()} for f, v in origin_dtypes.items(): tmp = df.withColumn(f'{f}_to_float', df[f'{f}'].cast('float')) if tmp.filter(fn.isnull(f'{f}_to_float')).count() < df_length * 0.4: num_value = tmp.groupBy(f).count().count() if num_value <= 10: col_type[f] = 'C' meta_data[f]['count'] = num_value else: col_type[f] = 'N' mi, _5, _25, median, _75, _95, mx = tmp.approxQuantile( f'{f}_to_float', [0.0, 0.005, 0.25, 0.5, 0.75, 0.995, 1.0], 0) others = tmp.select( fn.mean(f'{f}_to_float'), fn.stddev(f'{f}_to_float'), fn.skewness(f'{f}_to_float')).take(1)[-1].asDict() skew = others.get(f'skewness({f})', 0) meta_data[f]['max'] = mx meta_data[f]['min'] = mi meta_data[f]['median'] = median meta_data[f]['mean'] = others.get(f'avg({f})', 0) meta_data[f]['std'] = others.get(f'stddev_samp({f})', 0) meta_data[f]['skew'] = skew iqr = _75 - _25 if skew > 2.5: low = _5 high = mx + 1.5 * iqr elif skew < -2.5: low = mi - 1.5 * iqr high = _95
percs['summary'] = [str(p) + '%' for p in percentiles] spark_describe = df_in.describe().toPandas() new_df = pd.concat([spark_describe, percs],ignore_index = True) new_df = new_df.round(2) return new_df[['summary']+columns] #Describe selected = [s for s in df.columns if 'var_' in s] print(selected) df.select(selected).describe().show() selected = ['var_0','var_1','var_2','var_3','var_4', 'var_5'] describe_pd(df,selected) describe_pd(df,selected,deciles=True) #Skewness and kurtosis df.select(skewness('var_0'),kurtosis('var_0')).show() #Plot histogram var = [ 'var_108'] bins = np.arange(0,105,5.0) df[var].describe().show() plt.figure(figsize=(10,8)) plt.hist(df_new[var].astype(float),alpha=0.8,histtype='bar',ec='black') df.dtypes df[df.var_100 <14.0] #Correlation matrix selected = ['target','var_0','var_1','var_2','var_3','var_4', 'var_5'] features = df.select(selected).rdd.map(lambda row: row[0:])
return True elif np.isnan(p) == True: return False else: return False # Create UDF funcs get_pval_udf = F.udf(lambda vars: get_normal_pval(vars), FloatType()) if_norm_udf = F.udf(lambda p: if_norm(p), BooleanType()) # COMMAND ---------- eventsDataAll = eventsData.select('ActionGeo_FullName', 'wERA_3d', 'wERA_60d', 'nArticles') \ .groupBy('ActionGeo_FullName') \ .agg( F.skewness('wERA_3d'), F.kurtosis('wERA_3d'), F.stddev('wERA_3d'), F.variance('wERA_3d'), F.collect_list('wERA_3d').alias('list_wERA_3d'), F.skewness('wERA_60d'), F.kurtosis('wERA_60d'), F.stddev('wERA_60d'), F.variance('wERA_60d'), F.collect_list('wERA_60d').alias('list_wERA_60d'), F.sum('nArticles').alias('nArticles'), F.count(F.lit(1)).alias('n_observations') ) # get p-value and define normalcy eventsDataAll = eventsDataAll.withColumn(
.selectExpr( "total_purchases/total_transactions", "avg_purchases", "mean_purchases").show() # COMMAND ---------- from pyspark.sql.functions import var_pop, stddev_pop from pyspark.sql.functions import var_samp, stddev_samp df.select(var_pop("Quantity"), var_samp("Quantity"), stddev_pop("Quantity"), stddev_samp("Quantity")).show() # COMMAND ---------- from pyspark.sql.functions import skewness, kurtosis df.select(skewness("Quantity"), kurtosis("Quantity")).show() # COMMAND ---------- from pyspark.sql.functions import corr, covar_pop, covar_samp df.select(corr("InvoiceNo", "Quantity"), covar_samp("InvoiceNo", "Quantity"), covar_pop("InvoiceNo", "Quantity")).show() # COMMAND ---------- from pyspark.sql.functions import collect_set, collect_list df.agg(collect_set("Country"), collect_list("Country")).show() # COMMAND ---------- from pyspark.sql.functions import count
def run_pipeline(self): try: logging.info( "https://sparkbyexamples.com/pyspark/pyspark-aggregate-functions/" ) # check collect_list and collect_set #collect_set() function returns all values from an input column with duplicate values eliminated. #collect_list() function returns all values from an input column with duplicates logging.info( 'run_pipeline method started --> https://sparkbyexamples.com/pyspark/pyspark-explode-array-and-map-columns-to-rows/' ) simpleData = [("James", "Sales", 3000), ("Michael", "Sales", 4600), ("Robert", "Sales", 4100), ("Maria", "Finance", 3000), ("James", "Sales", 3000), ("Scott", "Finance", 3300), ("Jen", "Finance", 3900), ("Jeff", "Marketing", 3000), ("Kumar", "Marketing", 2000), ("Saif", "Sales", 4100)] schema = ["employee_name", "department", "salary"] df = self.spark.createDataFrame(data=simpleData, schema=schema).cache() df.show(truncate=False) from pyspark.sql.functions import approx_count_distinct, collect_list from pyspark.sql.functions import collect_set, sum, avg, max, countDistinct, count from pyspark.sql.functions import first, last, kurtosis, min, mean, skewness from pyspark.sql.functions import stddev, stddev_samp, stddev_pop, sumDistinct from pyspark.sql.functions import variance, var_samp, var_pop df.printSchema() df.show(truncate=False) print("approx_count_distinct: " + \ str(df.select(approx_count_distinct("salary")).collect()[0][0])) print("avg: " + str(df.select(avg("salary")).collect()[0][0])) df.select(collect_list("salary")).show(truncate=False) df.select(collect_set("salary")).show(truncate=False) df2 = df.select(countDistinct("department", "salary")) df2.show(truncate=False) print("Distinct Count of Department & Salary: " + str(df2.collect()[0][0])) print("count: " + str(df.select(count("salary")).collect()[0])) dffirst = df.select(first("salary")) dffirst.show(truncate=False) df.select(last("salary")).show(truncate=False) df.select(kurtosis("salary")).show(truncate=False) df.select(max("salary")).show(truncate=False) df.select(min("salary")).show(truncate=False) df.select(mean("salary")).show(truncate=False) df.select(skewness("salary")).show(truncate=False) df.select(stddev("salary"), stddev_samp("salary"), \ stddev_pop("salary")).show(truncate=False) df.select(sum("salary")).show(truncate=False) df.select(sumDistinct("salary")).show(truncate=False) df.select(variance("salary"), var_samp("salary"), var_pop("salary")) \ .show(truncate=False) logging.info('run_pipeline method ended') except Exception as exp: logging.error("An error occured while running the pipeline > " + str(exp)) # send email notification # log error to database sys.exit(1) return
def skewTemperature(df, spark): return df.select(F.skewness('temperature')).first()[0]
# COMMAND ---------- # MAGIC %md # MAGIC **Skewness**: a measure of symmetry, or more precisely, the lack of symmetry. A distribution, or data set, is symmetric if it looks the same to the left and right of the center point. # MAGIC # MAGIC **Kurtosis**: a measure of whether the data are heavy-tailed or light-tailed relative to a normal distribution. That is, data sets with high kurtosis tend to have heavy tails or outliers. Data sets with low kurtosis tend to have light tails or a lack of outliers. # MAGIC # MAGIC **Standard Deviation**: a statistical measure of the dispersion of the data relative to its mean. It is calculated with the square root of the variance. A low standard deviation indicates that the values tend to be close to the mean of the dataset, while a high standard deviation indicates that the values are spread out over a wider range. # MAGIC # MAGIC **Variance**: a measure of variability. It is calculated by taking the average of squared deviations from the mean. Variance tells you the degree of spread in your data set. The more spread the data, the larger the variance is in relation to the mean. # COMMAND ---------- goldsteinDataPartitioned = goldsteinData.select('ActionGeo_FullName', 'if_conflict', 'wGRA_1d', 'wGRA_60d', 'nArticles') \ .groupBy('ActionGeo_FullName', 'if_conflict') \ .agg( F.skewness('wGRA_1d'), F.kurtosis('wGRA_1d'), F.stddev('wGRA_1d'), F.variance('wGRA_1d'), F.collect_list('wGRA_1d').alias('list_wGRA_1d'), F.skewness('wGRA_60d'), F.kurtosis('wGRA_60d'), F.stddev('wGRA_60d'), F.variance('wGRA_60d'), F.collect_list('wGRA_60d').alias('list_wGRA_60d'), F.sum('nArticles').alias('nArticles'), F.count(F.lit(1)).alias('n_observations') ) goldsteinDataPartitioned.limit(4).toPandas()
# **Note:** Use `count(lit(1))` rather than `count(1)` as an alternative to `count("*")`. # The `agg` method returns the same results and can be applied to grouped data: rides.agg(count("*"), count("distance"), countDistinct("distance"), approx_count_distinct("distance")).show() # Use the `sum` and `sumDistinct` functions to compute various column sums: from pyspark.sql.functions import sum, sumDistinct rides.agg(sum("distance"), sumDistinct("distance")).show() # **Question:** When would one use the `sumDistinct` function? # Spark SQL provides a number of summary statistics: from pyspark.sql.functions import mean, stddev, variance, skewness, kurtosis rides.agg(mean("distance"), stddev("distance"), variance("distance"), skewness("distance"), kurtosis("distance")).show() # **Note:** `mean` is an alias for `avg`, `stddev` is an alias for the sample # standard deviation `stddev_samp`, and `variance` is an alias for the sample # variance `var_samp`. The population standard deviation and population # variance are available via `stddev_pop` and `var_pop`, respectively. # Use the `min` and `max` functions to compute the minimum and maximum, respectively: from pyspark.sql.functions import min, max rides.agg(min("distance"), max("distance")).show() # Use the `first` and `last` functions to compute the first and last values, respectively: from pyspark.sql.functions import first, last rides \ .orderBy("distance") \ .agg(first("distance", ignorenulls=False), last("distance", ignorenulls=False)) \
def get_builtin_aggregator_column(agg, ctx): try: aggregator = ctx.aggregators[agg["aggregator"]] try: input = ctx.populate_values(agg["input"], aggregator["input"], preserve_column_refs=False) except CortexException as e: e.wrap("input") raise if aggregator["name"] == "approx_count_distinct": return F.approxCountDistinct(input["col"], input.get("rsd")).alias(agg["name"]) if aggregator["name"] == "avg": return F.avg(input).alias(agg["name"]) if aggregator["name"] in { "collect_set_int", "collect_set_float", "collect_set_string" }: return F.collect_set(input).alias(agg["name"]) if aggregator["name"] == "count": return F.count(input).alias(agg["name"]) if aggregator["name"] == "count_distinct": return F.countDistinct(*input).alias(agg["name"]) if aggregator["name"] == "covar_pop": return F.covar_pop(input["col1"], input["col2"]).alias(agg["name"]) if aggregator["name"] == "covar_samp": return F.covar_samp(input["col1"], input["col2"]).alias(agg["name"]) if aggregator["name"] == "kurtosis": return F.kurtosis(input).alias(agg["name"]) if aggregator["name"] in {"max_int", "max_float", "max_string"}: return F.max(input).alias(agg["name"]) if aggregator["name"] == "mean": return F.mean(input).alias(agg["name"]) if aggregator["name"] in {"min_int", "min_float", "min_string"}: return F.min(input).alias(agg["name"]) if aggregator["name"] == "skewness": return F.skewness(input).alias(agg["name"]) if aggregator["name"] == "stddev": return F.stddev(input).alias(agg["name"]) if aggregator["name"] == "stddev_pop": return F.stddev_pop(input).alias(agg["name"]) if aggregator["name"] == "stddev_samp": return F.stddev_samp(input).alias(agg["name"]) if aggregator["name"] in {"sum_int", "sum_float"}: return F.sum(input).alias(agg["name"]) if aggregator["name"] in {"sum_distinct_int", "sum_distinct_float"}: return F.sumDistinct(input).alias(agg["name"]) if aggregator["name"] == "var_pop": return F.var_pop(input).alias(agg["name"]) if aggregator["name"] == "var_samp": return F.var_samp(input).alias(agg["name"]) if aggregator["name"] == "variance": return F.variance(input).alias(agg["name"]) raise ValueError("missing builtin aggregator") # unexpected except CortexException as e: e.wrap("aggregate " + agg["name"]) raise
spark.read # Our DataFrameReader .option("header", "true") # Let Spark know we have a header .option("inferSchema", "false") # Infering the schema (it is a small dataset) .format("com.databricks.spark.csv").csv( "/FileStore/tables/telecomData/churn_bigml_20-55239.csv", schema=schema, nullValue='NA') # Enforce the Schema .cache() # Mark the DataFrame as cached. ) testDF.printSchema() testDF.count() #Data skew done using pyspark functions and display as pie chart and adding to dashboard trainSet.select(f.skewness(trainSet['total_international_charge']), f.skewness(trainSet['total_day_charge']), f.skewness(trainSet['total_evening_charge']), f.skewness(trainSet['total_night_charge'])) # churn is related to the total international call charges: trainSet.groupBy("churn").sum("total_international_charge").show() # churn is related to the total international num of calls: trainSet.groupBy("churn").sum("total_international_num_calls").show() #Use sparksql to analyze data # create a temp view for persistence for this session trainSet.createOrReplaceTempView("UserAccount") # create a catalog as an interface that can be used to create, drop, alter, or query underlying databases, tables, functions
# MAGIC # MAGIC ## Statistical functions # MAGIC # MAGIC - We can do some basic statistical functions as well using the Spark API # COMMAND ---------- # standard deviation and variance dailyActivitiesDF.select(var_pop("CaloriesBurned"), var_samp("CaloriesBurned"), stddev_pop("CaloriesBurned"), stddev_samp("CaloriesBurned")).show() # COMMAND ---------- # Any extreme points in our data? dailyActivitiesDF.select(skewness("CaloriesBurned"), kurtosis("CaloriesBurned")).show() # COMMAND ---------- # Covariance and Correlation dailyActivitiesDF.select(corr("CaloriesBurned", "Steps"), covar_samp("CaloriesBurned", "Steps"), covar_pop("CaloriesBurned", "Steps")).show() # COMMAND ---------- # MAGIC %md # MAGIC # MAGIC ## Multiple languages in one notebook # MAGIC
def describe_float_1d(df, column, current_result, nrows): if spark_version == "1.6+": stats_df = df.select(column).na.drop().agg(mean(col(column)).alias("mean"), df_min(col(column)).alias("min"), df_max(col(column)).alias("max"), variance(col(column)).alias("variance"), kurtosis(col(column)).alias("kurtosis"), stddev(col(column)).alias("std"), skewness(col(column)).alias("skewness"), df_sum(col(column)).alias("sum") ).toPandas() else: stats_df = df.select(column).na.drop().agg(mean(col(column)).alias("mean"), df_min(col(column)).alias("min"), df_max(col(column)).alias("max"), df_sum(col(column)).alias("sum") ).toPandas() stats_df["variance"] = df.select(column).na.drop().agg(variance_custom(col(column), stats_df["mean"].ix[0], current_result["count"])).toPandas().ix[0][0] stats_df["std"] = np.sqrt(stats_df["variance"]) stats_df["skewness"] = df.select(column).na.drop().agg(skewness_custom(col(column), stats_df["mean"].ix[0], current_result["count"])).toPandas().ix[0][0] stats_df["kurtosis"] = df.select(column).na.drop().agg(kurtosis_custom(col(column), stats_df["mean"].ix[0], current_result["count"])).toPandas().ix[0][0] for x in np.array([0.05, 0.25, 0.5, 0.75, 0.95]): stats_df[pretty_name(x)] = (df.select(column) .na.drop() .selectExpr("percentile_approx(`{col}`,CAST({n} AS DOUBLE))" .format(col=column, n=x)).toPandas().ix[:,0] ) stats = stats_df.ix[0].copy() stats.name = column stats["range"] = stats["max"] - stats["min"] stats["iqr"] = stats[pretty_name(0.75)] - stats[pretty_name(0.25)] stats["cv"] = stats["std"] / float(stats["mean"]) stats["mad"] = (df.select(column) .na.drop() .select(df_abs(col(column)-stats["mean"]).alias("delta")) .agg(df_sum(col("delta"))).toPandas().ix[0,0] / float(current_result["count"])) stats["type"] = "NUM" stats['n_zeros'] = df.select(column).where(col(column)==0.0).count() stats['p_zeros'] = stats['n_zeros'] / float(nrows) # Large histogram imgdata = BytesIO() hist_data = create_hist_data(df, column, stats["min"], stats["max"], bins) figure = plt.figure(figsize=(6, 4)) plot = plt.subplot() plt.bar(hist_data["left_edge"], hist_data["count"], width=hist_data["width"], facecolor='#337ab7') plot.set_ylabel("Frequency") plot.figure.subplots_adjust(left=0.15, right=0.95, top=0.9, bottom=0.1, wspace=0, hspace=0) plot.figure.savefig(imgdata) imgdata.seek(0) stats['histogram'] = 'data:image/png;base64,' + quote(base64.b64encode(imgdata.getvalue())) #TODO Think about writing this to disk instead of caching them in strings plt.close(plot.figure) stats['mini_histogram'] = mini_histogram(hist_data) return stats
np.percentile(df_in.select(x).collect(), percentiles) for x in columns ]) percs = pd.DataFrame(percs, columns=columns) percs['summary'] = [str(p) + '%' for p in percentiles] spark_describe = df_in.describe().toPandas() new_df = pd.concat([spark_describe, percs], ignore_index=True) new_df = new_df.round(2) return new_df[['summary'] + columns] describe_pd(ds, ['Score']) # skewness and kurtosis from pyspark.sql.functions import skewness, kurtosis var = 'Score' ds.select(skewness(var), kurtosis(var)).show() # histogram import matplotlib.pyplot as plt var = 'Score' plot_data = ds.select(var).toPandas() x = plot_data[var] bins = [0, 3.6, 3.8, 3.9, 4] hist, bin_edges = np.histogram(x, bins, weights=np.zeros_like(x) + 100. / x.size) # make the histogram fig = plt.figure(figsize=(10, 8)) ax = fig.add_subplot(1, 1, 1) # Plot the histogram heights against integers on the x axis ax.bar(range(len(hist)), hist, width=1, alpha=0.8, ec='black', color='gold')
def describe_float_1d(df, column, current_result, nrows): if spark_version == "1.6+": stats_df = df.select(column).na.drop().agg(mean(col(column)).alias("mean"), df_min(col(column)).alias("min"), df_max(col(column)).alias("max"), variance(col(column)).alias("variance"), kurtosis(col(column)).alias("kurtosis"), stddev(col(column)).alias("std"), skewness(col(column)).alias("skewness"), df_sum(col(column)).alias("sum"), count(col(column) == 0.0).alias('n_zeros') ).toPandas() else: stats_df = df.select(column).na.drop().agg(mean(col(column)).alias("mean"), df_min(col(column)).alias("min"), df_max(col(column)).alias("max"), df_sum(col(column)).alias("sum"), count(col(column) == 0.0).alias('n_zeros') ).toPandas() stats_df["variance"] = df.select(column).na.drop().agg(variance_custom(col(column), stats_df["mean"].iloc[0], current_result["count"])).toPandas().iloc[0][0] stats_df["std"] = np.sqrt(stats_df["variance"]) stats_df["skewness"] = df.select(column).na.drop().agg(skewness_custom(col(column), stats_df["mean"].iloc[0], current_result["count"])).toPandas().iloc[0][0] stats_df["kurtosis"] = df.select(column).na.drop().agg(kurtosis_custom(col(column), stats_df["mean"].iloc[0], current_result["count"])).toPandas().iloc[0][0] for x in [0.05, 0.25, 0.5, 0.75, 0.95]: stats_df[pretty_name(x)] = (df.select(column) .na.drop() .selectExpr("percentile_approx(`{col}`,CAST({n} AS DOUBLE))" .format(col=column, n=x)).toPandas().iloc[:,0] ) stats = stats_df.iloc[0].copy() stats.name = column stats["range"] = stats["max"] - stats["min"] stats["iqr"] = stats[pretty_name(0.75)] - stats[pretty_name(0.25)] stats["cv"] = stats["std"] / float(stats["mean"]) stats["mad"] = (df.select(column) .na.drop() .select(df_abs(col(column)-stats["mean"]).alias("delta")) .agg(df_sum(col("delta"))).toPandas().iloc[0,0] / float(current_result["count"])) stats["type"] = "NUM" stats['p_zeros'] = stats['n_zeros'] / float(nrows) # Large histogram imgdata = BytesIO() hist_data = create_hist_data(df, column, stats["min"], stats["max"], bins) figure = plt.figure(figsize=(6, 4)) plot = plt.subplot() plt.bar(hist_data["left_edge"], hist_data["count"], width=hist_data["width"], facecolor='#337ab7') plot.set_ylabel("Frequency") plot.figure.subplots_adjust(left=0.15, right=0.95, top=0.9, bottom=0.1, wspace=0, hspace=0) plot.figure.savefig(imgdata) imgdata.seek(0) stats['histogram'] = 'data:image/png;base64,' + quote(base64.b64encode(imgdata.getvalue())) #TODO Think about writing this to disk instead of caching them in strings plt.close(plot.figure) stats['mini_histogram'] = mini_histogram(hist_data) return stats
def numStats(dataframe, field): ''' This function works with pandas and spark dataframe Parameter field must be a String value, and it must make reference to a single column name Parameter field must make reference to a numerical variable This function does not consider null values on its calculations ''' if isinstance(dataframe, pd.DataFrame) == True: df = dataframe[field].dropna() # Selecting column and droping null values # Count of Values count = df.count() countNullValues = dataframe.shape[0] - count # Central Tendency mean_value = df.sum() / count median_value = df.median() # Min, max, and Percentiles min_value = df.min() max_value = df.max() percentile_25 = df.quantile(0.25) percentile_75 = df.quantile(0.75) # Variation stddev_value = df.std() range_value = max_value - min_value IQR_value = percentile_75 - percentile_25 #Shape skewness_value = df.skew() kurtosis_value = df.kurt() elif isinstance(dataframe, DataFrame) == True: df = dataframe.select(field).dropna(how='any') # Count of Values count=df.count() countNullValues = dataframe.select(field).count() - count # Central Tendency mean_process=df.agg(avg(col(field))) # The result of agg is a spark dataframe mean_value=mean_process.collect()[0][mean_process.columns[0]] # The result of collect is a list of row. [0] first element & [mean_process.columns[0] name of the column median_value=df.approxQuantile(col=field,probabilities=[0.5],relativeError=0.05)[0] # The result of approxQuantile is a list. [0] first element # Min, Max, and Percentiles min_process=df.agg(min(col(field))) # The result of agg is a spark dataframe min_value=min_process.collect()[0][min_process.columns[0]] # The result of collect is a list of row. [0] first element & [min_process.columns[0] name of the column max_process=df.agg(max(col(field))) # The result of agg is a spark dataframe max_value=max_process.collect()[0][max_process.columns[0]] # The result of collect is a list of row. [0] first element & [max_process.columns[0] name of the column percentile_25=df.approxQuantile(col=field,probabilities=[0.25],relativeError=0.05)[0] # The result of approxQuantile is a list. [0] first element percentile_75=df.approxQuantile(col=field,probabilities=[0.75],relativeError=0.05)[0] # The result of approxQuantile is a list. [0] first element # Variation stddev_process=df.agg(stddev(col(field))) # The result of agg is a spark dataframe stddev_value=stddev_process.collect()[0][stddev_process.columns[0]] # The result of collect is a list of row. [0] first element & [stddev_process.columns[0] name of the column range_value=max_value-min_value # Calculation of the range of values IQR_value=percentile_75-percentile_25 # Calculation of the Interquartile range # Shape skewness_process=df.agg(skewness(col(field))) # The result of agg is a spark dataframe skewness_value=skewness_process.collect()[0][skewness_process.columns[0]] # The result of collect is a list of row. [0] first element & [skewness_process.columns[0] name of the column kurtosis_process=df.agg(kurtosis(col(field))) # The result of agg is a spark dataframe kurtosis_value=round(kurtosis_process.collect()[0][kurtosis_process.columns[0]],2) # The result of collect is a list of row. [0] first element & [kurtosis_process.columns[0] name of the column # Printing summary of statistics print('Summary of Descriptive Statistics - ',field) print('**********************************************************') print('Count of values : ',count) print('Count of Null values : ',countNullValues) print('Central tendency:-----------------------------------------') print('Mean(Average) : ',round(mean_value,2)) print('Median(Percentile 25) : ',round(median_value,2)) print('Min, Max, and Percentiles:--------------------------------') print('Minimum : ',round(min_value,2)) print('Maximum : ',round(max_value,2)) print('Percentile 25 (Q1) : ',round(percentile_25,2)) print('Percentile 75 (Q3) : ',round(percentile_75,2)) print('Variation:------------------------------------------------') print('Standard Deviation : ',round(stddev_value,2)) print('Range : ',round(range_value,2)) print('Interquartile Range (IQR): ',round(IQR_value,2)) print('Shape:----------------------------------------------------') print('Skewness : ',round(skewness_value,2)) print('Kurtosis : ',round(kurtosis_value,2)) print('**********************************************************') # Creating a dictionary with descriptive statistics data = {'Statistic': ['count', 'Count Null Values', 'mean', 'median', 'min', 'max', 'percentile25', 'percentile75', 'stddev', 'range', 'IQR', 'skewness', 'kurtosis'], 'Values': [count,countNullValues,mean_value,median_value,min_value,max_value,percentile_25,percentile_75,stddev_value,range_value,IQR_value,skewness_value,kurtosis_value]} # Creating a pandas dataframe summary_stats = pd.DataFrame(data) return summary_stats # This function returns a pandas dataframe
END AS minDist FROM distPOI0 """ distPOI2 = spark.sql(query) distPOI2.registerTempTable("distPOI2") distPOI2.show() # Stage 2 Analysis # grouping data by POI by_POI = distPOI2.groupBy("POI") by_POI.avg("minDist").show() by_POI.agg(F.stddev("minDist")).show() by_POI.min("minDist").show() by_POI.max("minDist").show() by_POI.agg(F.skewness("minDist")).show() by_POI.agg(F.kurtosis("minDist")).show() query = """SELECT COUNT(_ID) Requests, POI, AVG(minDist) AS Mean, percentile_approx(minDist, 0.5) AS Median, MAX(minDist) AS poiRadius_km, COUNT(_ID)/(3.14159*POWER(MAX(minDist),2)) AS Density_Requests_by_km2 FROM distPOI2 GROUP BY POI """ spark.sql(query).show()
def randomClassifier(dataset_add, feature_colm, label_colm, relation_list, relation): try: dataset = spark.read.parquet(dataset_add) label = '' for y in label_colm: label = y Schema = dataset.schema stringFeatures = [] numericalFeatures = [] for x in Schema: if (str(x.dataType) == "StringType"): for y in feature_colm: if x.name == y: stringFeatures.append(x.name) else: for y in feature_colm: if x.name == y: numericalFeatures.append(x.name) summaryList = ['mean', 'stddev', 'min', 'max'] summaryDict = {} import pyspark.sql.functions as F import builtins round = getattr(builtins, 'round') for colm in numericalFeatures: summaryListTemp = [] for value in summaryList: summ = list( dataset.select(colm).summary(value).toPandas()[colm]) summaryListSubTemp = [] for val in summ: summaryListSubTemp.append(round(float(val), 4)) # print(summaryListSubTemp) summaryListTemp.append(summaryListSubTemp) # varianceListTemp = list(dataset.select(variance(col(colm)).alias(colm)).toPandas()[colm]) # summaryListTemp.append(varianceListTemp) summaryDict[colm] = summaryListTemp # summaryList.append('variance') summaryDict['summaryName'] = summaryList summaryDict['categoricalColumn'] = stringFeatures skewnessList = [] kurtosisList = [] varianceList = [] skewKurtVarDict = {} for colm in numericalFeatures: skewness = (dataset.select(F.skewness(dataset[colm])).toPandas()) for i, row in skewness.iterrows(): for j, column in row.iteritems(): skewnessList.append(round(column, 4)) kurtosis = (dataset.select(F.kurtosis(dataset[colm])).toPandas()) for i, row in kurtosis.iterrows(): for j, column in row.iteritems(): kurtosisList.append(round(column, 4)) variance = (dataset.select(F.variance(dataset[colm])).toPandas()) for i, row in variance.iterrows(): for j, column in row.iteritems(): varianceList.append(round(column, 4)) for skew, kurt, var, colm in zip(skewnessList, kurtosisList, varianceList, numericalFeatures): print(skew, kurt, var) skewKurtVarList = [] skewKurtVarList.append(skew) skewKurtVarList.append(kurt) skewKurtVarList.append(var) skewKurtVarDict[colm] = skewKurtVarList for (keyOne, valueOne), (keyTwo, valueTwo) in zip(summaryDict.items(), skewKurtVarDict.items()): print(keyOne, valueOne, keyTwo, valueTwo) if keyOne == keyTwo: valueOne.extend(valueTwo) summaryDict[keyOne] = valueOne print(summaryDict) print(summaryList.extend(['skewness', 'kurtosis', 'variance'])) print(summaryDict) # for colm in numericalFeatures: # skewness = (dataset.select(F.skewness(dataset[colm])).alias('skewness_' + colm)) # kurtosis = (dataset.select(F.kurtosis(dataset[colm])).alias('kurtosis_' + colm)) # variance = (dataset.select(F.variance(dataset[colm]).alias('kurtosis_' + colm))) if relation == 'linear': dataset = dataset if relation == 'non_linear': dataset = Relationship(dataset, relation_list) dataset.show() for x in Schema: if (str(x.dataType) == "StringType" and x.name == label): for labelkey in label_colm: label_indexer = StringIndexer(inputCol=label, outputCol='indexed_' + label).fit(dataset) dataset = label_indexer.transform(dataset) label = 'indexed_' + label else: label = label indexed_features = [] for colm in stringFeatures: indexer = StringIndexer(inputCol=colm, outputCol='indexed_' + colm).fit(dataset) indexed_features.append('indexed_' + colm) dataset = indexer.transform(dataset) final_features = numericalFeatures + indexed_features response_chi_test = chi_square_test(dataset=dataset, features=indexed_features, label_col=label, stringFeatures=stringFeatures) featureassembler = VectorAssembler(inputCols=final_features, outputCol="features") dataset = featureassembler.transform(dataset) dataset.show() vec_indexer = VectorIndexer(inputCol='features', outputCol='vec_indexed_features', maxCategories=4).fit(dataset) categorical_features = vec_indexer.categoryMaps print("Choose %d categorical features: %s" % (len(categorical_features), ", ".join( str(k) for k in categorical_features.keys()))) vec_indexed = vec_indexer.transform(dataset) vec_indexed.show() finalized_data = vec_indexed.select(label, 'vec_indexed_features') train_data, test_data = finalized_data.randomSplit([0.75, 0.25], seed=40) rf = RandomForestClassifier(labelCol=label, featuresCol='vec_indexed_features', numTrees=10) model = rf.fit(train_data) predictions = model.transform(test_data) print(model.featureImportances) feature_importance = model.featureImportances.toArray().tolist() print(feature_importance) import pyspark.sql.functions as F import builtins round = getattr(builtins, 'round') feature_importance = model.featureImportances.toArray().tolist() print(feature_importance) # feature_importance = [round(x,4) for x in feature_importance] featureImportance = [] for x in feature_importance: featureImportance.append(round(x, 4)) print(featureImportance) features_column_for_user = numericalFeatures + stringFeatures feature_imp = { 'feature_importance': featureImportance, "feature_column": features_column_for_user } response_dict = { 'feature_importance': feature_imp, 'ChiSquareTestData': response_chi_test, 'summaryDict': summaryDict } return response_dict except Exception as e: print("exception is = " + str(e))
"avg_purchases", "mean_purchases").show() # COMMAND ---------- from pyspark.sql.functions import var_pop, stddev_pop from pyspark.sql.functions import var_samp, stddev_samp df.select(var_pop("Quantity"), var_samp("Quantity"), stddev_pop("Quantity"), stddev_samp("Quantity")).show() # COMMAND ---------- from pyspark.sql.functions import skewness, kurtosis df.select(skewness("Quantity"), kurtosis("Quantity")).show() # COMMAND ---------- from pyspark.sql.functions import corr, covar_pop, covar_samp df.select(corr("InvoiceNo", "Quantity"), covar_samp("InvoiceNo", "Quantity"), covar_pop("InvoiceNo", "Quantity")).show() # COMMAND ---------- from pyspark.sql.functions import collect_set, collect_list df.agg(collect_set("Country"), collect_list("Country")).show()
def skew(data_frame, measure_column_name): return data_frame.select( FN.skewness(measure_column_name)).collect()[0][0]
.option("inferSchema", "true") # Infering the schema (it is a small dataset) .csv(fileName) # Location of our data .cache() # Mark the DataFrame as cached. ) trainDF.count() # Materialize the cache trainDF.printSchema() testDF = (spark.read.option("header", "true").option( "inferSchema", "true").format("com.databricks.spark.csv").load( "/FileStore/tables/test.csv").cache()) testDF.printSchema() testDF.count() #skewness trainDF.select(f.skewness(trainDF['cont1']), f.skewness(trainDF['cont2']), f.skewness(trainDF['cont3']), f.skewness(trainDF['cont10'])) #show high correlation observed from scatter plot trainDF.stat.corr("cont12", "cont11") trainDF.stat.corr("cont1", "cont9") trainDF.stat.corr("cont14", "loss") trainDF.createOrReplaceTempView("insurance") spark.sql("SELECT avg(insurance.loss) as AVG_LOSS FROM insurance").show() spark.sql("SELECT min(insurance.loss) as MIN_LOSS FROM insurance").show() spark.sql("SELECT max(insurance.loss) as MAX_LOSS FROM insurance").show() #rename loss to label
def get_data(self): """ Returns statistics about attributes in a data frame """ from pyspark.sql import functions # Correlation pairs corr_pairs = list( chunks(list(itertools.product(self.attrs, self.attrs)), len(self.attrs))) # Cache data self.data.cache() df_count = self.data.count() # TODO: Implement median using df.approxQuantile('col', [.5], .25) stats = [] for i, name in enumerate(self.attrs): df_col = functions.col(name) stats.append(functions.lit(name)) stats.append(functions.max(df_col).alias('max_{}'.format(name))) stats.append(functions.min(df_col).alias('min_{}'.format(name))) if name in self.numeric_attrs: stats.append( functions.round(functions.stddev(df_col), 4).alias('stddev_{}'.format(name))) else: stats.append(functions.lit('-')) stats.append( functions.count(df_col).alias('count_{}'.format(name))) if name in self.numeric_attrs: stats.append( functions.round(functions.avg(df_col), 4).alias('avg_{}'.format(name))) else: stats.append(functions.lit('-')) stats.append( functions.approx_count_distinct(df_col).alias( 'distinct_{}'.format(name))) stats.append((df_count - functions.count(df_col)).alias( 'missing_{}'.format(name))) if name in self.numeric_attrs: stats.append( functions.round(functions.skewness(df_col), 2).alias('skewness_{}'.format(name))) stats.append( functions.round(functions.kurtosis(df_col), 2).alias('kurtosis_{}'.format(name))) else: stats.append(functions.lit('-')) stats.append(functions.lit('-')) if self.params['correlation']: for pair in corr_pairs[i]: if all([ pair[0] in self.numeric_attrs, pair[1] in self.numeric_attrs ]): stats.append( functions.round(functions.corr(*pair), 4).alias('corr_{}'.format(i))) else: stats.append(functions.lit('-')) self.data = self.data.agg(*stats) aggregated = self.data.take(1)[0] n = len(self.names) rows = [aggregated[i:i + n] for i in range(0, len(aggregated), n)] return {"rows": rows, "attributes": self.get_column_names().split(',')}
if p < alpha: # if norm return True elif np.isnan(p) == True: return False else: return False # Create UDF funcs get_pval_udf = F.udf(lambda vars: get_normal_pval(vars), FloatType()) if_norm_udf = F.udf(lambda p: if_norm(p), BooleanType()) # COMMAND ---------- toneDataAll = toneData.select('ActionGeo_FullName', 'wTRA_1d', 'wTRA_60d', 'nArticles') \ .groupBy('ActionGeo_FullName') \ .agg( F.skewness('wTRA_1d'), F.kurtosis('wTRA_1d'), F.stddev('wTRA_1d'), F.variance('wTRA_1d'), F.collect_list('wTRA_1d').alias('list_wTRA_1d'), F.skewness('wTRA_60d'), F.kurtosis('wTRA_60d'), F.stddev('wTRA_60d'), F.variance('wTRA_60d'), F.collect_list('wTRA_60d').alias('list_wTRA_60d'), F.sum('nArticles').alias('nArticles'), F.count(F.lit(1)).alias('n_observations') ) # get p-value and define normalcy toneDataAll = toneDataAll.withColumn('p_value_1d', get_pval_udf(toneDataAll.list_wTRA_1d))
df = spark.createDataFrame(data=simpleData, schema = schema) df.printSchema() df.show(truncate=False) print("approx_count_distinct: " + \ str(df.select(approx_count_distinct("salary")).collect()[0][0])) print("avg: " + str(df.select(avg("salary")).collect()[0][0])) df.select(collect_list("salary")).show(truncate=False) df.select(collect_set("salary")).show(truncate=False) df2 = df.select(countDistinct("department", "salary")) df2.show(truncate=False) print("Distinct Count of Department & Salary: "+str(df2.collect()[0][0])) print("count: "+str(df.select(count("salary")).collect()[0])) df.select(first("salary")).show(truncate=False) df.select(last("salary")).show(truncate=False) df.select(kurtosis("salary")).show(truncate=False) df.select(max("salary")).show(truncate=False) df.select(min("salary")).show(truncate=False) df.select(mean("salary")).show(truncate=False) df.select(skewness("salary")).show(truncate=False) df.select(stddev("salary"), stddev_samp("salary"), \ stddev_pop("salary")).show(truncate=False) df.select(sum("salary")).show(truncate=False) df.select(sumDistinct("salary")).show(truncate=False) df.select(variance("salary"),var_samp("salary"),var_pop("salary")) \ .show(truncate=False)
def desc_stats(self, dataframe, field): # Parameters: # datadrame: must be a spark dataframe # field: String value. It must match a single column name # About nulls values: This function does not consider null values on its calculations # Importing Libraries and Modules from pyspark.sql.functions import col, avg, min, max, stddev, skewness, kurtosis # Selecting column. Dropping null values df = dataframe.select(field).dropna(how='any') # Count of Values count = df.count() # Central Tendency mean_process = df.agg(avg( col(field))) # The result of agg is a spark dataframe mean_value = mean_process.collect()[0][mean_process.columns[ 0]] # The result of collect is a list of row. [0] first element & [mean_process.columns[0] name of the column median_value = df.approxQuantile( col=field, probabilities=[0.5], relativeError=0.05)[ 0] # The result of approxQuantile is a list. [0] first element # Min, Max, and Percentiles min_process = df.agg(min( col(field))) # The result of agg is a spark dataframe min_value = min_process.collect()[0][min_process.columns[ 0]] # The result of collect is a list of row. [0] first element & [min_process.columns[0] name of the column max_process = df.agg(max( col(field))) # The result of agg is a spark dataframe max_value = max_process.collect()[0][max_process.columns[ 0]] # The result of collect is a list of row. [0] first element & [max_process.columns[0] name of the column percentile_25 = df.approxQuantile( col=field, probabilities=[0.25], relativeError=0.05)[ 0] # The result of approxQuantile is a list. [0] first element percentile_75 = df.approxQuantile( col=field, probabilities=[0.75], relativeError=0.05)[ 0] # The result of approxQuantile is a list. [0] first element # Variation stddev_process = df.agg(stddev( col(field))) # The result of agg is a spark dataframe stddev_value = stddev_process.collect()[0][stddev_process.columns[ 0]] # The result of collect is a list of row. [0] first element & [stddev_process.columns[0] name of the column range_value = max_value - min_value # Calculation of the range of values IQR_value = percentile_75 - percentile_25 # Calculation of the Interquartile range # Shape skewness_process = df.agg(skewness( col(field))) # The result of agg is a spark dataframe skewness_value = skewness_process.collect( )[0][skewness_process.columns[ 0]] # The result of collect is a list of row. [0] first element & [skewness_process.columns[0] name of the column kurtosis_process = df.agg(kurtosis( col(field))) # The result of agg is a spark dataframe kurtosis_value = round( kurtosis_process.collect()[0][kurtosis_process.columns[0]], 2 ) # The result of collect is a list of row. [0] first element & [kurtosis_process.columns[0] name of the column # Printing summary of statistics print('Summary of Descriptive Statistics - ', field) print('**********************************************************') print('Count of values : ', count) print('Central tendency:-----------------------------------------') print('Mean(Average) : ', round(mean_value, 2)) print('Median(Percentile 25) : ', round(median_value, 2)) print('Min, Max, and Percentiles:--------------------------------') print('Minimum : ', round(min_value, 2)) print('Maximum : ', round(max_value, 2)) print('Percentile 25 (Q1) : ', round(percentile_25, 2)) print('Percentile 75 (Q3) : ', round(percentile_75, 2)) print('Variation:------------------------------------------------') print('Standard Deviation : ', round(stddev_value, 2)) print('Range : ', round(range_value, 2)) print('Interquartile Range (IQR): ', round(IQR_value, 2)) print('Shape:----------------------------------------------------') print('Skewness : ', round(skewness_value, 2)) print('Kurtosis : ', round(kurtosis_value, 2)) print('**********************************************************') # Creating a dictionary with descriptive statistics data = { 'Statistic': [ 'count', 'mean', 'median', 'min', 'max', 'percentile25', 'percentile75', 'stddev', 'range', 'iqr', 'skewness', 'kurtosis' ], 'Values': [ count, mean_value, median_value, min_value, max_value, percentile_25, percentile_75, stddev_value, range_value, IQR_value, skewness_value, kurtosis_value ] } # Creating a pandas dataframe summary_stats = pd.DataFrame(data) return summary_stats # This function returns a pandas dataframe