def test_percentile_approx(self): actual = list( chain.from_iterable( [ re.findall("(percentile_approx\\(.*\\))", str(x)) for x in [ percentile_approx(col("foo"), lit(0.5)), percentile_approx(col("bar"), 0.25, 42), percentile_approx(col("bar"), [0.25, 0.5, 0.75]), percentile_approx(col("foo"), (0.05, 0.95), 100), percentile_approx("foo", 0.5), percentile_approx("bar", [0.1, 0.9], lit(10)), ] ] ) ) expected = [ "percentile_approx(foo, 0.5, 10000)", "percentile_approx(bar, 0.25, 42)", "percentile_approx(bar, array(0.25, 0.5, 0.75), 10000)", "percentile_approx(foo, array(0.05, 0.95), 100)", "percentile_approx(foo, 0.5, 10000)", "percentile_approx(bar, array(0.1, 0.9), 10)", ] self.assertListEqual(actual, expected)
def median(self, sparkDataFrame, columnNames): ''' https://spark.apache.org/docs/3.1.1/api/python/reference/api/pyspark.sql.functions.percentile_approx.html if you have an odd number of rows it does not calculate the value between the 2 middle values ''' self.logger.warn("median BEGIN") retDF = sparkDataFrame.select( *(percentile_approx(c, 0.5, accuracy=1000000) for c in columnNames)) self.logger.warn( "median retDF numRows:{} numCols:{}"\ .format( retDF.count(), len( retDF.columns ) ) ) self.logger.warn("median END\n") return retDF
def calcular_metrica_percentil(viajes_didier_df, percentil): #excluye los registros con kilómetros o precio_kilometro en 0, negativos o en null viajes_didier_df = viajes_didier_df.filter(viajes_didier_df.kilometros > 0) viajes_didier_df = viajes_didier_df.filter( viajes_didier_df.precio_kilometro > 0) viajes_didier_df = viajes_didier_df.withColumn( "Ingreso_por_Viaje", col("kilometros") * col("precio_kilometro")) #obtiene el total de ingresos por persona y se ordenan de menor a mayor cantidad de ingresos personas_ingresos_df = viajes_didier_df.groupBy("identificador").sum( "Ingreso_por_Viaje") personas_ingresos_df = personas_ingresos_df.orderBy( col("sum(Ingreso_por_Viaje)").asc(), col("identificador").asc()) #si se envía un valor de percentil menor a 0, establece el valor en 0 (como valor mínimo) if (percentil < 0): percentil = 0 #si se envía un valor de percentil mayor a 100, establece el valor en 100 (como valor máximo) if (percentil > 100): percentil = 100 metrica = "percentil_" + str(percentil) #obtiene el percentil respectivo valor_percentil_df = personas_ingresos_df.select( percentile_approx("sum(Ingreso_por_Viaje)", [percentil / 100])[0].alias("Valor")) valor_percentil_df = valor_percentil_df.withColumn("Tipo_de_Metrica", lit(metrica)) valor_percentil_df = valor_percentil_df.select(col("Tipo_de_Metrica"), col("Valor")) return valor_percentil_df
def compute_multicol_stats(data, colnames, whis, precision): # Computes mean, median, Q1 and Q3 with approx_percentile and precision scol = [] for colname in colnames: scol.append( F.percentile_approx( "`%s`" % colname, [0.25, 0.50, 0.75], int(1.0 / precision) ).alias("{}_percentiles%".format(colname)) ) scol.append(F.mean("`%s`" % colname).alias("{}_mean".format(colname))) # a_percentiles a_mean b_percentiles b_mean # 0 [3.0, 3.2, 3.2] 3.18 [5.1, 5.9, 6.4] 5.86 pdf = data._internal.resolved_copy.spark_frame.select(*scol).toPandas() i = 0 multicol_stats = {} for colname in colnames: q1, med, q3 = pdf.iloc[0, i] iqr = q3 - q1 lfence = q1 - whis * iqr ufence = q3 + whis * iqr i += 1 mean = pdf.iloc[0, i] i += 1 multicol_stats[colname] = { "mean": mean, "med": med, "q1": q1, "q3": q3, "lfence": lfence, "ufence": ufence, } return multicol_stats
def calculate_time_horizon(df: DataFrame, ts_col: str, freq: str, partition_cols: List[str]): # Convert Frequency using resample dictionary parsed_freq = checkAllowableFreq(freq) freq = f"{parsed_freq[0]} {freq_dict[parsed_freq[1]]}" # Get max and min timestamp per partition partitioned_df: DataFrame = df.groupBy(*partition_cols).agg( max(ts_col).alias("max_ts"), min(ts_col).alias("min_ts"), ) # Generate upscale metrics normalized_time_df: DataFrame = (partitioned_df.withColumn( "min_epoch_ms", expr("unix_millis(min_ts)") ).withColumn("max_epoch_ms", expr("unix_millis(max_ts)")).withColumn( "interval_ms", expr( f"unix_millis(cast('1970-01-01 00:00:00.000+0000' as TIMESTAMP) + INTERVAL {freq})" ), ).withColumn( "rounded_min_epoch", expr("min_epoch_ms - (min_epoch_ms % interval_ms)")).withColumn( "rounded_max_epoch", expr("max_epoch_ms - (max_epoch_ms % interval_ms)")).withColumn( "diff_ms", expr("rounded_max_epoch - rounded_min_epoch")).withColumn( "num_values", expr("(diff_ms/interval_ms) +1"))) ( min_ts, max_ts, min_value_partition, max_value_partition, p25_value_partition, p50_value_partition, p75_value_partition, total_values, ) = normalized_time_df.select( min("min_ts"), max("max_ts"), min("num_values"), max("num_values"), percentile_approx("num_values", 0.25), percentile_approx("num_values", 0.5), percentile_approx("num_values", 0.75), sum("num_values"), ).first() warnings.simplefilter("always", ResampleWarning) warnings.warn( f""" Resample Metrics Warning: Earliest Timestamp: {min_ts} Latest Timestamp: {max_ts} No. of Unique Partitions: {normalized_time_df.count()} Resampled Min No. Values in Single a Partition: {min_value_partition} Resampled Max No. Values in Single a Partition: {max_value_partition} Resampled P25 No. Values in Single a Partition: {p25_value_partition} Resampled P50 No. Values in Single a Partition: {p50_value_partition} Resampled P75 No. Values in Single a Partition: {p75_value_partition} Resampled Total No. Values Across All Partitions: {total_values} """, ResampleWarning, )
udfExpand = F.udf(exlodeVisits, T.MapType(DateType(), T.IntegerType())) df = spark.read.csv('nyc_restaurant_pattern.csv', header=True, escape ='"') \ .select("placekey","safegraph_place_id", F.explode(udfExpand('date_range_start', 'visits_by_day')) \ .alias('date', "visits")) # .where(f"date=='{date}'") #Credit to the professor, I leverage this piece of code from class categories = ["big_box_grocers", "convenience_stores", "drinking_places", "full_service_restaurants", "limited_service_restaurants", "pharmacies_and_drug_stores", "snack_and_bakeries", "specialty_food_stores", "supermarkets_except_convenience_stores"] for c in categories: df.join(filteredCorePlaces, ["placekey"], "inner").groupBy("date","file_name")\ .agg(F.percentile_approx("visits", 0.5).alias('median'), F.round(F.stddev("visits")).cast("integer").alias('std'))\ .withColumn("low", when(F.col("std") > F.col("median"), 0).otherwise(F.col("median") - (F.col("std"))))\ .withColumn("high", F.col("median") + F.col("std"))\ .withColumn("year", F.year("date"))\ .withColumn("project_date", F.add_months(F.col("date"), 12))\ .sort(F.col("year"), F.col("project_date"))\ .where((F.col("year").isin(2019, 2020)) & (F.col("file_name") == c))\ .select(F.col("year"), F.col("project_date").alias('date'), F.col("median"),F.col("low"),F.col("high"))\ .coalesce(1).write.mode("overwrite").option("header",True).format("csv").save("/{}/" + c +"/" + "{}.csv".format(c, OUTPUT_PREFIX))