def add_send_order_column(df): return (df.withColumn( "min_slot_date", F.array_min("action_slots")).withColumn( "num_avail_slots", F.size("action_slots")).select( "*", F.dense_rank().over( Window.partitionBy(F.col("prim_party_id")).orderBy( F.col("prim_party_id"), F.col("min_slot_date").asc(), F.col("num_avail_slots").asc(), F.col("rank").asc())).alias('send_order')))
def parse_col(name): col = f.col(name) col = f.when(col == '-', None).otherwise(col) if name in dtypes: dtype = dtypes[name] if isinstance(dtype, t.ArrayType): col = f.split(col, ",") col = col.cast(dtypes[name]) if name in needsMinVal: col = f.array_min(col) return col.alias(name)
def Calculate_CCF(graph): iteration = 0 done = False while not done: iteration += 1 startPair = newPair.value # CCF-Iterate MAP ccf_iterate_map = graph.union(graph.select(col("value").alias("key"), col("key").alias("value"))) # CCF-Iterate REDUCE ccf_iterate_reduce_pair = ccf_iterate_map.groupBy(col("key")).agg(collect_set("value").alias("value"))\ .withColumn("min", least(col("key"), array_min("value")))\ .filter((col('key')!=col('min'))) newPair += ccf_iterate_reduce_pair.withColumn("count", size("value")-1).select(sum("count")).collect()[0][0] ccf_iterate_reduce = ccf_iterate_reduce_pair.select(col("min").alias("a_min"), concat(array(col("key")), col("value")).alias("valueList"))\ .withColumn("valueList", explode("valueList"))\ .filter((col('a_min')!=col('valueList')))\ .select(col('a_min').alias("key"), col('valueList').alias("value")) # CFF-Dedup MAP & REDUCE ccf_dedup_reduce = ccf_iterate_reduce.distinct() graph = ccf_dedup_reduce if startPair == newPair.value: done = True print("Itération : ", iteration, "Number of newPair : ", newPair.value) return graph # MAIN # if __name__ == "__main__": sc = pyspark.SparkContext(appName="Spark_RDD") spark = SparkSession.builder.getOrCreate() newPair = sc.accumulator(0) dataset_path = "/user/user335/dataset/ccf" dataset = sc.textFile(dataset_path + "/web-Google.txt", use_unicode="False") graph = prepare_dataset(dataset) t1 = time.perf_counter() graph = Calculate_CCF(graph) t2 = time.perf_counter() print("calculation time (s) :", t2 - t1)
def min_element_column(parquetFiles): parquetFiles = parquetFiles.withColumn('bars_confidence_min', F.array_min(col('bars_confidence'))) parquetFiles = parquetFiles.withColumn('bars_start_min', F.array_min(col('bars_start'))) parquetFiles = parquetFiles.withColumn('beats_confidence_min', F.array_min(col('beats_confidence'))) parquetFiles = parquetFiles.withColumn('segments_confidence_min', F.array_min(col('segments_confidence'))) parquetFiles = parquetFiles.withColumn('segments_loudness_max_time_min', F.array_min(col('segments_loudness_max_time'))) parquetFiles = parquetFiles.withColumn('tatums_confidence_min', F.array_min(col('tatums_confidence'))) return parquetFiles
def assoc_fn(df: DataFrame, group_by_cols): gbc = [col(x) for x in group_by_cols] h_fn = partial(harmonic_fn, partition_cols=group_by_cols, over_col="evs_score", output_col=harmonic_col) assoc_df = (df.withColumn( "evs_score", array_min(array(col("evidence_score") / 10.0, lit(1.0))) ).transform(h_fn).groupBy(*gbc).agg( countDistinct(col("pmid")).alias("f"), mean(col("evidence_score")).alias("mean"), stddev(col("evidence_score")).alias("std"), max(col("evidence_score")).alias("max"), min(col("evidence_score")).alias("min"), expr("approx_percentile(evidence_score, array(0.25, 0.5, 0.75))"). alias("q"), count(col("pmid")).alias("N"), first(col(harmonic_col)).alias(harmonic_col)).withColumn( "median", element_at(col("q"), 2)).withColumn( "q1", element_at(col("q"), 1)).withColumn("q3", element_at(col("q"), 3)).drop("q")) return assoc_df
def filter_slots_group_adjacency_days(df): udf_action_slots = SpaceEmails.udf_action_slots() return ( df.withColumn( "min_slot_date", F.array_min("action_slots")).withColumn( "last_group_adjacency_days", F.datediff(F.col("min_slot_date"), F.col("last_group_delivery_date"))).withColumn( "days_to_add", F.col("group_adjacency_days") - F.col("last_group_adjacency_days")). withColumn( "p_start_group_adjacency_days", F.expr("date_add(min_slot_date, days_to_add)").cast( T.TimestampType())).withColumn( "action_slots_", udf_action_slots( F.col("p_start_group_adjacency_days"), F.col("p_end_date"), F.col("action_slots"))).drop(*[ "days_to_add", "action_slots", "p_start_group_adjacency_days", "min_slot_date" ]).withColumnRenamed("action_slots_", "action_slots"))
def main(args): sparkConf = (SparkConf().set("spark.driver.memory", "10g").set( "spark.executor.memory", "10g").set("spark.driver.maxResultSize", "0").set("spark.debug.maxToStringFields", "2000").set( "spark.sql.execution.arrow.maxRecordsPerBatch", "500000")) if args.local: spark = (SparkSession.builder.config( conf=sparkConf).master('local[*]').getOrCreate()) else: spark = (SparkSession.builder.config(conf=sparkConf).getOrCreate()) print('args: ', args) print('Spark version: ', spark.version) start_time = time() # load co-occurrences from parquet dataset coming from path coocs = (spark.read.parquet(args.in_cooccurrences)) # we need some filtering; not all data is ready to be used # 1. at least 2 data points per month # 2. there must be data for the year 2020 w2 = Window.partitionBy(*predictions_grouped_keys) # curry function to pass to transform with the keys to group by tfn = partial(assoc_fn, group_by_cols=grouped_keys) aggregated = (coocs.withColumn("year", year( coocs.pubDate)).withColumn("month", month(coocs.pubDate)).withColumn( "day", lit(1)).filter((coocs.isMapped == True) & (coocs.type == "GP-DS") & col("year").isNotNull() & col("month").isNotNull()).selectExpr( *coocs_columns).transform(tfn).withColumn( "ds", to_date( concat_ws("-", col("year"), col("month"), col("day")))).withColumn( "y", col(harmonic_col)). dropna(subset=predictions_selection_keys).withColumn( "years", collect_set(col("year")).over(w2)).withColumn( "nYears", array_size(col("years"))).withColumn( "minYear", array_min(col("years"))).withColumn( "maxYear", array_max(col("years"))).withColumn( "dtCount", count(col("y")).over(w2)).withColumn( "dtMaxYear", max(col("year")).over(w2)). filter((col("maxYear") >= 2019) & (col("nYears") >= 3) & (col("dtCount") >= 12)).select( *predictions_selection_keys).repartition( *predictions_grouped_keys).persist()) aggregated.write.parquet(f"{args.out_prefix}/associationsFromCoocsTS") print('Completed aggregated data in {:.1f} secs'.format(time() - start_time)) # generate the models start_time = time() fbp = (aggregated.groupBy(*predictions_grouped_keys).applyInPandas( make_predictions, prediction_schema).persist()) # fbp.show(20, False) fbp.write.parquet(f"{args.out_prefix}/associationsFromCoocsTSPredictions") print('Completed TS analysis (FB Prophet) data in {:.1f} secs'.format( time() - start_time)) # clean all up just in case spark.stop() return 0
#Okay now we have a list of outages, restore_times, locations, core_ids #First let's calculate some high level metrics #size of outages pw_finalized_outages = pw_finalized_outages.withColumn("cluster_size", F.size(F.array_distinct("core_id"))) #standard deviation outage times pw_finalized_outages = pw_finalized_outages.withColumn("outage_times_stddev", F.explode("outage_times")) #this expression essentially takes the first value of each column (which should all be the same after the explode) exprs = [F.first(x).alias(x) for x in pw_finalized_outages.columns if x != 'outage_times_stddev' and x != 'outage_time'] pw_finalized_outages = pw_finalized_outages.groupBy("outage_time").agg(F.stddev_pop("outage_times_stddev").alias("outage_times_stddev"),*exprs) #range of outage times pw_finalized_outages = pw_finalized_outages.withColumn("outage_times_range", F.array_max("outage_times") - F.array_min("outage_times")) #standard deviation and range of restore times pw_finalized_outages = pw_finalized_outages.withColumn("restore_times", col("restore_time")) pw_finalized_outages = pw_finalized_outages.withColumn("restore_time", F.explode("restore_time")) #this expression essentially takes the first value of each column (which should all be the same after the explode) exprs = [F.first(x).alias(x) for x in pw_finalized_outages.columns if x != 'restore_time' and x != 'outage_time'] pw_finalized_outages = pw_finalized_outages.groupBy("outage_time").agg(F.avg("restore_time").alias("restore_times_mean"),*exprs) pw_finalized_outages = pw_finalized_outages.withColumn("restore_times_stddev", F.explode("restore_times")) #this expression essentially takes the first value of each column (which should all be the same after the explode) exprs = [F.first(x).alias(x) for x in pw_finalized_outages.columns if x != 'restore_times_stddev' and x != 'outage_time'] pw_finalized_outages = pw_finalized_outages.groupBy("outage_time").agg(F.stddev_pop("restore_times_stddev").alias("restore_times_stddev"),*exprs) pw_finalized_outages = pw_finalized_outages.withColumn("restore_times_range", F.array_max("restore_times") - F.array_min("restore_times"))
def foreach_jdbc_writer(df, epoch_id): df.write.\ jdbc(url="jdbc:mysql://localhost/world",table="amazon_products",mode='append',properties={"driver":"com.mysql.cj.jdbc.Driver","user":"******"}) spark = SparkSession.builder.master('local[2]').appName( 'StreamingDemo').getOrCreate() df = spark.readStream.format('kafka')\ .option('kafka.bootstrap.servers','localhost:9092')\ .option('subscribe','amazon')\ .load() deser = udf(lambda x: pickle.loads(x), MapType(StringType(), StringType())) deserlizedDF = df.withColumn('map', deser(df['value'])) parsedDF = deserlizedDF.withColumn('title',element_at('map','productTitle'))\ .withColumn('Categories',element_at('map','productCategories'))\ .withColumn('Rating',element_at('map','productRating'))\ .withColumn('Description',element_at('map','productDescription'))\ .withColumn('Prices',element_at('map','productPrices'))\ .withColumn('Min_Price',array_min(split(element_at('map','productPrices'),r'#*\$').cast(ArrayType(FloatType()))))\ .withColumn('Max_Price',array_max(split(element_at('map','productPrices'),r'#*\$').cast(ArrayType(FloatType())))) projectedDF = parsedDF.select('title', 'Categories', 'Rating', 'Prices', 'Min_Price', 'Max_Price') result = projectedDF.writeStream.foreachBatch(foreach_jdbc_writer).start() result.awaitTermination()
#standard deviation outage times pw_finalized_outages = pw_finalized_outages.withColumn( "outage_times_stddev", F.explode("outage_times")) #this expression essentially takes the first value of each column (which should all be the same after the explode) exprs = [ F.first(x).alias(x) for x in pw_finalized_outages.columns if x != 'outage_times_stddev' and x != 'outage_time' ] pw_finalized_outages = pw_finalized_outages.groupBy("outage_time").agg( F.stddev_pop("outage_times_stddev").alias("outage_times_stddev"), *exprs) #range of outage times pw_finalized_outages = pw_finalized_outages.withColumn( "outage_times_range", F.array_max("outage_times") - F.array_min("outage_times")) #Okay now to effectively calculate SAIDI/SAIFI we need to know the sensor population #join the number of sensors reporting metric above with our outage groupings #then we can calculate the relative SAIDI/SAIFI contribution of each outage pw_finalized_outages = pw_finalized_outages.join( pw_distinct_user_id, F.date_trunc("day", F.from_unixtime( pw_finalized_outages["outage_time"])) == F.date_trunc( "day", pw_distinct_user_id["window_mid_point"])) pw_finalized_outages = pw_finalized_outages.select( "outage_time", "cluster_size", "phones_reporting", "user_id", "outage_times", "outage_times_range", "outage_times_stddev") pw_finalized_outages = pw_finalized_outages.withColumn( "relative_cluster_size",