Beispiel #1
0
 def add_send_order_column(df):
     return (df.withColumn(
         "min_slot_date", F.array_min("action_slots")).withColumn(
             "num_avail_slots", F.size("action_slots")).select(
                 "*",
                 F.dense_rank().over(
                     Window.partitionBy(F.col("prim_party_id")).orderBy(
                         F.col("prim_party_id"),
                         F.col("min_slot_date").asc(),
                         F.col("num_avail_slots").asc(),
                         F.col("rank").asc())).alias('send_order')))
Beispiel #2
0
def parse_col(name):
    col = f.col(name)
    col = f.when(col == '-', None).otherwise(col)
    if name in dtypes:
        dtype = dtypes[name]
        if isinstance(dtype, t.ArrayType):
            col = f.split(col, ",")
        col = col.cast(dtypes[name])
    if name in needsMinVal:
        col = f.array_min(col)
    
    return col.alias(name)
Beispiel #3
0
def Calculate_CCF(graph):
    iteration = 0
    done = False

    while not done:

        iteration += 1
        startPair = newPair.value

        # CCF-Iterate MAP
        ccf_iterate_map = graph.union(graph.select(col("value").alias("key"), col("key").alias("value")))

        # CCF-Iterate REDUCE
        ccf_iterate_reduce_pair = ccf_iterate_map.groupBy(col("key")).agg(collect_set("value").alias("value"))\
                                            .withColumn("min", least(col("key"), array_min("value")))\
                                            .filter((col('key')!=col('min')))

        newPair += ccf_iterate_reduce_pair.withColumn("count", size("value")-1).select(sum("count")).collect()[0][0]

        ccf_iterate_reduce = ccf_iterate_reduce_pair.select(col("min").alias("a_min"), concat(array(col("key")), col("value")).alias("valueList"))\
                                                    .withColumn("valueList", explode("valueList"))\
                                                    .filter((col('a_min')!=col('valueList')))\
                                                    .select(col('a_min').alias("key"), col('valueList').alias("value"))

        # CFF-Dedup MAP & REDUCE
        ccf_dedup_reduce = ccf_iterate_reduce.distinct()

        graph = ccf_dedup_reduce

        if startPair == newPair.value:
        done = True

        print("Itération : ", iteration, "Number of newPair : ", newPair.value)
    
    return graph

# MAIN #  
if __name__ == "__main__":

    sc = pyspark.SparkContext(appName="Spark_RDD")
    spark = SparkSession.builder.getOrCreate()
    newPair = sc.accumulator(0)
    
    dataset_path = "/user/user335/dataset/ccf"
    dataset = sc.textFile(dataset_path + "/web-Google.txt", use_unicode="False")

    graph = prepare_dataset(dataset)

    t1 = time.perf_counter()
    graph = Calculate_CCF(graph)
    t2 = time.perf_counter()

    print("calculation time (s) :", t2 - t1)
Beispiel #4
0
def min_element_column(parquetFiles):
    parquetFiles = parquetFiles.withColumn('bars_confidence_min', F.array_min(col('bars_confidence')))
    parquetFiles = parquetFiles.withColumn('bars_start_min', F.array_min(col('bars_start')))
    parquetFiles = parquetFiles.withColumn('beats_confidence_min', F.array_min(col('beats_confidence')))
    parquetFiles = parquetFiles.withColumn('segments_confidence_min', F.array_min(col('segments_confidence')))
    parquetFiles = parquetFiles.withColumn('segments_loudness_max_time_min',
                                           F.array_min(col('segments_loudness_max_time')))
    parquetFiles = parquetFiles.withColumn('tatums_confidence_min', F.array_min(col('tatums_confidence')))

    return parquetFiles
def assoc_fn(df: DataFrame, group_by_cols):
    gbc = [col(x) for x in group_by_cols]
    h_fn = partial(harmonic_fn,
                   partition_cols=group_by_cols,
                   over_col="evs_score",
                   output_col=harmonic_col)
    assoc_df = (df.withColumn(
        "evs_score", array_min(array(col("evidence_score") / 10.0, lit(1.0)))
    ).transform(h_fn).groupBy(*gbc).agg(
        countDistinct(col("pmid")).alias("f"),
        mean(col("evidence_score")).alias("mean"),
        stddev(col("evidence_score")).alias("std"),
        max(col("evidence_score")).alias("max"),
        min(col("evidence_score")).alias("min"),
        expr("approx_percentile(evidence_score, array(0.25, 0.5, 0.75))").
        alias("q"),
        count(col("pmid")).alias("N"),
        first(col(harmonic_col)).alias(harmonic_col)).withColumn(
            "median", element_at(col("q"), 2)).withColumn(
                "q1", element_at(col("q"),
                                 1)).withColumn("q3", element_at(col("q"),
                                                                 3)).drop("q"))

    return assoc_df
Beispiel #6
0
 def filter_slots_group_adjacency_days(df):
     udf_action_slots = SpaceEmails.udf_action_slots()
     return (
         df.withColumn(
             "min_slot_date", F.array_min("action_slots")).withColumn(
                 "last_group_adjacency_days",
                 F.datediff(F.col("min_slot_date"),
                            F.col("last_group_delivery_date"))).withColumn(
                                "days_to_add",
                                F.col("group_adjacency_days") -
                                F.col("last_group_adjacency_days")).
         withColumn(
             "p_start_group_adjacency_days",
             F.expr("date_add(min_slot_date, days_to_add)").cast(
                 T.TimestampType())).withColumn(
                     "action_slots_",
                     udf_action_slots(
                         F.col("p_start_group_adjacency_days"),
                         F.col("p_end_date"),
                         F.col("action_slots"))).drop(*[
                             "days_to_add", "action_slots",
                             "p_start_group_adjacency_days", "min_slot_date"
                         ]).withColumnRenamed("action_slots_",
                                              "action_slots"))
def main(args):
    sparkConf = (SparkConf().set("spark.driver.memory", "10g").set(
        "spark.executor.memory",
        "10g").set("spark.driver.maxResultSize",
                   "0").set("spark.debug.maxToStringFields", "2000").set(
                       "spark.sql.execution.arrow.maxRecordsPerBatch",
                       "500000"))

    if args.local:
        spark = (SparkSession.builder.config(
            conf=sparkConf).master('local[*]').getOrCreate())
    else:
        spark = (SparkSession.builder.config(conf=sparkConf).getOrCreate())

    print('args: ', args)
    print('Spark version: ', spark.version)
    start_time = time()

    # load co-occurrences from parquet dataset coming from path
    coocs = (spark.read.parquet(args.in_cooccurrences))

    # we need some filtering; not all data is ready to be used
    # 1. at least 2 data points per month
    # 2. there must be data for the year 2020
    w2 = Window.partitionBy(*predictions_grouped_keys)

    # curry function to pass to transform with the keys to group by
    tfn = partial(assoc_fn, group_by_cols=grouped_keys)
    aggregated = (coocs.withColumn("year", year(
        coocs.pubDate)).withColumn("month", month(coocs.pubDate)).withColumn(
            "day",
            lit(1)).filter((coocs.isMapped == True) & (coocs.type == "GP-DS")
                           & col("year").isNotNull()
                           & col("month").isNotNull()).selectExpr(
                               *coocs_columns).transform(tfn).withColumn(
                                   "ds",
                                   to_date(
                                       concat_ws("-",
                                                 col("year"), col("month"),
                                                 col("day")))).withColumn(
                                                     "y", col(harmonic_col)).
                  dropna(subset=predictions_selection_keys).withColumn(
                      "years",
                      collect_set(col("year")).over(w2)).withColumn(
                          "nYears", array_size(col("years"))).withColumn(
                              "minYear", array_min(col("years"))).withColumn(
                                  "maxYear",
                                  array_max(col("years"))).withColumn(
                                      "dtCount",
                                      count(col("y")).over(w2)).withColumn(
                                          "dtMaxYear",
                                          max(col("year")).over(w2)).
                  filter((col("maxYear") >= 2019) & (col("nYears") >= 3)
                         & (col("dtCount") >= 12)).select(
                             *predictions_selection_keys).repartition(
                                 *predictions_grouped_keys).persist())

    aggregated.write.parquet(f"{args.out_prefix}/associationsFromCoocsTS")
    print('Completed aggregated data in {:.1f} secs'.format(time() -
                                                            start_time))

    # generate the models
    start_time = time()

    fbp = (aggregated.groupBy(*predictions_grouped_keys).applyInPandas(
        make_predictions, prediction_schema).persist())

    # fbp.show(20, False)

    fbp.write.parquet(f"{args.out_prefix}/associationsFromCoocsTSPredictions")
    print('Completed TS analysis (FB Prophet) data in {:.1f} secs'.format(
        time() - start_time))

    # clean all up just in case
    spark.stop()
    return 0
Beispiel #8
0
#Okay now we have a list of outages, restore_times, locations, core_ids
#First let's calculate some high level metrics

#size of outages
pw_finalized_outages = pw_finalized_outages.withColumn("cluster_size", F.size(F.array_distinct("core_id")))

#standard deviation outage times
pw_finalized_outages = pw_finalized_outages.withColumn("outage_times_stddev", F.explode("outage_times"))

#this expression essentially takes the first value of each column (which should all be the same after the explode)
exprs = [F.first(x).alias(x) for x in pw_finalized_outages.columns if x != 'outage_times_stddev' and x != 'outage_time']
pw_finalized_outages = pw_finalized_outages.groupBy("outage_time").agg(F.stddev_pop("outage_times_stddev").alias("outage_times_stddev"),*exprs)

#range of outage times
pw_finalized_outages = pw_finalized_outages.withColumn("outage_times_range", F.array_max("outage_times") - F.array_min("outage_times"))

#standard deviation and range of restore times
pw_finalized_outages = pw_finalized_outages.withColumn("restore_times", col("restore_time"))
pw_finalized_outages = pw_finalized_outages.withColumn("restore_time", F.explode("restore_time"))

#this expression essentially takes the first value of each column (which should all be the same after the explode)
exprs = [F.first(x).alias(x) for x in pw_finalized_outages.columns if x != 'restore_time' and x != 'outage_time']
pw_finalized_outages = pw_finalized_outages.groupBy("outage_time").agg(F.avg("restore_time").alias("restore_times_mean"),*exprs)

pw_finalized_outages = pw_finalized_outages.withColumn("restore_times_stddev", F.explode("restore_times"))

#this expression essentially takes the first value of each column (which should all be the same after the explode)
exprs = [F.first(x).alias(x) for x in pw_finalized_outages.columns if x != 'restore_times_stddev' and x != 'outage_time']
pw_finalized_outages = pw_finalized_outages.groupBy("outage_time").agg(F.stddev_pop("restore_times_stddev").alias("restore_times_stddev"),*exprs)
pw_finalized_outages = pw_finalized_outages.withColumn("restore_times_range", F.array_max("restore_times") - F.array_min("restore_times"))
def foreach_jdbc_writer(df, epoch_id):
    df.write.\
    jdbc(url="jdbc:mysql://localhost/world",table="amazon_products",mode='append',properties={"driver":"com.mysql.cj.jdbc.Driver","user":"******"})


spark = SparkSession.builder.master('local[2]').appName(
    'StreamingDemo').getOrCreate()

df = spark.readStream.format('kafka')\
    .option('kafka.bootstrap.servers','localhost:9092')\
    .option('subscribe','amazon')\
    .load()

deser = udf(lambda x: pickle.loads(x), MapType(StringType(), StringType()))

deserlizedDF = df.withColumn('map', deser(df['value']))
parsedDF = deserlizedDF.withColumn('title',element_at('map','productTitle'))\
    .withColumn('Categories',element_at('map','productCategories'))\
    .withColumn('Rating',element_at('map','productRating'))\
    .withColumn('Description',element_at('map','productDescription'))\
    .withColumn('Prices',element_at('map','productPrices'))\
    .withColumn('Min_Price',array_min(split(element_at('map','productPrices'),r'#*\$').cast(ArrayType(FloatType()))))\
    .withColumn('Max_Price',array_max(split(element_at('map','productPrices'),r'#*\$').cast(ArrayType(FloatType()))))

projectedDF = parsedDF.select('title', 'Categories', 'Rating', 'Prices',
                              'Min_Price', 'Max_Price')

result = projectedDF.writeStream.foreachBatch(foreach_jdbc_writer).start()

result.awaitTermination()
Beispiel #10
0
#standard deviation outage times
pw_finalized_outages = pw_finalized_outages.withColumn(
    "outage_times_stddev", F.explode("outage_times"))

#this expression essentially takes the first value of each column (which should all be the same after the explode)
exprs = [
    F.first(x).alias(x) for x in pw_finalized_outages.columns
    if x != 'outage_times_stddev' and x != 'outage_time'
]
pw_finalized_outages = pw_finalized_outages.groupBy("outage_time").agg(
    F.stddev_pop("outage_times_stddev").alias("outage_times_stddev"), *exprs)

#range of outage times
pw_finalized_outages = pw_finalized_outages.withColumn(
    "outage_times_range",
    F.array_max("outage_times") - F.array_min("outage_times"))

#Okay now to effectively calculate SAIDI/SAIFI we need to know the sensor population
#join the number of sensors reporting metric above with our outage groupings
#then we can calculate the relative SAIDI/SAIFI contribution of each outage
pw_finalized_outages = pw_finalized_outages.join(
    pw_distinct_user_id,
    F.date_trunc("day", F.from_unixtime(
        pw_finalized_outages["outage_time"])) == F.date_trunc(
            "day", pw_distinct_user_id["window_mid_point"]))

pw_finalized_outages = pw_finalized_outages.select(
    "outage_time", "cluster_size", "phones_reporting", "user_id",
    "outage_times", "outage_times_range", "outage_times_stddev")
pw_finalized_outages = pw_finalized_outages.withColumn(
    "relative_cluster_size",