Example #1
0
def run_spark_job(spark):

   
    # Create Spark configurations with max offset of 200 per trigger
    # set up correct bootstrap server and port
    # test being my topic

    df = spark \
        .readStream \
        .format("kafka") \
        .option("kafka.bootstrap.servers", "localhost:9092") \
        .option("subscribe", "test") \
        .option("startingOffsets", "earliest") \
        .option("maxOffsetsPerTrigger", 200) \
        .option("maxRatePerPartition", 100) \    
        .load()

    # Show schema for the incoming resources for checks
    df.printSchema()

    kafka_df = df.selectExpr("CAST(value AS STRING)")

    service_table = kafka_df \
        .select(psf.from_json(psf.col('value'), schema).alias("SERVICE_CALLS")) \
        .select("SERVICE_CALLS.*")

    distinct_table = service_table \
        .select(psf.col('crime_id'),
                psf.col('original_crime_type_name'),
                psf.to_timestamp(psf.col('call_date_time')).alias('call_datetime'),
                psf.col('address'),
                psf.col('disposition'))

    
    counts_df = distinct_table \
        .withWatermark("call_datetime", "60 minutes") \
        .groupBy(
            psf.window(distinct_table.call_datetime, "10 minutes", "5 minutes"),
            distinct_table.original_crime_type_name
            ).count()

   
    converted_df = counts_df.withColumn(
        "call_date_time", udf_convert_time(counts_df.call_date_time))

    
    calls_per_2_days = converted_df \
        .groupBy(
        psf.window(converted_df.call_date_time, "2 day")
    ).agg(psf.count("crime_id").alias("calls_per_2_day")).select("calls_per_2_day")

    # TODO write output stream
    query = counts_df \
        .writeStream \
        .outputMode('Complete') \
        .format('console') \
        .start()

    # TODO attach a ProgressReporter
    query.awaitTermination()
def runSpark(spark):

    df = spark \
    .readStream \
    .format("kafka") \
    .option("kafka.bootstrap.servers", "localhost:9092") \
    .option("subscribe", "service-calls") \
    .option("maxOffsetPerTrigger", "250") \
    .option("startingOffsets", "earliest") \
    .load()

    # Show schema for the incoming resources for checks
    df.printSchema()

    df.selectExpr("CAST(value AS STRING)")
    kafka_df = df.selectExpr("CAST(value AS STRING)")
    service_table = kafka_df\
        .select(psf.from_json(psf.col('value'), schema).alias("SERVICE_CALLS"))\
        .select("SERVICE_CALLS.*")

    distinct_table = service_table\
        .select(psf.col('crime_id'),
                psf.col('original_crime_type_name'),
                psf.to_timestamp(psf.col('call_date_time')).alias('call_datetime'),
                psf.col('address'),
                psf.col('disposition'))


    counts_df = distinct_table \
        .withWatermark("call_datetime", "60 minutes") \
        .groupBy(
            psf.window(distinct_table.call_datetime, "10 minutes", "5 minutes"),
            distinct_table.original_crime_type_name
            ).count()

    counts_df = counts_df['call_date_time'].apply(udf_convert_time)

    query = counts_df \
        .writeStream \
        .outputMode('complete') \
        .format('console') \
        .start()

    query.awaitTermination()


    calls_per_2_days = distinct_table \
        .withWatermark("call_datetime", "2880 minutes") \
        .groupBy(
            psf.window(distinct_table.call_datetime, "60 minutes", "30 minutes"),
            distinct_table.original_crime_type_name
            ).count()

    query = calls_per_2_days \
        .writeStream \
        .outputMode('complete') \
        .format('console') \
        .start()

    query.awaitTermination()
Example #3
0
def run_spark_job(spark):

    # TODO Create Spark Configuration
    # Create Spark configurations with max offset of 200 per trigger
    # set up correct bootstrap server and port
    df = spark \
        .readStream \
        .format("kafka") \
        .option("kafka.bootstrap.servers", "localhost:9092") \
        .option("subscribe", "service-calls") \
        .option("startingOffsets", "earliest") \
        .option("maxOffsetsPerTrigger", 200) \
        .load()

    # Show schema for the incoming resources for checks
    df.printSchema()

    # TODO extract the correct column from the kafka input resources
    # Take only value and convert it to String
    kafka_df = df.selectExpr("CAST(value AS STRING)")

    service_table = kafka_df\
        .select(psf.from_json(psf.col('value'), schema).alias("SERVICE_CALLS"))\
        .select("SERVICE_CALLS.*")

    distinct_table = service_table\
        .select(psf.col('crime_id'),
                psf.col('original_crime_type_name'),
                psf.to_timestamp(psf.col('call_date_time')).alias('call_datetime'),
                psf.col('address'),
                psf.col('disposition'))

    # TODO get different types of original_crime_type_name in 60 minutes interval
    counts_df = distinct_table \
        .withWatermark("call_datetime", "60 minutes") \
        .groupBy(
            psf.window(distinct_table.call_datetime, "10 minutes", "5 minutes"),
            distinct_table.original_crime_type_name
            ).count()

    # TODO use udf to convert timestamp to right format on a call_date_time column

    converted_df = distinct_table.withColumn(
        "call_date_time", udf_convert_time(distinct_table.call_datetime))

    # TODO apply aggregations using windows function to see how many calls occurred in 2 day span
    calls_per_2_days = converted_df \
        .groupBy(
        psf.window(converted_df.call_date_time, "2 day")
    ).agg(psf.count("crime_id").alias("calls_per_2_day")).select("calls_per_2_day")

    # TODO write output stream
    query = counts_df \
        .writeStream \
        .outputMode('Complete') \
        .format('console') \
        .start()

    # TODO attach a ProgressReporter
    query.awaitTermination()
Example #4
0
def run_spark_job(spark):

    df = spark.readStream \
        .format("kafka") \
        .option("kafka.bootstrap.servers", "localhost:9092") \
        .option("maxOffsetsPerTrigger", 200) \
        .option("subscribe", "service-calls") \
        .option("startingOffsets", "earliest") \
        .load()

    df.printSchema()

    kafka_df = df.selectExpr("CAST(value AS STRING)")

    service_table = kafka_df\
        .select(psf.from_json(psf.col('value'), schema).alias("SERVICE_CALLS"))\
        .select("SERVICE_CALLS.*")

    distinct_table = service_table\
        .select(psf.col('crime_id'),
                psf.col('original_crime_type_name'),
                psf.to_timestamp(psf.col('call_date_time')).alias('call_datetime'),
                psf.col('address'),
                psf.col('address_type'),
                psf.col('disposition'))

    counts_df = distinct_table.withWatermark("call_datetime", "60 minutes") \
        .groupBy(
            psf.window(distinct_table.call_datetime,
                       "10 minutes", "10 minutes"),
            distinct_table.original_crime_type_name
    ).count()

    converted_df = distinct_table.withColumn(
        "call_date_time", udf_convert_time(distinct_table.call_datetime))
    # converted_df.printSchema()

    # apply aggregations using windows function to see how many calls occurred in 2 day span
    calls_per_2_days = converted_df \
        .withWatermark("call_datetime", "2 day") \
        .groupBy(
        psf.window(converted_df.call_date_time, "2 day")
    ).agg(psf.count("crime_id").alias("calls_per_2_day")).select("calls_per_2_day")
    # calls_per_2_days.printSchema()


    query = calls_per_2_days.writeStream \
        .outputMode('complete') \
        .format('console') \
        .start()

    query.awaitTermination()
Example #5
0
    def hexercise_in_streaming(self):
        static_name = 'retail_data'

        self.static_data_frame.createOrReplaceTempView(static_name)
        static_schema = self.static_data_frame.schema

        self.static_data_frame \
            .selectExpr(
            "CustomerId",
            "(UnitPrice * Quantity) as total_cost",
            "InvoiceDate") \
            .groupBy(
            functions.col("CustomerId"), functions.window(functions.col("InvoiceDate"), "1 day")) \
            .sum("total_cost") \
            .show(5)

        streaming_data_frame = self.spark.readStream \
            .schema(static_schema) \
            .option("maxFilesPerTrigger", 1) \
            .format("csv") \
            .option("header", "true") \
            .option("inferSchema", "true") \
            .load("/Spark-The-Definitive-Guide/data/retail-data/by-day/*.csv")

        purchase_by_customer_per_hour = streaming_data_frame \
            .selectExpr(
            "CustomerId",
            "(UnitPrice * Quantity) as total_cost",
            "InvoiceDate") \
            .groupBy(functions.col("CustomerId"), functions.window(functions.col("InvoiceDate"), "1 day")) \
            .sum("total_cost")

        stream = purchase_by_customer_per_hour.writeStream \
            .format("memory") \
            .queryName("customer_purchases") \
            .outputMode("complete") \
            .start()

        # check every X sec
        # values in range and sleep need to be adjusted per machine
        for _ in range(0, 6):
            self.spark.sql("""
             SELECT *
             FROM customer_purchases
             ORDER BY `sum(total_cost)` DESC
             """) \
                .show(5)

            sleep(4)

        # very gracefully
        stream.stop()
def run_spark_job(spark):

    df = spark \
        .readStream \
        .format("kafka") \
        .option("kafka.bootstrap.servers", "localhost:9092") \
        .option("subscribe", "com.udacity.police.calls.new") \
        .option("startingOffset", "earliest") \
        .option("maxRatePerPartition", 10000) \
        .option("maxOffsetPerTrigger", 200) \
        .load()

    df.printSchema()

    kafka_df = df.selectExpr("CAST(value AS STRING)",
                             "CAST(timestamp AS timestamp)")

    service_table = kafka_df\
        .select(psf.from_json(psf.col('value'), schema).alias("DF"), psf.col('timestamp'))\
        .select("timestamp", "DF.*")

    service_table.printSchema()

    distinct_table = service_table.select(
        'original_crime_type_name', 'disposition',
        psf.to_timestamp('timestamp').alias('call_date_time2'))
    #.dropDuplicates()

    agg_df = distinct_table.select('call_date_time2', 'original_crime_type_name', 'disposition') \
    .withWatermark('call_date_time2', '10 minutes') \
    .groupBy(
             psf.window('call_date_time2', "5 minutes", "2 minutes"),
             #'call_date_time2',
             'original_crime_type_name',
             'disposition') \
    .count() \
#    .orderBy(psf.desc('count'))

    #    query = agg_df \
    #    .writeStream \
    #    .outputMode("complete") \
    #    .format("console") \
    #    .start()
    #    query.awaitTermination()

    radio_code_json_filepath = "/home/workspace/radio_code.json"
    radio_code_df = spark.read.option("multiLine",
                                      True).json(radio_code_json_filepath)

    radio_code_df = radio_code_df.withColumnRenamed("disposition_code",
                                                    "disposition")

    # join on disposition column, I was experimenting with different types of joins inner, left_outer...
    join_query = agg_df.join(radio_code_df,
                             radio_code_df.disposition == agg_df.disposition,
                             "left_outer")

    query2 = join_query.writeStream.outputMode("complete").format(
        "console").start()
    query2.awaitTermination()
def track_model_quality(real, predicted):
    quality_compare = predicted.join(real, "pid")
    quality_compare = quality_compare.withColumn(
        'accurate_prediction',
        F.when((F.col('quality')==F.col('predicted_quality')), 1)\
        .otherwise(0)
    )

    accurate_prediction_summary = (quality_compare.groupBy(
        F.window(F.col('process_time'), '1 day').alias('window'),
        F.col('accurate_prediction')).count().withColumn(
            'window_day', F.expr('to_date(window.start)')).withColumn(
                'total',
                F.sum(F.col('count')).over(
                    Window.partitionBy('window_day'))).withColumn(
                        'ratio',
                        F.col('count') * 100 / F.col('total')).select(
                            'window_day', 'accurate_prediction', 'count',
                            'total', 'ratio').withColumn(
                                'accurate_prediction',
                                F.when(
                                    F.col('accurate_prediction') == 1,
                                    'Accurate').otherwise(
                                        'Inaccurate')).orderBy('window_day'))
    return accurate_prediction_summary
    def test_grouped_over_window(self):

        data = [(0, 1, "2018-03-10T00:00:00+00:00", [0]),
                (1, 2, "2018-03-11T00:00:00+00:00", [0]),
                (2, 2, "2018-03-12T00:00:00+00:00", [0]),
                (3, 3, "2018-03-15T00:00:00+00:00", [0]),
                (4, 3, "2018-03-16T00:00:00+00:00", [0]),
                (5, 3, "2018-03-17T00:00:00+00:00", [0]),
                (6, 3, "2018-03-21T00:00:00+00:00", [0])]

        expected = {0: [0],
                    1: [1, 2],
                    2: [1, 2],
                    3: [3, 4, 5],
                    4: [3, 4, 5],
                    5: [3, 4, 5],
                    6: [6]}

        df = self.spark.createDataFrame(data, ['id', 'group', 'ts', 'result'])
        df = df.select(col('id'), col('group'), col('ts').cast('timestamp'), col('result'))

        def f(pdf):
            # Assign each result element the ids of the windowed group
            pdf['result'] = [pdf['id']] * len(pdf)
            return pdf

        result = df.groupby('group', window('ts', '5 days')).applyInPandas(f, df.schema)\
            .select('id', 'result').collect()
        for r in result:
            self.assertListEqual(expected[r[0]], r[1])
Example #9
0
def task_a_2_step_1_final(spark):
    a2_struct = T.StructType([
        T.StructField("datetime_start", T.TimestampType()),
        T.StructField("datetime_end", T.TimestampType()),
        T.StructField("map_topics", T.MapType(
            T.StringType(),
            T.ArrayType(T.StringType())
        ))
    ])

    result = kafka_source(spark, config.BOOTSTRAP_SERVERS, "topics-by-state_step-0").parse_json(a2_struct) \
        .withWatermark("datetime_end", "1 minute").groupBy(
        F.window("datetime_end", "3 hour", "1 hour")
    ) \
        .agg(
        F.first("window.start").alias("timestamp_start"),
        F.first("window.end").alias("timestamp_end"),
        F.collect_list("map_topics").alias("statistics")
    ) \
        .select(
        F.struct(
            F.concat(F.hour('timestamp_start'), lit(":"), F.minute('timestamp_start')).alias("time_start"),
            F.concat(F.hour('timestamp_end'), lit(":"), F.minute('timestamp_end')).alias("time_end"),
            concat_maps_udf(col('statistics')).alias("statistics")
        ).alias("res")
    ).send_to_kafka(config.BOOTSTRAP_SERVERS, "topics-by-state", config.LOG_PREFIX)

    return result
Example #10
0
    def stream_and_process_message(self):
        try:
            input_df = self.get_kafka_consumer()
            flat_message_df = self.get_parsed_message(input_df)
            cleaned_message_df = self.get_cleaned_message(flat_message_df)
            grouped_message_df = cleaned_message_df.select(
                'id',
                window(cleaned_message_df.timestamp, self.kafka_window_time),
                'country', 'email')
            grouped_message_output_df = self.get_output_df(grouped_message_df)

            # publish the grouped data onto new topic for further measure calculation
            df_write_stream = grouped_message_output_df \
                .selectExpr("CAST(key AS STRING)", "CAST(value AS STRING)") \
                .writeStream \
                .format("kafka") \
                .option("kafka.bootstrap.servers", self.kafka_bootstrap_servers) \
                .option("topic", self.kafka_output_topic_name) \
                .outputMode("append") \
                .option("checkpointLocation", self.checkpoint_file_dir + "output_topic_checkpoint") \
                .start()

            # save the cleansed data in parquet file format for future use
            cleaned_message_df \
                .writeStream \
                .format("parquet") \
                .option("startingOffsets", "earliest") \
                .option("path", self.output_file_dir ) \
                .option("checkpointLocation", self.checkpoint_file_dir + "saved_parquet_checkpoint") \
                .start()

            df_write_stream.awaitTermination()
        except Exception as e:
            self.logger.error(e)
Example #11
0
def main():
    spark = SparkSession.builder \
        .master('local') \
        .appName('nyc-taxi') \
        .config('spark.executor.memory', '1gb') \
        .getOrCreate()
    sc: SparkContext = spark.sparkContext
    sc.setLogLevel('WARN')
    logger.info("app_id = {}".format(sc.applicationId))

    df_line = spark.readStream.format('kafka') \
        .option('kafka.bootstrap.servers', 'localhost:9092') \
        .option('subscribe', 'word-count') \
        .option('startingOffsets', 'latest') \
        .load() \
        .selectExpr('CAST(value AS STRING)')

    df_word = df_line \
        .select(funs.explode(funs.split(df_line.value, " ")).alias("word"))

    df_word = df_word \
        .withColumn('word', funs.regexp_replace('word', '[^a-zA-Z0-9]', '')) \
        .filter(df_word['word'] != '') \
        .selectExpr('LOWER(word) AS word') \
        .withColumn('process_time', funs.current_timestamp())

    df_grouped = df_word.groupBy(
        funs.window('process_time', '20 seconds', '10 seconds'),
        'word').count()

    write_stream(df_grouped)
    def process_stream(self, rdd):
        """
         Args rdd: rdd
        :rtype: None
        """
        if rdd.isEmpty():
            print("RDD is empty")
        else:
            df = rdd.toDF()
            # downsample data
            df2 = df.withColumn("timestamp", df.ts.cast("timestamp"))
            downsampled_df = df2.groupBy(
                'id',
                window("timestamp", "1 second").alias("ds_ts")).agg(
                    F.round(F.avg("val"), 2).alias('downsample_avg'))
            final_df = downsampled_df.select(
                "id", downsampled_df['ds_ts'].start.alias("start_ts"),
                "downsample_avg").orderBy('start_ts', ascending=True)

            # write to timescale
            try:
                connector = pgConnector.PostgresConnector(
                    "ec2-3-94-71-208.compute-1.amazonaws.com", "datanodedb",
                    "datanode", "password")
                connector.write(final_df, "downsampled_table", "append")

            except Exception as e:
                print(e)
                pass
Example #13
0
def test_sliding_window(data_gen):
    row_gen = StructGen([['ts', _restricted_ts_gen], ['data', data_gen]],
                        nullable=False)
    w = Window.partitionBy(f.window('ts', '5 hour', '1 hour'))
    assert_gpu_and_cpu_are_equal_collect(lambda spark: gen_df(spark, row_gen).
                                         withColumn('rolling_max',
                                                    f.max("data").over(w)))
Example #14
0
    def bin_keep_df(self, df, wind="5",):

        binned = df.groupBy(df['src_ip'], window(df["timestamp"], wind + " minutes")) \
            .agg(collect_list('dest_port')) \
            .orderBy("window")
            #  .sum('num_pkts_in', 'num_pkts_out', 'bytes_in', 'bytes_out') \
        return binned
Example #15
0
def run_spark_job(spark):
    df = (
        spark.readStream.format("kafka")
        .option("subscribe", "org.sf.police.calls")
        .option("startingOffsets", "earliest")
        .option("maxOffsetsPerTrigger", 200)
        .option("kafka.bootstrap.servers", "localhost:9092")
        .load()
    )
    # Show schema for the incoming resources for checks
    df.printSchema()

    # Take only value and convert it to String
    kafka_df = df.selectExpr("CAST(value AS STRING)", "timestamp")

    service_table = kafka_df.select(
        func.from_json(func.col("value"), schema).alias("DF"), "timestamp"
    ).select("DF.*", "timestamp")

    # count the number of original crime type
    agg_df = (
        service_table
        .withWatermark("timestamp", "12 hours")
        .groupBy(
            func.window("call_date_time", "1 day"),
            "original_crime_type_name",
            "disposition",
        ).count()
    )

    # Q1. Submit a screen shot of a batch ingestion of the aggregation
    # write output stream
    query = (
        agg_df.writeStream
        .outputMode("update")
        .format("console")
        .start()
    )

    # get the right radio code json path
    radio_code_json_filepath = "radio_code.json"
    radio_code_df = spark.read.option("multiline", "true").json(
        radio_code_json_filepath
    )

    # clean up your data so that the column names match on radio_code_df and agg_df
    # we will want to join on the disposition code
    # rename disposition_code column to disposition
    radio_code_df = radio_code_df.withColumnRenamed("disposition_code", "disposition")

    # join on disposition column
    join_query = (
        agg_df.join(radio_code_df, on="disposition")
        .writeStream.format("console")
        .outputMode("update")
        .start()
    )
    # attach a ProgressReporter
    join_query.awaitTermination()
    query.awaitTermination()
Example #16
0
def run_spark_job(spark):

    df = spark \
        .readStream \
        .format("kafka").option("kafka.bootstrap.servers", "localhost:9092").option("subscribe", "demo")\
        .option("startingOffsets", "earliest").option("maxOffsetsPerTrigger", 200).load()

    df.printSchema()

    kafka_df = df.selectExpr("CAST(value AS STRING)")

    service_table = kafka_df.select(psf.from_json(psf.col('value'), schema).alias("SERVICE_CALLS"))\
        .select("SERVICE_CALLS.*")

    distinct_table = service_table.select(
        psf.to_timestamp(psf.col('call_date_time')).alias('call_datetime'),
        psf.col('address'), psf.col('disposition'), psf.col('crime_id'),
        psf.col('original_crime_type_name'))

    counts_df = distinct_table \
        .withWatermark("call_datetime", "60 minutes").groupBy(
            psf.window(distinct_table.call_datetime, "10 minutes", "5 minutes"),
            distinct_table.original_crime_type_name
            ).count()

    query = counts_df.writeStream.outputMode('Complete').format(
        'console').start()

    query.awaitTermination()
Example #17
0
    def transform_input(self, df):
        config = self.config
        watermark = config["watermark"]

        group_by = config["group_by"]
        time_window = config["time_window"]
        weektime_format = config["weektime_format"]

        def foreach_adapter(df: DataFrame, _):
            print("foreach adapter started")
            if not df.rdd.isEmpty():
                anomaly_cum_average = get_udf("anomaly_cum_average",
                                              self.config)
                df_anomaly = df.withColumn(
                    "is_anomaly",
                    anomaly_cum_average("count", "weektime", *self.group_by))
                df_result = df_anomaly.filter("is_anomaly == False")
                self.write_to_es(df_result)

                print("finished writing")

        grouped_df = df.withWatermark("timestamp", watermark) \
            .groupBy(*group_by, window("timestamp", time_window)) \
            .count().withColumn("weektime", date_format('window.end', weektime_format)) \
            .withColumn("@timestamp", date_format('window.end', constants.TIMESTAMP_FORMAT))

        return grouped_df.writeStream \
            .foreachBatch(foreach_adapter).start()
Example #18
0
def sliding_window_hotspots_by_time(df,
                                    window_size="2",
                                    check_before=2,
                                    check_after=2,
                                    multiplier=2,
                                    type="title"):
    df_s_windowed = df.groupBy(
        col(type), window(col("timestamp"),
                          str(window_size) + " weeks")).count()
    df_s_windowed.cache()
    df_s_windowed.show()
    windowspec = Window.partitionBy(col(type)).orderBy(
        col("window")).rowsBetween(-check_before, check_after)

    print("calculate moving average of revisions (-" + str(check_before) +
          ", " + str(check_after) + ")")
    df_s_avg = df_s_windowed.withColumn("moving_avg",
                                        avg(col("count")).over(windowspec))
    df_s_avg.show()

    #title|window|hotspot|count
    df_with_hotspots = df_s_avg.withColumn("hotspot", when(col("count") > multiplier * col("moving_avg"), 1).otherwise(0))\
        .select("hotspot", "window", type, col("count").alias("rev_count"))
    df_with_hotspots.show()
    #title|window|rev_count
    df_hotspots = df_with_hotspots.where(col("hotspot") == 1)\
        .select(type, "window", "rev_count")
    return df_hotspots
Example #19
0
    def compute(self, udfName, windowDuration: int = None, slideDuration: int = None,
                      groupByColumnName: List[str] = [], startTime=None):
        """
        Run an algorithm. This method supports running an udf method on windowed data

        Args:
            udfName: Name of the algorithm
            windowDuration (int): duration of a window in seconds
            slideDuration (int): slide duration of a window
            groupByColumnName List[str]: groupby column names, for example, groupby user, col1, col2
            startTime (datetime): The startTime is the offset with respect to 1970-01-01 00:00:00 UTC with which to start window intervals. For example, in order to have hourly tumbling windows that start 15 minutes past the hour, e.g. 12:15-13:15, 13:15-14:15... provide startTime as 15 minutes. First time of data will be used as startTime if none is provided
        Returns:
            DataStream: this will return a new datastream object with blank metadata

        """
        if slideDuration:
            slideDuration = str(slideDuration) + " seconds"
            
        if 'custom_window' in self._data.columns:
            data = self._data.groupby('user', 'custom_window').apply(udfName)
        else:
            groupbycols = ["user", "version"]
        
            if windowDuration:
                windowDuration = str(windowDuration) + " seconds"
                win = F.window("timestamp", windowDuration=windowDuration, slideDuration=slideDuration, startTime=startTime)
                groupbycols.append(win)

            if len(groupByColumnName) > 0:
                groupbycols.extend(groupByColumnName)

            data = self._data.groupBy(groupbycols).apply(udfName)

        return DataStream(data=data, metadata=Metadata())
Example #20
0
    def test_grouped_over_window(self):

        data = [
            (0, 1, "2018-03-10T00:00:00+00:00", [0]),
            (1, 2, "2018-03-11T00:00:00+00:00", [0]),
            (2, 2, "2018-03-12T00:00:00+00:00", [0]),
            (3, 3, "2018-03-15T00:00:00+00:00", [0]),
            (4, 3, "2018-03-16T00:00:00+00:00", [0]),
            (5, 3, "2018-03-17T00:00:00+00:00", [0]),
            (6, 3, "2018-03-21T00:00:00+00:00", [0]),
        ]

        expected = {0: [0], 1: [1, 2], 2: [1, 2], 3: [3, 4, 5], 4: [3, 4, 5], 5: [3, 4, 5], 6: [6]}

        df = self.spark.createDataFrame(data, ["id", "group", "ts", "result"])
        df = df.select(col("id"), col("group"), col("ts").cast("timestamp"), col("result"))

        def f(pdf):
            # Assign each result element the ids of the windowed group
            pdf["result"] = [pdf["id"]] * len(pdf)
            return pdf

        result = (
            df.groupby("group", window("ts", "5 days"))
            .applyInPandas(f, df.schema)
            .select("id", "result")
            .collect()
        )
        for r in result:
            self.assertListEqual(expected[r[0]], r[1])
Example #21
0
def stream_windowed_top_urls(logs_df: DataFrame, window_duration: str,
                             slide_duration: str) -> None:
    """
    Stream the most requested URLs within window_duration slot, every slide_duration
    based on the timestamp column of rows in dataset.
    :param logs_df:
    :param window_duration:
    :param slide_duration:
    :return: None
    """
    # Aggregate based on a time window and URL (endpoint)
    # Split the window column to see the dates clearly.
    windowed_top_urls_df = logs_df.groupBy(
        func.window(func.col('timestamp'), windowDuration=window_duration, slideDuration=slide_duration),
        func.col('endpoint')).agg(func.count('*').alias('url_count')).\
        orderBy(func.desc('url_count')).\
        select(func.col('window.*'), func.col('endpoint').alias('url'), func.col('url_count'))

    query_stream = windowed_top_urls_df.writeStream.\
        outputMode('complete').\
        queryName('windowed_top_urls').\
        format('console').\
        start()

    query_stream.awaitTermination()
Example #22
0
def test_grouped_sliding_window_array(data_gen):
    row_gen = StructGen(
        [['ts', _restricted_ts_gen], ['data', ArrayGen(data_gen)]],
        nullable=False)
    assert_gpu_and_cpu_are_equal_collect(lambda spark: gen_df(
        spark, row_gen).groupBy(f.window('ts', '5 hour', '1 hour')).agg(
            f.max(f.col("data")[3]).alias("max_data")))
Example #23
0
def langCountQuery(df, colName):
    return df \
        .withWatermark("timestamp", "2 minutes") \
        .groupBy(
            window(col("timestamp"), "2 minutes", "1 minutes"),
            col(colName)
        ).count() \
        .select(colName, "count", to_json(struct(colName, "count")).alias("value"))
Example #24
0
def wordCountQuery(df, colName):
    return df \
        .withWatermark("timestamp", "10 seconds") \
        .withColumn('word', explode(split(col(colName), ' '))) \
        .groupBy(window(col("timestamp"), "10 seconds", "5 seconds"),
                 col('word')
                 ).count() \
        .select("word", "count", to_json(struct("word", "count")).alias("value"))
Example #25
0
def datetime_filter(df,
                    param_name,
                    param_value,
                    datetime_name,
                    time_w=90,
                    step=90 * 60):
    """
    Remove rows in DataFrame which match a condition within a given time interval.

    :param df:
        Spark DataFrame object with timestamp data
    :param param_name:
        parameter name
    :param param_value:
        parameter value for conditional statement
    :param datetime_name:
        column with timestamp data
    :param time_w:
        tumbling window duration (in minutes)
    :param step:
        sliding interval (in seconds)
    :return:
        Spark DataFrame object with timestamp data
    """

    ## tumbling window size
    tw = str(time_w) + ' minutes'

    ## sliding window size
    sw = str(step) + ' seconds'

    ## offset (in seconds)
    offset = str(0) + ' seconds'

    intervals_df = df.groupBy(F.window(datetime_name, '{}'.format(tw), '{}'.format(sw), '{}'.format(offset))) \
        .avg(param_name) \
        .sort('window.start') \
        .filter(F.col('avg({})'.format(param_name)) == param_value) \
        .select('window') \
        .withColumn('start', F.col('window').start) \
        .withColumn('end', F.col('window').end) \
        .drop('window')
    """
        schema of internal_df:
        
        root
         |-- start: timestamp (nullable = true)
         |-- end: timestamp (nullable = true)
    """

    ## transform dataframe into list of pyspark.sql.types.Row objects
    intervals_list = intervals_df.collect()

    ## filter dataframe excluding the selected intervals
    for row in intervals_list:
        df = df.filter(~F.col(datetime_name).between(row[0], row[1]))

    return intervals_df, df
Example #26
0
def run_spark_job(spark):
    # Create Spark configurations with max offset of 200 per trigger
    # set up correct bootstrap server and port
    df = spark.readStream.format("kafka") \
        .option("kafka.bootstrap.servers","localhost:9092") \
        .option("subscribe","police-department-calls-sf") \
        .option("startingOffsets","earliest") \
        .option("maxOffsetsPerTrigger", 200) \
        .option("maxRatePerPartition",100) \
        .load()

    # Show schema for the incoming resources for checks
    df.printSchema()

    # Take only value and convert it to String
    kafka_df = df.selectExpr("CAST(value as STRING)")

    service_table = kafka_df\
        .select(psf.from_json(psf.col('value'), schema).alias("DF"))\
        .select("DF.*")

    # Select original_crime_type_name and disposition
    distinct_table = service_table.\
        select("original_crime_type_name", "call_date_time", "disposition").\
        withWatermark("call_date_time", "60 minutes")

    # count the number of original crime type
    agg_df = distinct_table.\
        groupBy("original_crime_type_name", psf.window("call_date_time", "60 minutes")).\
        count()

    # TODO Q1. Submit a screen shot of a batch ingestion of the aggregation
    # write output stream
    query = agg_df\
        .writeStream\
        .queryName("agg_query_writer")\
        .outputMode("Complete")\
        .format("console")\
        .start()

    # Attach a ProgressReporter
    query.awaitTermination()

    # Get the right radio code json path
    radio_code_json_filepath = "radio_code.json"
    radio_code_df = spark.read.json(radio_code_json_filepath)

    # clean up your data so that the column names match on radio_code_df and agg_df
    # we will want to join on the disposition code

    # Rename disposition_code column to disposition
    radio_code_df = radio_code_df.withColumnRenamed("disposition_code",
                                                    "disposition")

    # Join on disposition column
    join_query = agg_df.join(radio_code_df, "disposition")

    join_query.awaitTermination()
Example #27
0
def run_spark_job(spark):

    # TODO Create Spark Configuration
    # Create Spark configurations with max offset of 200 per trigger
    # set up correct bootstrap server and port
    df = spark \
        .readStream \
        .format("kafka") \
        .option("kafka.bootstrap.servers", "localhost:9092") \
        .option("subscribe", "udacity.police.department.calls") \
        .option("startingOffsets", "earliest") \
        .load()

    # Show schema for the incoming resources for checks
    df.printSchema()

    # TODO extract the correct column from the kafka input resources
    # Take only value and convert it to String
    kafka_df = df.selectExpr(" cast (value as STRING)")

    service_table = kafka_df\
        .select(psf.from_json(psf.col('value'), schema).alias("DF"))\
        .select("DF.*")

    distinct_table = service_table.select("call_date_time",
                                          "original_crime_type_name",
                                          "disposition")
    agg_df = distinct_table \
            .withWatermark("call_date_time", "60 minutes") \
            .groupBy(psf.window(distinct_table.call_date_time, "10 minutes", "5 minutes"), psf.col("original_crime_type_name"))\
            .count()


    query = agg_df \
            .writeStream \
            .outputMode('complete') \
            .format('console') \
            .start()

    # TODO attach a ProgressReporter
    query.awaitTermination()

    radio_code_json_filepath = "radio_code.json"
    radio_code_df = spark.read.json(radio_code_json_filepath)

    # clean up your data so that the column names match on radio_code_df and agg_df
    # we will want to join on the disposition code

    radio_code_df = radio_code_df.withColumnRenamed("disposition_code",
                                                    "disposition")

    join_query = agg_df.join(radio_code_df, "disposition") \
        .writeStream \
        .format("console") \
        .queryName("join") \
        .start()

    join_query.awaitTermination()
Example #28
0
File: job1.py Project: canimus/job1
def main():
    spark = (
        SparkSession
            .builder
            .appName('Streaming - Python')
            .master('local[*]')
            .getOrCreate()
    )

    spark.sparkContext.setJobGroup('ReadSocket', 'Reading input from nc command')
    spark.sparkContext.setLogLevel('ERROR')

    df = (
        spark.readStream.format('socket')
        .option('host', '127.0.0.1')
        .option('port', 9999)
        .load()
    )

    # Dynamic
    # ===================
    expected_columns = ['sqltimestamp'] + [f'col{x}' for x in range(1,6)]

    # Static
    # ===================
    #expected_columns = ['sqltimestamp'] + 'col1,col2,col3,col4,col5'.split(',')

    transformation_1 = (
        df
            # Column structure
            # ================================================
            .withColumn('raw_columns', F.split('value', ','))
            .drop('value')
            .select(*[F.col('raw_columns')[k].alias(v) for k,v in enumerate(expected_columns)])
            # Casting
            .withColumn('timestamp', F.from_unixtime('sqltimestamp').cast('timestamp'))
            .drop('sqltimestamp')
            # Watermarking
            .withWatermark('timestamp', '5 seconds')
            # Aggregation
            .groupby(F.window(F.col('timestamp'), '5 minute'))
            .agg(
                F.sum(F.when(F.col('col1') == 'cat1', 1).otherwise(0)).alias('count_cat1'), 
                F.count(F.col('col1')).alias('total')
            )
            .withColumn('mean_cat1', F.expr('count_cat1 / total'))
    )
    
    run_query = (
        transformation_1
            .writeStream
            .queryName('SocketStream')
            .format('console')
            .option('truncate', 'false')
            .outputMode('update')
    ).start()
    run_query.awaitTermination()
Example #29
0
def global_revision_hotspots_by_time(df, weeks_per_bin=4, multiplier=4):
    df_windowed = df.groupBy(window("timestamp",
                                    str(weeks_per_bin) + " weeks")).count()
    df_avg = df_windowed.select(avg("count"))
    df_windowed = df_windowed.withColumn(
        "avg", lit(float(df_avg.first()["avg(count)"])))
    df_windowed.show()
    df_hotspots = df_windowed.where(col("count") >= (multiplier * col("avg")))
    return df_hotspots
Example #30
0
def single_revision_hotspots_by_time(df, weeks_per_bin=4, multiplier=4):
    df_windowed = df.groupBy(
        col("title"), window("timestamp",
                             str(weeks_per_bin) + " weeks")).count()
    df_avg = df_windowed.groupBy(col("title")).avg()
    df_joined = join_by_column(df_windowed, df_avg, "title")
    df_hotspots = df_joined.where(
        col("count") >= (multiplier * col("avg(count)")))
    return df_hotspots
    lines = spark\
        .readStream\
        .format('socket')\
        .option('host', host)\
        .option('port', port)\
        .option('includeTimestamp', 'true')\
        .load()

    # Split the lines into words, retaining timestamps
    # split() splits each line into an array, and explode() turns the array into multiple rows
    words = lines.select(
        explode(split(lines.value, ' ')).alias('word'),
        lines.timestamp
    )

    # Group the data by window and word and compute the count of each group
    windowedCounts = words.groupBy(
        window(words.timestamp, windowDuration, slideDuration),
        words.word
    ).count().orderBy('window')

    # Start running the query that prints the windowed word counts to the console
    query = windowedCounts\
        .writeStream\
        .outputMode('complete')\
        .format('console')\
        .option('truncate', 'false')\
        .start()

    query.awaitTermination()
  .load("/data/retail-data/by-day/*.csv")

staticDataFrame.createOrReplaceTempView("retail_data")
staticSchema = staticDataFrame.schema


# COMMAND ----------

from pyspark.sql.functions import window, column, desc, col
staticDataFrame\
  .selectExpr(
    "CustomerId",
    "(UnitPrice * Quantity) as total_cost",
    "InvoiceDate")\
  .groupBy(
    col("CustomerId"), window(col("InvoiceDate"), "1 day"))\
  .sum("total_cost")\
  .show(5)


# COMMAND ----------

streamingDataFrame = spark.readStream\
    .schema(staticSchema)\
    .option("maxFilesPerTrigger", 1)\
    .format("csv")\
    .option("header", "true")\
    .load("/data/retail-data/by-day/*.csv")


# COMMAND ----------
Example #33
0
    spark = SparkSession\
        .builder\
        .appName("CS-838-Assignment2-PartB-1")\
        .config("spark.driver.memeory","1g")\
        .config("spark.executor.cores","4")\
        .config("spark.executor.memory","4g")\
        .config("spark.task.cpus","1")\
        .config("spark.eventLog.enabled","true")\
        .config("spark.eventLog.dir","file:///tmp/spark-events")\
        .getOrCreate()\


    tweetSchema =StructType().add("userA",StringType()).add("userB",StringType()).add("timestamp",TimestampType()).add("interaction",StringType())
    lines = spark.readStream.option("sep", ",").schema(tweetSchema).csv(sys.argv[1])#("/stream-monitoring")        

    windowedCounts = lines.groupBy(window(lines.timestamp, '60 minutes', '30 minutes'),lines.interaction).count().orderBy('window')

    #print (windowedCounts.count())

    query = windowedCounts\
        .writeStream\
        .outputMode('complete')\
        .format('console')\
        .option('truncate', 'false')\
        .option('numRows', 1000000000)\
        .start()
    

    query.awaitTermination()