def run_spark_job(spark): # Create Spark configurations with max offset of 200 per trigger # set up correct bootstrap server and port # test being my topic df = spark \ .readStream \ .format("kafka") \ .option("kafka.bootstrap.servers", "localhost:9092") \ .option("subscribe", "test") \ .option("startingOffsets", "earliest") \ .option("maxOffsetsPerTrigger", 200) \ .option("maxRatePerPartition", 100) \ .load() # Show schema for the incoming resources for checks df.printSchema() kafka_df = df.selectExpr("CAST(value AS STRING)") service_table = kafka_df \ .select(psf.from_json(psf.col('value'), schema).alias("SERVICE_CALLS")) \ .select("SERVICE_CALLS.*") distinct_table = service_table \ .select(psf.col('crime_id'), psf.col('original_crime_type_name'), psf.to_timestamp(psf.col('call_date_time')).alias('call_datetime'), psf.col('address'), psf.col('disposition')) counts_df = distinct_table \ .withWatermark("call_datetime", "60 minutes") \ .groupBy( psf.window(distinct_table.call_datetime, "10 minutes", "5 minutes"), distinct_table.original_crime_type_name ).count() converted_df = counts_df.withColumn( "call_date_time", udf_convert_time(counts_df.call_date_time)) calls_per_2_days = converted_df \ .groupBy( psf.window(converted_df.call_date_time, "2 day") ).agg(psf.count("crime_id").alias("calls_per_2_day")).select("calls_per_2_day") # TODO write output stream query = counts_df \ .writeStream \ .outputMode('Complete') \ .format('console') \ .start() # TODO attach a ProgressReporter query.awaitTermination()
def runSpark(spark): df = spark \ .readStream \ .format("kafka") \ .option("kafka.bootstrap.servers", "localhost:9092") \ .option("subscribe", "service-calls") \ .option("maxOffsetPerTrigger", "250") \ .option("startingOffsets", "earliest") \ .load() # Show schema for the incoming resources for checks df.printSchema() df.selectExpr("CAST(value AS STRING)") kafka_df = df.selectExpr("CAST(value AS STRING)") service_table = kafka_df\ .select(psf.from_json(psf.col('value'), schema).alias("SERVICE_CALLS"))\ .select("SERVICE_CALLS.*") distinct_table = service_table\ .select(psf.col('crime_id'), psf.col('original_crime_type_name'), psf.to_timestamp(psf.col('call_date_time')).alias('call_datetime'), psf.col('address'), psf.col('disposition')) counts_df = distinct_table \ .withWatermark("call_datetime", "60 minutes") \ .groupBy( psf.window(distinct_table.call_datetime, "10 minutes", "5 minutes"), distinct_table.original_crime_type_name ).count() counts_df = counts_df['call_date_time'].apply(udf_convert_time) query = counts_df \ .writeStream \ .outputMode('complete') \ .format('console') \ .start() query.awaitTermination() calls_per_2_days = distinct_table \ .withWatermark("call_datetime", "2880 minutes") \ .groupBy( psf.window(distinct_table.call_datetime, "60 minutes", "30 minutes"), distinct_table.original_crime_type_name ).count() query = calls_per_2_days \ .writeStream \ .outputMode('complete') \ .format('console') \ .start() query.awaitTermination()
def run_spark_job(spark): # TODO Create Spark Configuration # Create Spark configurations with max offset of 200 per trigger # set up correct bootstrap server and port df = spark \ .readStream \ .format("kafka") \ .option("kafka.bootstrap.servers", "localhost:9092") \ .option("subscribe", "service-calls") \ .option("startingOffsets", "earliest") \ .option("maxOffsetsPerTrigger", 200) \ .load() # Show schema for the incoming resources for checks df.printSchema() # TODO extract the correct column from the kafka input resources # Take only value and convert it to String kafka_df = df.selectExpr("CAST(value AS STRING)") service_table = kafka_df\ .select(psf.from_json(psf.col('value'), schema).alias("SERVICE_CALLS"))\ .select("SERVICE_CALLS.*") distinct_table = service_table\ .select(psf.col('crime_id'), psf.col('original_crime_type_name'), psf.to_timestamp(psf.col('call_date_time')).alias('call_datetime'), psf.col('address'), psf.col('disposition')) # TODO get different types of original_crime_type_name in 60 minutes interval counts_df = distinct_table \ .withWatermark("call_datetime", "60 minutes") \ .groupBy( psf.window(distinct_table.call_datetime, "10 minutes", "5 minutes"), distinct_table.original_crime_type_name ).count() # TODO use udf to convert timestamp to right format on a call_date_time column converted_df = distinct_table.withColumn( "call_date_time", udf_convert_time(distinct_table.call_datetime)) # TODO apply aggregations using windows function to see how many calls occurred in 2 day span calls_per_2_days = converted_df \ .groupBy( psf.window(converted_df.call_date_time, "2 day") ).agg(psf.count("crime_id").alias("calls_per_2_day")).select("calls_per_2_day") # TODO write output stream query = counts_df \ .writeStream \ .outputMode('Complete') \ .format('console') \ .start() # TODO attach a ProgressReporter query.awaitTermination()
def run_spark_job(spark): df = spark.readStream \ .format("kafka") \ .option("kafka.bootstrap.servers", "localhost:9092") \ .option("maxOffsetsPerTrigger", 200) \ .option("subscribe", "service-calls") \ .option("startingOffsets", "earliest") \ .load() df.printSchema() kafka_df = df.selectExpr("CAST(value AS STRING)") service_table = kafka_df\ .select(psf.from_json(psf.col('value'), schema).alias("SERVICE_CALLS"))\ .select("SERVICE_CALLS.*") distinct_table = service_table\ .select(psf.col('crime_id'), psf.col('original_crime_type_name'), psf.to_timestamp(psf.col('call_date_time')).alias('call_datetime'), psf.col('address'), psf.col('address_type'), psf.col('disposition')) counts_df = distinct_table.withWatermark("call_datetime", "60 minutes") \ .groupBy( psf.window(distinct_table.call_datetime, "10 minutes", "10 minutes"), distinct_table.original_crime_type_name ).count() converted_df = distinct_table.withColumn( "call_date_time", udf_convert_time(distinct_table.call_datetime)) # converted_df.printSchema() # apply aggregations using windows function to see how many calls occurred in 2 day span calls_per_2_days = converted_df \ .withWatermark("call_datetime", "2 day") \ .groupBy( psf.window(converted_df.call_date_time, "2 day") ).agg(psf.count("crime_id").alias("calls_per_2_day")).select("calls_per_2_day") # calls_per_2_days.printSchema() query = calls_per_2_days.writeStream \ .outputMode('complete') \ .format('console') \ .start() query.awaitTermination()
def hexercise_in_streaming(self): static_name = 'retail_data' self.static_data_frame.createOrReplaceTempView(static_name) static_schema = self.static_data_frame.schema self.static_data_frame \ .selectExpr( "CustomerId", "(UnitPrice * Quantity) as total_cost", "InvoiceDate") \ .groupBy( functions.col("CustomerId"), functions.window(functions.col("InvoiceDate"), "1 day")) \ .sum("total_cost") \ .show(5) streaming_data_frame = self.spark.readStream \ .schema(static_schema) \ .option("maxFilesPerTrigger", 1) \ .format("csv") \ .option("header", "true") \ .option("inferSchema", "true") \ .load("/Spark-The-Definitive-Guide/data/retail-data/by-day/*.csv") purchase_by_customer_per_hour = streaming_data_frame \ .selectExpr( "CustomerId", "(UnitPrice * Quantity) as total_cost", "InvoiceDate") \ .groupBy(functions.col("CustomerId"), functions.window(functions.col("InvoiceDate"), "1 day")) \ .sum("total_cost") stream = purchase_by_customer_per_hour.writeStream \ .format("memory") \ .queryName("customer_purchases") \ .outputMode("complete") \ .start() # check every X sec # values in range and sleep need to be adjusted per machine for _ in range(0, 6): self.spark.sql(""" SELECT * FROM customer_purchases ORDER BY `sum(total_cost)` DESC """) \ .show(5) sleep(4) # very gracefully stream.stop()
def run_spark_job(spark): df = spark \ .readStream \ .format("kafka") \ .option("kafka.bootstrap.servers", "localhost:9092") \ .option("subscribe", "com.udacity.police.calls.new") \ .option("startingOffset", "earliest") \ .option("maxRatePerPartition", 10000) \ .option("maxOffsetPerTrigger", 200) \ .load() df.printSchema() kafka_df = df.selectExpr("CAST(value AS STRING)", "CAST(timestamp AS timestamp)") service_table = kafka_df\ .select(psf.from_json(psf.col('value'), schema).alias("DF"), psf.col('timestamp'))\ .select("timestamp", "DF.*") service_table.printSchema() distinct_table = service_table.select( 'original_crime_type_name', 'disposition', psf.to_timestamp('timestamp').alias('call_date_time2')) #.dropDuplicates() agg_df = distinct_table.select('call_date_time2', 'original_crime_type_name', 'disposition') \ .withWatermark('call_date_time2', '10 minutes') \ .groupBy( psf.window('call_date_time2', "5 minutes", "2 minutes"), #'call_date_time2', 'original_crime_type_name', 'disposition') \ .count() \ # .orderBy(psf.desc('count')) # query = agg_df \ # .writeStream \ # .outputMode("complete") \ # .format("console") \ # .start() # query.awaitTermination() radio_code_json_filepath = "/home/workspace/radio_code.json" radio_code_df = spark.read.option("multiLine", True).json(radio_code_json_filepath) radio_code_df = radio_code_df.withColumnRenamed("disposition_code", "disposition") # join on disposition column, I was experimenting with different types of joins inner, left_outer... join_query = agg_df.join(radio_code_df, radio_code_df.disposition == agg_df.disposition, "left_outer") query2 = join_query.writeStream.outputMode("complete").format( "console").start() query2.awaitTermination()
def track_model_quality(real, predicted): quality_compare = predicted.join(real, "pid") quality_compare = quality_compare.withColumn( 'accurate_prediction', F.when((F.col('quality')==F.col('predicted_quality')), 1)\ .otherwise(0) ) accurate_prediction_summary = (quality_compare.groupBy( F.window(F.col('process_time'), '1 day').alias('window'), F.col('accurate_prediction')).count().withColumn( 'window_day', F.expr('to_date(window.start)')).withColumn( 'total', F.sum(F.col('count')).over( Window.partitionBy('window_day'))).withColumn( 'ratio', F.col('count') * 100 / F.col('total')).select( 'window_day', 'accurate_prediction', 'count', 'total', 'ratio').withColumn( 'accurate_prediction', F.when( F.col('accurate_prediction') == 1, 'Accurate').otherwise( 'Inaccurate')).orderBy('window_day')) return accurate_prediction_summary
def test_grouped_over_window(self): data = [(0, 1, "2018-03-10T00:00:00+00:00", [0]), (1, 2, "2018-03-11T00:00:00+00:00", [0]), (2, 2, "2018-03-12T00:00:00+00:00", [0]), (3, 3, "2018-03-15T00:00:00+00:00", [0]), (4, 3, "2018-03-16T00:00:00+00:00", [0]), (5, 3, "2018-03-17T00:00:00+00:00", [0]), (6, 3, "2018-03-21T00:00:00+00:00", [0])] expected = {0: [0], 1: [1, 2], 2: [1, 2], 3: [3, 4, 5], 4: [3, 4, 5], 5: [3, 4, 5], 6: [6]} df = self.spark.createDataFrame(data, ['id', 'group', 'ts', 'result']) df = df.select(col('id'), col('group'), col('ts').cast('timestamp'), col('result')) def f(pdf): # Assign each result element the ids of the windowed group pdf['result'] = [pdf['id']] * len(pdf) return pdf result = df.groupby('group', window('ts', '5 days')).applyInPandas(f, df.schema)\ .select('id', 'result').collect() for r in result: self.assertListEqual(expected[r[0]], r[1])
def task_a_2_step_1_final(spark): a2_struct = T.StructType([ T.StructField("datetime_start", T.TimestampType()), T.StructField("datetime_end", T.TimestampType()), T.StructField("map_topics", T.MapType( T.StringType(), T.ArrayType(T.StringType()) )) ]) result = kafka_source(spark, config.BOOTSTRAP_SERVERS, "topics-by-state_step-0").parse_json(a2_struct) \ .withWatermark("datetime_end", "1 minute").groupBy( F.window("datetime_end", "3 hour", "1 hour") ) \ .agg( F.first("window.start").alias("timestamp_start"), F.first("window.end").alias("timestamp_end"), F.collect_list("map_topics").alias("statistics") ) \ .select( F.struct( F.concat(F.hour('timestamp_start'), lit(":"), F.minute('timestamp_start')).alias("time_start"), F.concat(F.hour('timestamp_end'), lit(":"), F.minute('timestamp_end')).alias("time_end"), concat_maps_udf(col('statistics')).alias("statistics") ).alias("res") ).send_to_kafka(config.BOOTSTRAP_SERVERS, "topics-by-state", config.LOG_PREFIX) return result
def stream_and_process_message(self): try: input_df = self.get_kafka_consumer() flat_message_df = self.get_parsed_message(input_df) cleaned_message_df = self.get_cleaned_message(flat_message_df) grouped_message_df = cleaned_message_df.select( 'id', window(cleaned_message_df.timestamp, self.kafka_window_time), 'country', 'email') grouped_message_output_df = self.get_output_df(grouped_message_df) # publish the grouped data onto new topic for further measure calculation df_write_stream = grouped_message_output_df \ .selectExpr("CAST(key AS STRING)", "CAST(value AS STRING)") \ .writeStream \ .format("kafka") \ .option("kafka.bootstrap.servers", self.kafka_bootstrap_servers) \ .option("topic", self.kafka_output_topic_name) \ .outputMode("append") \ .option("checkpointLocation", self.checkpoint_file_dir + "output_topic_checkpoint") \ .start() # save the cleansed data in parquet file format for future use cleaned_message_df \ .writeStream \ .format("parquet") \ .option("startingOffsets", "earliest") \ .option("path", self.output_file_dir ) \ .option("checkpointLocation", self.checkpoint_file_dir + "saved_parquet_checkpoint") \ .start() df_write_stream.awaitTermination() except Exception as e: self.logger.error(e)
def main(): spark = SparkSession.builder \ .master('local') \ .appName('nyc-taxi') \ .config('spark.executor.memory', '1gb') \ .getOrCreate() sc: SparkContext = spark.sparkContext sc.setLogLevel('WARN') logger.info("app_id = {}".format(sc.applicationId)) df_line = spark.readStream.format('kafka') \ .option('kafka.bootstrap.servers', 'localhost:9092') \ .option('subscribe', 'word-count') \ .option('startingOffsets', 'latest') \ .load() \ .selectExpr('CAST(value AS STRING)') df_word = df_line \ .select(funs.explode(funs.split(df_line.value, " ")).alias("word")) df_word = df_word \ .withColumn('word', funs.regexp_replace('word', '[^a-zA-Z0-9]', '')) \ .filter(df_word['word'] != '') \ .selectExpr('LOWER(word) AS word') \ .withColumn('process_time', funs.current_timestamp()) df_grouped = df_word.groupBy( funs.window('process_time', '20 seconds', '10 seconds'), 'word').count() write_stream(df_grouped)
def process_stream(self, rdd): """ Args rdd: rdd :rtype: None """ if rdd.isEmpty(): print("RDD is empty") else: df = rdd.toDF() # downsample data df2 = df.withColumn("timestamp", df.ts.cast("timestamp")) downsampled_df = df2.groupBy( 'id', window("timestamp", "1 second").alias("ds_ts")).agg( F.round(F.avg("val"), 2).alias('downsample_avg')) final_df = downsampled_df.select( "id", downsampled_df['ds_ts'].start.alias("start_ts"), "downsample_avg").orderBy('start_ts', ascending=True) # write to timescale try: connector = pgConnector.PostgresConnector( "ec2-3-94-71-208.compute-1.amazonaws.com", "datanodedb", "datanode", "password") connector.write(final_df, "downsampled_table", "append") except Exception as e: print(e) pass
def test_sliding_window(data_gen): row_gen = StructGen([['ts', _restricted_ts_gen], ['data', data_gen]], nullable=False) w = Window.partitionBy(f.window('ts', '5 hour', '1 hour')) assert_gpu_and_cpu_are_equal_collect(lambda spark: gen_df(spark, row_gen). withColumn('rolling_max', f.max("data").over(w)))
def bin_keep_df(self, df, wind="5",): binned = df.groupBy(df['src_ip'], window(df["timestamp"], wind + " minutes")) \ .agg(collect_list('dest_port')) \ .orderBy("window") # .sum('num_pkts_in', 'num_pkts_out', 'bytes_in', 'bytes_out') \ return binned
def run_spark_job(spark): df = ( spark.readStream.format("kafka") .option("subscribe", "org.sf.police.calls") .option("startingOffsets", "earliest") .option("maxOffsetsPerTrigger", 200) .option("kafka.bootstrap.servers", "localhost:9092") .load() ) # Show schema for the incoming resources for checks df.printSchema() # Take only value and convert it to String kafka_df = df.selectExpr("CAST(value AS STRING)", "timestamp") service_table = kafka_df.select( func.from_json(func.col("value"), schema).alias("DF"), "timestamp" ).select("DF.*", "timestamp") # count the number of original crime type agg_df = ( service_table .withWatermark("timestamp", "12 hours") .groupBy( func.window("call_date_time", "1 day"), "original_crime_type_name", "disposition", ).count() ) # Q1. Submit a screen shot of a batch ingestion of the aggregation # write output stream query = ( agg_df.writeStream .outputMode("update") .format("console") .start() ) # get the right radio code json path radio_code_json_filepath = "radio_code.json" radio_code_df = spark.read.option("multiline", "true").json( radio_code_json_filepath ) # clean up your data so that the column names match on radio_code_df and agg_df # we will want to join on the disposition code # rename disposition_code column to disposition radio_code_df = radio_code_df.withColumnRenamed("disposition_code", "disposition") # join on disposition column join_query = ( agg_df.join(radio_code_df, on="disposition") .writeStream.format("console") .outputMode("update") .start() ) # attach a ProgressReporter join_query.awaitTermination() query.awaitTermination()
def run_spark_job(spark): df = spark \ .readStream \ .format("kafka").option("kafka.bootstrap.servers", "localhost:9092").option("subscribe", "demo")\ .option("startingOffsets", "earliest").option("maxOffsetsPerTrigger", 200).load() df.printSchema() kafka_df = df.selectExpr("CAST(value AS STRING)") service_table = kafka_df.select(psf.from_json(psf.col('value'), schema).alias("SERVICE_CALLS"))\ .select("SERVICE_CALLS.*") distinct_table = service_table.select( psf.to_timestamp(psf.col('call_date_time')).alias('call_datetime'), psf.col('address'), psf.col('disposition'), psf.col('crime_id'), psf.col('original_crime_type_name')) counts_df = distinct_table \ .withWatermark("call_datetime", "60 minutes").groupBy( psf.window(distinct_table.call_datetime, "10 minutes", "5 minutes"), distinct_table.original_crime_type_name ).count() query = counts_df.writeStream.outputMode('Complete').format( 'console').start() query.awaitTermination()
def transform_input(self, df): config = self.config watermark = config["watermark"] group_by = config["group_by"] time_window = config["time_window"] weektime_format = config["weektime_format"] def foreach_adapter(df: DataFrame, _): print("foreach adapter started") if not df.rdd.isEmpty(): anomaly_cum_average = get_udf("anomaly_cum_average", self.config) df_anomaly = df.withColumn( "is_anomaly", anomaly_cum_average("count", "weektime", *self.group_by)) df_result = df_anomaly.filter("is_anomaly == False") self.write_to_es(df_result) print("finished writing") grouped_df = df.withWatermark("timestamp", watermark) \ .groupBy(*group_by, window("timestamp", time_window)) \ .count().withColumn("weektime", date_format('window.end', weektime_format)) \ .withColumn("@timestamp", date_format('window.end', constants.TIMESTAMP_FORMAT)) return grouped_df.writeStream \ .foreachBatch(foreach_adapter).start()
def sliding_window_hotspots_by_time(df, window_size="2", check_before=2, check_after=2, multiplier=2, type="title"): df_s_windowed = df.groupBy( col(type), window(col("timestamp"), str(window_size) + " weeks")).count() df_s_windowed.cache() df_s_windowed.show() windowspec = Window.partitionBy(col(type)).orderBy( col("window")).rowsBetween(-check_before, check_after) print("calculate moving average of revisions (-" + str(check_before) + ", " + str(check_after) + ")") df_s_avg = df_s_windowed.withColumn("moving_avg", avg(col("count")).over(windowspec)) df_s_avg.show() #title|window|hotspot|count df_with_hotspots = df_s_avg.withColumn("hotspot", when(col("count") > multiplier * col("moving_avg"), 1).otherwise(0))\ .select("hotspot", "window", type, col("count").alias("rev_count")) df_with_hotspots.show() #title|window|rev_count df_hotspots = df_with_hotspots.where(col("hotspot") == 1)\ .select(type, "window", "rev_count") return df_hotspots
def compute(self, udfName, windowDuration: int = None, slideDuration: int = None, groupByColumnName: List[str] = [], startTime=None): """ Run an algorithm. This method supports running an udf method on windowed data Args: udfName: Name of the algorithm windowDuration (int): duration of a window in seconds slideDuration (int): slide duration of a window groupByColumnName List[str]: groupby column names, for example, groupby user, col1, col2 startTime (datetime): The startTime is the offset with respect to 1970-01-01 00:00:00 UTC with which to start window intervals. For example, in order to have hourly tumbling windows that start 15 minutes past the hour, e.g. 12:15-13:15, 13:15-14:15... provide startTime as 15 minutes. First time of data will be used as startTime if none is provided Returns: DataStream: this will return a new datastream object with blank metadata """ if slideDuration: slideDuration = str(slideDuration) + " seconds" if 'custom_window' in self._data.columns: data = self._data.groupby('user', 'custom_window').apply(udfName) else: groupbycols = ["user", "version"] if windowDuration: windowDuration = str(windowDuration) + " seconds" win = F.window("timestamp", windowDuration=windowDuration, slideDuration=slideDuration, startTime=startTime) groupbycols.append(win) if len(groupByColumnName) > 0: groupbycols.extend(groupByColumnName) data = self._data.groupBy(groupbycols).apply(udfName) return DataStream(data=data, metadata=Metadata())
def test_grouped_over_window(self): data = [ (0, 1, "2018-03-10T00:00:00+00:00", [0]), (1, 2, "2018-03-11T00:00:00+00:00", [0]), (2, 2, "2018-03-12T00:00:00+00:00", [0]), (3, 3, "2018-03-15T00:00:00+00:00", [0]), (4, 3, "2018-03-16T00:00:00+00:00", [0]), (5, 3, "2018-03-17T00:00:00+00:00", [0]), (6, 3, "2018-03-21T00:00:00+00:00", [0]), ] expected = {0: [0], 1: [1, 2], 2: [1, 2], 3: [3, 4, 5], 4: [3, 4, 5], 5: [3, 4, 5], 6: [6]} df = self.spark.createDataFrame(data, ["id", "group", "ts", "result"]) df = df.select(col("id"), col("group"), col("ts").cast("timestamp"), col("result")) def f(pdf): # Assign each result element the ids of the windowed group pdf["result"] = [pdf["id"]] * len(pdf) return pdf result = ( df.groupby("group", window("ts", "5 days")) .applyInPandas(f, df.schema) .select("id", "result") .collect() ) for r in result: self.assertListEqual(expected[r[0]], r[1])
def stream_windowed_top_urls(logs_df: DataFrame, window_duration: str, slide_duration: str) -> None: """ Stream the most requested URLs within window_duration slot, every slide_duration based on the timestamp column of rows in dataset. :param logs_df: :param window_duration: :param slide_duration: :return: None """ # Aggregate based on a time window and URL (endpoint) # Split the window column to see the dates clearly. windowed_top_urls_df = logs_df.groupBy( func.window(func.col('timestamp'), windowDuration=window_duration, slideDuration=slide_duration), func.col('endpoint')).agg(func.count('*').alias('url_count')).\ orderBy(func.desc('url_count')).\ select(func.col('window.*'), func.col('endpoint').alias('url'), func.col('url_count')) query_stream = windowed_top_urls_df.writeStream.\ outputMode('complete').\ queryName('windowed_top_urls').\ format('console').\ start() query_stream.awaitTermination()
def test_grouped_sliding_window_array(data_gen): row_gen = StructGen( [['ts', _restricted_ts_gen], ['data', ArrayGen(data_gen)]], nullable=False) assert_gpu_and_cpu_are_equal_collect(lambda spark: gen_df( spark, row_gen).groupBy(f.window('ts', '5 hour', '1 hour')).agg( f.max(f.col("data")[3]).alias("max_data")))
def langCountQuery(df, colName): return df \ .withWatermark("timestamp", "2 minutes") \ .groupBy( window(col("timestamp"), "2 minutes", "1 minutes"), col(colName) ).count() \ .select(colName, "count", to_json(struct(colName, "count")).alias("value"))
def wordCountQuery(df, colName): return df \ .withWatermark("timestamp", "10 seconds") \ .withColumn('word', explode(split(col(colName), ' '))) \ .groupBy(window(col("timestamp"), "10 seconds", "5 seconds"), col('word') ).count() \ .select("word", "count", to_json(struct("word", "count")).alias("value"))
def datetime_filter(df, param_name, param_value, datetime_name, time_w=90, step=90 * 60): """ Remove rows in DataFrame which match a condition within a given time interval. :param df: Spark DataFrame object with timestamp data :param param_name: parameter name :param param_value: parameter value for conditional statement :param datetime_name: column with timestamp data :param time_w: tumbling window duration (in minutes) :param step: sliding interval (in seconds) :return: Spark DataFrame object with timestamp data """ ## tumbling window size tw = str(time_w) + ' minutes' ## sliding window size sw = str(step) + ' seconds' ## offset (in seconds) offset = str(0) + ' seconds' intervals_df = df.groupBy(F.window(datetime_name, '{}'.format(tw), '{}'.format(sw), '{}'.format(offset))) \ .avg(param_name) \ .sort('window.start') \ .filter(F.col('avg({})'.format(param_name)) == param_value) \ .select('window') \ .withColumn('start', F.col('window').start) \ .withColumn('end', F.col('window').end) \ .drop('window') """ schema of internal_df: root |-- start: timestamp (nullable = true) |-- end: timestamp (nullable = true) """ ## transform dataframe into list of pyspark.sql.types.Row objects intervals_list = intervals_df.collect() ## filter dataframe excluding the selected intervals for row in intervals_list: df = df.filter(~F.col(datetime_name).between(row[0], row[1])) return intervals_df, df
def run_spark_job(spark): # Create Spark configurations with max offset of 200 per trigger # set up correct bootstrap server and port df = spark.readStream.format("kafka") \ .option("kafka.bootstrap.servers","localhost:9092") \ .option("subscribe","police-department-calls-sf") \ .option("startingOffsets","earliest") \ .option("maxOffsetsPerTrigger", 200) \ .option("maxRatePerPartition",100) \ .load() # Show schema for the incoming resources for checks df.printSchema() # Take only value and convert it to String kafka_df = df.selectExpr("CAST(value as STRING)") service_table = kafka_df\ .select(psf.from_json(psf.col('value'), schema).alias("DF"))\ .select("DF.*") # Select original_crime_type_name and disposition distinct_table = service_table.\ select("original_crime_type_name", "call_date_time", "disposition").\ withWatermark("call_date_time", "60 minutes") # count the number of original crime type agg_df = distinct_table.\ groupBy("original_crime_type_name", psf.window("call_date_time", "60 minutes")).\ count() # TODO Q1. Submit a screen shot of a batch ingestion of the aggregation # write output stream query = agg_df\ .writeStream\ .queryName("agg_query_writer")\ .outputMode("Complete")\ .format("console")\ .start() # Attach a ProgressReporter query.awaitTermination() # Get the right radio code json path radio_code_json_filepath = "radio_code.json" radio_code_df = spark.read.json(radio_code_json_filepath) # clean up your data so that the column names match on radio_code_df and agg_df # we will want to join on the disposition code # Rename disposition_code column to disposition radio_code_df = radio_code_df.withColumnRenamed("disposition_code", "disposition") # Join on disposition column join_query = agg_df.join(radio_code_df, "disposition") join_query.awaitTermination()
def run_spark_job(spark): # TODO Create Spark Configuration # Create Spark configurations with max offset of 200 per trigger # set up correct bootstrap server and port df = spark \ .readStream \ .format("kafka") \ .option("kafka.bootstrap.servers", "localhost:9092") \ .option("subscribe", "udacity.police.department.calls") \ .option("startingOffsets", "earliest") \ .load() # Show schema for the incoming resources for checks df.printSchema() # TODO extract the correct column from the kafka input resources # Take only value and convert it to String kafka_df = df.selectExpr(" cast (value as STRING)") service_table = kafka_df\ .select(psf.from_json(psf.col('value'), schema).alias("DF"))\ .select("DF.*") distinct_table = service_table.select("call_date_time", "original_crime_type_name", "disposition") agg_df = distinct_table \ .withWatermark("call_date_time", "60 minutes") \ .groupBy(psf.window(distinct_table.call_date_time, "10 minutes", "5 minutes"), psf.col("original_crime_type_name"))\ .count() query = agg_df \ .writeStream \ .outputMode('complete') \ .format('console') \ .start() # TODO attach a ProgressReporter query.awaitTermination() radio_code_json_filepath = "radio_code.json" radio_code_df = spark.read.json(radio_code_json_filepath) # clean up your data so that the column names match on radio_code_df and agg_df # we will want to join on the disposition code radio_code_df = radio_code_df.withColumnRenamed("disposition_code", "disposition") join_query = agg_df.join(radio_code_df, "disposition") \ .writeStream \ .format("console") \ .queryName("join") \ .start() join_query.awaitTermination()
def main(): spark = ( SparkSession .builder .appName('Streaming - Python') .master('local[*]') .getOrCreate() ) spark.sparkContext.setJobGroup('ReadSocket', 'Reading input from nc command') spark.sparkContext.setLogLevel('ERROR') df = ( spark.readStream.format('socket') .option('host', '127.0.0.1') .option('port', 9999) .load() ) # Dynamic # =================== expected_columns = ['sqltimestamp'] + [f'col{x}' for x in range(1,6)] # Static # =================== #expected_columns = ['sqltimestamp'] + 'col1,col2,col3,col4,col5'.split(',') transformation_1 = ( df # Column structure # ================================================ .withColumn('raw_columns', F.split('value', ',')) .drop('value') .select(*[F.col('raw_columns')[k].alias(v) for k,v in enumerate(expected_columns)]) # Casting .withColumn('timestamp', F.from_unixtime('sqltimestamp').cast('timestamp')) .drop('sqltimestamp') # Watermarking .withWatermark('timestamp', '5 seconds') # Aggregation .groupby(F.window(F.col('timestamp'), '5 minute')) .agg( F.sum(F.when(F.col('col1') == 'cat1', 1).otherwise(0)).alias('count_cat1'), F.count(F.col('col1')).alias('total') ) .withColumn('mean_cat1', F.expr('count_cat1 / total')) ) run_query = ( transformation_1 .writeStream .queryName('SocketStream') .format('console') .option('truncate', 'false') .outputMode('update') ).start() run_query.awaitTermination()
def global_revision_hotspots_by_time(df, weeks_per_bin=4, multiplier=4): df_windowed = df.groupBy(window("timestamp", str(weeks_per_bin) + " weeks")).count() df_avg = df_windowed.select(avg("count")) df_windowed = df_windowed.withColumn( "avg", lit(float(df_avg.first()["avg(count)"]))) df_windowed.show() df_hotspots = df_windowed.where(col("count") >= (multiplier * col("avg"))) return df_hotspots
def single_revision_hotspots_by_time(df, weeks_per_bin=4, multiplier=4): df_windowed = df.groupBy( col("title"), window("timestamp", str(weeks_per_bin) + " weeks")).count() df_avg = df_windowed.groupBy(col("title")).avg() df_joined = join_by_column(df_windowed, df_avg, "title") df_hotspots = df_joined.where( col("count") >= (multiplier * col("avg(count)"))) return df_hotspots
lines = spark\ .readStream\ .format('socket')\ .option('host', host)\ .option('port', port)\ .option('includeTimestamp', 'true')\ .load() # Split the lines into words, retaining timestamps # split() splits each line into an array, and explode() turns the array into multiple rows words = lines.select( explode(split(lines.value, ' ')).alias('word'), lines.timestamp ) # Group the data by window and word and compute the count of each group windowedCounts = words.groupBy( window(words.timestamp, windowDuration, slideDuration), words.word ).count().orderBy('window') # Start running the query that prints the windowed word counts to the console query = windowedCounts\ .writeStream\ .outputMode('complete')\ .format('console')\ .option('truncate', 'false')\ .start() query.awaitTermination()
.load("/data/retail-data/by-day/*.csv") staticDataFrame.createOrReplaceTempView("retail_data") staticSchema = staticDataFrame.schema # COMMAND ---------- from pyspark.sql.functions import window, column, desc, col staticDataFrame\ .selectExpr( "CustomerId", "(UnitPrice * Quantity) as total_cost", "InvoiceDate")\ .groupBy( col("CustomerId"), window(col("InvoiceDate"), "1 day"))\ .sum("total_cost")\ .show(5) # COMMAND ---------- streamingDataFrame = spark.readStream\ .schema(staticSchema)\ .option("maxFilesPerTrigger", 1)\ .format("csv")\ .option("header", "true")\ .load("/data/retail-data/by-day/*.csv") # COMMAND ----------
spark = SparkSession\ .builder\ .appName("CS-838-Assignment2-PartB-1")\ .config("spark.driver.memeory","1g")\ .config("spark.executor.cores","4")\ .config("spark.executor.memory","4g")\ .config("spark.task.cpus","1")\ .config("spark.eventLog.enabled","true")\ .config("spark.eventLog.dir","file:///tmp/spark-events")\ .getOrCreate()\ tweetSchema =StructType().add("userA",StringType()).add("userB",StringType()).add("timestamp",TimestampType()).add("interaction",StringType()) lines = spark.readStream.option("sep", ",").schema(tweetSchema).csv(sys.argv[1])#("/stream-monitoring") windowedCounts = lines.groupBy(window(lines.timestamp, '60 minutes', '30 minutes'),lines.interaction).count().orderBy('window') #print (windowedCounts.count()) query = windowedCounts\ .writeStream\ .outputMode('complete')\ .format('console')\ .option('truncate', 'false')\ .option('numRows', 1000000000)\ .start() query.awaitTermination()