def doKafkaTest(spark, dataDir): # {"id":1,"firstname":"James ","middlename":"","lastname":"Smith","dob_year":2018,"dob_month":1,"gender":"M","salary":3000} schema = StructType() \ .add("id", IntegerType()) \ .add("firstname", StringType()) \ .add("middlename", StringType()) \ .add("lastname", StringType()) \ .add("dob_year", IntegerType()) \ .add("dob_month", IntegerType()) \ .add("gender", StringType()) \ .add("salary", IntegerType()) brokers = "kafka.kd-confluent.svc.cluster.local:9071" topic = "json_spark" # +---+---------+----------+--------+--------+---------+------+------+---------+--------------------+ # | id|firstname|middlename|lastname|dob_year|dob_month|gender|salary|msgoffset| msgtime| # +---+---------+----------+--------+--------+---------+------+------+---------+--------------------+ # | 1| James | | Smith| 2018| 1| M| 3000| 0|2021-01-22 05:53:...| # | 2| Michael | Rose| | 2010| 3| M| 4000| 1|2021-01-22 05:53:...| # | 3| Robert | |Williams| 2010| 3| M| 4000| 2|2021-01-22 07:32:...| # +---+---------+----------+--------+--------+---------+------+------+---------+--------------------+ kdf = util.readKafkaJson(spark, brokers, topic, schema, offset=0) util.showStream(kdf) util.writeKafkaJson(brokers, topic, kdf, "id", dataDir + "/checkpoint_kafka")
def getShowDemographic(showByUserDf, userDf): # Dfs on both sides of the join need an event timestamp field as a watermark, so that # late-arriving data can be discarded, to prevent unbounded waiting. # The user dataframe has no time column, so we add current time as an artificial column showByUserDf = showByUserDf.withWatermark("start_ts", "1 minutes") # userDf = userDf \ # .withColumn("current_timestamp", current_timestamp()) \ # .withWatermark("current_timestamp", "1 minutes") showByUserDf = showByUserDf.withColumnRenamed("user_id", "show_user_id") joinDf = showByUserDf.join(userDf, (showByUserDf.show_user_id == userDf.user_id), "inner") joinDf = joinDf.select("program_id", "channel_id", "program", "channel", "user_id", "device_id", "name", "age", "gender", "start_ts", "end_ts") # +----------+----------+-------------+-------+-------+---------+------+---+------+-------------------+-------------------+ # |program_id|channel_id| program|channel|user_id|device_id| name|age|gender| start_ts| end_ts| # +----------+----------+-------------+-------+-------+---------+------+---+------+-------------------+-------------------+ # | 13| 57|Living Planet| BBC| 45| 14| ketan| 55| M|2021-02-01 07:45:00|2021-02-01 08:19:35| # | 12| 57|Animal Planet| BBC| 45| 14| ketan| 55| M|2021-02-01 07:12:35|2021-02-01 07:45:00| # | 14| 57| Blue Kingdom| BBC| 45| 15| ketan| 55| M|2021-02-02 07:19:35|2021-02-02 07:57:35| # | 13| 57|Living Planet| BBC| 46| 17|vihaan| 18| M|2021-02-01 09:20:16|2021-02-01 09:36:56| # | 14| 58| Dune| HBO| 46| 17|vihaan| 18| M|2021-02-01 09:10:00|2021-02-01 09:20:16| # | 13| 58| Wonder Woman| HBO| 46| 17|vihaan| 18| M|2021-02-01 08:32:51|2021-02-01 09:10:00| # | 13| 58| Wonder Woman| HBO| 46| 16|vihaan| 18| M|2021-02-01 07:30:00|2021-02-01 09:05:17| # | 12| 58| Westworld| HBO| 46| 16|vihaan| 18| M|2021-02-01 07:00:35|2021-02-01 07:30:00| # +----------+----------+-------------+-------+-------+---------+------+---+------+-------------------+-------------------+ util.showStream(joinDf) return joinDf
def getAdDemographic(adByUserDf, userDf): # Dfs on both sides of the join need an event timestamp field as a watermark, so that # late-arriving data can be discarded, to prevent unbounded waiting. # The user dataframe has no time column, so we add current time as an artificial column adByUserDf = adByUserDf.withWatermark("start_ts", "1 minutes") # userDf = userDf \ # .withColumn("current_timestamp", current_timestamp()) \ # .withWatermark("current_timestamp", "1 minutes") adByUserDf = adByUserDf.withColumnRenamed("user_id", "ad_user_id") joinDf = adByUserDf.join(userDf, (adByUserDf.ad_user_id == userDf.user_id), "inner") # +-----+----------+-------+---------+-------------------+-------------------+-------+------+---+------+--------------------+ # |ad_id|channel_id|user_id|device_id| start_ts| end_ts|user_id| name|age|gender| current_timestamp| # +-----+----------+-------+---------+-------------------+-------------------+-------+------+---+------+--------------------+ # | 17| 57| 45| 14|2021-02-01 07:16:24|2021-02-01 07:43:51| 45| ketan| 55| M|2021-02-25 06:07:...| # | 17| 57| 45| 15|2021-02-02 07:22:27|2021-02-02 07:24:38| 45| ketan| 55| M|2021-02-25 06:07:...| # | 13| 58| 46| 17|2021-02-01 08:32:51|2021-02-01 08:43:18| 46|vihaan| 18| M|2021-02-25 06:07:...| # | 14| 58| 46| 17|2021-02-01 09:07:03|2021-02-01 09:20:16| 46|vihaan| 18| M|2021-02-25 06:07:...| # | 13| 58| 46| 16|2021-02-01 08:19:26|2021-02-01 08:43:18| 46|vihaan| 18| M|2021-02-25 06:07:...| # +-----+----------+-------+---------+-------------------+-------------------+-------+------+---+------+--------------------+ # Use only the relevant fields joinDf = joinDf.select("ad_id", "channel_id", "user_id", "device_id", "name", "age", "gender", "start_ts", "end_ts") util.showStream(joinDf) return joinDf
def getShowOverlap(showDf, sessionDf): # Alternate syntax for Joins # # overlapDf = sessionDf.join(df1, # expr(""" # show_channel_id = channel_id AND # start_ts <= show_end_ts AND # end_ts >= show_start_ts # """ # )) # Rename columns that have the same name in both Dataframes showDf = showDf.withColumnRenamed("channel_id", "show_channel_id") \ .withColumnRenamed("start_ts", "show_start_ts") \ .withColumnRenamed("end_ts", "show_end_ts") # Rename columns that have the same name in both Dataframes sessionDf = sessionDf.withColumnRenamed("channel_id", "session_channel_id") \ .withColumnRenamed("start_ts", "session_start_ts") \ .withColumnRenamed("end_ts", "session_end_ts") # Join the two Dataframes based on the Channel, such that the Session watching # start/end times overlap with the Show start/end times. This is a static-stream # join. We would like to use Left Outer join, so that Shows with no watching # Sessions also get included, but that is not supported. overlapDf = sessionDf.join( showDf, (sessionDf.session_channel_id == showDf.show_channel_id) & (sessionDf.session_start_ts <= showDf.show_end_ts) & (sessionDf.session_end_ts >= showDf.show_start_ts), "inner") # From the overlap, compute the start and end time that the user watched each show. overlapDf = overlapDf.withColumn("over_start_ts", greatest(overlapDf.session_start_ts, overlapDf.show_start_ts)) \ .withColumn("over_end_ts", least(overlapDf.session_end_ts, overlapDf.show_end_ts)) # +-------+---------+------------------+-------------------+-------------------+----------+-------------+---------------+-------+----------+-------------------+-------------------+-------------------+-------------------+ # |user_id|device_id|session_channel_id| session_start_ts| session_end_ts|program_id| program|show_channel_id|channel| day_date| show_start_ts| show_end_ts| over_start_ts| over_end_ts| # +-------+---------+------------------+-------------------+-------------------+----------+-------------+---------------+-------+----------+-------------------+-------------------+-------------------+-------------------+ # | 46| 17| 57|2021-02-01 09:20:16|2021-02-01 09:36:56| 13|Living Planet| 57| BBC|2021-02-01|2021-02-01 07:45:00|2021-02-02 00:00:00|2021-02-01 09:20:16|2021-02-01 09:36:56| # | 46| 17| 58|2021-02-01 08:32:51|2021-02-01 09:20:16| 14| Dune| 58| HBO|2021-02-01|2021-02-01 09:10:00|2021-02-02 00:00:00|2021-02-01 09:10:00|2021-02-01 09:20:16| # | 46| 17| 58|2021-02-01 08:32:51|2021-02-01 09:20:16| 13| Wonder Woman| 58| HBO|2021-02-01|2021-02-01 07:30:00|2021-02-01 09:10:00|2021-02-01 08:32:51|2021-02-01 09:10:00| # | 46| 16| 58|2021-02-01 07:00:35|2021-02-01 09:05:17| 13| Wonder Woman| 58| HBO|2021-02-01|2021-02-01 07:30:00|2021-02-01 09:10:00|2021-02-01 07:30:00|2021-02-01 09:05:17| # | 46| 16| 58|2021-02-01 07:00:35|2021-02-01 09:05:17| 12| Westworld| 58| HBO|2021-02-01|2021-02-01 07:00:00|2021-02-01 07:30:00|2021-02-01 07:00:35|2021-02-01 07:30:00| # | 45| 14| 57|2021-02-01 07:12:35|2021-02-01 08:19:35| 13|Living Planet| 57| BBC|2021-02-01|2021-02-01 07:45:00|2021-02-02 00:00:00|2021-02-01 07:45:00|2021-02-01 08:19:35| # | 45| 14| 57|2021-02-01 07:12:35|2021-02-01 08:19:35| 12|Animal Planet| 57| BBC|2021-02-01|2021-02-01 07:00:00|2021-02-01 07:45:00|2021-02-01 07:12:35|2021-02-01 07:45:00| # | 45| 15| 57|2021-02-02 07:19:35|2021-02-02 07:57:35| 14| Blue Kingdom| 57| BBC|2021-02-02|2021-02-02 07:00:00|2021-02-02 08:30:00|2021-02-02 07:19:35|2021-02-02 07:57:35| # +-------+---------+------------------+-------------------+-------------------+----------+-------------+---------------+-------+----------+-------------------+-------------------+-------------------+-------------------+ # overlapDf.printSchema() # Use only the relevant columns and rename them as needed overlapDf = overlapDf.select("program_id", "program", col("show_channel_id").alias("channel_id"), "channel", "user_id", "device_id", col("over_start_ts").alias("start_ts"), col("over_end_ts").alias("end_ts")) util.showStream(overlapDf) return overlapDf
def main(spark, dataDir, fromKafka=False, toKafka=False): sessionDf = session.doSession(spark, dataDir, brokers, sessionTopic, offset=0, fromKafka=fromKafka) channelDf = channel.doChannel(spark) # Process Shows and Ads showDf = show.doShow(channelDf) adDf = ad.doAd(spark, dataDir, brokers, adTopic, offset=0, fromKafka=fromKafka) # Get user demographics and device locations userDf = user.doUser(spark, dataDir, brokers, userTopic, offset=0, fromKafka=fromKafka) deviceLocDf = device_loc.doDeviceLoc(spark, dataDir, brokers, deviceLocTopic, offset=10, fromKafka=fromKafka) # Get Show by User Sessions, enriched with Demographic and Device Location showDemographicLocDf = show.doShowDemographicLoc(showDf, sessionDf, userDf, deviceLocDf) util.showStream(showDemographicLocDf) # Get Ad by User Sessions, enriched with Demographic and Device Location adDemographicLocDf = ad.doAdDemographicLoc(adDf, sessionDf, userDf, deviceLocDf) util.showStream(adDemographicLocDf) if (toKafka): util.writeKafkaJson(brokers, showOutTopic, showDemographicLocDf, None, dataDir + "/checkpoint_show") util.writeKafkaJson(brokers, adOutTopic, adDemographicLocDf, None, dataDir + "/checkpoint_ad") # Obsolete if (False): # Save intermediate state to Kafka as a shortcut for use by downstream processing # saveShowDemographicKafka (showDemographicDf, dataDir) # saveAdDemographicKafka (adDemographicDf, dataDir) #showDemographicDf = loadShowDemographicKafka() #util.showStream(showDemographicDf) pass
def getAdOverlap(adDf, sessionDf): # Rename columns that have the same name in both Dataframes adDf = adDf.withColumnRenamed("channel_id", "ad_channel_id") \ .withColumnRenamed("start_ts", "ad_start_ts") \ .withColumnRenamed("end_ts", "ad_end_ts") # Rename columns that have the same name in both Dataframes sessionDf = sessionDf.withColumnRenamed("channel_id", "session_channel_id") \ .withColumnRenamed("start_ts", "session_start_ts") \ .withColumnRenamed("end_ts", "session_end_ts") # Dfs on both sides of the join need an event timestamp field as a watermark, so that # late-arriving data can be discarded, to prevent unbounded waiting. adWithWatermark = adDf.withWatermark("ad_start_ts", "1 minutes") sessionWithWatermark = sessionDf.withWatermark("session_start_ts", "1 minutes") # Join the two Dataframes based on the Channel, such that the Session watching # start/end times overlap with the Ad start/end times. This is a stream-stream # join. We use Left Outer join, so that Ads with no viewing Sessions also get # included. overlapDf = adWithWatermark.join( sessionWithWatermark, (sessionWithWatermark.session_channel_id == adWithWatermark.ad_channel_id) & (sessionWithWatermark.session_start_ts <= adWithWatermark.ad_end_ts) & (sessionWithWatermark.session_end_ts >= adWithWatermark.ad_start_ts), "inner") overlapDf = overlapDf.withColumn("over_start_ts", greatest(overlapDf.session_start_ts, overlapDf.ad_start_ts)) \ .withColumn("over_end_ts", least(overlapDf.session_end_ts, overlapDf.ad_end_ts)) # NB: The Outer NULL results (for Ad ID 18) should be here but are missing!! # From the docs, my feeling is that they will be output after a delay, when the next batch of stream data come in. # # +-----+-------------+-------------------+-------------------+-------+---------+------------------+-------------------+-------------------+-------------------+-------------------+ # |ad_id|ad_channel_id| ad_start_ts| ad_end_ts|user_id|device_id|session_channel_id| session_start_ts| session_end_ts| over_start_ts| over_end_ts| # +-----+-------------+-------------------+-------------------+-------+---------+------------------+-------------------+-------------------+-------------------+-------------------+ # | 17| 57|2021-02-01 07:16:24|2021-02-01 07:43:51| 45| 14| 57|2021-02-01 07:12:35|2021-02-01 08:19:35|2021-02-01 07:16:24|2021-02-01 07:43:51| # | 17| 57|2021-02-02 07:22:27|2021-02-02 07:24:38| 45| 15| 57|2021-02-02 07:19:35|2021-02-02 07:57:35|2021-02-02 07:22:27|2021-02-02 07:24:38| # | 13| 58|2021-02-01 08:19:26|2021-02-01 08:43:18| 46| 17| 58|2021-02-01 08:32:51|2021-02-01 09:20:16|2021-02-01 08:32:51|2021-02-01 08:43:18| # | 14| 58|2021-02-01 09:07:03|2021-02-01 09:24:41| 46| 17| 58|2021-02-01 08:32:51|2021-02-01 09:20:16|2021-02-01 09:07:03|2021-02-01 09:20:16| # | 13| 58|2021-02-01 08:19:26|2021-02-01 08:43:18| 46| 16| 58|2021-02-01 07:00:35|2021-02-01 09:05:17|2021-02-01 08:19:26|2021-02-01 08:43:18| # +-----+-------------+-------------------+-------------------+-------+---------+------------------+-------------------+-------------------+-------------------+-------------------+ #overlapDf.printSchema() # Use only the relevant columns and rename them as needed overlapDf = overlapDf.select("ad_id", col("ad_channel_id").alias("channel_id"), "user_id", "device_id", col("over_start_ts").alias("start_ts"), col("over_end_ts").alias("end_ts")) util.showStream(overlapDf) return overlapDf
def readKafkaAction(spark, brokers, topic, offset): # schema = StructType() \ .add("user", StringType()) \ .add("user_id", IntegerType()) \ .add("channel_id", IntegerType()) \ .add("device_id", IntegerType()) \ .add("action", IntegerType()) \ .add("action_ts", TimestampType()) actionDf = util.readKafkaJson(spark, brokers, topic, schema, offset=offset) util.showStream(actionDf) return actionDf
def readKafkaStream(spark, brokers, topic, offset): # schema = StructType() \ .add("ID", IntegerType()) \ .add("user_name", StringType()) \ .add("age", IntegerType()) \ .add("gender", StringType()) userDf = util.readKafkaJson(spark, brokers, topic, schema, offset=offset) userDf = userDf.withColumnRenamed("ID", "user_id") \ .withColumnRenamed("user_name", "name") # Use just the relevant fields userDf = userDf.select("user_id", "name", "age", "gender") util.showStream(userDf) return userDf
def readFileStream(spark, dataDir): userSchema = StructType() \ .add("user_id", IntegerType()) \ .add("name", StringType()) \ .add("age", IntegerType()) \ .add("gender", StringType()) inputPath = dataDir + "/user*.json" userDf = util.getFileStream(spark, userSchema, inputPath) util.showStream(userDf) # +-------+-------+---+------+ # |user_id| name|age|gender| # +-------+-------+---+------+ # | 45| ketan| 55| M| # | 46| vihaan| 18| M| # | 47|meghana| 51| F| # +-------+-------+---+------+ return userDf
def readKafkaStream(spark, brokers, topic, offset): # schema = StructType() \ .add("user_id", IntegerType()) \ .add("device_id", IntegerType()) \ .add("channel_id", IntegerType()) \ .add("start", StringType()) \ .add("end", StringType()) # {"user_id":45,"device_id":15,"channel_id":57,"start":"2021-02-02T07:19:35.000Z","end":"2021-02-02T07:57:35.000Z"} # {"user_id":45,"device_id":14,"channel_id":57,"start":"2021-02-01T07:12:35.000Z","end":"2021-02-01T08:19:35.000Z"} # {"user_id":46,"device_id":16,"channel_id":58,"start":"2021-02-01T07:00:35.000Z","end":"2021-02-01T09:05:17.000Z"} # {"user_id":46,"device_id":17,"channel_id":58,"start":"2021-02-01T08:32:51.000Z","end":"2021-02-01T09:20:16.000Z"} # {"user_id":46,"device_id":17,"channel_id":57,"start":"2021-02-01T09:20:16.000Z","end":"2021-02-01T09:36:56.000Z"} sessionDf = util.readKafkaJson(spark, brokers, topic, schema, offset=offset) fmt = "yyyy-MM-dd'T'HH:mm:ss.SSS'Z'" sessionDf = sessionDf.transform( partial(util.StrToTimestamp, strColName="start", tsColName="start_ts", fmt=fmt)) sessionDf = sessionDf.transform( partial(util.StrToTimestamp, strColName="end", tsColName="end_ts", fmt=fmt)) # Use just the relevant fields sessionDf = sessionDf.select("user_id", "device_id", "channel_id", "start_ts", "end_ts") util.showStream(sessionDf) return sessionDf
def readKafkaStream(spark, brokers, topic, offset): # schema = StructType() \ .add("ad_id", IntegerType()) \ .add("channel_id", IntegerType()) \ .add("start_ts", TimestampType()) \ .add("duration_secs", IntegerType()) # {"ad_id":13,"channel_id":57,"start_ts":1612224166,"duration_secs":5} # {"ad_id":16,"channel_id":57,"start_ts":1612223933,"duration_secs":16} # {"ad_id":15,"channel_id":57,"start_ts":1612224213,"duration_secs":7} # {"ad_id":15,"channel_id":57,"start_ts":1612224269,"duration_secs":8} # {"ad_id":16,"channel_id":57,"start_ts":1612224227,"duration_secs":17} adDf = util.readKafkaJson(spark, brokers, topic, schema, offset=offset) # Get the 'end_ts' by adding the 'start_ts' and 'duration_secs'. Since 'start_ts' is a timestamp column # cast it to integer, add the duration, and then convert back to timestamp. adDf = adDf.withColumn("end_ts", (col("start_ts").cast("integer") + col("duration_secs")).cast("timestamp")) # Use just the relevant fields adDf = adDf.select("ad_id", "channel_id", "start_ts", "end_ts") util.showStream(adDf) return adDf