コード例 #1
0
def doKafkaTest(spark, dataDir):
    # {"id":1,"firstname":"James ","middlename":"","lastname":"Smith","dob_year":2018,"dob_month":1,"gender":"M","salary":3000}
    schema = StructType() \
            .add("id", IntegerType()) \
            .add("firstname", StringType()) \
            .add("middlename", StringType()) \
            .add("lastname", StringType()) \
            .add("dob_year", IntegerType()) \
            .add("dob_month", IntegerType()) \
            .add("gender", StringType()) \
            .add("salary", IntegerType())

    brokers = "kafka.kd-confluent.svc.cluster.local:9071"
    topic = "json_spark"

    # +---+---------+----------+--------+--------+---------+------+------+---------+--------------------+
    # | id|firstname|middlename|lastname|dob_year|dob_month|gender|salary|msgoffset|             msgtime|
    # +---+---------+----------+--------+--------+---------+------+------+---------+--------------------+
    # |  1|   James |          |   Smith|    2018|        1|     M|  3000|        0|2021-01-22 05:53:...|
    # |  2| Michael |      Rose|        |    2010|        3|     M|  4000|        1|2021-01-22 05:53:...|
    # |  3|  Robert |          |Williams|    2010|        3|     M|  4000|        2|2021-01-22 07:32:...|
    # +---+---------+----------+--------+--------+---------+------+------+---------+--------------------+

    kdf = util.readKafkaJson(spark, brokers, topic, schema, offset=0)
    util.showStream(kdf)
    util.writeKafkaJson(brokers, topic, kdf, "id",
                        dataDir + "/checkpoint_kafka")
コード例 #2
0
def getShowDemographic(showByUserDf, userDf):

    # Dfs on both sides of the join need an event timestamp field as a watermark, so that
    # late-arriving data can be discarded, to prevent unbounded waiting.
    # The user dataframe has no time column, so we add current time as an artificial column
    showByUserDf = showByUserDf.withWatermark("start_ts", "1 minutes")
    # userDf = userDf \
    #             .withColumn("current_timestamp", current_timestamp()) \
    #             .withWatermark("current_timestamp", "1 minutes")

    showByUserDf = showByUserDf.withColumnRenamed("user_id", "show_user_id")
    joinDf = showByUserDf.join(userDf,
                               (showByUserDf.show_user_id == userDf.user_id),
                               "inner")
    joinDf = joinDf.select("program_id", "channel_id", "program", "channel",
                           "user_id", "device_id", "name", "age", "gender",
                           "start_ts", "end_ts")

    # +----------+----------+-------------+-------+-------+---------+------+---+------+-------------------+-------------------+
    # |program_id|channel_id|      program|channel|user_id|device_id|  name|age|gender|           start_ts|             end_ts|
    # +----------+----------+-------------+-------+-------+---------+------+---+------+-------------------+-------------------+
    # |        13|        57|Living Planet|    BBC|     45|       14| ketan| 55|     M|2021-02-01 07:45:00|2021-02-01 08:19:35|
    # |        12|        57|Animal Planet|    BBC|     45|       14| ketan| 55|     M|2021-02-01 07:12:35|2021-02-01 07:45:00|
    # |        14|        57| Blue Kingdom|    BBC|     45|       15| ketan| 55|     M|2021-02-02 07:19:35|2021-02-02 07:57:35|
    # |        13|        57|Living Planet|    BBC|     46|       17|vihaan| 18|     M|2021-02-01 09:20:16|2021-02-01 09:36:56|
    # |        14|        58|         Dune|    HBO|     46|       17|vihaan| 18|     M|2021-02-01 09:10:00|2021-02-01 09:20:16|
    # |        13|        58| Wonder Woman|    HBO|     46|       17|vihaan| 18|     M|2021-02-01 08:32:51|2021-02-01 09:10:00|
    # |        13|        58| Wonder Woman|    HBO|     46|       16|vihaan| 18|     M|2021-02-01 07:30:00|2021-02-01 09:05:17|
    # |        12|        58|    Westworld|    HBO|     46|       16|vihaan| 18|     M|2021-02-01 07:00:35|2021-02-01 07:30:00|
    # +----------+----------+-------------+-------+-------+---------+------+---+------+-------------------+-------------------+

    util.showStream(joinDf)
    return joinDf
コード例 #3
0
def getAdDemographic(adByUserDf, userDf):

    # Dfs on both sides of the join need an event timestamp field as a watermark, so that
    # late-arriving data can be discarded, to prevent unbounded waiting.
    # The user dataframe has no time column, so we add current time as an artificial column
    adByUserDf = adByUserDf.withWatermark("start_ts", "1 minutes")
    # userDf = userDf \
    #             .withColumn("current_timestamp", current_timestamp()) \
    #             .withWatermark("current_timestamp", "1 minutes")

    adByUserDf = adByUserDf.withColumnRenamed("user_id", "ad_user_id")
    joinDf = adByUserDf.join(userDf, (adByUserDf.ad_user_id == userDf.user_id),
                             "inner")

    # +-----+----------+-------+---------+-------------------+-------------------+-------+------+---+------+--------------------+
    # |ad_id|channel_id|user_id|device_id|           start_ts|             end_ts|user_id|  name|age|gender|   current_timestamp|
    # +-----+----------+-------+---------+-------------------+-------------------+-------+------+---+------+--------------------+
    # |   17|        57|     45|       14|2021-02-01 07:16:24|2021-02-01 07:43:51|     45| ketan| 55|     M|2021-02-25 06:07:...|
    # |   17|        57|     45|       15|2021-02-02 07:22:27|2021-02-02 07:24:38|     45| ketan| 55|     M|2021-02-25 06:07:...|
    # |   13|        58|     46|       17|2021-02-01 08:32:51|2021-02-01 08:43:18|     46|vihaan| 18|     M|2021-02-25 06:07:...|
    # |   14|        58|     46|       17|2021-02-01 09:07:03|2021-02-01 09:20:16|     46|vihaan| 18|     M|2021-02-25 06:07:...|
    # |   13|        58|     46|       16|2021-02-01 08:19:26|2021-02-01 08:43:18|     46|vihaan| 18|     M|2021-02-25 06:07:...|
    # +-----+----------+-------+---------+-------------------+-------------------+-------+------+---+------+--------------------+

    # Use only the relevant fields
    joinDf = joinDf.select("ad_id", "channel_id", "user_id", "device_id",
                           "name", "age", "gender", "start_ts", "end_ts")
    util.showStream(joinDf)
    return joinDf
コード例 #4
0
def getShowOverlap(showDf, sessionDf):
    # Alternate syntax for Joins
    #
    # overlapDf = sessionDf.join(df1,
    #     expr("""
    #         show_channel_id = channel_id AND
    #         start_ts <= show_end_ts AND
    #         end_ts >= show_start_ts
    #       """
    #     ))

    # Rename columns that have the same name in both Dataframes
    showDf = showDf.withColumnRenamed("channel_id", "show_channel_id") \
               .withColumnRenamed("start_ts", "show_start_ts") \
               .withColumnRenamed("end_ts", "show_end_ts")

    # Rename columns that have the same name in both Dataframes
    sessionDf = sessionDf.withColumnRenamed("channel_id", "session_channel_id") \
                         .withColumnRenamed("start_ts", "session_start_ts") \
                         .withColumnRenamed("end_ts", "session_end_ts")

    # Join the two Dataframes based on the Channel, such that the Session watching
    # start/end times overlap with the Show start/end times. This is a static-stream
    # join. We would like to use Left Outer join, so that Shows with no watching
    # Sessions also get included, but that is not supported.
    overlapDf = sessionDf.join(
        showDf, (sessionDf.session_channel_id == showDf.show_channel_id) &
        (sessionDf.session_start_ts <= showDf.show_end_ts) &
        (sessionDf.session_end_ts >= showDf.show_start_ts), "inner")

    # From the overlap, compute the start and end time that the user watched each show.
    overlapDf = overlapDf.withColumn("over_start_ts", greatest(overlapDf.session_start_ts, overlapDf.show_start_ts)) \
                         .withColumn("over_end_ts", least(overlapDf.session_end_ts, overlapDf.show_end_ts))

    # +-------+---------+------------------+-------------------+-------------------+----------+-------------+---------------+-------+----------+-------------------+-------------------+-------------------+-------------------+
    # |user_id|device_id|session_channel_id|   session_start_ts|     session_end_ts|program_id|      program|show_channel_id|channel|  day_date|      show_start_ts|        show_end_ts|      over_start_ts|        over_end_ts|
    # +-------+---------+------------------+-------------------+-------------------+----------+-------------+---------------+-------+----------+-------------------+-------------------+-------------------+-------------------+
    # |     46|       17|                57|2021-02-01 09:20:16|2021-02-01 09:36:56|        13|Living Planet|             57|    BBC|2021-02-01|2021-02-01 07:45:00|2021-02-02 00:00:00|2021-02-01 09:20:16|2021-02-01 09:36:56|
    # |     46|       17|                58|2021-02-01 08:32:51|2021-02-01 09:20:16|        14|         Dune|             58|    HBO|2021-02-01|2021-02-01 09:10:00|2021-02-02 00:00:00|2021-02-01 09:10:00|2021-02-01 09:20:16|
    # |     46|       17|                58|2021-02-01 08:32:51|2021-02-01 09:20:16|        13| Wonder Woman|             58|    HBO|2021-02-01|2021-02-01 07:30:00|2021-02-01 09:10:00|2021-02-01 08:32:51|2021-02-01 09:10:00|
    # |     46|       16|                58|2021-02-01 07:00:35|2021-02-01 09:05:17|        13| Wonder Woman|             58|    HBO|2021-02-01|2021-02-01 07:30:00|2021-02-01 09:10:00|2021-02-01 07:30:00|2021-02-01 09:05:17|
    # |     46|       16|                58|2021-02-01 07:00:35|2021-02-01 09:05:17|        12|    Westworld|             58|    HBO|2021-02-01|2021-02-01 07:00:00|2021-02-01 07:30:00|2021-02-01 07:00:35|2021-02-01 07:30:00|
    # |     45|       14|                57|2021-02-01 07:12:35|2021-02-01 08:19:35|        13|Living Planet|             57|    BBC|2021-02-01|2021-02-01 07:45:00|2021-02-02 00:00:00|2021-02-01 07:45:00|2021-02-01 08:19:35|
    # |     45|       14|                57|2021-02-01 07:12:35|2021-02-01 08:19:35|        12|Animal Planet|             57|    BBC|2021-02-01|2021-02-01 07:00:00|2021-02-01 07:45:00|2021-02-01 07:12:35|2021-02-01 07:45:00|
    # |     45|       15|                57|2021-02-02 07:19:35|2021-02-02 07:57:35|        14| Blue Kingdom|             57|    BBC|2021-02-02|2021-02-02 07:00:00|2021-02-02 08:30:00|2021-02-02 07:19:35|2021-02-02 07:57:35|
    # +-------+---------+------------------+-------------------+-------------------+----------+-------------+---------------+-------+----------+-------------------+-------------------+-------------------+-------------------+

    # overlapDf.printSchema()

    # Use only the relevant columns and rename them as needed
    overlapDf = overlapDf.select("program_id", "program",
                                 col("show_channel_id").alias("channel_id"),
                                 "channel", "user_id", "device_id",
                                 col("over_start_ts").alias("start_ts"),
                                 col("over_end_ts").alias("end_ts"))

    util.showStream(overlapDf)
    return overlapDf
コード例 #5
0
def main(spark, dataDir, fromKafka=False, toKafka=False):
    sessionDf = session.doSession(spark,
                                  dataDir,
                                  brokers,
                                  sessionTopic,
                                  offset=0,
                                  fromKafka=fromKafka)
    channelDf = channel.doChannel(spark)

    # Process Shows and Ads
    showDf = show.doShow(channelDf)
    adDf = ad.doAd(spark,
                   dataDir,
                   brokers,
                   adTopic,
                   offset=0,
                   fromKafka=fromKafka)

    # Get user demographics and device locations
    userDf = user.doUser(spark,
                         dataDir,
                         brokers,
                         userTopic,
                         offset=0,
                         fromKafka=fromKafka)
    deviceLocDf = device_loc.doDeviceLoc(spark,
                                         dataDir,
                                         brokers,
                                         deviceLocTopic,
                                         offset=10,
                                         fromKafka=fromKafka)

    # Get Show by User Sessions, enriched with Demographic and Device Location
    showDemographicLocDf = show.doShowDemographicLoc(showDf, sessionDf, userDf,
                                                     deviceLocDf)
    util.showStream(showDemographicLocDf)

    # Get Ad by User Sessions, enriched with Demographic and Device Location
    adDemographicLocDf = ad.doAdDemographicLoc(adDf, sessionDf, userDf,
                                               deviceLocDf)
    util.showStream(adDemographicLocDf)

    if (toKafka):
        util.writeKafkaJson(brokers, showOutTopic, showDemographicLocDf, None,
                            dataDir + "/checkpoint_show")
        util.writeKafkaJson(brokers, adOutTopic, adDemographicLocDf, None,
                            dataDir + "/checkpoint_ad")

    # Obsolete
    if (False):
        # Save intermediate state to Kafka as a shortcut for use by downstream processing
        # saveShowDemographicKafka (showDemographicDf, dataDir)
        # saveAdDemographicKafka (adDemographicDf, dataDir)
        #showDemographicDf = loadShowDemographicKafka()
        #util.showStream(showDemographicDf)
        pass
コード例 #6
0
def getAdOverlap(adDf, sessionDf):
    # Rename columns that have the same name in both Dataframes
    adDf = adDf.withColumnRenamed("channel_id", "ad_channel_id") \
               .withColumnRenamed("start_ts", "ad_start_ts") \
               .withColumnRenamed("end_ts", "ad_end_ts")

    # Rename columns that have the same name in both Dataframes
    sessionDf = sessionDf.withColumnRenamed("channel_id", "session_channel_id") \
                         .withColumnRenamed("start_ts", "session_start_ts") \
                         .withColumnRenamed("end_ts", "session_end_ts")

    # Dfs on both sides of the join need an event timestamp field as a watermark, so that
    # late-arriving data can be discarded, to prevent unbounded waiting.
    adWithWatermark = adDf.withWatermark("ad_start_ts", "1 minutes")
    sessionWithWatermark = sessionDf.withWatermark("session_start_ts",
                                                   "1 minutes")

    # Join the two Dataframes based on the Channel, such that the Session watching
    # start/end times overlap with the Ad start/end times. This is a stream-stream
    # join. We use Left Outer join, so that Ads with no viewing Sessions also get
    # included.
    overlapDf = adWithWatermark.join(
        sessionWithWatermark, (sessionWithWatermark.session_channel_id
                               == adWithWatermark.ad_channel_id) &
        (sessionWithWatermark.session_start_ts <= adWithWatermark.ad_end_ts) &
        (sessionWithWatermark.session_end_ts >= adWithWatermark.ad_start_ts),
        "inner")

    overlapDf = overlapDf.withColumn("over_start_ts", greatest(overlapDf.session_start_ts, overlapDf.ad_start_ts)) \
                         .withColumn("over_end_ts", least(overlapDf.session_end_ts, overlapDf.ad_end_ts))

    # NB: The Outer NULL results (for Ad ID 18) should be here but are missing!!
    # From the docs, my feeling is that they will be output after a delay, when the next batch of stream data come in.
    #
    # +-----+-------------+-------------------+-------------------+-------+---------+------------------+-------------------+-------------------+-------------------+-------------------+
    # |ad_id|ad_channel_id|        ad_start_ts|          ad_end_ts|user_id|device_id|session_channel_id|   session_start_ts|     session_end_ts|      over_start_ts|        over_end_ts|
    # +-----+-------------+-------------------+-------------------+-------+---------+------------------+-------------------+-------------------+-------------------+-------------------+
    # |   17|           57|2021-02-01 07:16:24|2021-02-01 07:43:51|     45|       14|                57|2021-02-01 07:12:35|2021-02-01 08:19:35|2021-02-01 07:16:24|2021-02-01 07:43:51|
    # |   17|           57|2021-02-02 07:22:27|2021-02-02 07:24:38|     45|       15|                57|2021-02-02 07:19:35|2021-02-02 07:57:35|2021-02-02 07:22:27|2021-02-02 07:24:38|
    # |   13|           58|2021-02-01 08:19:26|2021-02-01 08:43:18|     46|       17|                58|2021-02-01 08:32:51|2021-02-01 09:20:16|2021-02-01 08:32:51|2021-02-01 08:43:18|
    # |   14|           58|2021-02-01 09:07:03|2021-02-01 09:24:41|     46|       17|                58|2021-02-01 08:32:51|2021-02-01 09:20:16|2021-02-01 09:07:03|2021-02-01 09:20:16|
    # |   13|           58|2021-02-01 08:19:26|2021-02-01 08:43:18|     46|       16|                58|2021-02-01 07:00:35|2021-02-01 09:05:17|2021-02-01 08:19:26|2021-02-01 08:43:18|
    # +-----+-------------+-------------------+-------------------+-------+---------+------------------+-------------------+-------------------+-------------------+-------------------+

    #overlapDf.printSchema()

    # Use only the relevant columns and rename them as needed
    overlapDf = overlapDf.select("ad_id",
                                 col("ad_channel_id").alias("channel_id"),
                                 "user_id", "device_id",
                                 col("over_start_ts").alias("start_ts"),
                                 col("over_end_ts").alias("end_ts"))
    util.showStream(overlapDf)
    return overlapDf
コード例 #7
0
ファイル: session.py プロジェクト: ketanhdoshi/bd
def readKafkaAction(spark, brokers, topic, offset):
    #
    schema = StructType() \
            .add("user", StringType()) \
            .add("user_id", IntegerType()) \
            .add("channel_id", IntegerType()) \
            .add("device_id", IntegerType()) \
            .add("action", IntegerType()) \
            .add("action_ts", TimestampType())

    actionDf = util.readKafkaJson(spark, brokers, topic, schema, offset=offset)

    util.showStream(actionDf)
    return actionDf
コード例 #8
0
ファイル: user.py プロジェクト: ketanhdoshi/bd
def readKafkaStream(spark, brokers, topic, offset):
    #
    schema = StructType() \
            .add("ID", IntegerType()) \
            .add("user_name", StringType()) \
            .add("age", IntegerType()) \
            .add("gender", StringType())

    userDf = util.readKafkaJson(spark, brokers, topic, schema, offset=offset)
    userDf = userDf.withColumnRenamed("ID", "user_id") \
                   .withColumnRenamed("user_name", "name")

    # Use just the relevant fields
    userDf = userDf.select("user_id", "name", "age", "gender")
    util.showStream(userDf)
    return userDf
コード例 #9
0
ファイル: user.py プロジェクト: ketanhdoshi/bd
def readFileStream(spark, dataDir):
    userSchema = StructType() \
            .add("user_id", IntegerType()) \
            .add("name", StringType()) \
            .add("age", IntegerType()) \
            .add("gender", StringType())

    inputPath = dataDir + "/user*.json"
    userDf = util.getFileStream(spark, userSchema, inputPath)
    util.showStream(userDf)

    # +-------+-------+---+------+
    # |user_id|   name|age|gender|
    # +-------+-------+---+------+
    # |     45|  ketan| 55|     M|
    # |     46| vihaan| 18|     M|
    # |     47|meghana| 51|     F|
    # +-------+-------+---+------+

    return userDf
コード例 #10
0
ファイル: session.py プロジェクト: ketanhdoshi/bd
def readKafkaStream(spark, brokers, topic, offset):
    #
    schema = StructType() \
            .add("user_id", IntegerType()) \
            .add("device_id", IntegerType()) \
            .add("channel_id", IntegerType()) \
            .add("start", StringType()) \
            .add("end", StringType())

    # {"user_id":45,"device_id":15,"channel_id":57,"start":"2021-02-02T07:19:35.000Z","end":"2021-02-02T07:57:35.000Z"}
    # {"user_id":45,"device_id":14,"channel_id":57,"start":"2021-02-01T07:12:35.000Z","end":"2021-02-01T08:19:35.000Z"}
    # {"user_id":46,"device_id":16,"channel_id":58,"start":"2021-02-01T07:00:35.000Z","end":"2021-02-01T09:05:17.000Z"}
    # {"user_id":46,"device_id":17,"channel_id":58,"start":"2021-02-01T08:32:51.000Z","end":"2021-02-01T09:20:16.000Z"}
    # {"user_id":46,"device_id":17,"channel_id":57,"start":"2021-02-01T09:20:16.000Z","end":"2021-02-01T09:36:56.000Z"}

    sessionDf = util.readKafkaJson(spark,
                                   brokers,
                                   topic,
                                   schema,
                                   offset=offset)

    fmt = "yyyy-MM-dd'T'HH:mm:ss.SSS'Z'"
    sessionDf = sessionDf.transform(
        partial(util.StrToTimestamp,
                strColName="start",
                tsColName="start_ts",
                fmt=fmt))
    sessionDf = sessionDf.transform(
        partial(util.StrToTimestamp,
                strColName="end",
                tsColName="end_ts",
                fmt=fmt))

    # Use just the relevant fields
    sessionDf = sessionDf.select("user_id", "device_id", "channel_id",
                                 "start_ts", "end_ts")
    util.showStream(sessionDf)
    return sessionDf
コード例 #11
0
def readKafkaStream(spark, brokers, topic, offset):
    #
    schema = StructType() \
            .add("ad_id", IntegerType()) \
            .add("channel_id", IntegerType()) \
            .add("start_ts", TimestampType()) \
            .add("duration_secs", IntegerType())

    # {"ad_id":13,"channel_id":57,"start_ts":1612224166,"duration_secs":5}
    # {"ad_id":16,"channel_id":57,"start_ts":1612223933,"duration_secs":16}
    # {"ad_id":15,"channel_id":57,"start_ts":1612224213,"duration_secs":7}
    # {"ad_id":15,"channel_id":57,"start_ts":1612224269,"duration_secs":8}
    # {"ad_id":16,"channel_id":57,"start_ts":1612224227,"duration_secs":17}

    adDf = util.readKafkaJson(spark, brokers, topic, schema, offset=offset)
    # Get the 'end_ts' by adding the 'start_ts' and 'duration_secs'. Since 'start_ts' is a timestamp column
    # cast it to integer, add the duration, and then convert back to timestamp.
    adDf = adDf.withColumn("end_ts", (col("start_ts").cast("integer") +
                                      col("duration_secs")).cast("timestamp"))

    # Use just the relevant fields
    adDf = adDf.select("ad_id", "channel_id", "start_ts", "end_ts")
    util.showStream(adDf)
    return adDf