コード例 #1
0
def on_different_type_names(match, state):
    """It happens when there isn't TypeObject and type names are different."""
    topic = get_topic_name(match[0], state)
    type1 = get_type_name(match[1], state)
    type2 = get_type_name(match[2], state)
    log_error(
        "[LP-18] Cannot match remote entity in topic '%s': " % (topic) +
        "Different type names found ('%s', '%s')" % (type1, type2), state)
def run_kafka_server():
    f_name = config.INPUT_FILE_NAME
    input_file = utils.prepare_input_file(f_name)
    topic_name = utils.get_topic_name(f_name)

    producer = producer_server.ProducerServer(
        input_file=input_file,
        topic=topic_name,
        bootstrap_servers=config.BOOTSTRAP_SERVERS)
    return producer
コード例 #3
0
def on_duplicate_topic_name_error(match, state):
    """It happens when there is a topic name duplication."""
    topic = get_topic_name(match[0], state)
    log_error("[LP-2] Topic name already in use by another topic: %s" % topic,
              state)
コード例 #4
0
def on_delete_reader(match, state):
    """It happens for deleted DataReaders."""
    topic = get_topic_name(match[0], state)
    log_event("Deleted reader for topic '%s'" % topic, state)
コード例 #5
0
def on_create_reader(match, state):
    """It happens for new DataReader."""
    topic = get_topic_name(match[0], state)
    log_event("Created reader for topic '%s'" % topic, state)
コード例 #6
0
def on_create_writer(match, state):
    """It happens for new DataWriters."""
    topic = get_topic_name(match[0], state)
    log_event("Created writer for topic '%s'" % topic, state)
コード例 #7
0
def on_delete_topic(match, state):
    """It happens for deleted topics."""
    topic = get_topic_name(match[0], state)
    typ = get_type_name(match[1], state)
    log_event("Deleted topic, name: '%s', type: '%s'" % (topic, typ), state, 1)
コード例 #8
0
def on_create_cft(match, state):
    """It happens for new CFT."""
    topic = get_topic_name(match[0], state)
    log_event("Created ContentFilteredTopic, name: '%s'" % topic, state)
コード例 #9
0
def run_spark_job(spark):
    # Create Spark configurations with max offset of 200 per trigger
    # set up correct bootstrap server and port
    topic_name = utils.get_topic_name(config.INPUT_FILE_NAME)
    df = spark \
        .readStream \
        .format("kafka") \
        .option("kafka.bootstrap.servers", config.BOOTSTRAP_SERVERS) \
        .option("subscribe", topic_name) \
        .option("startingOffsets", "earliest") \
        .option("maxRatePerPartition", 100) \
        .option("maxOffsetsPerTrigger", 100) \
        .option("stopGracefullyOnShutdown", "true") \
        .load()

    spark.udf.register("udf_to_timestamp", udf_to_timestamp)

    # Show schema for the incoming resources for checks
    df.printSchema()

    # Take only value and convert it to String
    kafka_df = df.selectExpr("CAST(value AS STRING)")

    service_table = kafka_df \
        .select(psf.from_json(psf.col('value'), schema).alias("SERVICE_DF")) \
        .select("SERVICE_DF.*")

    distinct_table = service_table \
        .select(
            udf_to_timestamp(psf.col("call_date_time")).alias("call_date_time"),
            psf.col("original_crime_type_name"),
            psf.col("disposition")
    ).distinct()

    # count the number of original crime type
    agg_df = distinct_table \
        .withWatermark("call_date_time", "60 minutes") \
        .groupBy(
            psf.window(psf.col("call_date_time"), "60 minutes", "10 minutes"),
            psf.col("original_crime_type_name"),
            psf.col("disposition")
        ) \
        .count() \
        .orderBy("count", ascending=False)

    query = agg_df \
        .writeStream \
        .outputMode("complete") \
        .format("console") \
        .option("truncate", "false") \
        .queryName("Query 1 - Aggregate query") \
        .start()

    query.awaitTermination()

    radio_code_json_filepath = f"{Path(__file__).parents[0]}/radio_code.json"
    radio_code_df = spark \
        .read \
        .option("multiline", "true") \
        .schema(radio_code_schema) \
        .json(radio_code_json_filepath)

    # clean up your data so that the column names match on radio_code_df and agg_df
    # we will want to join on the disposition code

    radio_code_df = radio_code_df.withColumnRenamed("disposition_code",
                                                    "disposition")

    join_query = agg_df \
        .join(radio_code_df, "disposition") \
        .writeStream \
        .format("console") \
        .outputMode("complete") \
        .option("truncate", "false") \
        .queryName("Query 2 - Join query") \
        .start()

    join_query.awaitTermination()
コード例 #10
0
import json

from kafka import KafkaConsumer

import utils
import config

if __name__ == "__main__":
    topic_name = utils.get_topic_name(config.INPUT_FILE_NAME)
    consumer = KafkaConsumer(
        topic_name,
        bootstrap_servers=config.BOOTSTRAP_SERVERS,
        group_id="0",
        auto_offset_reset="earliest",
        value_deserializer=lambda x: json.loads(x.decode('utf-8')))
    for message in consumer:
        print(
            f"Consumed message: topic= {message.topic}, key={message.key} value={message.value}"
        )