def on_different_type_names(match, state): """It happens when there isn't TypeObject and type names are different.""" topic = get_topic_name(match[0], state) type1 = get_type_name(match[1], state) type2 = get_type_name(match[2], state) log_error( "[LP-18] Cannot match remote entity in topic '%s': " % (topic) + "Different type names found ('%s', '%s')" % (type1, type2), state)
def run_kafka_server(): f_name = config.INPUT_FILE_NAME input_file = utils.prepare_input_file(f_name) topic_name = utils.get_topic_name(f_name) producer = producer_server.ProducerServer( input_file=input_file, topic=topic_name, bootstrap_servers=config.BOOTSTRAP_SERVERS) return producer
def on_duplicate_topic_name_error(match, state): """It happens when there is a topic name duplication.""" topic = get_topic_name(match[0], state) log_error("[LP-2] Topic name already in use by another topic: %s" % topic, state)
def on_delete_reader(match, state): """It happens for deleted DataReaders.""" topic = get_topic_name(match[0], state) log_event("Deleted reader for topic '%s'" % topic, state)
def on_create_reader(match, state): """It happens for new DataReader.""" topic = get_topic_name(match[0], state) log_event("Created reader for topic '%s'" % topic, state)
def on_create_writer(match, state): """It happens for new DataWriters.""" topic = get_topic_name(match[0], state) log_event("Created writer for topic '%s'" % topic, state)
def on_delete_topic(match, state): """It happens for deleted topics.""" topic = get_topic_name(match[0], state) typ = get_type_name(match[1], state) log_event("Deleted topic, name: '%s', type: '%s'" % (topic, typ), state, 1)
def on_create_cft(match, state): """It happens for new CFT.""" topic = get_topic_name(match[0], state) log_event("Created ContentFilteredTopic, name: '%s'" % topic, state)
def run_spark_job(spark): # Create Spark configurations with max offset of 200 per trigger # set up correct bootstrap server and port topic_name = utils.get_topic_name(config.INPUT_FILE_NAME) df = spark \ .readStream \ .format("kafka") \ .option("kafka.bootstrap.servers", config.BOOTSTRAP_SERVERS) \ .option("subscribe", topic_name) \ .option("startingOffsets", "earliest") \ .option("maxRatePerPartition", 100) \ .option("maxOffsetsPerTrigger", 100) \ .option("stopGracefullyOnShutdown", "true") \ .load() spark.udf.register("udf_to_timestamp", udf_to_timestamp) # Show schema for the incoming resources for checks df.printSchema() # Take only value and convert it to String kafka_df = df.selectExpr("CAST(value AS STRING)") service_table = kafka_df \ .select(psf.from_json(psf.col('value'), schema).alias("SERVICE_DF")) \ .select("SERVICE_DF.*") distinct_table = service_table \ .select( udf_to_timestamp(psf.col("call_date_time")).alias("call_date_time"), psf.col("original_crime_type_name"), psf.col("disposition") ).distinct() # count the number of original crime type agg_df = distinct_table \ .withWatermark("call_date_time", "60 minutes") \ .groupBy( psf.window(psf.col("call_date_time"), "60 minutes", "10 minutes"), psf.col("original_crime_type_name"), psf.col("disposition") ) \ .count() \ .orderBy("count", ascending=False) query = agg_df \ .writeStream \ .outputMode("complete") \ .format("console") \ .option("truncate", "false") \ .queryName("Query 1 - Aggregate query") \ .start() query.awaitTermination() radio_code_json_filepath = f"{Path(__file__).parents[0]}/radio_code.json" radio_code_df = spark \ .read \ .option("multiline", "true") \ .schema(radio_code_schema) \ .json(radio_code_json_filepath) # clean up your data so that the column names match on radio_code_df and agg_df # we will want to join on the disposition code radio_code_df = radio_code_df.withColumnRenamed("disposition_code", "disposition") join_query = agg_df \ .join(radio_code_df, "disposition") \ .writeStream \ .format("console") \ .outputMode("complete") \ .option("truncate", "false") \ .queryName("Query 2 - Join query") \ .start() join_query.awaitTermination()
import json from kafka import KafkaConsumer import utils import config if __name__ == "__main__": topic_name = utils.get_topic_name(config.INPUT_FILE_NAME) consumer = KafkaConsumer( topic_name, bootstrap_servers=config.BOOTSTRAP_SERVERS, group_id="0", auto_offset_reset="earliest", value_deserializer=lambda x: json.loads(x.decode('utf-8'))) for message in consumer: print( f"Consumed message: topic= {message.topic}, key={message.key} value={message.value}" )