def main(): """main """ spark = SparkSession \ .builder \ .appName("ExtractEventsJob") \ .enableHiveSupport() \ .getOrCreate() raw_events = spark \ .readStream \ .format("kafka") \ .option("kafka.bootstrap.servers", "kafka:29092") \ .option("subscribe", "events") \ .load() sword_purchases = raw_events \ .filter(is_sword_purchase(raw_events.value.cast('string'))) \ .select(raw_events.value.cast('string').alias('raw_event'), raw_events.timestamp.cast('string'), from_json(raw_events.value.cast('string'), purchase_sword_event_schema()).alias('json')) \ .select('raw_event', 'timestamp', 'json.*') spark.sql("drop table if exists sword_purchases") sql_string = """ create external table if not exists sword_purchases ( raw_event string, timestamp string, Accept string, Host string, `User-Agent` string, event_type string ) stored as parquet location '/tmp/sword_purchases' tblproperties ("parquet.compress"="SNAPPY") """ spark.sql(sql_string) sink = sword_purchases \ .writeStream \ .format("parquet") \ .option("checkpointLocation", "/tmp/checkpoints_for_sword_purchases") \ .option("path", "/tmp/sword_purchases") \ .trigger(processingTime="10 seconds") \ .start() sink.awaitTermination()
def run_spark_job(spark): spark.sparkContext.setLogLevel("WARN") df = spark \ .readStream \ .format("kafka") \ .option("kafka.bootstrap.servers", "kafka01-vn00c1.vn.infra:9092,kafka02-vn00c1.vn.infra:9092,kafka03-vn00c1.vn.infra:9092") \ .option("subscribe", "streaming.itbi.topic_test_02") \ .option("startingOffsets", "earliest") \ .option("maxOffsetsPerTrigger", 200) \ .option("maxRatePerPartition", 10) \ .load() df.printSchema() schema = StructType([ StructField("crime_id", StringType(), True), StructField("original_crime_type_name", StringType(), True), StructField("report_date", StringType(), True), StructField("call_date", StringType(), True), StructField("offense_date", StringType(), True), StructField("call_time", StringType(), True), StructField("call_date_time", StringType(), True), StructField("disposition", StringType(), True), StructField("address", StringType(), True), StructField("city", StringType(), True), StructField("state", StringType(), True), StructField("agency_id", StringType(), True), StructField("address_type", StringType(), True), StructField("common_location", StringType(), True) ]) # TODO extract the correct column from the kafka input resources # Take only value and convert it to String kafka_df = df.selectExpr("CAST(value AS STRING)") service_table = kafka_df \ .select(psf.from_json(psf.col('value'), schema).alias("DF")) \ .select("DF.*") # TODO select original_crime_type_name and disposition # distinct_table = (service_table # .select("original_crime_type_name", "disposition") # .distinct() # ) query = service_table.writeStream.trigger( processingTime="10 seconds").format("console").option( "truncate", "false").start() query.awaitTermination()
def run_spark_job(spark): # Create Spark Configuration # Create Spark configurations with max offset of 200 per trigger # set up correct bootstrap server and port df = spark \ .readStream \ .format("kafka") \ .option("kafka.bootstrap.servers", "localhost:9092") \ .option("subscribe", "service-calls") \ .option("startingOffsets", "earliest") \ .option("maxOffsetsPerTrigger", 200) \ .load() # Show schema for the incoming resources for checks df.printSchema() # Extract the correct column from the kafka input resources # Take only value and convert it to String kafka_df = df.selectExpr("CAST(value AS STRING)") service_table = kafka_df\ .select(psf.from_json(psf.col('value'), schema).alias("SERVICE_CALLS"))\ .select("SERVICE_CALLS.*") distinct_table = service_table\ .select(psf.col('crime_id'), psf.col('original_crime_type_name'), psf.to_timestamp(psf.col('call_date_time')).alias('call_datetime'), psf.col('address'), psf.col('disposition')) # Get different types of original_crime_type_name in 60 minutes interval counts_df = distinct_table \ .withWatermark("call_datetime", "60 minutes") \ .groupBy( psf.window(distinct_table.call_datetime, "10 minutes", "5 minutes"), distinct_table.original_crime_type_name ).count() # Write output stream query = counts_df \ .writeStream \ .outputMode('Complete') \ .format('console') \ .start() # Attach a ProgressReporter query.awaitTermination()
def clean_MES(df): df_decoded = df.withColumn('Body', decode(unbase64(df.Body), 'utf-8')) return flatten_df( flatten_df( df_decoded.withColumn( 'Body', from_json( col('Body'), StructType([ StructField("dataItemType", StringType(), True), StructField("assetId", StringType(), True), StructField("value", StringType(), True) ], )))).drop(col('SystemProperties')). withColumn( 'SystemProperties_connectionAuthMethod', from_json( col('SystemProperties_connectionAuthMethod'), StructType([ StructField("scope", StringType(), True), StructField("type", StringType(), True), StructField("issuer", StringType(), True), StructField("acceptingIpFilterRule", StringType(), True) ], ))).withColumn( 'Body_Value', from_json( col('Body_Value'), StructType([ StructField("eventId", StringType(), True), StructField("assetId", StringType(), True), StructField("telemetryValue", StringType(), True), StructField("description", StringType(), True), StructField("dateTime", StringType(), True), StructField("componentName", StringType(), True), StructField("status", StringType(), True) ], ))))
def _parse_text(self, vep_transformed_df: f.DataFrame): """ Parses json-formatted VEP output string Args: vep_transformed_df: output of `self._transform()` Returns: Spark DataFrame with the schema as defined by `self.output_schema` """ vep_df = (vep_transformed_df.withColumn( 'data', f.from_json('text', self.output_schema)).select( f.expr("data.*")).drop("input")) return vep_df
def run_spark_job(spark): # TODO Create Spark Configuration # Create Spark configurations with max offset of 200 per trigger # set up correct bootstrap server and port df = spark \ .readStream \ # Show schema for the incoming resources for checks df.printSchema() # TODO extract the correct column from the kafka input resources # Take only value and convert it to String kafka_df = df.selectExpr("") service_table = kafka_df\ .select(psf.from_json(psf.col('value'), schema).alias("DF"))\ .select("DF.*") # TODO select original_crime_type_name and disposition distinct_table = # count the number of original crime type agg_df = # TODO Q1. Submit a screen shot of a batch ingestion of the aggregation # TODO write output stream query = agg_df \ # TODO attach a ProgressReporter query.awaitTermination() # TODO get the right radio code json path radio_code_json_filepath = "" radio_code_df = spark.read.json(radio_code_json_filepath) # clean up your data so that the column names match on radio_code_df and agg_df # we will want to join on the disposition code # TODO rename disposition_code column to disposition radio_code_df = radio_code_df.withColumnRenamed("disposition_code", "disposition") # TODO join on disposition column join_query = agg_df. join_query.awaitTermination()
def main(): spark = SparkSession \ .builder \ .appName("ExtractEventsJob") \ .enableHiveSupport() \ .getOrCreate() # batch - load raw events raw_events = spark \ .read \ .format("kafka") \ .option("kafka.bootstrap.servers", "kafka:29092") \ .option("subscribe", "events") \ .option("startingOffsets", "earliest") \ .option("endingOffsets", "latest") \ .load() # # streaming - load raw events # raw_events = spark \ # .readStream \ # .format("kafka") \ # .option("kafka.bootstrap.servers", "kafka:29092") \ # .option("subscribe", "events") \ # .load() # filter for purchase_sword events purchase_events = raw_events \ .filter(is_purchase(raw_events.value.cast('string'))) \ .select(raw_events.value.cast('string').alias('raw_event'), raw_events.timestamp.cast('string'), from_json(raw_events.value.cast('string'), purchase_sword_event_schema()).alias('json')) \ .select('raw_event', 'timestamp', 'json.*') # register extracted_purchase_events to temp table purchase_events.registerTempTable("sword_purchase_events") # write to parquet file spark.sql(""" create external table sword_purchase_events stored as parquet location '/tmp/sword_purchase_events' as select * from sword_purchase_events """) sword_purchase_events.write.mode('overwrite').parquet( '/tmp/sword_purchase_events')
def transform(self) -> DataFrame: with_recommendations = ( self.dataFrame.where(~F.col("top_candidates").isNull()) .withColumn( "data", F.explode( F.from_json("top_candidates", RawDataset.top_candidates_schema) ), ) .select("*", "data.image", "data.rating", "data.note") .withColumnRenamed("wiki_db", "wiki") .withColumnRenamed("image", "image_id") .withColumn("confidence_rating", self.confidence_rating) .withColumn("source", self.source) .withColumn("found_on", self.found_on) .select( "wiki", "page_id", "page_title", "image_id", "confidence_rating", "source", "instance_of", "found_on", ) ) without_recommendations = ( self.dataFrame.where(F.col("top_candidates").isNull()) .withColumnRenamed("wiki_db", "wiki") .withColumn("image_id", F.lit(None)) .withColumn("confidence_rating", F.lit(None)) .withColumn("source", F.lit(None)) .withColumn("found_on", F.lit(None)) .select( "wiki", "page_id", "page_title", "image_id", "confidence_rating", "source", "instance_of", "found_on", ) ) return with_recommendations.union(without_recommendations)\ .withColumn("instance_of", self.instance_of)\ .withColumn("is_article_page", self.is_article_page)
def main(): spark = SparkSession \ .builder \ .appName("Twitchatter") \ .getOrCreate() # Create DataFrame representing the stream of input lines from connection to localhost:9999 # Subscribe to 1 topic df = spark \ .readStream \ .format("kafka") \ .option("kafka.bootstrap.servers", config.ip_address) \ .option("subscribe", config.topic) \ .option("startingOffsets","earliest") \ .load() #Kafka streams from source are as "key":"value"..etc. df.printSchema() #Select key:value and discard others #value schema: {"username":"******","message":"xxx","channel":"xxx","time":"xxx"} schema = StructType().add("username", StringType()).add( "message", StringType()).add("channel", StringType()).add("time", StringType()) ds = df.selectExpr("CAST(value AS STRING)") \ .select(f.from_json("value",schema).alias("message")) \ .select("message.*") ds.printSchema() # uncomment to see data flowing in query = ds.writeStream.outputMode("append").format("console").start() query.awaitTermination() # write is not available for streaming data; we use create_table to create table and keyspaces #ds.write.format("org.apache.spark.sql.cassandra").options(table="rawtable",keyspace="test").save(mode="append") # again structured stream does not provide savetocassandra functionality #ds.saveToCassandra("test","rawtable") # dump data to parquet files #query = ds.writeStream \ # .format("parquet") \ # .option("startingOffsets", "earliest") \ # .option("checkpointLocation", "/home/ubuntu/twitchatter/test/check/") \ # .option("path", "s3a://mypqrquet/") \ # .start() #query.awaitTermination() # Do some simple count: unique user count per channel user_count = ds.groupBy("channel", "username").count()
def stream(spark): df = spark \ .readStream \ .format("kafka") \ .option("kafka.bootstrap.servers", "localhost:9092") \ .option("subscribe", "TweetsTopic") \ .load() \ .select(from_json(col("value").cast("String"), SCHEMA).alias("data"), col("timestamp")) \ .select("data.*", "timestamp") \ .select("text", "tweet_time", "source", "timestamp") \ .writeStream \ .outputMode("update") \ .trigger(processingTime='2 seconds') \ .foreachBatch(save_to_hbase_mysql) \ .start() return df
def _parse_device_product_associations_payload( cls, df, payload_column_name='payload'): """ Parses the JSON payload column containing the product association information. :param DataFrame df: The input DataFrame :param str payload_column_name: (Optional) The name of the JSON payload column - defaults to `payload` :rtype DataFrame """ payload_schema = cls._get_device_product_association_payload_schema() return df \ .withColumn(payload_column_name, from_json(payload_column_name, schema=payload_schema)) \ .select('*', '{}.*'.format(payload_column_name)) \ .drop(payload_column_name)
def flattenDf(df): ht_schema = StructType([ StructField('text', StringType(), True), StructField('indices', StringType(), True) ]) flatDf = df \ .withColumn("ht", f.from_json("hashtags", ht_schema)) \ .withColumn("twtdate", f.from_unixtime(f.col("twttimestamp").substr(1,10), format='yyyy-MM-dd')) \ .drop("hashtags") \ .select(f.col("twtid"), f.col("twtdate"), f.col("twtlang"), f.col("twtsource"), f.col("ht.text").alias("twthashtag") ) return flatDf
def run_spark_job(spark): # TODO Create Spark Configuration # Create Spark configurations with max offset of 200 per trigger # set up correct bootstrap server and port # df = spark ... df = spark.readStream.\ format("kafka").\ # set data ingestion format as Kafka option("subscribe", b'service-calls').\ #This is required although it says option. option("kafka.bootstrap.servers", "localhost:9092").\ #You will also need the url and port of the bootstrap server option("startingOffsets", "earliest") .\ option("maxOffsetsPerTrigger", 200). \ load() # Show schema for the incoming resources for checks df.printSchema() # TODO extract the correct column from the kafka input resources # Take only value and convert it to String kafka_df = df.selectExpr("CAST(value as STRING)") service_table = kafka_df\ .select(psf.from_json(psf.col('value'), schema).alias("SERVICE_CALLS"))\ .select("SERVICE_CALLS.*") distinct_table = service_table\ .select(psf.col('crime_id'), psf.col('original_crime_type_name'), psf.to_timestamp(psf.col('call_date_time')).alias('call_datetime'), psf.col('address'), psf.col('disposition')) # TODO get different types of original_crime_type_name in 60 minutes interval #counts_df = # TODO use udf to convert timestamp to right format on a call_date_time column converted_df = distinct_table.withColumn("DateColconv", udf_convert_time('call_date_time')) # TODO apply aggregations using windows function to see how many calls occurred in 2 day span # calls_per_2_days = # TODO write output stream # query = # TODO attach a ProgressReporter query.awaitTermination()
def run_spark_job(spark): df = spark \ .readStream \ .format("kafka") \ .option("kafka.bootstrap.servers", "localhost:9092") \ .option("subscribe", "org.sf.police.service-calls") \ .option("startingOffsets", "earliest") \ .option("maxOffsetsPerTrigger", 200) \ .option("stopGracefullyOnShutdown", "true") \ .load() df.printSchema() kafka_df = df.selectExpr("CAST(value AS STRING)") service_table = kafka_df\ .select(psf.from_json(psf.col('value'), schema).alias("DF"))\ .select("DF.*") distinct_table = service_table \ .select(psf.col('crime_id'), psf.col('original_crime_type_name'), psf.col('disposition')).distinct() agg_df = distinct_table \ .groupBy(distinct_table.original_crime_type_name).count() query = agg_df \ .writeStream \ .trigger(processingTime="10 seconds") \ .outputMode('complete') \ .format('console') \ .start() query.awaitTermination() radio_code_json_filepath = "./radio_code.json" radio_code_df = spark.read.json(radio_code_json_filepath) radio_code_df = radio_code_df.withColumnRenamed("disposition_code", "disposition") join_query = agg_df.join(radio_code_df, agg_df.disposition == radio_code_df.disposition, "left_outer") join_query.awaitTermination()
def startSpark(self): spark = SparkSession \ .builder \ .appName("twitter") \ .master("local[*]") \ .config('spark.jars.packages', 'org.apache.spark:spark-sql-kafka-0-10_2.11:2.4.4') \ .getOrCreate() hadoop_conf=spark.sparkContext._jsc.hadoopConfiguration() hadoop_conf.set("fs.s3n.impl", "org.apache.hadoop.fs.s3native.NativeS3FileSystem") hadoop_conf.set("fs.s3n.awsAccessKeyId", self.aws_key) hadoop_conf.set("fs.s3n.awsSecretAccessKey", self.aws_secret) conf = spark.sparkContext._conf.setAll([('spark.delta.logStore.class','org.apache.spark.sql.delta.storage.S3SingleDriverLogStore')]) spark.sparkContext._conf.getAll() df = spark \ .readStream \ .format("kafka") \ .option("kafka.bootstrap.servers", "localhost:9092") \ .option("subscribe", "demo-3-twitter") \ .option("startingOffsets", "latest") \ .load() onlyValue = df.selectExpr("CAST(value AS STRING)").writeStream.format("memory").queryName("onlyValue").start() onlyValue.awaitTermination(timeout=10) sample = spark.table("onlyValue") kafka_schema = json.loads(sample.head().asDict()["value"])["schema"] spark_schema = self.transform_schema_kafka_to_spark(kafka_schema) schema = StructType().add("payload", spark_schema).add("schema", StructType()) data = df.select(fn.from_json(df["value"].cast("string"), schema).alias("value")).alias("data") data.printSchema() hashtags = data.select(fn.explode("data.value.payload.entities.hashtags").alias("hashtag"), fn.col("data.value.payload.coordinates").alias("coordinates"), fn.col("data.value.payload.user.location").alias("location_name"), fn.col("data.value.payload.text").alias("text"), fn.to_timestamp("data.value.payload.created_at").alias("created_time")) \ .select(fn.lower(fn.col("hashtag.text")).alias("hashtag"), "created_at") hashtagCount = hashtags.groupBy(fn.window(hashtags["created_at"], "10 minutes", "5 minutes"), "hashtag") .count().orderBy(["window", "count"], ascending=[False, False]) query = hashtagCount.writeStream.outputMode("append").format("delta").trigger(Trigger.ProcessingTime("300 seconds")).option('checkpointLocation', checkpoint_location).start() sleep(600) query.stop()
def load(self, name, path, infer_schema=False): if name in ['ml_features_train', 'ml_features_test']: data = self.spark.read.parquet(path) else: schema = self.schema[name] if not infer_schema else None data = self.spark.read.csv(path, schema=schema, escape='"', quote='"', inferSchema=infer_schema, header=True) if name == 'product': for column, column_schema in self.metadata_schema.items(): if column in data.columns: data = data.withColumn( column, F.from_json(F.col(column), column_schema)) return data
def main(spark: pyspark.sql.SparkSession): try: sdf = spark.readStream.format('kafka') \ .option('kafka.bootstrap.servers', 'localhost:9092') \ .option('subscribe', 'lcr-events') \ .option("startingOffsets", "earliest") \ .load() sdf.select(from_json(col('value').cast("string"), schema=LCR_RESULT_SCHEMA).alias('data')) \ .select('data.*')\ .writeStream \ .format('console') \ .start() spark.streams.awaitAnyTermination() finally: spark.stop()
def __init__(self, topic, spark_ip="local[2]", kafka_ip="localhost:9092"): self.topic = topic self.kafka_ip = kafka_ip sc = SparkContext(spark_ip, appName="WikiStream") self.spark = pyspark.SQLContext(sc) self.df = self.spark \ .read \ .format("kafka") \ .option("kafka.bootstrap.servers", self.kafka_ip) \ .option("subscribe", self.topic) \ .option("startingOffsets", "earliest") \ .load() self.events = self.df.withColumn("value", F.col("value").cast(StringType())) \ .withColumn("value", F.from_json("value", MAIN_SCHEMA)) \ .select("value.data.*") \ .withColumn("time", F.col("meta.dt").cast("timestamp"))
def test1(spark): """ This demonstrates reading large images from Pravega and detecting defects. The data field contains a base-64 encoded PNG image file. It uses chunked encoding to support events of 2 GiB. This runs out of memory because the non-Pandas runner uses fixed batches of 100. """ schema = 'timestamp timestamp, frame_number int, camera int, ssrc int, data binary' controller = os.getenv('PRAVEGA_CONTROLLER', 'tcp://127.0.0.1:9090') scope = os.getenv('PRAVEGA_SCOPE', 'examples') df = (spark.readStream.format("pravega").option( "controller", controller).option("scope", scope).option( "stream", "video").option("encoding", "chunked_v1").load()) df = df.withColumnRenamed('event', 'raw_event') df = df.select('*', decode('raw_event', 'UTF-8').alias('event_string')) df = df.select('*', from_json('event_string', schema=schema).alias('event')) df = df.select('*', 'event.*') df = df.select('*', length('data')) # df = df.withWatermark('timestamp', '60 second') @udf(returnType=DoubleType()) def defect_probability(data): """Calculate the probability of a defect.""" # Decode the image. rgb = cv2.imdecode(np.array(data), -1) # Perform a computation on the image to determine the probability of a defect. # For now, we just calculate the mean pixel value. # We can any Python library, including NumPy and TensorFlow. p = rgb.mean() / 255.0 return float(p) df = df.select('*', defect_probability('data').alias('defect_probability')) df = df.drop('raw_event', 'event_string', 'event', 'data') df.printSchema() if True: (df.writeStream.trigger( processingTime='3 seconds') # limit trigger rate .outputMode('append').format('console').option( 'truncate', 'false').start().awaitTermination())
def run_spark_job(spark): df = spark \ .readStream \ .format("kafka") \ .option("kafka.bootstrap.servers", "localhost:9092") \ .option("subscribe", "com.udacity.streams.purchases9999") \ .option("startingOffsets", "earliest") \ .option("maxOffsetsPerTrigger", 10) \ .option("stopGracefullyOnShutdown", "true") \ .load() # Show schema for the incoming resources for checks df.printSchema() """ Once received from Kafka, the source will have the following schema: key[binary] value[binary] topic[string] partition[int] offset[long] timestamp[long] timestampType[int] """ kafka_df = df.selectExpr("CAST(value AS STRING)") # we can run this if we don't care about schema and repsrentation # query = kafka_df.writeStream.outputMode("append").format("console").start() # query.awaitTermination() # build a schema jsonSchema = StructType([ StructField("username", StringType(), True), StructField("currency", StringType(), True), StructField("amount", IntegerType(), True) ]) # unpack collumns from json "Value" json_df =kafka_df\ .select(psf.from_json(psf.col("value"), jsonSchema).alias("JSON_Topic"))\ .select("JSON_Topic.*") query = json_df.writeStream.outputMode("append").format("console").start() query.awaitTermination()
def get_read_stream(spark): args = parser.parse_args() PERSON_EVENTS = get_schema() df = spark \ .readStream \ .format("kafka") \ .option("kafka.bootstrap.servers", kafka_url) \ .option("subscribe", args.topic) \ .option("startingOffsets", "earliest") \ .load() \ .selectExpr("CAST(value AS STRING) as json") \ .select(from_json(col("json").cast("string"), PERSON_EVENTS).alias("parsed_value")) \ .select("parsed_value.*") df = normalize_fields(df) # df.printSchema() return df
def get_layer_skus_df(product_lists_df): """ Take all the product lists defining the bundles, and get all unique oms_ids from those lists with the associated product type of each. :param product_lists_df: :return: df with two columns: (oms_id, product_type) """ return product_lists_df.select( F.explode(F.from_json('product_list_json', ArrayType( StructType([ StructField("id", StringType()), StructField("productType", StringType()) ]) )) ).alias('product_object')) \ .select(F.col('product_object.id').alias('oms_id'), F.col('product_object.productType').alias('productType')) \ .drop_duplicates()
def job(self) -> None: # define transformations for input data stream and write structured data on parquet format query = self.stream \ .select(F.col('value').cast('string').alias('json')) \ .select(F.from_json('json', schema).alias('data')) \ .select('data.*')\ .withColumn('event_date', F.to_date('event_time')) \ .writeStream \ .trigger(processingTime='1 minute') \ .outputMode("append")\ .partitionBy('event_date')\ .option("checkpointLocation", self.checkpoint_dir) \ .option("startingOffsets", "earliest") \ .option("truncate", "false") \ .format("parquet")\ .option("path", self.target_dir) \ .start() query.awaitTermination()
def run_spark_job(spark): df = spark \ .readStream \ .format("kafka") \ .option("kafka.bootstrap.servers", "localhost:9092") \ .option("subscribe", "service-calls") \ .option("maxOffsetPerTrigger", "200") \ .option("startingOffsets", "earliest") \ .load() df.printSchema() kafka_df = df.selectExpr("CAST(value AS STRING)") service_table = kafka_df\ .select(psf.from_json(psf.col('value'), schema).alias("SERVICE_CALLS"))\ .select("SERVICE_CALLS.*") distinct_table = service_table\ .select(psf.col('crime_id'), psf.col('original_crime_type_name'), psf.to_timestamp(psf.col('call_date_time')).alias('call_datetime'), psf.col('address'), psf.col('disposition')) counts_df = distinct_table \ .withWatermark("call_datetime", "60 minutes") \ .groupBy( psf.window(distinct_table.call_datetime, "10 minutes", "5 minutes"), distinct_table.original_crime_type_name ).count() query = counts_df \ .writeStream \ .outputMode('complete') \ .format('console') \ .start() query.awaitTermination() countDataFrame = distinct_table \ .withWatermark("call_datetime", "60 minutes") \ .groupBy( psf.window(distinct_table.call_datetime, "10 minutes", "5 minutes"), distinct_table.original_crime_type_name ).count() query = countDataFrame \ .writeStream \ .outputMode('complete') \ .format('console') \ .start() query.awaitTermination()
def main(): # Read from kafka spark = SparkSession.builder.appName("Driver GPS Streaming").getOrCreate() # log4j # log4jLogger= spark._jvm.org.apache.log4j.Logger # logger = log4jLogger.getLogger(__name__) log4jLogger = spark._jvm.org.apache.log4j logger = log4jLogger.LogManager.getLogger("spark_app_logs") # logger = log4jLogger.LogManager.getRootLogger("catfish_logs") logger.setLevel(log4jLogger.Level.DEBUG) logger.info("pyspark script logger initialized") sdf_locations = (spark.readStream.format("kafka").option( "kafka.bootstrap.servers", brokerAddresses).option("subscribe", TOPIC_NAME).option( "startingOffsets", "earliest").load().selectExpr("CAST(value AS STRING)", )) sdf_locations_schema = StructType([ StructField("driverId", StringType(), True), StructField("lat", StringType(), True), StructField("lng", StringType(), True), StructField("time", StringType(), True), ]) sdf_locations_data = sdf_locations.select( from_json("value", sdf_locations_schema).alias("a")).select("a.*") sdf_locations_data = apply_location_requirement(sdf_locations_data) sdf_locations_data.printSchema() # sdf_locations_data.writeStream.format("console").start().awaitTermination() sdf_locations_data.writeStream.outputMode("append").foreachBatch( partial(postgres_sink, table_name="locations")).trigger( processingTime="15 minute").start() spark.streams.awaitAnyTermination()
def run_spark_job(spark): df = spark \ .readStream \ .format("kafka") \ .option("kafka.bootstrap.servers", BOOTSTRAP_SERVER) \ .option("subscribe", TOPIC_NAME) \ .option("maxOffsetsPerTrigger", 200) \ .option("maxRatePerPartition", 100) \ .option("startingOffsets","earliest") \ .load() df.printSchema() kafka_df = df.selectExpr("CAST(value AS STRING)") service_table = kafka_df\ .select(psf.from_json(psf.col('value'), schema).alias("DF"))\ .select("DF.*") distinct_table = service_table.select( "call_date_time", "disposition", "original_crime_type_name" ) watermarked_distinct_table = distinct_table.withWatermark("call_date_time", "20 minutes") agg_df = watermarked_distinct_table.groupBy( psf.window("call_date_time", "20 minutes"), "original_crime_type_name" ).count().orderBy("count") query = agg_df.writeStream.outputMode("complete").format("console").start() query.awaitTermination() radio_code_json_filepath = "./radio_code.json" radio_code_df = spark.read.json(radio_code_json_filepath) radio_code_df = radio_code_df.withColumnRenamed("disposition_code", "disposition") join_query = agg_df.join(radio_code_df, "disposition") join_query.awaitTermination()
def parseJSONCols(spark, df, sanitize=True, *cols): """Auto infer the schema of a json column and parse into a struct. https://stackoverflow.com/questions/41107835/pyspark-parse-a-column-of-json-strings rdd-based schema inference works if you have well-formatted JSON, like ``{"key": "value", ...}``, but breaks if your 'JSON' is just a string (``"data"``) or is an array (``[1, 2, 3]``). In those cases you can fix everything by wrapping the data in another JSON object (``{"key": [1, 2, 3]}``). The ``sanitize`` option (default True) automatically performs the wrapping and unwrapping. The schema inference is based on this `SO Post <https://stackoverflow.com/a/45880574)/>`_. Parameters ---------- df : pyspark dataframe Dataframe containing the JSON cols. *cols : string(s) Names of the columns containing JSON. sanitize : boolean Flag indicating whether you'd like to sanitize your records by wrapping and unwrapping them in another JSON object layer. Returns ------- pyspark dataframe A dataframe with the decoded columns. """ res = df for i in cols: # sanitize if requested. if sanitize: res = (res.withColumn( i, psf.concat(psf.lit('{"data": '), i, psf.lit('}')))) # infer schema and apply it schema = spark.read.json(res.rdd.map(lambda x: x[i])).schema res = res.withColumn(i, psf.from_json(psf.col(i), schema)) # unpack the wrapped object if needed if sanitize: res = res.withColumn(i, psf.col(i).data) return res
def run_spark_job(spark): # TODO Create Spark Configuration # Create Spark configurations with max offset of 200 per trigger # set up correct bootstrap server and port df = spark.readStream.format("kafka").option( "kafka.bootstrap.servers", "localhost:9092").option("subscribe", "service-calls").option( "maxOffsetsPerTrigger", 200).option("startingOffsets", "earliest").load() # Show schema for the incoming resources for checks df.printSchema() # TODO extract the correct column from the kafka input resources # Take only value and convert it to String kafka_df = df.selectExpr("CAST(value AS STRING)") service_table = kafka_df\ .select(psf.from_json(psf.col('value'), schema).alias("SERVICE_CALLS"))\ .select("SERVICE_CALLS.*") distinct_table = service_table\ .select(psf.col('crime_id'), psf.col('original_crime_type_name'), psf.to_timestamp(psf.col('call_date_time')).alias('call_datetime'), psf.col('address'), psf.col('disposition')) counts_df = distinct_table.withWatermark( "call_datetime", "60 minutes").groupBy('original_crime_type_name') converted_df = distinct_table.select( psf.col('crime_id'), psf.col('original_crime_type_name'), udf_convert_time(psf.col('call_datetime'))) calls_per_2_days = counts_df.groupBy( psf.window(distinct_table.call_datetime, "2 days")).count() query = converted_df.writeStream.outputMode("Complete").format( "console").start() query.awaitTermination()
def consume_transactions(input_df,checkpoint_path): # The Below Section is for Flight Transactions transformation and loading new_df = input_df.withColumn('new_val', F.regexp_replace(input_df['value'], '\\\\', '')).drop('value') new_df = new_df.withColumn('value', F.regexp_replace(new_df['new_val'], '""', "'")).drop('new_val') new_df = new_df.withColumn('new_val', F.regexp_replace(new_df['value'], '}n', "}")).drop('value') new_df = new_df.withColumn('new_val', F.regexp_replace(new_df['new_val'], "'", "")) #new_df = new_df.withColumn('struct_val',F.struct(new_df['new_val'])).drop('new_val') transaction_schema = StructType((StructField("DestinationAirportCode", StringType()), StructField("Itinerary", StringType()), StructField("OneWayOrReturn", StringType()), StructField("OriginAirportCode", StringType()), StructField("Segment", ArrayType(StructType([StructField("ArrivalAirportCode", StringType()), StructField("DepartureAirportCode", StringType()), StructField("LegNumber", StringType()), StructField("NumberOfPassengers", StringType()), StructField("SegmentNumber", StringType())]))), StructField("TransactionDateUTC", StringType()), StructField("UniqueId", StringType()))) json_df = new_df.select(F.from_json(F.col("new_val"), transaction_schema).alias("value")).selectExpr('value.*') json_df = json_df.withColumn('Segment', F.explode_outer(json_df['Segment'])) json_df = json_df.withColumn('DepartureAirportCode', F.col('Segment')['DepartureAirportCode']) \ .withColumn('ArrivalAirportCode', F.col('Segment')['ArrivalAirportCode']) \ .withColumn('LegNumber', F.col('Segment')['LegNumber']) \ .withColumn('NumberOfPassengers', F.col('Segment')['NumberOfPassengers']) \ .withColumn('SegmentNumber', F.col('Segment')['SegmentNumber']) \ .drop('Segment') # Stream the data, from a Kafka topic to a Spark in-memory table query = json_df\ .writeStream \ .format("memory") \ .queryName("TransactionTable") \ .outputMode("append") \ .option("checkpoint",checkpoint_path)\ .start() query.awaitTermination(5) # Let it Fill up the table sleep(10)
def Analytics1(): while (1): global Flags if Flags == True: spark = SparkSession \ .builder \ .config("spark.jars", "spark-streaming-kafka-0-10_2.12-3.0.0.jar,spark-sql-kafka-0-10_2.12-3.0.0.jar,kafka-clients-2.5.0.jar,commons-pool2-2.8.0.jar,spark-token-provider-kafka-0-10_2.12-3.0.0.jar") \ .appName("Q1") \ .getOrCreate() data_spark_schema = StructType([ StructField("classification", StringType(), True), StructField("status", StringType(), True), StructField("initial_firm_notification", StringType(), True) ]) streamingInputDF = spark \ .readStream \ .format("kafka") \ .option("kafka.bootstrap.servers", "192.168.56.1:9092") \ .option("subscribe", "devices") \ .load() stream_records = streamingInputDF.selectExpr("CAST(key AS STRING)", "CAST(value AS STRING) as devices") \ .select(psf.from_json("devices", data_spark_schema).alias("devices")) stream_records.printSchema() stream_records = stream_records.select("devices.*").drop("devices") z = stream_records.writeStream.outputMode("append").format( "memory").queryName("view").trigger( processingTime=' 2 seconds').start() while (z.isActive): t.sleep(15) x = spark.sql( """select classification, count(classification) from view group by classification""" ) x = x.toPandas() x.plot(x='classification', y='count(classification)', kind='bar') plt.savefig( 'C:\\Users\\balla\\PycharmProjects\\ORIM1\\untitled\\static\\output.png' ) break
from pyspark.sql.functions import to_json df.selectExpr("(InvoiceNo, Description) as myStruct")\ .select(to_json(col("myStruct"))) # COMMAND ---------- from pyspark.sql.functions import from_json from pyspark.sql.types import * parseSchema = StructType(( StructField("InvoiceNo",StringType(),True), StructField("Description",StringType(),True))) df.selectExpr("(InvoiceNo, Description) as myStruct")\ .select(to_json(col("myStruct")).alias("newJSON"))\ .select(from_json(col("newJSON"), parseSchema), col("newJSON")).show(2) # COMMAND ---------- udfExampleDF = spark.range(5).toDF("num") def power3(double_value): return double_value ** 3 power3(2.0) # COMMAND ---------- from pyspark.sql.functions import udf power3udf = udf(power3)
# COMMAND ---------- from pyspark.sql.types import * trxSchema = StructType().add("transaction", StructType().add("rewardsTransactionId", StringType()).add("sourceTransactionId", StringType()).add("transactionNumber", StringType()).add("voidedTransactionId", StringType()).add("userId", StringType()).add("membershipId", StringType()).add("memberId", StringType()).add("membershipLevel", StringType()).add("loyaltyPartner", StringType()).add("programId", StringType()).add("programName", StringType()).add("transactionStatus", StringType()).add("sourceTransactionType", StringType()).add("rewardsTransactionType", StringType()).add("rewardsTransactionSubType", StringType()).add("merchantId", StringType()).add("storeNumber", StringType()).add("cardNumber", StringType()).add("checkNumber", StringType()).add("receiptNumber", StringType()).add("terminalLocalDateTime", StringType()).add("terminalNumber", StringType()).add("redemptionItemRewardCode", StringType()).add("redemptionItemProuductSKU", StringType()).add("channelType", StringType()).add("subChannelType", StringType()).add("brandCode", StringType()).add("countryCode", StringType()).add("exchangeRate", StringType()).add("transactionSource", StringType()).add("channelSource", StringType()).add("description", StringType()).add("rewardsActivityId", StringType()).add("rewardsActivityName", StringType()).add("ticketTotalAmount", StringType()).add("ticketAccruableAmount", IntegerType()).add("cardTotalAmount", StringType()).add("cardAccruableAmount", StringType()).add("localCurrencyCode", StringType()).add("baseCurrencyCode", StringType()).add("transactionDate", StringType()).add("transactionProcessedDate", StringType()).add("utcOffset", StringType()).add("createdDate", StringType()).add("modifiedDate", StringType()).add("correlationId", StringType()).add("eventId", StringType())) accrualSchema = StructType().add("accrual", StructType().add("rewardsAccrualId", StringType()).add("rewardsTransactionId", StringType()).add("sourceTransactionId", StringType()).add("transactionNumber", StringType()).add("userId", StringType()).add("membershipId", StringType()).add("memberId", StringType()).add("membershipLevel", StringType()).add("programId", StringType()).add("programName", StringType()).add("promotionId", StringType()).add("promotionName", StringType()).add("productSKU", StringType()).add("rewardsActivityDescription", StringType()).add("rewardsTransactionSubType", StringType()).add("pointSubType", StringType()).add("pointsAccrued", StringType()).add("pointsUsed", StringType()).add("pointTypeId", StringType()).add("pointType", StringType()).add("pointExpirationDate", StringType()).add("createdDate", StringType()).add("modifiedDate", StringType()).add("correlationId", StringType()).add("eventId", StringType())) redemptionSchema = StructType().add("redemption", StructType().add("rewardsRedemptionId", StringType()).add("rewardsTransactionId", StringType()).add("originalTransactionId", StringType()).add("userId", StringType()).add("membershipId", StringType()).add("memberId", StringType()).add("programId", StringType()).add("programName", StringType()).add("rewardsActivityDescription", StringType()).add("typeCode", StringType()).add("pointTypeId", StringType()).add("pointType", StringType()).add("pointsRedeemed", StringType()).add("createdDate", StringType()).add("modifiedDate", StringType()).add("rewardsAccrualId", StringType()).add("accrualSourceTransactionId", StringType()).add("accrualTransactionNumber", StringType()).add("accrualMerchantId", StringType()).add("accrualStoreNumber", StringType()).add("accrualBrandCode", StringType()).add("accrualTransactionSource", StringType()).add("accrualChannelSource", StringType()).add("accrualRewardsActivityName", StringType()).add("accrualRewardsActivityDescription", StringType()).add("accrualCreatedDate", StringType()).add("accrualExpirationDate", StringType()).add("correlationId", StringType()).add("eventId", StringType())) memberBalancesSchema = StructType().add("membershipPointBalance", StructType().add("membershipId", StringType()).add("memberId", StringType()).add("userId", StringType()).add("programName", StringType()).add("currentRewardPoints", StringType()).add("currentLevelPoints", StringType()).add("lifetimeRewardPoints", StringType()).add("lifetimeLevelPoints", StringType()).add("lifetimeRedeemedPoints", StringType()).add("modifiedDate", StringType()).add("lastAccrualTransactionDate", StringType()).add("lastRedemptionTransactionDate", StringType()).add("lastTransactionDate",StringType()).add("createdDate",StringType()).add("correlationId", StringType()).add("eventId", StringType())) # COMMAND ---------- from pyspark.sql.functions import from_json trxDf = validDecryptedTrxDf.select(validDecryptedTrxDf.EnqueuedDateTimeUTC, from_json(validDecryptedTrxDf.Body, trxSchema).alias("jsonBody")) accDf = validDecryptedAccDf.select(validDecryptedAccDf.EnqueuedDateTimeUTC, from_json(validDecryptedAccDf.Body, accrualSchema).alias("jsonBody")) redDf = validDecryptedRedDf.select(validDecryptedRedDf.EnqueuedDateTimeUTC, from_json(validDecryptedRedDf.Body, redemptionSchema).alias("jsonBody")) memBalDf = validDecryptedMemBalDf.select(validDecryptedMemBalDf.EnqueuedDateTimeUTC, from_json(validDecryptedMemBalDf.Body, memberBalancesSchema).alias("jsonBody")) # COMMAND ---------- # TAKES 15 MINUTES TO WRITE --> Do we need this??? # trxDf.write.mode("Overwrite").parquet('/mnt/transactions/raw_data/' + now_pt_str_hhmmss ) # redDf.write.mode("Overwrite").parquet('/mnt/redemptions/raw_data/' + now_pt_str_hhmmss) # accDf.write.mode("Overwrite").parquet('/mnt/accruals/raw_data/' + now_pt_str_hhmmss) # ## which can later be accessed using the below # from pyspark.sql import SQLContext # sqlContext = SQLContext(sc)
## Read in data by pointing to its's table name in Glue Data Catalog schema = StructType() \ .add('source', StringType()) \ .add('type', StringType()) \ .add('data', StringType()) \ .add('ts', StringType()) src = spark.read.load('s3://demoanalyticsapp-output/*/*/*/*/', format='parquet', schema=schema) ## Build out some new columns with data extracted from the JSON string df = src \ .withColumn('n_ts', F.unix_timestamp('ts', "yyyy-MM-dd'T'hh:mm:ss").cast('timestamp')) \ .withColumn('agerange', \ F.from_json( \ F.get_json_object('data', '$.facedetails[*].agerange'), \ StructType().add('high', IntegerType()).add('low', IntegerType()) \ ) \ ) \ .withColumn('smile', \ F.from_json( \ F.get_json_object('data', '$.facedetails[*].smile'), \ StructType().add('confidence', DoubleType()).add('value', BooleanType()) \ ) \ ) \ .withColumn('eyeglasses', \ F.from_json( \ F.get_json_object('data', '$.facedetails[*].eyeglasses'), \ StructType().add('confidence', DoubleType()).add('value', BooleanType()) \ ) \ ) \ .withColumn('sunglasses', \