def test_aggregate(spark, root, args): raw = spark.read.json(str(root / "server_a" / "raw")) internal = spark.read.json( str(root / "server_a" / "intermediate" / "internal" / "verify2") ) external = spark.read.json( str(root / "server_a" / "intermediate" / "external" / "verify2") ) aggregates = ( raw.select("id", F.unbase64("payload").alias("shares")) .join(internal.select("id", F.unbase64("payload").alias("internal")), on="id") .join(external.select("id", F.unbase64("payload").alias("external")), on="id") .groupBy() .applyInPandas( lambda pdf: udf.aggregate( args.batch_id, args.n_data, args.server_id, args.private_key_hex, args.shared_secret, args.public_key_hex_internal, args.public_key_hex_external, pdf, ), schema="payload binary, error int, total int", ) ) aggregates.show() rows = aggregates.collect() assert len(rows) == 1 assert rows[0].total == 5 assert rows[0].error == 0
def test_total_share(spark, root, args): raw = spark.read.json(str(root / "server_a" / "raw")) internal = spark.read.json( str(root / "server_a" / "intermediate" / "internal" / "verify2") ) external = spark.read.json( str(root / "server_a" / "intermediate" / "external" / "verify2") ) aggregates = ( raw.select("id", F.unbase64("payload").alias("shares")) .join(internal.select("id", F.unbase64("payload").alias("internal")), on="id") .join(external.select("id", F.unbase64("payload").alias("external")), on="id") .repartition(2) .withColumn("pid", F.spark_partition_id()) .groupBy("pid") .applyInPandas( lambda pdf: udf.aggregate( args.batch_id, args.n_data, args.server_id, args.private_key_hex, args.shared_secret, args.public_key_hex_internal, args.public_key_hex_external, pdf, ), schema="payload binary, error int, total int", ) ) aggregates.show() rows = aggregates.collect() assert len(rows) == 2 assert {2, 3} == set(r.total for r in rows) assert all(r.error == 0 for r in rows) total_share = aggregates.groupBy().applyInPandas( lambda pdf: udf.total_share( args.batch_id, args.n_data, args.server_id, args.private_key_hex, args.shared_secret, args.public_key_hex_internal, args.public_key_hex_external, pdf, ), schema="payload binary, error int, total int", ) total_share.show() rows = total_share.collect() assert len(rows) == 1 assert len(rows[0].payload) > 0 assert rows[0].total == 5 assert rows[0].error == 0
def top_k_businesses(spark: SparkSession, k: int = 3, ascending: bool = False): df = spark.read.csv(l_file[1],header=True,sep="\t") #unbase df = df.withColumn("review_text", unbase64(df["review_text"]).cast("string")) #tokenize tokenizer = Tokenizer(inputCol="review_text", outputCol="temp") df = tokenizer.transform(df) df = df.drop("review_text") #stopwords stopwords = get_stopword_list() remover = StopWordsRemover(inputCol="temp", outputCol="words", stopWords=stopwords) df = remover.transform(df) df = df.drop("temp") df = df.select(df.business_id, explode(df.words)) #punctuation commaRep = udf(lambda x: x.translate(str.maketrans('','', string.punctuation))) df = df.withColumn("col", commaRep(df["col"])) #sentiment afinn = get_sentiment_dict() sentimentRep = udf(lambda x: afinn.get(x,0)) df = df.withColumn("col", sentimentRep(df["col"])) #summing on business df = df.groupBy("business_id") result = df.agg({'col':'sum'}).orderBy("sum(col)",ascending=ascending).take(k) return result
def verify1( batch_id, n_data, server_id, private_key_hex, shared_secret, public_key_hex_internal, public_key_hex_external, input, output, ): """Decode a batch of shares""" click.echo("Running verify1") spark = spark_session() df = spark.read.json(input).select( "id", F.pandas_udf( partial( udf.verify1, batch_id, n_data, server_id, private_key_hex, b64decode(shared_secret), public_key_hex_internal, public_key_hex_external, ), returnType="binary", )(F.unbase64("payload")).alias("payload"), ) valid = df.where("payload is not null") # NOTE: invalid set can be written out, but maybe require work to be done twice valid.write.json(output, mode="overwrite")
def test_verify1_bad_data(spark, root, args): # we can use server b's data a = spark.read.json(str(root / "server_a" / "raw")) b = spark.read.json(str(root / "server_b" / "raw")) df = spark.createDataFrame(a.union(b).collect() + [Row(id=None, payload=None)]) actual = df.select( "id", F.pandas_udf( partial( udf.verify1, args.batch_id, args.n_data, args.server_id, args.private_key_hex, args.shared_secret, args.public_key_hex_internal, args.public_key_hex_external, ), returnType="binary", )(F.unbase64("payload")).alias("expected_payload"), ) actual.show() assert actual.where("expected_payload IS NOT NULL").count() == a.count() assert actual.where("expected_payload IS NULL").count() == b.count() + 1
def detokenize(column): """ This function to detokenise using unbase64 :param column: pass column to detokenize :return: untokenised value """ return fn.unbase64(column).cast("string")
def verify2( batch_id, n_data, server_id, private_key_hex, shared_secret, public_key_hex_internal, public_key_hex_external, input, input_internal, input_external, output, ): """Verify a batch of SNIPs""" click.echo("Running verify2") spark = spark_session() shares = spark.read.json(input) internal = spark.read.json(input_internal) external = spark.read.json(input_external) df = (shares.select("id", F.unbase64("payload").alias("shares")).join( internal.select( "id", F.unbase64("payload").alias("internal")), on="id").join(external.select( "id", F.unbase64("payload").alias("external")), on="id").select( "id", F.pandas_udf( partial( udf.verify2, batch_id, n_data, server_id, private_key_hex, b64decode(shared_secret), public_key_hex_internal, public_key_hex_external, ), returnType="binary", )("shares", "internal", "external").alias("payload"), )) valid = df.where("payload is not null") valid.write.json(output, mode="overwrite")
def aggregate( batch_id, n_data, server_id, private_key_hex, shared_secret, public_key_hex_internal, public_key_hex_external, input, input_internal, input_external, output, ): """Generate an aggregate share from a batch of verified SNIPs""" click.echo("Running aggregate") spark = spark_session() shares = spark.read.json(input) internal = spark.read.json(input_internal) external = spark.read.json(input_external) args = [ batch_id, n_data, server_id, private_key_hex, b64decode(shared_secret), public_key_hex_internal, public_key_hex_external, ] (shares.join(internal.withColumnRenamed("payload", "internal"), on="id").join( external.withColumnRenamed("payload", "external"), on="id").select( F.unbase64("payload").alias("shares"), F.unbase64("internal").alias("internal"), F.unbase64("external").alias("external"), F.spark_partition_id().alias("pid"), ).groupBy("pid").applyInPandas( lambda pdf: udf.aggregate(*args, pdf), schema="payload: binary, error: int, total: int", ).groupBy().applyInPandas( lambda pdf: udf.total_share(*args, pdf), schema="payload: binary, error: int, total: int", ).withColumn("payload", F.base64("payload"))).write.json( output, mode="overwrite")
def generate_frames(display_loaded=False, outputs=False): # Read csv files, specify column types and generate frames business_schema = StructType()\ .add("business_id", StringType(), True)\ .add("name", StringType(), True)\ .add("address", StringType(), True)\ .add("city", StringType(), True)\ .add("state", StringType(), True)\ .add("postal_code", StringType(), True)\ .add("latitude", FloatType(), True)\ .add("longitude", FloatType(), True)\ .add("stars", FloatType(), True)\ .add("review_count", IntegerType(), True)\ .add("categories", StringType(), True) business_frame = spark.read.format('csv')\ .options(header='true', delimiter=" ")\ .schema(business_schema)\ .load('./yelp_businesses.csv') top_review_schema = StructType() \ .add("review_id", StringType()) \ .add("user_id", StringType()) \ .add("business_id", StringType()) \ .add("review_text", StringType()) \ .add("review_date", StringType()) # Will be converted to TimestampType top_review_frame = spark.read.format('csv')\ .options(header='true', delimiter=" ") \ .schema(top_review_schema) \ .load('./yelp_top_reviewers_with_reviews.csv') # Decode review_text, convert review_date to TimestampType top_review_frame = top_review_frame\ .withColumn('review_text', unbase64(top_review_frame.review_text).cast(StringType()))\ .withColumn('review_date', from_unixtime(top_review_frame.review_date).cast(TimestampType())) friendship_graph_frame = spark.read.format('csv')\ .options(header='true', inferSchema='true')\ .load('./yelp_top_users_friendship_graph.csv') if outputs: to_console = "5a:\nData frame objects, showing the columns and types: \n" to_console += "Yelp Businesses: " + str(business_frame) + "\n" to_console += "Yelp Top Reviewers: " + str(top_review_frame) + "\n" to_console += "Yelp Top Users Friendship Graph: " + str( friendship_graph_frame) + "\n" print(to_console) # Display frames (for testing) if display_loaded: business_frame.show() top_review_frame.show() friendship_graph_frame.show() return business_frame, top_review_frame, friendship_graph_frame
def read_df(uid): server_root = root / f"server_{uid}" raw = spark.read.json(str(server_root / "raw")) internal = spark.read.json( str(server_root / "intermediate" / "internal" / "verify2") ) external = spark.read.json( str(server_root / "intermediate" / "external" / "verify2") ) return ( raw.select("id", F.unbase64("payload").alias("shares")) .join( internal.select("id", F.unbase64("payload").alias("internal")), on="id" ) .join( external.select("id", F.unbase64("payload").alias("external")), on="id" ) )
def test_verify2(spark, root, args): raw = spark.read.json(str(root / "server_a" / "raw")) internal = spark.read.json( str(root / "server_a" / "intermediate" / "internal" / "verify1") ) external = spark.read.json( str(root / "server_a" / "intermediate" / "external" / "verify1") ) actual = ( raw.select("id", F.unbase64("payload").alias("shares")) .join(internal.select("id", F.unbase64("payload").alias("internal")), on="id") .join(external.select("id", F.unbase64("payload").alias("external")), on="id") .select( "id", F.base64( F.pandas_udf( partial( udf.verify2, args.batch_id, args.n_data, args.server_id, args.private_key_hex, args.shared_secret, args.public_key_hex_internal, args.public_key_hex_external, ), returnType="binary", )("shares", "internal", "external") ).alias("expected_payload"), ) ) expected = spark.read.json( str(root / "server_a" / "intermediate" / "internal" / "verify2") ) joined = actual.join(expected, on="id") assert joined.where("length(expected_payload) <> length(payload)").count() == 0
def publish( batch_id, n_data, server_id, private_key_hex, shared_secret, public_key_hex_internal, public_key_hex_external, input_internal, input_external, output, ): """Generate a final aggregate.""" click.echo("Running publish") spark = spark_session() (spark.read.json(input_internal).withColumn( "server", F.lit("internal")).union( spark.read.json(input_external).withColumn( "server", F.lit("external"))).withColumn( "payload", F.unbase64("payload")).groupBy().pivot( "server", ["internal", "external"]). agg(*[F.min(c).alias(c) for c in ["payload", "error", "total"]]).select( F.udf(lambda: str(uuid4()), returnType="string")().alias("id"), F.lit(datetime.utcnow().isoformat()).alias("timestamp"), F.pandas_udf( partial( udf.publish, batch_id, n_data, server_id, private_key_hex, b64decode(shared_secret), public_key_hex_internal, public_key_hex_external, ), returnType="array<int>", )("internal_payload", "external_payload").alias("payload"), F.col("internal_error").alias("error"), F.col("internal_total").alias("total"), ).write.json(output, mode="overwrite"))
def test_publish(spark, tmp_path, root, args): expect = json.loads(next((root / "server_a" / "processed").glob("*")).read_text()) internal = root / "server_a" / "intermediate" / "internal" / "aggregate" external = root / "server_a" / "intermediate" / "external" / "aggregate" df = ( spark.read.json(str(internal)) .withColumn("server", F.lit("internal")) .union(spark.read.json(str(external)).withColumn("server", F.lit("external"))) .withColumn("payload", F.unbase64("payload")) .groupBy() .pivot("server", ["internal", "external"]) .agg( F.min("payload").alias("payload"), F.min("error").alias("error"), F.min("total").alias("total"), ) .select( F.pandas_udf( partial( udf.publish, args.batch_id, args.n_data, args.server_id, args.private_key_hex, args.shared_secret, args.public_key_hex_internal, args.public_key_hex_external, ), returnType="array<int>", )("internal_payload", "external_payload").alias("payload"), F.col("internal_error").alias("error"), F.col("internal_total").alias("total"), ) ) assert df.count() == 1 row = df.first() assert row.payload == expect["payload"] assert row.error == 0 assert row.total == 5
def test_verify1(spark, root, args): df = spark.read.json(str(root / "server_a" / "raw")) df.show(vertical=True, truncate=100) actual = df.select( "id", F.base64( F.pandas_udf( partial( udf.verify1, args.batch_id, args.n_data, args.server_id, args.private_key_hex, args.shared_secret, args.public_key_hex_internal, args.public_key_hex_external, ), returnType="binary", )(F.unbase64("payload")) ).alias("expected_payload"), ) expected = spark.read.json( str(root / "server_a" / "intermediate" / "internal" / "verify1") ) joined = actual.join(expected, on="id") joined.show(vertical=True, truncate=100) # NOTE: Payloads are only the same if they are processed in a deterministic # order using the same context due to the pseudorandom seed. The CLI # application assumes the same server context across all of the rows in a # partition. However, the UDF approach will generate a new server context # for every row. assert joined.where("length(expected_payload) <> length(payload)").count() == 0
def clean_MES(df): df_decoded = df.withColumn('Body', decode(unbase64(df.Body), 'utf-8')) return flatten_df( flatten_df( df_decoded.withColumn( 'Body', from_json( col('Body'), StructType([ StructField("dataItemType", StringType(), True), StructField("assetId", StringType(), True), StructField("value", StringType(), True) ], )))).drop(col('SystemProperties')). withColumn( 'SystemProperties_connectionAuthMethod', from_json( col('SystemProperties_connectionAuthMethod'), StructType([ StructField("scope", StringType(), True), StructField("type", StringType(), True), StructField("issuer", StringType(), True), StructField("acceptingIpFilterRule", StringType(), True) ], ))).withColumn( 'Body_Value', from_json( col('Body_Value'), StructType([ StructField("eventId", StringType(), True), StructField("assetId", StringType(), True), StructField("telemetryValue", StringType(), True), StructField("description", StringType(), True), StructField("dateTime", StringType(), True), StructField("componentName", StringType(), True), StructField("status", StringType(), True) ], ))))
# | customer| # +--------------------+ # |[7B 22 73 74 61 7...| # +--------------------+ # and convert it to clear json like this: # +--------------------+ # | customer| # +--------------------+ # |{"customerName":"...| #+--------------------+ # # with this JSON format: {"customerName":"Sam Test","email":"*****@*****.**","phone":"8015551212","birthDay":"2001-01-03"} zSetEntriesDecodedStreamingDF = zSetEntriesEncodedStreamingDF.withColumn( 'encodedCustomer', unbase64(zSetEntriesEncodedStreamingDF.encodedCustomer).cast('string')) # TO-DO: parse the JSON in the Customer record and store in a temporary view called CustomerRecords zSetEntriesDecodedStreamingDF.withColumn( 'encodedCustomer', from_json('encodedCustomer', customerMessageSchema)).select( col('encodedCustomer.*')).createOrReplaceTempView('CustomerRecords') # TO-DO: JSON parsing will set non-existent fields to null, so let's select just the fields we want, where they are not null as a new dataframe called emailAndBirthDayStreamingDF emailAndBirthDayStreamingDF = spark.sql( 'select customerName, email, phone, birthDay from CustomerRecords where birthDay is not null' ) # TO-DO: from the emailAndBirthDayStreamingDF dataframe select the email and the birth year (using the split function) # TO-DO: Split the birth year as a separate field from the birthday
redisServerStreamingDF = redisServerRawStreamingDF.selectExpr( "cast(key as string) key", "cast(value as string) value") # this creates a temporary streaming view based on the streaming dataframe # it can later be queried with spark.sql, we will cover that in the next section redisServerStreamingDF.withColumn("value",from_json("value",redisMessageSchema))\ .select(col('value.*')) \ .createOrReplaceTempView("RedisData") # Using spark.sql we can select any valid select statement from the spark view zSetEntriesEncodedStreamingDF = spark.sql( "select key, zSetEntries[0].element as customer from RedisData") zSetDecodedEntriesStreamingDF = zSetEntriesEncodedStreamingDF.withColumn( "customer", unbase64(zSetEntriesEncodedStreamingDF.customer).cast("string")) zSetDecodedEntriesStreamingDF\ .withColumn("customer", from_json("customer", customerJSONSchema))\ .select(col('customer.*'))\ .createOrReplaceTempView("Customer")\ customerStreamingDF = spark.sql( "select accountNumber, location, birthDay from Customer where birthDay is not null" ) relevantCustomerFieldsStreamingDF = customerStreamingDF.select( 'accountNumber', 'location', split(customerStreamingDF.birthDay, "-").getItem(0).alias("birthYear")) # this takes the stream and "sinks" it to the console as it is updated one message at a time (null means the JSON parsing didn't match the fields in the schema):
redis_server_raw_df = spark\ .readStream\ .format("kafka") \ .option("kafka.bootstrap.servers", "kafka:19092") \ .option("subscribe", "redis-server") \ .option("startingOffsets", "earliest") \ .load() redis_server_raw_df = redis_server_raw_df.selectExpr("cast(key as string) key", "cast(value as string) value") redis_server_raw_df\ .withColumn("value", from_json("value", redis_server_schema))\ .select(col("value.*"))\ .createOrReplaceTempView("RedisSortedSet") customers_df = spark.sql("select key, zSetEntries[0].element as encodedCustomer from RedisSortedSet") customers_df = customers_df.withColumn("encodedCustomer", unbase64(customers_df.encodedCustomer).cast("string")) customers_df\ .withColumn("encodedCustomer", from_json("encodedCustomer", customer_schema))\ .select(col("encodedCustomer.*"))\ .createOrReplaceTempView("CustomerRecords") emailAndBirthDayStreamingDF = spark.sql("select email, birthDay from CustomerRecords " "where email is not null and birthDay is not null") emailAndBirthYearStreamingDF = emailAndBirthDayStreamingDF\ .select("email", split(col("birthDay"), "-").getItem(0).alias("birthYear")) stedi_events_raw_df = spark\ .readStream\ .format("kafka") \ .option("kafka.bootstrap.servers", "kafka:19092") \
redisServerStreamingDF = redisServerRawStreamingDF.selectExpr( "cast(key as string) key", "cast(value as string) value") # this creates a temporary streaming view based on the streaming dataframe # it can later be queried with spark.sql, we will cover that in the next section redisServerStreamingDF.withColumn("value",from_json("value",redisMessageSchema))\ .select(col('value.*')) \ .createOrReplaceTempView("RedisData") # Using spark.sql we can select any valid select statement from the spark view zSetEntriesEncodedStreamingDF = spark.sql( "select key, zSetEntries[0].element as payment from RedisData") zSetDecodedEntriesStreamingDF = zSetEntriesEncodedStreamingDF.withColumn( "payment", unbase64(zSetEntriesEncodedStreamingDF.payment).cast("string")) zSetDecodedEntriesStreamingDF\ .withColumn("payment", from_json("payment", paymentJSONSchema))\ .select(col('payment.*'))\ .createOrReplaceTempView("Payment")\ paymentStreamingDF = spark.sql("select reservationId, amount from Payment") # this takes the stream and "sinks" it to the console as it is updated one message at a time (null means the JSON parsing didn't match the fields in the schema): # {"reservationId":"9856743232","amount":"946.88"} paymentStreamingDF.selectExpr("CAST(reservationId AS STRING) AS key", "to_json(struct(*)) AS value")\ .writeStream \ .format("kafka") \ .option("kafka.bootstrap.servers", "localhost:9092")\
# cast the value column in the streaming dataframe as a STRING redis_server_streaming_df = redis_server_raw_streaming_df.selectExpr( "cast(key as string) key", "cast(value as string) value") redis_server_streaming_df.withColumn( "value", from_json("value", redis_server_schema)).select( col('value.*')).createOrReplaceTempView("RedisSortedSet") # execute a sql statement against a temporary view, which statement takes the element field from the 0th element in the array of structs and create a column called encodedCustomer # the reason we do it this way is that the syntax available select against a view is different than a dataframe, and it makes it easy to select the nth element of an array in a sql column encoded_streaming_df = spark.sql( "select key, zSetEntries[0].element as customer from RedisSortedSet") decoded_streaming_df = encoded_streaming_df.withColumn( "customer", unbase64(encoded_streaming_df.customer).cast("string")) # parse the JSON in the Customer record and store in a temporary view called CustomerRecords decoded_streaming_df.withColumn( "customer", from_json("customer", customer_json_schema)).select( col('customer.*')).createOrReplaceTempView("CustomerRecords") # JSON parsing will set non-existent fields to null, so let's select just the fields we want, where they are not null as a new dataframe called emailAndBirthDayStreamingDF email_bday_streaming_df = spark.sql( "select email, birthDay from CustomerRecords where birthDay is not null") # Select only the birth year and email fields as a new streaming data frame called emailAndBirthYearStreamingDF email_birth_year_streaming_df = email_bday_streaming_df.select( 'email', split(email_bday_streaming_df.birthDay, "-").getItem(0).alias("birthYear"))
# +--------------------+ # | customer| # +--------------------+ # |[7B 22 73 74 61 7...| # +--------------------+ # and convert it to clear json like this: # +--------------------+ # | customer| # +--------------------+ # |{"customerName":"...| # +--------------------+ # # with this JSON format: {"customerName":"Sam Test","email":"*****@*****.**","phone":"8015551212","birthDay":"2001-01-03"} encodedCustomerDF = encodedCustomerDF.withColumn( "encodedCustomer", unbase64(encodedCustomerDF.encodedCustomer).cast("string")) # TO-DO: parse the JSON in the Customer record and store in a temporary view called CustomerRecords encodedCustomerDF\ .withColumn("encodedCustomer", from_json("encodedCustomer", redisCustomerMessageSchema))\ .select(col('encodedCustomer.*'))\ .createOrReplaceTempView("CustomerRecords") # TO-DO: JSON parsing will set non-existent fields to null, so let's select just the fields we want, where they are not null as a new dataframe called emailAndBirthDayStreamingDF emailAndBirthDayStreamingDF = spark.sql( "select email, birthday from CustomerRecords where email is not null and birthDay is not null") # TO-DO: Split the birth year as a separate field from the birthday emailAndBirthDayStreamingDF = emailAndBirthDayStreamingDF.withColumn( "birthYear", split("CustomerRecords.birthDay", "-").getItem(0).alias("birthYear"))
# TO-DO: take the encodedCustomer column which is base64 encoded at first like this: # +--------------------+ # | customer| # +--------------------+ # |[7B 22 73 74 61 7...| # +--------------------+ # and convert it to clear json like this: # +--------------------+ # | customer| # +--------------------+ # |{"customerName":"...| # +--------------------+ # # with this JSON format: {"customerName":"Sam Test","email":"*****@*****.**","phone":"8015551212","birthDay":"2001-01-03"} df = encDf.withColumn("customer", unbase64(encDf.customer).cast("string")) # TO-DO: parse the JSON in the Customer record and store in a temporary view called CustomerRecords df.withColumn("customer", from_json("customer", customersSchema)).select( col("customer.*")).createOrReplaceTempView("CustomerRecords") # TO-DO: JSON parsing will set non-existent fields to null, so let's select just the fields we want, where they are not null as a new dataframe called emailAndBirthDayStreamingDF emailAndBirthDayStreamingDF = spark.sql( "select email, birthDay from CustomerRecords where email is not null and birthDay is not null" ) # TO-DO: Split the birth year as a separate field from the birthday # TO-DO: Select only the birth year and email fields as a new streaming data frame called emailAndBirthYearStreamingDF emailAndBirthYearStreamingDF = emailAndBirthDayStreamingDF.select( "email", split(emailAndBirthDayStreamingDF.birthDay,
# | customer| # +--------------------+ # |[7B 22 73 74 61 7...| # +--------------------+ # and converting it to clear json like this: # +--------------------+ # | customer| # +--------------------+ # |{"customerName":"...| #+--------------------+ # # with this JSON format: {"customerName":"Sam Test","email":"*****@*****.**","phone":"8015551212","birthDay":"2001-01-03"} decodedCustomer = encodedCustomer.withColumn( "customer", unbase64(encodedCustomer.customer).cast("string")) # Parsing the JSON in the Customer record and store in a temporary view called CustomerRecords decodedCustomer\ .withColumn("customer", from_json("customer",customerInfoJSONschema))\ .select(col("customer.*"))\ .createOrReplaceTempView("CustomerRecords") # JSON parsing will set non-existent fields to null, so let's select just the fields we want, where they are not null as a new dataframe called emailAndBirthDayStreamingDF emailAndBirthDayStreamingDF = spark.sql( "select email, birthDay from CustomerRecords where email is not null") # from the emailAndBirthDayStreamingDF dataframe select the email and the birth year (using the split function) # Splitting the birth year as a separate field from the birthday # Selecting only the birth year and email fields as a new streaming data frame called emailAndBirthYearStreamingDF
#TO-DO: using the redisMessageSchema StructType, deserialize the JSON from the streaming dataframe redisServerStreamingDF = redisServerStreamingDF.withColumn('value', from_json('value', redisMessageSchema)) \ .select(col('value.*')) # TO-DO: create a temporary streaming view called "RedisData" based on the streaming dataframe # it can later be queried with spark.sql redisServerStreamingDF.createOrReplaceTempView('RedisData') #TO-DO: using spark.sql, select key, zSetEntries[0].element as customerLocation from RedisData zSetEntriesEncodedStreamingDF = spark.sql( 'select key, zSetEntries[0].element as customerLocation from RedisData') #TO-DO: from the dataframe use the unbase64 function to select a column called customerLocation with the base64 decoded JSON, and cast it to a string zSetEntriesDecodedStreamingDF = zSetEntriesEncodedStreamingDF.withColumn( 'customerLocation', unbase64(zSetEntriesEncodedStreamingDF.customerLocation).cast('string')) #TO-DO: using the customer location StructType, deserialize the JSON from the streaming dataframe, selecting column customerLocation.* as a temporary view called CustomerLocation zSetEntriesDecodedStreamingDF\ .withColumn('customerLocation', from_json('customerLocation', customerLocationSchema)) \ .select(col('customerLocation.*')) \ .createOrReplaceTempView('CustomerLocation') #TO-DO: using spark.sql select * from CustomerLocation customerLocationStreamingDF = spark.sql('select * from CustomerLocation') # TO-DO: write the stream to the console, and configure it to run indefinitely, the console output will look something like this: # +-------------+---------+ # |accountNumber| location| # +-------------+---------+ # | null| null|
# and convert it to clear json like this: # +--------------------+ # | customer| # +--------------------+ # |{"customerName":"...| # +--------------------+ # # with this JSON format: # {"customerName":"Sam Test","email":"*****@*****.**","phone":"8015551212","birthDay":"2001-01-03"} customerEncodedEntriesStreamingDF = spark.sql(""" select zSetEntries[0].element as encodedCustomer from RedisSortedSet """) customerDecodedEntriesStreamingDF = customerEncodedEntriesStreamingDF \ .withColumn("customer", F.unbase64(customerEncodedEntriesStreamingDF.encodedCustomer).cast("string")) customerDecodedEntriesStreamingDF.withColumn("customer", F.from_json("customer", redisCustomerSchema)) \ .select(F.col("customer.*")) \ .createOrReplaceTempView("CustomerRecords") emailAndBirthDayStreamingDF = spark.sql(""" select * from CustomerRecords where email is not null AND birthDay is not null """) # parse the JSON in the Customer record and store in a temporary view called CustomerRecords # JSON parsing will set non-existent fields to null, so let"s # select just the fields we want, where they are not null as a new dataframe called emailAndBirthDayStreamingDF customerEncodedEntriesStreamingDF = spark.sql("""
.option("startingOffsets", "earliest") \ .load() redis_server_raw_df = redis_server_raw_df.selectExpr( "cast(key as string) key", "cast(value as string) value") redis_server_raw_df\ .withColumn("value", from_json("value", redis_server_schema))\ .select(col("value.*"))\ .createOrReplaceTempView("RedisSortedSet") customers_df = spark.sql( "select key, zSetEntries[0].element as encodedCustomer from RedisSortedSet" ) customers_df = customers_df.withColumn( "encodedCustomer", unbase64(customers_df.encodedCustomer).cast("string")) customers_df\ .withColumn("encodedCustomer", from_json("encodedCustomer", customer_schema))\ .select(col("encodedCustomer.*"))\ .createOrReplaceTempView("CustomerRecords") emailAndBirthDayStreamingDF = spark.sql( "select email, birthDay from CustomerRecords " "where email is not null and birthDay is not null") emailAndBirthYearStreamingDF = emailAndBirthDayStreamingDF\ .select("email", split(col("birthDay"), "-").getItem(0).alias("birthYear")) emailAndBirthYearStreamingDF.writeStream.format("console").outputMode( "append").start().awaitTermination()
# +--------------------+ # | customer| # +--------------------+ # |[7B 22 73 74 61 7...| # +--------------------+ # and convert it to clear json like this: # +--------------------+ # | customer| # +--------------------+ # |{"customerName":"...| #+--------------------+ # # with this JSON format: {"customerName":"Sam Test","email":"*****@*****.**","phone":"8015551212","birthDay":"2001-01-03"} zSetEntriesDecodedStreamingDF = zSetEntriesEncodedStreamingDF\ .withColumn("encodedCustomer", unbase64(zSetEntriesEncodedStreamingDF.encodedCustomer).cast("string")) # Parse the JSON in the Customer record and store in a temporary view called CustomerRecords zSetEntriesDecodedStreamingDF.withColumn( "decodedCustomer", from_json("encodedCustomer", customerJSONSchema)).select( col("decodedCustomer.*")).createOrReplaceTempView("CustomerRecords") # JSON parsing will set non-existent fields to null, so let's select just the fields we want, where they are not null as a new dataframe called emailAndBirthDayStreamingDF emailAndBirthDayStreamingDF = spark.sql( "SELECT customerName, email, birthDay FROM CustomerRecords WHERE birthDay IS NOT NULL" ) # Split the birth year as a separate field from the birthday relevantEmailAndBirthDayStreamingDF = emailAndBirthDayStreamingDF.select( 'customerName', 'email', 'birthDay', split(emailAndBirthDayStreamingDF.birthDay,
# TO-DO: take the encodedCustomer column which is base64 encoded at first like this: # +--------------------+ # | customer| # +--------------------+ # |[7B 22 73 74 61 7...| # +--------------------+ # and convert it to clear json like this: # +--------------------+ # | customer| # +--------------------+ # |{"customerName":"...| #+--------------------+ # # with this JSON format: {"customerName":"Sam Test","email":"*****@*****.**","phone":"8015551212","birthDay":"2001-01-03"} zSetEntriesDecodedStreamingDF = zSetEntriesEncodedStreamingDF.withColumn("customer", unbase64(zSetEntriesEncodedStreamingDF.customer).cast("string")) # TO-DO: parse the JSON in the Customer record and store in a temporary view called CustomerRecords zSetEntriesDecodedStreamingDF.withColumn("customer", from_json("customer", customerJSONSchema)).select(col("customer.*")).createOrReplaceTempView("CustomerRecords") # TO-DO: JSON parsing will set non-existent fields to null, so let's select just the fields we want, where they are not null as a new dataframe called emailAndBirthDayStreamingDF emailAndBirthDayStreamingDF = spark.sql("select email, birthDay from CustomerRecords where email is not null and birthDay is not null") # TO-DO: Split the birth year as a separate field from the birthday # TO-DO: Select only the birth year and email fields as a new streaming data frame called emailAndBirthYearStreamingDF emailAndBirthYearStreamingDF = emailAndBirthDayStreamingDF.select("email", split("birthDay", "-").getItem(0).alias("birthYear")) # TO-DO: using the spark application object, read a streaming dataframe from the Kafka topic stedi-events as the source # Be sure to specify the option that reads all the events from the topic including those that were published before you started the spark stream stediEventsRawStreamingDF = spark.readStream.format("kafka")\ .option("kafka.bootstrap.servers", "kafka:19092")\ .option("subscribe", "stedi-events")\ .option("startingOffsets", "earliest")\ .load() # TO-DO: cast the value column in the streaming dataframe as a STRING
.options("subscribe", "redis-server") .options("startingOffsets", "earliest")\ .load() # TO-DO: cast the value column in the streaming dataframe as a STRING RedisServerStreamingDF = redisServerRawStreamingDF.selectExpr("cast(key as String) key", "cast(value as string) value") RedisServerStreamingDF.withColumn("value",from_json("value", redisMessageSchema))\ .select("value.*")\ .createOrReplaceTempView("RedisData") zSetEntriesEncodedStreamingDF = spark.sql("select key, zSetEntries[0].element as redisEvent from RedisData") zSetDecodedEntriesStreamingDF1 = zSetEntriesEncodedStreamingDF.withColumn("redisEvent", unbase64(zSetEntriesEncodedStreamingDF.redisEvent).cast("string")) # TO-DO:; parse the single column "value" with a json object in it, like this: # +------------+ # | value | # +------------+ # |{"key":"Q3..| # +------------+ # # with this JSON format: {"key":"Q3VzdG9tZXI=", # "existType":"NONE", # "Ch":false, # "Incr":false, # "zSetEntries":[{ # "element":"eyJjdXN0b21lck5hbWUiOiJTYW0gVGVzdCIsImVtYWlsIjoic2FtLnRlc3RAdGVzdC5jb20iLCJwaG9uZSI6IjgwMTU1NTEyMTIiLCJiaXJ0aERheSI6IjIwMDEtMDEtMDMifQ==",
# +--------------------+ # | customer| # +--------------------+ # |[7B 22 73 74 61 7...| # +--------------------+ # and convert it to clear json like this: # +--------------------+ # | customer| # +--------------------+ # |{"customerName":"...| #+--------------------+ # # with this JSON format: {"customerName":"Sam Test","email":"*****@*****.**","phone":"8015551212","birthDay":"2001-01-03"} decoded = unbase64( zSetEntriesEncodedStreamingDF.encodedCustomer).cast("string") zSetDecodedEntriesStreamingDF = zSetEntriesEncodedStreamingDF.withColumn( "encodedCustomer", decoded) # TO-DO: parse the JSON in the Customer record and store in a temporary view called CustomerRecords zSetDecodedEntriesStreamingDF\ .withColumn("encodedCustomer", from_json("encodedCustomer", customerSchema))\ .select(col('encodedCustomer.*'))\ .createOrReplaceTempView("CustomerRecords") # TO-DO: JSON parsing will set non-existent fields to null, so let's select just the fields we want, where they are not null as a new dataframe called emailAndBirthDayStreamingDF emailAndBirthDayStreamingDF = spark.sql( "select customerName, email, birthDay from CustomerRecords where email IS NOT NULL and birthDay IS NOT NULL"