Example #1
0
df_json.select("First_Name").show()
df_json.select(df_json["First_Name"], df_json["dob"],
               ((df_json["dob"] - 1).cast(
                   IntegerType())).alias("changed_dob")).show()
#+----------+----+-----------+
#|First_Name| dob|changed_dob|
#+----------+----+-----------+
#|     DAVID|2013|       2012|
#|    JAYDEN|2013|       2012|
#|      RUBY|2014|       2013|
#|     MOSHE|2012|       2011|
#|     ETHAN|2015|       2014|
#|     EDDIE|2012|       2011|
#|    RACHEL|2014|       2013|
#|     ELENA|2014|       2013|
#|    MIGUEL|2013|       2012|
#|      ROSY|2015|       2014|
#+----------+----+-----------+

#filter
df_json.filter(df_json["FATHER_INCOME"] > 3750).show()
#group by and count
df_json.groupBy("DOB").count().show()

#global temp view
df_json.createOrReplaceGlobalTempView("baby_gv")
#sql operation on global temp view
spark.sql("select race from global_temp.baby_gv").show()
#using from new session
spark.newSession().sql("select state from global_temp.baby_gv").show()
Example #2
0
     .option("kafka.bootstrap.servers", "192.168.1.100:9092")\
     .option("subscribe", "json_topic")\
     .option("startingOffsets", "earliest")\
     .load()

df.selectExpr("CAST(id AS STRING) AS key", "to_json(struct(*)) AS value")\
    .writeStream\
    .format("kafka")\
    .outputMode("append") \
    .option("kafka.bootstrap.servers", "192.168.1.100:9092")\
    .option("topic", "josn_data_topic")\
    .start()\
    .awaitTermination()


#newSession()
spark_session = SparkSession.newSession()