def clean_close(session: SparkSession) -> None: """ :param session: :return: """ clean_spark_session(session) clean_spark_dir() session.stop()
StructField("Lang", StringType(), True), StructField("Date", StringType(), True), StructField("Source", StringType(), True), StructField("Len", StringType(), True), StructField("Likes", StringType(), True), StructField("RTs", StringType(), True), StructField("Hashtags", StringType(), True), StructField("UserMentionNames", StringType(), True), StructField("UserMentionID", StringType(), True), StructField("name", StringType(), True), StructField("Place", StringType(), True), StructField("Followers", StringType(), True), StructField("Friends", StringType(), True) ]) inputDF = spark.readStream.schema(schema).option("delimiter", ";").option( "maxFilesPerTrigger", 1).csv(inputPath) #query1 =inputDF.select(explode(split("Hashtags", ",")).alias("Hashtags")).groupBy("Hashtags").count().orderBy('count', ascending=False) query2 = inputDF.withColumn( "Ratio", inputDF.Followers / inputDF.Friends).groupBy('name').agg( max('Ratio').alias('FRRatio')).orderBy('FRRatio', ascending=False) #query1.writeStream.outputMode("complete").format("console").option("numRows",5).start().awaitTermination(100) query2.writeStream.outputMode("complete").format("console").option( "numRows", 1).start().awaitTermination(60) #query2.stop() spark.stop()