""" To run ./pyspark.submit.sh spark-streaming-foreachRDD-and-foreachPartition.py """ from pyspark import SparkContext, SparkConf from pyspark.streaming import StreamingContext from quiet_logs import quiet_logs if __name__ == "__main__": conf = SparkConf().setAppName("Reading files from a directory") sc = SparkContext(conf=conf) ssc = StreamingContext(sc, 2) quiet_logs(sc) lines = ssc.textFileStream('./streamingData') # Split each line into words words = lines.flatMap(lambda line: line.split(" ")) # Count each word in each batch pairs = words.map(lambda word: (word, 1)) wordCounts = pairs.reduceByKey(lambda x, y: x + y) # Print the first ten elements of each RDD generated in this DStream to the console wordCounts.pprint() ssc.start() # Start the computation
and then run the example `$ ./pyspark.submit.sh spark-streaming-listening-to-a-tcp-port.py """ from pyspark import SparkContext, SparkConf from pyspark.streaming import StreamingContext from quiet_logs import quiet_logs if __name__ == "__main__": conf = SparkConf().setAppName("Listening to a tcp port") sc = SparkContext(conf=conf) ssc = StreamingContext(sc, 1) quiet_logs(sc) lines = ssc.socketTextStream("localhost", 9998) # Split each line into words words = lines.flatMap(lambda line: line.split(" ")) # Count each word in each batch pairs = words.map(lambda word: (word, 1)) wordCounts = pairs.reduceByKey(lambda x, y: x + y) # Print the first ten elements of each RDD generated in this DStream to the console wordCounts.pprint()