def streaming_logic(): """ :function: initial spark context and all the streaming logic :return: None """ # - read configuration from file spark_config, kafka_config, cassandra_config = read_config() # - initial spark context conf = SparkConf().setMaster(spark_config['master']).setAppName(spark_config['app_name']).set('spark.cassandra.connection.host', cassandra_config['cluster']) csc = CassandraSparkContext(conf=conf) csc.setLogLevel(spark_config['log_level']) ssc = StreamingContext(sparkContext=csc, batchDuration=spark_config['time_window']) # - creating kafka stream directKafkaStream = KafkaUtils.createDirectStream(ssc, [kafka_config['topic_in']], {'metadata.broker.list': kafka_config['cluster']}) # - start to process data # - output data structure: MetadData structured_stock_data = directKafkaStream.map(lambda data : preprocess_data(data=data)) structured_stock_data.pprint(20) stock_data_list = structured_stock_data.reduceByKey(lambda a,b : aggregate_list(a,b)) stock_data_list.pprint(20) # - get history data from cassandra alert_user_data = stock_data_list.mapValues(lambda dictlist : compute_stock_tending_in_window(dict_list=dictlist)) alert_user_data.pprint(20) # - send alert to user alert_user_data.foreachRDD(lambda rdd : rdd.foreachPartition(lambda iter : send_alert_to_kafka(iterator=iter,kafka_config=kafka_config))) ssc.start() ssc.awaitTermination()
def main(): pwords = load_wordlist("./Dataset/positive.txt") nwords = load_wordlist("./Dataset/negative.txt") conf = SparkConf().\ setMaster("local[2]").\ setAppName("TweeStreamer").\ set("spark.cassandra.connection.host",\ "52.25.173.31, 35.165.251.179, 52.27.187.234, 52.38.246.84") sc = CassandraSparkContext(conf=conf) sc.setLogLevel("WARN") sql = SQLContext(sc) # Creating a streaming context with batch interval of 1 sec ssc = StreamingContext(sc, 10) ssc.checkpoint("checkpoint") kstream = KafkaUtils.createDirectStream( ssc, topics=['twitter-topic1'], kafkaParams={"metadata.broker.list": 'localhost:9092'}) #tweets = kstream.map(lambda x: json.loads( x[1].decode('utf-8'))) tweets = kstream.map(lambda x: json.loads(x[1])) tweetsUsentiment = tweets.map( lambda tweet: tweetwithSentiment(tweet, pwords, nwords)) #searchTermSentiment = tweetsUsentiment.pprint() tweetsUsentiment.saveToCassandra("tweetdb", "tweettable") ssc.start() ssc.awaitTerminationOrTimeout(100) ssc.stop(stopGraceFully=True)
def main(): pwords = load_wordlist("../Dataset/positive.txt") nwords = load_wordlist("../Dataset/negative.txt") sterms = load_wordlist("../Dataset/keyWords.txt") conf = SparkConf().\ setMaster("local[2]").\ setAppName("TweeStreamer").\ set("spark.cassandra.connection.host",\ "52.25.173.31, 35.165.251.179, 52.27.187.234, 52.38.246.84") sc = CassandraSparkContext(conf=conf) sc.setLogLevel("WARN") # Creating a streaming context with batch interval of 10 sec ssc = StreamingContext(sc, 10) ssc.checkpoint("checkpoint") kstream = KafkaUtils.createDirectStream( ssc, topics=['twitter-topic1'], kafkaParams={"metadata.broker.list": 'localhost:9092'}) tweets = kstream.map(lambda x: json.loads(x[1])) tweets.count().map(lambda x: 'Tweets in this batch: %s' % x).pprint() tweetsUsentiment = tweets.map( lambda tweet: tweetwithSentiment(tweet, pwords, nwords, sterms)) searchTermUsentiment = tweetsUsentiment.flatMap( lambda tweet: searchTermFunction(tweet, sterms)).reduceByKey( lambda a, b: a + b) searchTermUsentiment = searchTermUsentiment.map( lambda (key, value): { "searchterm": "_" + key, "insertion_time": datetime.datetime.now().strftime( '%Y-%m-%d %H:%M:%S'), "sentiment": value }) searchTermUsentiment.pprint() searchTermUsentiment.saveToCassandra("tweetdb", "searchtermtable") # searchTermSentiment = tweetsUsentiment.map(lambda tweet: searchTermFunction(tweet,sterms)) ssc.start() ssc.awaitTerminationOrTimeout(1000) ssc.stop(stopGraceFully=True)
def transfer_time(text): #return "2018-06-25" return datetime.today().strftime("%Y-%m-%d %H:%M:%S") def process(rdd): spark = getSparkSessionInstance(rdd.context.getConf()) tweetsDataFrame = spark.read.json(rdd) df = tweetsDataFrame.withColumn('hashtag', func(tweetsDataFrame.text)) df = df.withColumn('time',transfer_time(tweetsDataFrame.time)) df.createOrReplaceTempView("historicaltweets") df = spark.sql("SELECT MAX(time) AS time,hashtag, count(*) AS count FROM historicaltweets WHERE hashtag IS NOT NULL GROUP BY hashtag ORDER BY count DESC") rdd = df.rdd.map(tuple) rdd.saveToCassandra("twitter","tweet") df.show() if __name__ == "__main__": sc = CassandraSparkContext(appName="tweet") sc.setLogLevel("WARN") ssc = StreamingContext(sc,600) topic_name = "twitter" streamFromKafka = KafkaUtils.createDirectStream(ssc, [topic_name],{"metadata.broker.list":'*'}) lines = streamFromKafka.map(lambda x: x[1]) lines.count().pprint() lines.foreachRDD(process) #text_counts = lines.map(lambda tweet: (tweet['hashtag'],1)).reduceByKey(lambda x,y: x + y) ssc.start() ssc.awaitTermination()
return datetime.today().strftime("%Y-%m-%d %H:%M:%S") def process(rdd): spark = getSparkSessionInstance(rdd.context.getConf()) tweetsDataFrame = spark.read.json(rdd) df = tweetsDataFrame.withColumn('hashtag', func(tweetsDataFrame.text)) df = df.withColumn('date', transfer_time(tweetsDataFrame.time)) df.createOrReplaceTempView("historicaltweets") df = spark.sql( "SELECT MAX(date) AS date,hashtag,count(*) AS count FROM historicaltweets WHERE hashtag IS NOT NULL GROUP BY hashtag ORDER BY count DESC" ) rdd = df.rdd.map(tuple) rdd.saveToCassandra("twitter", "tweet") df.show() if __name__ == "__main__": sc = CassandraSparkContext(appName="tweet") sc.setLogLevel("WARN") ssc = StreamingContext(sc, 600) topic_name = "twitter" streamFromKafka = KafkaUtils.createDirectStream( ssc, [topic_name], {"metadata.broker.list": '*'}) lines = streamFromKafka.map(lambda x: x[1]) lines.count().pprint() lines.foreachRDD(process) #text_counts = lines.map(lambda tweet: (tweet['hashtag'],1)).reduceByKey(lambda x,y: x + y) ssc.start() ssc.awaitTermination()
if __name__ == '__main__': if len(sys.argv) != 4: print("Usage: consumer.py <kafka-host> <topic-name> <seconds>") exit(-1) kafka_host = sys.argv[1] topic_name = sys.argv[2] seconds = int(sys.argv[3]) conf = SparkConf() \ .setAppName("data_challenge") from pyspark_cassandra import CassandraSparkContext sc = CassandraSparkContext(conf=conf) sc.setLogLevel('ERROR') ssc = StreamingContext(sc, seconds) ssc.checkpoint('./output') d = dict() d['bootstrap.servers'] = kafka_host d['group.id'] = 'test-id' d['enable.auto.commit'] = 'false' kafka_stream = KafkaUtils.createDirectStream(ssc, [topic_name], d) # Parse messages as json tweets = kafka_stream.map(lambda v: json.loads(v[1])) tweets_text = tweets.map( lambda tweet: json.loads(tweet)['text'].encode('ascii', 'ignore'))