def createStreamingContext(): # Create a local StreamingContext with two working thread and batch interval of 1 second sc = SparkContext("spark://%s:7077" % MASTER_NAME, appName="GlutenTweet", pyFiles=PYFILES) ssc = StreamingContext(sc, 2) # Create a DStream of raw data raw = ssc.socketTextStream(MASTER_IP, 9999) # Convert into models tweets = raw.map(lambda r: Tweet(raw_json=r)) # Store models tweets.foreachRDD(storeTweetsRDD) # Sliding window analysis window = tweets.window(20 * 60, 30) hashtagCounts = analysisHahtagCount(window) streamTop(hashtagCounts).pprint() # Keyword extraction - note tweets is immutable tweetsKeyword = tweets.map(lambda t: keywordExtraction(t)) # Update models tweetsKeyword.foreachRDD(updateTweetsRDD) # Sliding window analysis window2 = tweetsKeyword.window(20 * 60, 30) keywordCounts = analysisKeywordCount(window2) streamTop(keywordCounts).pprint() ssc.checkpoint(CHECKPOINT_DIR) return ssc
def createStreamingContext(): # Create a local StreamingContext with two working thread and batch interval of 1 second sc = SparkContext("spark://%s:7077" % MASTER_NAME, appName="GlutenTweet", pyFiles=PYFILES) ssc = StreamingContext(sc, 2) # Create a DStream of raw data raw = ssc.socketTextStream(MASTER_IP, 9999) # Convert into models tweets = raw.map(lambda r: Tweet(raw_json=r)) # Store models tweets.foreachRDD(storeTweetsRDD) # Sliding window analysis window = tweets.window(20*60, 30) hashtagCounts = analysisHahtagCount(window) streamTop(hashtagCounts).pprint() # Keyword extraction - note tweets is immutable tweetsKeyword = tweets.map(lambda t: keywordExtraction(t)) # Update models tweetsKeyword.foreachRDD(updateTweetsRDD) # Sliding window analysis window2 = tweetsKeyword.window(20*60, 30) keywordCounts = analysisKeywordCount(window2) streamTop(keywordCounts).pprint() ssc.checkpoint(CHECKPOINT_DIR) return ssc
from models import Tweet from database import db_session from analysis import keywordExtraction, analysisHahtagCount, analysisKeywordCount from config import * PYFILES = ['batch.py'] + PYFILES # Create a local StreamingContext with two working thread and batch interval of 1 second # sc = SparkContext("spark://%s:7077" % MASTER, "GlutenTweetBatch", pyFiles=PYFILES) sc = SparkContext("spark://%s:7077" % 'hadoop-m-unoa', appName="GlutenTweetBatch", pyFiles=PYFILES) dbTweets = db_session.query(Tweet).all() tweets = sc.parallelize(dbTweets) # Hashtag analysis hashtagCounts = analysisHahtagCount(tweets) print(hashtagCounts.top(10, key=lambda p: p[1])) # Keyword extraction - note tweets is immutable tweetsKeyword = tweets.map(lambda t: keywordExtraction(t)) # Update models # tweetsKeyword.foreachRDD(updateTweetsRDD) # Keyword analysis keywordCounts = analysisKeywordCount(tweetsKeyword) print(keywordCounts.top(10, key=lambda p: p[1]))
from pyspark import SparkContext from models import Tweet from database import db_session from analysis import keywordExtraction, analysisHahtagCount, analysisKeywordCount from config import * PYFILES = ['batch.py'] + PYFILES # Create a local StreamingContext with two working thread and batch interval of 1 second # sc = SparkContext("spark://%s:7077" % MASTER, "GlutenTweetBatch", pyFiles=PYFILES) sc = SparkContext("spark://%s:7077" % 'hadoop-m-unoa', appName="GlutenTweetBatch", pyFiles=PYFILES) dbTweets = db_session.query(Tweet).all() tweets = sc.parallelize(dbTweets) # Hashtag analysis hashtagCounts = analysisHahtagCount(tweets) print(hashtagCounts.top(10, key=lambda p: p[1])) # Keyword extraction - note tweets is immutable tweetsKeyword = tweets.map(lambda t: keywordExtraction(t)) # Update models # tweetsKeyword.foreachRDD(updateTweetsRDD) # Keyword analysis keywordCounts = analysisKeywordCount(tweetsKeyword) print(keywordCounts.top(10, key=lambda p: p[1]))