Beispiel #1
0
def createStreamingContext():

    # Create a local StreamingContext with two working thread and batch interval of 1 second
    sc = SparkContext("spark://%s:7077" % MASTER_NAME,
                      appName="GlutenTweet",
                      pyFiles=PYFILES)
    ssc = StreamingContext(sc, 2)

    # Create a DStream of raw data
    raw = ssc.socketTextStream(MASTER_IP, 9999)

    # Convert into models
    tweets = raw.map(lambda r: Tweet(raw_json=r))

    # Store models
    tweets.foreachRDD(storeTweetsRDD)

    # Sliding window analysis
    window = tweets.window(20 * 60, 30)
    hashtagCounts = analysisHahtagCount(window)
    streamTop(hashtagCounts).pprint()

    # Keyword extraction - note tweets is immutable
    tweetsKeyword = tweets.map(lambda t: keywordExtraction(t))

    # Update models
    tweetsKeyword.foreachRDD(updateTweetsRDD)

    # Sliding window analysis
    window2 = tweetsKeyword.window(20 * 60, 30)
    keywordCounts = analysisKeywordCount(window2)
    streamTop(keywordCounts).pprint()

    ssc.checkpoint(CHECKPOINT_DIR)
    return ssc
Beispiel #2
0
def createStreamingContext():

    # Create a local StreamingContext with two working thread and batch interval of 1 second
    sc = SparkContext("spark://%s:7077" % MASTER_NAME, appName="GlutenTweet", pyFiles=PYFILES)
    ssc = StreamingContext(sc, 2)

    # Create a DStream of raw data
    raw = ssc.socketTextStream(MASTER_IP, 9999)

    # Convert into models
    tweets = raw.map(lambda r: Tweet(raw_json=r))

    # Store models
    tweets.foreachRDD(storeTweetsRDD)

    # Sliding window analysis
    window = tweets.window(20*60, 30)
    hashtagCounts = analysisHahtagCount(window)
    streamTop(hashtagCounts).pprint()

    # Keyword extraction - note tweets is immutable
    tweetsKeyword = tweets.map(lambda t: keywordExtraction(t))

    # Update models
    tweetsKeyword.foreachRDD(updateTweetsRDD)

    # Sliding window analysis
    window2 = tweetsKeyword.window(20*60, 30)
    keywordCounts = analysisKeywordCount(window2)
    streamTop(keywordCounts).pprint()

    ssc.checkpoint(CHECKPOINT_DIR)
    return ssc
Beispiel #3
0
from models import Tweet
from database import db_session
from analysis import keywordExtraction, analysisHahtagCount, analysisKeywordCount
from config import *

PYFILES = ['batch.py'] + PYFILES

# Create a local StreamingContext with two working thread and batch interval of 1 second
# sc = SparkContext("spark://%s:7077" % MASTER, "GlutenTweetBatch", pyFiles=PYFILES)
sc = SparkContext("spark://%s:7077" % 'hadoop-m-unoa',
                  appName="GlutenTweetBatch",
                  pyFiles=PYFILES)

dbTweets = db_session.query(Tweet).all()
tweets = sc.parallelize(dbTweets)

# Hashtag analysis
hashtagCounts = analysisHahtagCount(tweets)
print(hashtagCounts.top(10, key=lambda p: p[1]))

# Keyword extraction - note tweets is immutable
tweetsKeyword = tweets.map(lambda t: keywordExtraction(t))

# Update models
# tweetsKeyword.foreachRDD(updateTweetsRDD)

# Keyword analysis
keywordCounts = analysisKeywordCount(tweetsKeyword)
print(keywordCounts.top(10, key=lambda p: p[1]))
Beispiel #4
0
from pyspark import SparkContext

from models import Tweet
from database import db_session
from analysis import keywordExtraction, analysisHahtagCount, analysisKeywordCount
from config import *

PYFILES = ['batch.py'] + PYFILES

# Create a local StreamingContext with two working thread and batch interval of 1 second
# sc = SparkContext("spark://%s:7077" % MASTER, "GlutenTweetBatch", pyFiles=PYFILES)
sc = SparkContext("spark://%s:7077" % 'hadoop-m-unoa', appName="GlutenTweetBatch", pyFiles=PYFILES)

dbTweets = db_session.query(Tweet).all()
tweets = sc.parallelize(dbTweets)

# Hashtag analysis
hashtagCounts = analysisHahtagCount(tweets)
print(hashtagCounts.top(10, key=lambda p: p[1]))

# Keyword extraction - note tweets is immutable
tweetsKeyword = tweets.map(lambda t: keywordExtraction(t))

# Update models
# tweetsKeyword.foreachRDD(updateTweetsRDD)

# Keyword analysis
keywordCounts = analysisKeywordCount(tweetsKeyword)
print(keywordCounts.top(10, key=lambda p: p[1]))