/
spark.py
159 lines (118 loc) · 4.88 KB
/
spark.py
1
2
3
4
5
6
7
8
9
10
11
12
13
14
15
16
17
18
19
20
21
22
23
24
25
26
27
28
29
30
31
32
33
34
35
36
37
38
39
40
41
42
43
44
45
46
47
48
49
50
51
52
53
54
55
56
57
58
59
60
61
62
63
64
65
66
67
68
69
70
71
72
73
74
75
76
77
78
79
80
81
82
83
84
85
86
87
88
89
90
91
92
93
94
95
96
97
98
99
100
101
102
103
104
105
106
107
108
109
110
111
112
113
114
115
116
117
118
119
120
121
122
123
124
125
126
127
128
129
130
131
132
133
134
135
136
137
138
139
140
141
142
143
144
145
146
147
148
149
150
151
152
153
154
155
156
157
158
159
from pyspark import SparkConf, SparkContext
from pyspark.streaming import StreamingContext
import nltk
from nltk.sentiment.vader import SentimentIntensityAnalyzer
from geopy.geocoders import Nominatim
from textblob import TextBlob
from elasticsearch import Elasticsearch
import socket
import re
TCP_IP = 'localhost'
TCP_PORT = 9001
# Get the sentiment of a sentence given the polarity
def getSentiment(polarity):
if polarity < 0: return "negative"
if polarity == 0: return "neutral"
return "positive"
# Initialize Elasticsearch by deleting the indices and adding the geo_point
def initES(hashtag):
print("Initializing Elasticsearch index " + hashtag)
mappings = {
"mappings": {
"properties": {
"location": {
"type": "geo_point"
}
}
}
}
es = Elasticsearch()
# Clear out the data from the previous run
es.indices.delete(index=hashtag, ignore=[400,404])
es.indices.delete(index="hashtag-*", ignore=[400,404])
# Create the index using the geo_point settings
es.indices.create(index=hashtag, body=mappings)
def processTweet(tweet):
# Here, you should implement:
# (i) Sentiment analysis,
# (ii) Get data corresponding to place where the tweet was generate (using geopy or googlemaps)
# (iii) Index the data using Elastic Search
global started
tweetData = tweet.split("::")
if len(tweetData) > 2:
text = tweetData[1]
rawLocation = tweetData[0]
hashtag = tweetData[2]
# (i) Apply Sentiment analysis in "text"
# Get the polarity to determine sentiment from the tweet
score = TextBlob(text).sentiment.polarity
# (ii) Get geolocation (state, country, lat, lon, etc...) from rawLocation
geolocator = Nominatim(user_agent="spark")
location = geolocator.geocode(rawLocation)
# print("\n\nRaw data: " + tweet)
print("=========================\ntweet: ", str(bytes(text, "utf-8")))
print("hashtag:", hashtag)
print("Raw location from tweet status: ", str(bytes(rawLocation, "utf-8")))
#print(location)
if(location == None): return
rawLoc = geolocator.reverse([location.latitude, location.longitude])
if not "country" in rawLoc.raw['address']: return
country = str(bytes(rawLoc.raw['address']['country'], "utf-8"))[2:-1]
# Remove special characters
countryPattern = re.findall(r"^[A-Za-z0-9 _]*[A-Za-z0-9][A-Za-z0-9 _]*", country)
if len(countryPattern) == 0: return
country = countryPattern[0].strip()
print(country)
print("Coords: " + str(location.latitude) + ", " + str(location.longitude))
sentiment = getSentiment(score)
print(sentiment)
# print("lat: ", lat)
# print("lon: ", lon)
# print("state: ", state)
# print("country: ", country)
# print("Text: ", text)
# print("Sentiment: ", sentiment)
# (iii) Post the index on ElasticSearch or log your data in some other way (you are always free!!)
json_data = {
"latitude": location.latitude,
"longitude": location.longitude,
"tweet": text,
"raw_location": rawLocation,
"location": str(location.latitude) + "," + str(location.longitude),
"country": country,
"sentiment": sentiment
}
index = hashtag.replace("#","hashtag-").lower()
for c in [":", ","]:
index = index.replace(c, "")
es = Elasticsearch([{'host':'localhost','port':9200}])
es.index(index=index, body=json_data)
def getHashtagData():
sock = socket.socket(socket.AF_INET, socket.SOCK_STREAM)
sock.connect((TCP_IP, TCP_PORT+1))
data = sock.recv(1024)
sock.close()
print("Got data from stream.py")
print(data)
print(data.decode("utf-8"))
return data.decode("utf-8").replace("#", "hashtag-").lower()
hashtagIndex = getHashtagData()
initES(hashtagIndex)
# Pyspark
# create spark configuration
conf = SparkConf()
conf.setAppName('TwitterApp')
conf.setMaster('local[2]')
# create spark context with the above configuration
sc = SparkContext(conf=conf)
# set the log level to one of ALL, DEBUG, ERROR, FATAL, INFO, OFF, TRACE, WARN
# sc.sparkContext.setLogLevel("OFF")
sc.setLogLevel("ERROR")
# create the Streaming Context from spark context with interval size 4 seconds
ssc = StreamingContext(sc, 4)
ssc.checkpoint("checkpoint_TwitterApp")
# read data from port 900
dataStream = ssc.socketTextStream(TCP_IP, TCP_PORT)
dataStream.foreachRDD(lambda rdd: rdd.foreach(processTweet))
ssc.start()
ssc.awaitTermination()