/
reducer_sentiment.py
executable file
·146 lines (132 loc) · 5.9 KB
/
reducer_sentiment.py
1
2
3
4
5
6
7
8
9
10
11
12
13
14
15
16
17
18
19
20
21
22
23
24
25
26
27
28
29
30
31
32
33
34
35
36
37
38
39
40
41
42
43
44
45
46
47
48
49
50
51
52
53
54
55
56
57
58
59
60
61
62
63
64
65
66
67
68
69
70
71
72
73
74
75
76
77
78
79
80
81
82
83
84
85
86
87
88
89
90
91
92
93
94
95
96
97
98
99
100
101
102
103
104
105
106
107
108
109
110
111
112
113
114
115
116
117
118
119
120
121
122
123
124
125
126
127
128
129
130
131
132
133
134
135
136
137
138
139
140
141
142
143
144
145
146
#!/usr/bin/python3
#########################################################
# DES: Reducer script to find insights regarding sentiment analysis scores for each date.
# On a follower weighted basis, as well as a tweets per day basis, finds insights
# such as average sentiment scores per date, and correlations between sentiment scores and favourites/RTs.
# Please note:
# Because standard deviation require at least 2 values per date, and because there are some dates with only 1 tweet,
# this reducer adds a 0 to all lists to fulfill these requirements. The assumption is that dates with only 2 tweets these
# values are meaningless anyway, while others they will not impact.
# Similarily, when only 1 tweet is couneted per date, Scipy prints out a warning when calculating correlation coefficient.
# This is why warnings are disabled, as the return is 'nan' for such dates.
# Results are manually copied from HDFS into the Ubuntu EC2 machine using 'hdfs dfs -copyToLocal'.
# BY: Tiernan Barry, x19141840 - NCI.
#########################################################
#########################################################
# Libraries and source scripts:
#########################################################
from operator import itemgetter
import sys
from sortedcontainers import SortedList
import statistics as stats
import csv
from scipy.stats import pearsonr
import warnings
warnings.filterwarnings("ignore")
#########################################################
# Reducer:
#########################################################
last_date_key = None
count_per_date = 0
favs_per_dt = 0
rt_per_dt = 0
aggregate_sentiment = 0
aggregate_sentiment_rnd = 0 # new variable for categorical sentiment
sent_list_sort = SortedList()
list_sentiment = []
list_sentiment_rnd = []
favs_to_follower = []
rt_to_follower = []
# covid count:
aggregate_covid_count = 0
# Add 0 to all lists to begin with, makes all lists at least 2 in lengths.
# Enables correlation and standard deviation where date < 2 (not meaningful anyway).
sent_list_sort.add(0)
list_sentiment.append(0)
list_sentiment_rnd.append(0)
favs_to_follower.append(0)
rt_to_follower.append(0)
# input: (date, "MEDIA_TWITTER_ACC", fav_count, rt_count, followers, login_device, sentiment, sentiment_rnd, covid_count))
# Print column headings for output in CSV format:
print("DATE_TIME,SOURCE,MEAN_SENT_POLARITY,MEAN_SENT_CATG,STND_DEV_SENT,MEDIAN_SENT,MIN_SENT,MAX_SENT,FAVS_PER_TWEETS,RT_PER_TWEET,"
"CORR_FAV_SENT,CORR_RT_SENT,TWEETS_PER_HOUR,COVID_COUNT")
# Reduce by date:
for key_value in csv.reader(sys.stdin):
this_date_key = key_value[0]
source = str(key_value[1])
fav = int(key_value[2])
rt = int(key_value[3])
follower = int(key_value[4])
sentiment_value = float(key_value[6])
sentiment_rnd = int(key_value[7])
covid = int(key_value[8])
if last_date_key == this_date_key:
count_per_date += 1
aggregate_sentiment += sentiment_value
aggregate_sentiment_rnd += sentiment_rnd
favs_per_dt += fav # add favs per date
rt_per_dt += rt
aggregate_covid_count += covid
sent_list_sort.add(sentiment_value) #1
list_sentiment.append(sentiment_value)
favs_to_follower.append(fav/follower)
rt_to_follower.append(rt/follower)
else:
if last_date_key:
print(('%s,%s,%s,%s,%s,%s,%s,%s,%s,%s,%s,%s,%s,%s') %
(last_date_key,
source,
aggregate_sentiment / count_per_date, # avg polarity
aggregate_sentiment_rnd / count_per_date, # avg catg sent
stats.stdev(sent_list_sort), # stnd dev
sent_list_sort[int(len(sent_list_sort) / 2)], # median
sent_list_sort[0], # min
sent_list_sort[-1], # max
favs_per_dt / count_per_date, # favs:number tweet ratio
rt_per_dt / count_per_date, # rt:number tweets
pearsonr(list_sentiment, favs_to_follower)[0],
pearsonr(list_sentiment, rt_to_follower)[0],
count_per_date,
aggregate_covid_count)) # 2
# Start the reducer / restart values for each iteration
aggregate_sentiment = sentiment_value
aggregate_sentiment_rnd = sentiment_rnd
last_date_key = this_date_key
favs_per_dt = fav
rt_per_dt = rt
count_per_date = 1
aggregate_covid_count = covid
sent_list_sort = SortedList()
list_sentiment = []
favs_to_follower = []
rt_to_follower = []
list_sentiment_rnd = []
# Add 0 to all lists to begin with, makes all lists at least 2 in lengths:
sent_list_sort.add(0)
list_sentiment.append(0)
list_sentiment_rnd.append(0)
favs_to_follower.append(0)
rt_to_follower.append(0)
# Add actual data:
sent_list_sort.add(sentiment_value)
list_sentiment.append(sentiment_value)
list_sentiment_rnd.append(sentiment_rnd)
favs_to_follower.append(fav / follower)
rt_to_follower.append(rt / follower)
# Output summary stats:
if last_date_key == this_date_key:
print(('%s,%s,%s,%s,%s,%s,%s,%s,%s,%s,%s,%s,%s,%s') %
(last_date_key,
source,
aggregate_sentiment / count_per_date, # avg polarity
aggregate_sentiment_rnd / count_per_date, # avg catg sent
stats.stdev(sent_list_sort), # stnd dev
sent_list_sort[int(len(sent_list_sort) / 2)], # median
sent_list_sort[0], # min
sent_list_sort[-1], # max
favs_per_dt / count_per_date, # favs:number tweet ratio
rt_per_dt / count_per_date, # rt:number tweets
pearsonr(list_sentiment, favs_to_follower)[0],
pearsonr(list_sentiment, rt_to_follower)[0],
count_per_date,
aggregate_covid_count))