/
stats_analysis.py
208 lines (178 loc) · 8.25 KB
/
stats_analysis.py
1
2
3
4
5
6
7
8
9
10
11
12
13
14
15
16
17
18
19
20
21
22
23
24
25
26
27
28
29
30
31
32
33
34
35
36
37
38
39
40
41
42
43
44
45
46
47
48
49
50
51
52
53
54
55
56
57
58
59
60
61
62
63
64
65
66
67
68
69
70
71
72
73
74
75
76
77
78
79
80
81
82
83
84
85
86
87
88
89
90
91
92
93
94
95
96
97
98
99
100
101
102
103
104
105
106
107
108
109
110
111
112
113
114
115
116
117
118
119
120
121
122
123
124
125
126
127
128
129
130
131
132
133
134
135
136
137
138
139
140
141
142
143
144
145
146
147
148
149
150
151
152
153
154
155
156
157
158
159
160
161
162
163
164
165
166
167
168
169
170
171
172
173
174
175
176
177
178
179
180
181
182
183
184
185
186
187
188
189
190
191
192
193
194
195
196
197
198
199
200
201
202
203
204
205
206
207
208
"""CSC110 Project: Climate Change Sentiment on Twitter: Stats analysis
Module Description:
====================
The module contains the functions that perform statistical operations on the data
and then graph it using plotly.
"""
from typing import List, Dict, Tuple
import statistics
import plotly.graph_objects as go
from data_formatting import Tweet
def compare_frequency_vader(sorted_tweets: Dict[int, List[Tweet]]) -> None:
"""Displays a grouped bar chart with the x_values being the four different
types of opinions. For each opinion, there are three bars, each representing
the frequency of positive, neutral and negative tweets. """
freq_dict = frequency(sorted_tweets)
# Helper function to calculate percentage of frequency
y_pos = [freq_dict[x]['pos'] for x in [-1, 0, 1, 2]]
y_neu = [freq_dict[x]['neu'] for x in [-1, 0, 1, 2]]
y_neg = [freq_dict[x]['neg'] for x in [-1, 0, 1, 2]]
opinion = ['does not support', 'neutral', 'support', 'news'] # x-axis labels
fig = go.Figure(data=[
go.Bar(name='Positive', x=opinion, y=y_pos, marker={'color': 'green'}),
go.Bar(name='Neutral', x=opinion, y=y_neu, marker={'color': 'blue'}),
go.Bar(name='Negative', x=opinion, y=y_neg, marker={'color': 'red'})
])
fig.update_layout(barmode='group',
title='Percentage of tweets of different sentiments within each opinion',
yaxis_title='Percentage of tweets',
xaxis_title='Opinion')
fig.show()
def normal_histogram(tweets: List[Tweet]) -> None:
"""Displays a normal histogram in plotly containing the compound values of each tweet
in tweets.
Precondition:
- all(t.sentiment is not None for t in tweets)
"""
# Retrieves the compound scores of each tweet
compound_values = [tweet.sentiment['compound'] for tweet in tweets]
# Compute the summary statistics of the compound values
summary_data = summary(compound_values)
# Store the summary statistics of the compound values in text form to add as annotation
lines = [
'Mean :' + str(summary_data['mean']),
'Median :' + str(summary_data['median']),
'Standard Deviation :' + str(summary_data['stdev']),
'Range :' + str(summary_data['range'])
]
text = '<br>'.join(lines) # HTML '<br>' sequence gives newlines
# Adds the annotation that displays the summary statistics in a box below the graph.
layout = go.Layout(
height=800,
width=800,
yaxis=go.layout.YAxis(domain=[0.5, 1]),
annotations=[
go.layout.Annotation(
bordercolor='black',
align='left',
yanchor='top', # Align text box's top edge with y axis
text=text,
showarrow=False,
width=650,
xref='paper', # Place relative to figure, not axes
yref='paper',
font={'family': 'Courier'},
x=0, # Coordinates start from top left corner
y=0.4
)
])
# Creates the figure object that draws the histogram and takes in the annotation created above
fig = go.Figure(data=[go.Histogram(x=compound_values, histnorm='probability')], layout=layout)
fig.update_traces(xbins_size=0.01, selector=dict(type='histogram'))
fig.update_layout(title='Percentage of frequency of tweets against compound value range',
xaxis_title='Compound value range',
yaxis_title='Percentage of occurrence')
# Shows the figure object
fig.show()
def summary(data: List[float]) -> Dict[str, float]:
"""Return a dictionary of summary statistics for a list of numbers.
Mappings:
- 'mean': mean of the data
- 'median': median of the data
- 'stdev': sample standard deviation of the data
- 'range': (statistical) range of the data
Preconditions:
- len(data) > 0
"""
return {
'mean': statistics.mean(data),
'median': statistics.median(data),
'stdev': statistics.stdev(data),
'range': max(data) - min(data)
}
def plot_pos_neg(tweets: List[Tweet]) -> None:
"""Plots each tweet as a point where the x-coordinate is the negative value
and the y-coordinate is the positive value.
Precondition:
- all(t.sentiment is not None for t in tweets)
"""
# Retrieves the negative and positive scores for each tweet
x_values = [tweet.sentiment['neg'] for tweet in tweets]
y_values = [tweet.sentiment['pos'] for tweet in tweets]
# Creates a figure object that displays a scatter plot with the negative
# scores on the x-axis, and positive scores on the y-axis
fig = go.Figure(data=go.Scatter(x=x_values, y=y_values, mode='markers'))
fig.update_layout(title='Scatter plot of positive against negative polarity score',
xaxis_title='Negative polarity score',
yaxis_title='Positive polarity score')
fig.show()
def plot_compound(sorted_tweets: Dict[int, List[Tweet]]) -> None:
"""Displays a box plot for each opinion.
"""
# Retrieves compound values for each list of tweets, which have been sorted by opinion value.
not_support = [tweet.sentiment['compound'] for tweet in sorted_tweets[-1]]
neutral = [tweet.sentiment['compound'] for tweet in sorted_tweets[0]]
support = [tweet.sentiment['compound'] for tweet in sorted_tweets[1]]
news = [tweet.sentiment['compound'] for tweet in sorted_tweets[2]]
# Creates an empty figure object
fig = go.Figure()
# Adds each box plot to the figure object
fig.add_trace(go.Box(x=not_support, name='Against Climate Change'))
fig.add_trace(go.Box(x=neutral, name='Neutral'))
fig.add_trace(go.Box(x=support, name='In Support Of Climate Change'))
fig.add_trace(go.Box(x=news, name='News'))
fig.update_layout(title='Box plot for each opinion',
xaxis_title='Compound value',
yaxis_title='Opinion')
fig.show()
#############################################################################
# Helper functions:
#############################################################################
def min_max_values(data: List[float]) -> Tuple[float, float]:
"""Returns the minimum and maximum of a list of numbers in a tuple (min, max).
Preconditions:
- data != []
"""
return (min(data), max(data))
def frequency(sorted_tweets: Dict[int, List[Tweet]]) -> Dict[int, Dict[str, int]]:
"""Return the percentage of positive, negative and neutral tweets in the
dictionary of sorted tweets provided. The frequency of the three types of
values are calculated for each opinion in the dictionary provided. The function
returns a dictionary that maps each opinion to a dictionary that maps each type
of tweet (pos, neu, neg) to the percentage of its frequency .
Precondition:
- list(sorted_tweets.keys()) == [-1, 0, 1, 2]
"""
freq_dict = {-1: {}, 0: {}, 1: {}, 2: {}}
for key in sorted_tweets:
freq_neg = 0
freq_pos = 0
freq_neu = 0
for tweet in sorted_tweets[key]:
if tweet.sentiment['compound'] <= -0.05:
# a negative string according to vaderSentiment
freq_neg += 1
elif tweet.sentiment['compound'] >= 0.05:
# a positive string according to vaderSentiment
freq_pos += 1
else:
# a neutral string according to vaderSentiment
freq_neu += 1
percentage_neg = freq_neg / len(sorted_tweets[key])
percentage_neu = freq_neu / len(sorted_tweets[key])
percentage_pos = freq_pos / len(sorted_tweets[key])
freq_dict[key] = {'pos': percentage_pos, 'neu': percentage_neu, 'neg': percentage_neg}
return freq_dict
if __name__ == "__main__":
import python_ta
import python_ta.contracts
import doctest
python_ta.check_all(config={
'extra-imports': ['python_ta.contracts', 'data_formatting', 'plotly.graph_objects',
'statistics'],
'max-line-length': 100,
'disable': ['R1705', 'C0200']
})
python_ta.contracts.DEBUG_CONTRACTS = False
python_ta.contracts.check_all_contracts()
doctest.testmod(verbose=True)