-
Notifications
You must be signed in to change notification settings - Fork 0
/
votecount_beta.py
382 lines (324 loc) · 17.5 KB
/
votecount_beta.py
1
2
3
4
5
6
7
8
9
10
11
12
13
14
15
16
17
18
19
20
21
22
23
24
25
26
27
28
29
30
31
32
33
34
35
36
37
38
39
40
41
42
43
44
45
46
47
48
49
50
51
52
53
54
55
56
57
58
59
60
61
62
63
64
65
66
67
68
69
70
71
72
73
74
75
76
77
78
79
80
81
82
83
84
85
86
87
88
89
90
91
92
93
94
95
96
97
98
99
100
101
102
103
104
105
106
107
108
109
110
111
112
113
114
115
116
117
118
119
120
121
122
123
124
125
126
127
128
129
130
131
132
133
134
135
136
137
138
139
140
141
142
143
144
145
146
147
148
149
150
151
152
153
154
155
156
157
158
159
160
161
162
163
164
165
166
167
168
169
170
171
172
173
174
175
176
177
178
179
180
181
182
183
184
185
186
187
188
189
190
191
192
193
194
195
196
197
198
199
200
201
202
203
204
205
206
207
208
209
210
211
212
213
214
215
216
217
218
219
220
221
222
223
224
225
226
227
228
229
230
231
232
233
234
235
236
237
238
239
240
241
242
243
244
245
246
247
248
249
250
251
252
253
254
255
256
257
258
259
260
261
262
263
264
265
266
267
268
269
270
271
272
273
274
275
276
277
278
279
280
281
282
283
284
285
286
287
288
289
290
291
292
293
294
295
296
297
298
299
300
301
302
303
304
305
306
307
308
309
310
311
312
313
314
315
316
317
318
319
320
321
322
323
324
325
326
327
328
329
330
331
332
333
334
335
336
337
338
339
340
341
342
343
344
345
346
347
348
349
350
351
352
353
354
355
356
357
358
359
360
361
362
363
364
365
366
367
368
369
370
371
372
373
374
375
376
377
378
379
380
381
382
import re
import json
import urllib.request
import pandas as pd
import formatter
from datetime import datetime
from requests_futures.sessions import FuturesSession
from bs4 import BeautifulSoup, SoupStrainer
############################################
#### GLOBAL VARIABLES
############################################
#Era Base URLs
era_url = 'https://www.resetera.com/'
base_thread_url = era_url+'threads/'
#Outer Mafia Base URLs
om_url = 'https://outermafia.com/'
om_thread_url = om_url+'index.php?threads/'
#Vote Tool Base URLs
vt_url = 'https://vote.fireblend.com/'
#Commands
command_player_list= "!player_list"
command_dead= "((.+) has died!)"
command_won= "((.+) has won the game!)"
command_lost= "((.+) has lost the game!)"
command_vote= "vote:"
command_doublevote= "double:"
command_triplevote= "triple:"
command_unvote= "unvote"
command_day_ends= "(day (.+) ends)"
command_day_begins= "(day (.+) begins)"
command_reset = "votes have been reset"
# Dataframes
players_df = pd.DataFrame()
phases_df = pd.DataFrame()
votes_df = pd.DataFrame()
############################################
#### SOME USEFUL FUNCTIONS
############################################
#This strainer acts as a filter for the parser. We only care about divs whose classes are any of these:
message_list_strainer = SoupStrainer(["div", "header"], {"class" : ["bbWrapper", "message-userDetails", "message-attribution-opposite", "message-attribution"]})
#This is the same thing, except for OuterMafia, since some divs have different names due to the theme difference:
mo_message_list_strainer = SoupStrainer(["div", "span"], {"class" : ["messageContent", "messageUserInfo", "messageDetails", "DateTime"]})
# Returns a soup object from a URL
def getSoup(url, isMessage=False, isOM=False):
#Load URL using Request library
req = urllib.request.Request(
url,
data=None,
headers={
'User-Agent': 'Mozilla/5.0 (Macintosh; Intel Mac OS X 10_9_3) AppleWebKit/537.36 (KHTML, like Gecko) Chrome/35.0.1916.47 Safari/537.36'
}
)
f = urllib.request.urlopen(req)
#Transform it into a Soup object
result = getSoupFromText(f, isMessage, isOM)
f.close()
return result
# Returns a soup object from text
def getSoupFromText(f, isMessage=False, isOM=False):
#If it's a thread page (isMessageFlag), we use the strainer we defined to only parse
#what we care about. If not, we parse the entire page.
#Use the OuterMafia strainer if this is an OM page.
if isMessage and isOM:
result = BeautifulSoup(f, 'lxml', parse_only=mo_message_list_strainer)
#If not, use the regular strainer.
elif isMessage:
result = BeautifulSoup(f, 'lxml', parse_only=message_list_strainer)
else:
result = BeautifulSoup(f, 'lxml')
return result
# Marks a vote as innactive
def removeActiveVote(user, day, link, post_num, timestamp):
global votes_df
if(len(votes_df) == 0):
return
mask = (votes_df.sender_name==user) & votes_df.active & (votes_df.day == day)
votes_df.loc[mask, "active"] = False
votes_df.loc[mask, "unvote_timestamp"] = timestamp
votes_df.loc[mask, "unvote_link"] = link
votes_df.loc[mask, "unvote_num"] = post_num
# Adds a new vote
def addActiveVote(user, target, day, link, post_num, value, timestamp):
global votes_df
toAppend = {'sender_name': user,
'target_name': target,
'day': day,
'active': True,
'vote_link': link,
'unvote_link':None,
'vote_num': post_num,
'unvote_num':None,
'vote_timestamp':timestamp,
'unvote_timestamp':None,
'value':value }
votes_df = votes_df.append(toAppend, ignore_index = True)
# This function runs on the background for each page that is loaded asynchronically.
def getSoupInBackground(sess, resp, isOM):
# Loads the page into soup
era_page = getSoupFromText(resp.text, True, isOM)
#These are the posts
posts = era_page.find_all("div", {"class" : "bbWrapper"})
#These are the users
users = era_page.find_all("div", {"class" : "message-userDetails"})
#These are the links
links = era_page.find_all("div", {"class" : "message-attribution-opposite"})
#These are the timestamps
timestamps = era_page.find_all("header", {"class" : "message-attribution"})
#We use an alternative div name for OM since it's different there.
if(isOM):
posts = era_page.find_all("div", {"class" : "messageContent"})
users = era_page.find_all("div", {"class" : "messageUserInfo"})
links = era_page.find_all("div", {"class" : "messageDetails"})
timestamps = era_page.find_all("span", {"class" : "DateTime"})
#Readies the data for this page in the background
resp.data = {"posts":posts, "users":users, "links":links, "timestamps":timestamps}
############################################
#### MAIN SCRAPING FUNCTION
############################################
def scrapeThread(thread_id, om=False):
global players_df
global votes_df
global phases_df
# Store page in variable
thread_url = base_thread_url+thread_id
if om:
thread_url = om_thread_url+thread_id
era_page = getSoup(thread_url, False)
# Find out how many pages there are
numPages = 1
if om:
pages = era_page.find("span", {"class" : "pageNavHeader"})
if(pages != None):
nav = pages.contents[0].split(" ")
numPages = int(nav[3])
else:
aList = era_page.find_all('a', {'class':'pageNavSimple-el pageNavSimple-el--current'})
for a in aList:
numPages = int(a.get_text(strip=True).split(" of ")[1])
break
print("lastPage is "+str(numPages))
#Banner
banner_url = None
#By default, the scraper should start scanning on page 1, and have no
#reference to the last day end post scanned.
lastPage = 1
lastPost = None
#Check if there's a file corresponding to this game already
#if so, we load all game info and set variables so the scraper knows
#which page and post to start scraping from.
try:
players_df = pd.read_csv("gamecache_2.0/"+str(thread_id).replace("/","")+"_players.csv")
phases_df = pd.read_csv("gamecache_2.0/"+str(thread_id).replace("/","")+"_phases.csv")
votes_df = pd.read_csv("gamecache_2.0/"+str(thread_id).replace("/","")+"_votes.csv")
#banner_url = data["banner_url"]
#We find out the last day end page and post numbers, so we can start scraping from that point.
lastPage = phases_df.phase_end_page.max()
lastPost = phases_df.phase_end_number.max()
except Exception as e:
print("No file found, or error loading file: ")
print (e)
# Load pages asynchronically, I'm a mad scientist
session = FuturesSession(max_workers=10)
requests = []
for p in range(int(lastPage), numPages + 1):
# Each page request gets added to the session, as well as the getSoupInBackground
# function which lets us do some additional stuff on the background
page_url = thread_url + "page-" + str(p)
requests.append(session.get(page_url, background_callback=lambda sess, resp: getSoupInBackground(sess, resp, om)))
# For each page:
for p in range(0, len(requests)):
print("Page "+str(p))
#Wait if needed for the request to complete. By the time it's done we should have
#access to the posts, users and links as parsed by the getSoupInBackground function
pageData = requests[p].result().data
#These are the posts
posts = pageData["posts"]
#These are the users
users = pageData["users"]
#These are the links
links = pageData["links"]
#These are the links
timestamps = pageData["timestamps"]
if(not om):
i = 0
while(i != len(links)):
lstr = links[i].find("a")['href'].partition("/permalink")[0]
if("#" in lstr or "threadmarks" in lstr):
links.pop(i)
i = i - 1
i = i+1
# If there are no active phases yet, grab banner url
if (len(phases_df) == 0):
img = posts[0].find("img")
if(img != None and img.has_attr('src')):
banner_url = img["src"]
if '/' == banner_url[-1]:
banner_url[-1] = ' '
print(banner_url)
#For each post in this page:
for i in range(0, len(posts)):
nextPost = False
#Get the current post's content, the user, the link, timestamp and the post number
currentPost = posts[i]
currentUser = users[i].find("a", {"class": "username"}).get_text(strip=True).lower()
currentLink = era_url+links[i].find("a")['href'].partition("/permalink")[0]
currentTimestamp = timestamps[i].find("time")['datetime']
if (om):
currentLink = om_url+links[i].find("a")['data-href'].partition("/permalink")[0]
currentTimestamp = timestamps[i].find("span")['title']
currentPostNum = links[i].find("a").string;
try:
current_phase_info = phases_df.loc[phases_df.phase_number.idxmax()]
phaseNum = phases_df.phase_number.max()
except Exception as e:
current_phase_info = pd.DataFrame()
phaseNum = 0
# Increment post count only if the latest phase is active
if(len(current_phase_info) > 0 and pd.isnull(phases_df.loc[phases_df.phase_number.idxmax(), "phase_end_link"])):
if(len(players_df) > 0 and len(players_df[players_df.name == currentUser]) > 0):
players_df.loc[players_df.name == currentUser, "post_count_"+str(current_phase_info.phase_number)] += 1
else:
players_df = players_df.append({"name":currentUser, "post_count_"+str(current_phase_info.phase_number):1}, ignore_index = True)
currentPostInt = int(currentPostNum.replace("#", "").replace(",", "").strip())
# If we set a last day end post, meaning we loaded some previous game data,
# skip all posts until the one after it, by comparing post numbers.
if (lastPost != None):
#Ignore the post if its number is lower than last post
if(currentPostInt <= lastPost):
continue
#Mark last post as none so we don't have to make this comparison for future posts
else:
lastPost = None
#Extract quotes so we don't accidentally count stuff in quotes
hasQuote = currentPost.findAll("div", {"class": " bbCodeBlock bbCodeBlock--expandable bbCodeBlock--quote"})
if(om):
hasQuote = currentPost.findAll("div", {"class": "bbCodeBlock bbCodeQuote"})
for quote in hasQuote: # Skips quoted posts
quote.extract()
#Find all potential "actions"
action_list = currentPost.find_all("span")
if(om):
action_list = currentPost.find_all("strong")
if len(action_list) > 0:
for action in action_list:
if nextPost:
break
#Check for color tags
if (action.has_attr('style') and 'color' in action['style']) or (action.has_attr('class') and 'bbHighlight' in action['class']):
#I'm removing bold tags here to simplify the command matching procedure
for match in action.findAll('b'):
match.replaceWithChildren()
#Check for valid commands
for line in str(action).lower().splitlines():
if nextPost:
break
#If the day is starting, set the current day variable to a new day
if(bool(re.search(command_day_begins, line, re.IGNORECASE))):
print("New day begins on post "+currentPostNum+"("+currentLink+")")
#This is to use the day identifier as part of the title
#of this day phase in the view of the data.
m = re.search(command_day_begins, line, re.IGNORECASE)
current_day_name = m.group(2)
phaseNum = 0
if(len(current_phase_info) > 0):
phaseNum = current_phase_info.phase_number+1
new_day_info = {"phase_name":current_day_name, "phase_start_link":currentLink, "phase_start_number":currentPostInt, "phase_start_page":p+lastPage, "phase_start_timestamp":currentTimestamp, "phase_number":phaseNum, "phase_end_link":pd.np.nan, "phase_end_number":pd.np.nan, "phase_end_page":pd.np.nan, "phase_end_timestamp":pd.np.nan}
phases_df = phases_df.append(new_day_info, ignore_index = True)
current_phase_info = phases_df.loc[phases_df.phase_number.idxmax()]
players_df["post_count_"+str(current_phase_info.phase_number)] = 0
nextPost = True
break
#If the day has ended, append the current day to the days variable and then clear it
if(bool(re.search(command_day_ends, line, re.IGNORECASE))):
if len(current_phase_info) == 0:
continue
print("Day ends on "+currentPostNum)
phases_df.loc[phases_df.phase_number.idxmax(), "phase_end_link"] = currentLink
phases_df.loc[phases_df.phase_number.idxmax(), "phase_end_number"] = currentPostInt
phases_df.loc[phases_df.phase_number.idxmax(), "phase_end_page"] = p+lastPage
phases_df.loc[phases_df.phase_number.idxmax(), "phase_end_timestamp"] = currentTimestamp
#Update this game's cache files with day info
phases_df.to_csv("gamecache_2.0/"+str(thread_id).replace("/","")+"_phases.csv",index=False)
players_df.to_csv("gamecache_2.0/"+str(thread_id).replace("/","")+"_players.csv",index=False)
votes_df.to_csv("gamecache_2.0/"+str(thread_id).replace("/","")+"_votes.csv",index=False)
break
#Handle vote reset command
elif(command_reset in line):
if len(current_phase_info) == 0:
continue
votes_df = votes_df.drop(votes_df[votes_df.day == current_phase_info.phase_number].index)
print("Votes have been reset!")
nextPost = True
break
#Handle unvote command
elif(command_unvote in line):
if len(current_phase_info) == 0:
continue
print(currentUser+" UNVOTED")
removeActiveVote(currentUser, phaseNum, currentLink, currentPostInt, currentTimestamp)
#Handle vote command
elif(command_vote in line):
if len(current_phase_info) == 0:
continue
target = str(line).lower().partition(command_vote)[2].partition('<')[0].strip()
print(currentUser+" -> "+ target)
removeActiveVote(currentUser, phaseNum, currentLink, currentPostInt, currentTimestamp)
addActiveVote(currentUser, target, phaseNum, currentLink, currentPostInt, 1, currentTimestamp)
#Handle doublevote command
elif(command_doublevote in line):
if len(current_phase_info) == 0:
continue
target = str(line).lower().partition(command_doublevote)[2].partition('<')[0].strip()
print(currentUser+" ->> "+ target)
removeActiveVote(currentUser, phaseNum, currentLink, currentPostInt, currentTimestamp)
addActiveVote(currentUser, target, phaseNum, currentLink, currentPostInt, 2, currentTimestamp)
#Handle triple vote command
elif(command_triplevote in line):
if len(current_phase_info) == 0:
continue
target = str(line).lower().partition(command_triplevote)[2].partition('<')[0].strip()
print(currentUser+" ->>> "+ target)
removeActiveVote(currentUser, phaseNum, currentLink, currentPostInt, currentTimestamp)
addActiveVote(currentUser, target, phaseNum, currentLink, currentPostInt, 3, currentTimestamp)
return formatter.format(votes_df, players_df, phases_df)