forked from REMitchell/jeopardy-scraper
/
scraper.py
267 lines (217 loc) · 7.76 KB
/
scraper.py
1
2
3
4
5
6
7
8
9
10
11
12
13
14
15
16
17
18
19
20
21
22
23
24
25
26
27
28
29
30
31
32
33
34
35
36
37
38
39
40
41
42
43
44
45
46
47
48
49
50
51
52
53
54
55
56
57
58
59
60
61
62
63
64
65
66
67
68
69
70
71
72
73
74
75
76
77
78
79
80
81
82
83
84
85
86
87
88
89
90
91
92
93
94
95
96
97
98
99
100
101
102
103
104
105
106
107
108
109
110
111
112
113
114
115
116
117
118
119
120
121
122
123
124
125
126
127
128
129
130
131
132
133
134
135
136
137
138
139
140
141
142
143
144
145
146
147
148
149
150
151
152
153
154
155
156
157
158
159
160
161
162
163
164
165
166
167
168
169
170
171
172
173
174
175
176
177
178
179
180
181
182
183
184
185
186
187
188
189
190
191
192
193
194
195
196
197
198
199
200
201
202
203
204
205
206
207
208
209
210
211
212
213
214
215
216
217
218
219
220
221
222
223
224
225
226
227
228
229
230
231
232
233
234
235
236
237
238
239
240
241
242
243
244
245
246
247
248
249
250
251
252
253
254
255
256
257
258
259
260
261
262
263
264
265
266
267
from urllib.request import urlopen
from urllib.error import HTTPError
from urllib.error import URLError
from bs4 import BeautifulSoup
import requests
import re
from answer import Answer
from question import Question
from category import Category
from database import Database
from player import Player
from score import Score
from game import Game
class JeopardyScraper:
def __init__(self):
self.db = Database()
self.waitingOnGame = 5869
def safeGet(self, url, jsonObj=False):
session = requests.Session()
headers = {"User-Agent":"Mozilla/5.0 (Macintosh; Intel Mac OS X 10_11_2) AppleWebKit/537.36 (KHTML, like Gecko) Chrome/48.0.2564.103 Safari/537.36"}
try:
r = requests.get(url, headers=headers)
except HTTPError as e:
print("HTTP error "+str(e))
return None
except URLError as e:
print("The server is down!")
return None
else:
if r.text == None:
print("None result at URL "+url)
return None
if jsonObj:
try:
return json.loads(r.text)
except ValueError as e:
print("Error parsing JSON:")
print(r.text)
return None
else:
return BeautifulSoup(r.text, "lxml")
def formatDollars(self, stringDollars):
dollars = re.sub("\D", "", stringDollars)
return int(dollars)
def byShortname(self, players, nameStr):
for player in players:
if player.shortname.lower() == nameStr.lower():
return player
#"clue" is the beautifulsoup clue object
#i is the round number
def extractClue(self, clue, game, categories, i):
clueDiv = clue.find("div")
clueText = clue.find("td",{"class":"clue_text"}).get_text()
if i == 3:
category = categories
order = 61
row = 0
else:
coords = clue.find("td",{"class":"clue_text"}).attrs["id"].split('_')
category = categories[int(coords[2])-1]
row = coords[3]
order = clueDiv.find("td", {"class":"clue_order_number"}).get_text()
question = clueDiv.attrs["onmouseout"]
answer = clueDiv.attrs["onmouseover"]
answer = answer[7:-1].split('\', \'')
#Get rid of the quotes surrounding the question
answerText = answer[2][:-1].replace("\\", "")
answerObj = BeautifulSoup(str(answerText), "lxml")
#Weird problem: Can't get an answer for these...
if answerObj.find("em",{"class":"correct_response"}) is None:
print("NO ANSWER FOUND!")
return
answerText = answerObj.find("em",{"class":"correct_response"}).get_text()
notes = None
amount = None
if i == 3:
#Final Jeopardy
amount = 0
else:
#Find dollar values
if clue.find("td",{"class":"clue_value_daily_double"}) != None:
#DAILY DUBBBBLE!
notes = "DD"
amount = self.formatDollars(clue.find("td",{"class":"clue_value_daily_double"}).get_text())
else:
amount = self.formatDollars(clue.find("td",{"class":"clue_value"}).get_text())
question = Question(None, game, i, category, row, order, clueText, answerText, amount, notes)
question = question.save(self.db)
#Get the answers
if answerObj.find("td", {"class":"right"}) is not None:
#Someone answered it right!
name = answerObj.find("td", {"class":"right"}).get_text()
rightPlayer = self.byShortname(game.players, name)
answer = Answer(question, rightPlayer, "true")
answer.save(self.db)
wrongs = answerObj.findAll("td", {"class":"wrong"})
for wrong in wrongs:
if wrong.get_text().lower() != "triple stumper":
wrongPlayer = self.byShortname(game.players, wrong.get_text())
answer = Answer(question, wrongPlayer, "false")
answer.save(self.db)
#Gets categories, questions and answers from the game boards
def getQuestions(self, pageObj, game):
rounds = pageObj.findAll("table",{"class":"round"})
categories =[]
if len(rounds) != 2:
return
for i in range(1,3):
print("ROUND "+str(i))
round = rounds[i-1]
categoryElems = round.findAll("td",{"class":"category_name"})
for category in categoryElems:
categoryObj = Category(None, game, i, category.get_text())
categoryObj = categoryObj.save(self.db)
categories.append(categoryObj)
clues = round.findAll("td",{"class":"clue"})
for clue in clues:
if clue.find("td",{"class":"clue_text"}) is not None:
self.extractClue(clue, game, categories, i)
final = pageObj.find("table",{"class":"final_round"})
if final is None:
return
finalCategoryName = final.find("td",{"class":"category_name"}).get_text()
finalCategory = Category(None, game, 3, finalCategoryName)
finalCategory = finalCategory.save(self.db)
self.extractClue(final, game, finalCategory, 3)
def getTableScores(self, pageObj, title, coryat=False):
regex = title+"*"
if coryat:
titleTag = pageObj.find("a", text="Coryat scores")
else:
titleTag = pageObj.find("h3", text=re.compile(title))
if titleTag is None:
return [None, None, None]
if coryat:
titleTag = titleTag.findParent("h3")
table = titleTag.findNext("table")
if table is None:
return [None, None, None]
scoresArr = []
scoresRow = table.findAll("tr")[1]
scores = scoresRow.findAll("td")
for score in scores:
score = score.get_text()
score = score.replace("$", "")
score = score.replace(",", "")
scoresArr.append(int(score))
return scoresArr
def saveResults(self, pageObj, game, players):
commercialBreak = self.getTableScores(pageObj, "Scores at the first commercial break")
round1 = self.getTableScores(pageObj, "Scores at the end of the Jeopardy")
round2 = self.getTableScores(pageObj, "Scores at the end of the Double Jeopardy")
final = self.getTableScores(pageObj, "Final scores:")
coryat = self.getTableScores(pageObj, "Coryat scores:", True)
for i in range(0,3):
score = Score(None, game, players[i], commercialBreak[i], round1[i], round2[i], final[i], coryat[i])
score.save(self.db)
def scrapeGame(self, url):
if "game_id=" not in url:
print("Invalid URL "+url)
return
if(url.startswith("showgame")):
url = "http://www.j-archive.com/"+url
gameId = url.split("game_id=")[1]
if int(gameId) == self.waitingOnGame:
self.waitingOnGame = False
if self.waitingOnGame:
print("SKIPPING gameId")
return
if not self.waitingOnGame:
bsObj = self.safeGet(url)
date = bsObj.h1.get_text()
date = date[date.index(',')+2:]
game = Game(gameId, date)
game.save(self.db)
contestants = bsObj.findAll("p", {"class":"contestants"})
if len(contestants) != 3:
print("Weirdness going on with contestants in game "+str(game.id))
return
players = []
for contestant in contestants:
url = contestant.a.attrs['href']
playerId = url.split("player_id=")[1]
#"http://www.j-archive.com/showplayer.php?player_id=9121"
name = contestant.a.get_text()
description = contestant.get_text()
description = description.replace(name, "")
description = description[2:]
description = description.split("(")[0]
playerObj = Player(playerId, name, description)
#Important to use "insert" and not append -- players are listed at the top
#in the opposite order that the scores appear in
players.insert(0, playerObj)
nicknames = bsObj.findAll("td",{"class":"score_player_nickname"})
if len(nicknames) == 0:
print("Weirdness going on with contestant nicknames in game "+str(game.id))
return
#Get the first three instances of nicknames, these will be the player nicknames
for i in range(0,3):
players[i].setShortname(nicknames[i].get_text())
players[i].save(self.db)
game.setPlayers(players)
self.saveResults(bsObj, game, players)
self.getQuestions(bsObj, game)
print("Done with "+str(game.id))
def getGames(self):
for i in range(30,36):
html = urlopen("http://j-archive.com/showseason.php?season="+str(i))
bsObj = BeautifulSoup(html, "lxml")
table = bsObj.table
games = table.findAll("")
gameList = bsObj.find("table").findAll("a")
for game in gameList:
print(game['href'])
self.scrapeGame(game['href'])
scraper = JeopardyScraper()
scraper.getGames()