This repository has been archived by the owner on Jun 5, 2022. It is now read-only.
forked from mikemccllstr/dominionstats
/
scrape.py
280 lines (229 loc) · 8.92 KB
/
scrape.py
1
2
3
4
5
6
7
8
9
10
11
12
13
14
15
16
17
18
19
20
21
22
23
24
25
26
27
28
29
30
31
32
33
34
35
36
37
38
39
40
41
42
43
44
45
46
47
48
49
50
51
52
53
54
55
56
57
58
59
60
61
62
63
64
65
66
67
68
69
70
71
72
73
74
75
76
77
78
79
80
81
82
83
84
85
86
87
88
89
90
91
92
93
94
95
96
97
98
99
100
101
102
103
104
105
106
107
108
109
110
111
112
113
114
115
116
117
118
119
120
121
122
123
124
125
126
127
128
129
130
131
132
133
134
135
136
137
138
139
140
141
142
143
144
145
146
147
148
149
150
151
152
153
154
155
156
157
158
159
160
161
162
163
164
165
166
167
168
169
170
171
172
173
174
175
176
177
178
179
180
181
182
183
184
185
186
187
188
189
190
191
192
193
194
195
196
197
198
199
200
201
202
203
204
205
206
207
208
209
210
211
212
213
214
215
216
217
218
219
220
221
222
223
224
225
226
227
228
229
230
231
232
233
234
235
236
237
238
239
240
241
242
243
244
245
246
247
248
249
250
251
252
253
254
255
256
257
258
259
260
261
262
263
264
265
266
267
268
269
270
271
272
273
274
275
276
277
278
279
280
#!/usr/bin/python
# taken from
# http://stackoverflow.com/questions/1060279/iterating-through-a-range-of-dates-in-python
import datetime
import glob
import logging
import shutil
import os
import os.path
import subprocess
import sys
import tempfile
import time
import urllib
import utils
import re
import tarfile
# if the size of the game log is less than this assume we got an error page
SMALL_FILE_SIZE = 5000
default_startdate = datetime.date(2010, 10, 15)
DEBUG = True
GOOD = 0
MISSING = 1
ERROR = 2
DOWNLOADED = 3
REPACKAGED = 4
CR_SOURCE = 5
GOKO_SOURCE = 6
ISO_SOURCE = 7
GOKO_LOG_RE = re.compile('"(log.\w+.\w+.txt)"', re.MULTILINE)
# Councilroom format is more similar to old isotropic format.
GOKO_FORMAT = '%(year)d%(month)02d%(day)02d/'
ISOTROPIC_FORMAT = '%(year)d%(month)02d/%(day)02d/all.tar.bz2'
COUNCILROOM_FORMAT = '%(year)d%(month)02d%(day)02d/%(year)d%(month)02d%(day)02d.all.tar.bz2'
def FormatDate(fmt, cur_date):
return fmt % {
'year': cur_date.year, 'month': cur_date.month, 'day': cur_date.day
}
def IsotropicGamesCollectionUrl(cur_date):
host = 'http://dominion.isotropic.org/gamelog/'
return host + FormatDate(ISOTROPIC_FORMAT, cur_date)
def GokoGamesCollectionUrl(cur_date):
host = 'http://archive-dominionlogs.goko.com/'
return host+FormatDate(GOKO_FORMAT, cur_date)
def GokoSingleGameUrl(cur_date, cur_game):
return GokoGamesCollectionUrl(cur_date)+cur_game
def CouncilroomGamesCollectionUrl(cur_date):
host = 'http://councilroom.com/static/scrape_data/'
return host + FormatDate(COUNCILROOM_FORMAT, cur_date)
def RemoveSmallFileIfExists(fn):
if (os.path.exists(fn) and
os.stat(fn).st_size <= SMALL_FILE_SIZE):
print 'removing small existing file', fn
os.unlink(fn)
def download_date(str_date, cur_date, saved_games_bundle):
urls_by_priority = [
(CR_SOURCE, CouncilroomGamesCollectionUrl(cur_date)),
(GOKO_SOURCE, GokoGamesCollectionUrl(cur_date)),
(ISO_SOURCE, IsotropicGamesCollectionUrl(cur_date))
]
for (source, url) in urls_by_priority:
if DEBUG:
print 'getting', saved_games_bundle, 'at', url
try:
contents = urllib.urlopen(url).read()
except IOError:
contents = "0"
if len(contents) > SMALL_FILE_SIZE:
if DEBUG:
print 'yay, success from', url, 'no more requests for', \
str_date, 'needed'
if source == CR_SOURCE or source == ISO_SOURCE:
open(saved_games_bundle, 'w').write(contents)
elif source == GOKO_SOURCE:
games = re.findall(GOKO_LOG_RE, contents)
bundle_goko_games(cur_date, games, saved_games_bundle)
return True
elif DEBUG:
print 'request to', url, 'failed to find large file'
return False
def bundle_goko_games(cur_date, games, saved_games_bundle):
progressfile = tempfile.mktemp()
bundle = tarfile.open(progressfile,'w:bz2')
orig_dir = os.getcwd()
directory_name = tempfile.mkdtemp()
os.chdir(directory_name)
if DEBUG:
print len(games), " games to download..."
for cur_game in games:
url = GokoSingleGameUrl(cur_date, cur_game)
retries_remaining = 3
while retries_remaining > 0:
try:
game_text = urllib.urlopen(url).read()
if '<title>403 Forbidden' not in game_text and '<title>404 Not Found' not in game_text:
game = open(cur_game,'a')
game.write(game_text)
game.close()
bundle.add(cur_game)
break
else:
retries_remaining = retries_remaining - 1
if(DEBUG):
print "Failed to download game: ", cur_game
except:
retries_remaining = retries_remaining - 1
if(DEBUG):
print "Failed to download game: ", cur_game
bundle.close();
os.chdir(orig_dir)
os.rename(progressfile, saved_games_bundle)
shutil.rmtree(directory_name)
def unzip_date(directory, filename):
os.chdir(directory)
cmd = 'tar -xjvf %s >/dev/null 2>/dev/null'%filename
if DEBUG:
print cmd
ret = os.system(cmd)
if ret==0:
os.system('chmod -R 755 .')
code = True
else:
code = False
os.chdir('..')
return code
def repackage_filename(orig_archive_filename):
return orig_archive_filename.replace(".all.tar.bz2", ".bz2.tar")
def repackage_archive(filename):
""" Converts a .tar.bz2 file into a .bz2.tar file in the same directory.
Game archives are distributed as .tar.bz2 (a bzip2-compressed tar
archive). For speed of serving, we repackage them as .bz2.tar (a
tar archive of bzip2-compressed HTML or text files). The .bz2.tar file is
a good bit larger, but an individual file can be extracted,
decompressed, and served to a client in tenths of a second instead
of tens of seconds. At the same time, storage space is still
dramatically smaller than a raw folder of uncompressed (or even
compressed) HTML or text files.
"""
orig_dir = os.getcwd()
# Extract the existing file into a temporary folder
directory_name = tempfile.mkdtemp()
source_filename = os.path.abspath(filename)
try:
subprocess.check_call(["tar", "--auto-compress", "-C", directory_name,
"-xf", source_filename])
except subprocess.CalledProcessError, e:
# Not handling this yet, just re-raise
logging.warning("Unexpected return from tar >>{msg}<<".format(msg=e.output))
raise
# Compress all the game*.html log*txt files.
# Individually, so the argument list doesn't explode from huge numbers of
# goko logs.
os.chdir(directory_name)
game_files = glob.glob("game*.html")+glob.glob("log*.txt")
dest_filename = repackage_filename(source_filename)
repackaged_archive = tarfile.open(dest_filename,'w')
for cur_game in game_files:
try:
subprocess.check_call(["bzip2", cur_game])
except subprocess.CalledProcessError, e: #(retcode, cmd, output=output)
# Not handling this yet, just re-raise
logging.warning("Unexpected return from bzip or tar >>{msg}<<".format(msg=e.output))
os.chdir(orig_dir)
shutil.rmtree(directory_name)
raise
repackaged_archive.add(cur_game+'.bz2')
repackaged_archive.close();
os.chdir(orig_dir)
shutil.rmtree(directory_name)
def scrape_date(str_date, cur_date, passive=False):
#directory = str_date
games_short_name = str_date + '.all.tar.bz2'
saved_games_bundle = games_short_name
return_code = ERROR
if utils.at_least_as_big_as(saved_games_bundle, SMALL_FILE_SIZE):
if DEBUG:
print 'skipping because exists', str_date, saved_games_bundle, \
'and not small (size=', os.stat(saved_games_bundle).st_size, ')'
return_code = GOOD
else:
RemoveSmallFileIfExists(saved_games_bundle)
if passive:
return_code = MISSING
elif not download_date(str_date, cur_date, saved_games_bundle):
return_code = ERROR
else:
return_code = DOWNLOADED
# Repackage an existing file, if found
if utils.at_least_as_big_as(saved_games_bundle, SMALL_FILE_SIZE) and \
not os.path.exists(repackage_filename(saved_games_bundle)):
repackage_archive(saved_games_bundle)
return_code = REPACKAGED
return return_code
def scrape_games():
parser = utils.incremental_date_range_cmd_line_parser()
utils.ensure_exists('static/scrape_data')
os.chdir('static/scrape_data')
args = parser.parse_args()
last_month = ''
yesterday = datetime.date.today() - datetime.timedelta(days=1)
#Goko updates logs in real time; wait a day so the list is finalized.
for cur_date in utils.daterange(default_startdate, yesterday, reverse=True):
str_date = time.strftime("%Y%m%d", cur_date.timetuple())
if not utils.includes_day(args, str_date):
if DEBUG:
print 'skipping', str_date, 'because not in cmd line arg daterange'
continue
mon = time.strftime("%b%y", cur_date.timetuple())
if mon != last_month:
print
print mon, cur_date.day*" ",
sys.stdout.flush()
last_month = mon
ret = scrape_date(str_date, cur_date, passive=args.passive)
if ret==DOWNLOADED:
print 'o',
elif ret==REPACKAGED:
print 'O',
elif ret==ERROR:
print '!',
elif ret==MISSING:
print '_',
else:
print '.',
sys.stdout.flush()
print
os.chdir('../..')
if __name__=='__main__':
scrape_games()