/
loader.py
662 lines (547 loc) · 24.3 KB
/
loader.py
1
2
3
4
5
6
7
8
9
10
11
12
13
14
15
16
17
18
19
20
21
22
23
24
25
26
27
28
29
30
31
32
33
34
35
36
37
38
39
40
41
42
43
44
45
46
47
48
49
50
51
52
53
54
55
56
57
58
59
60
61
62
63
64
65
66
67
68
69
70
71
72
73
74
75
76
77
78
79
80
81
82
83
84
85
86
87
88
89
90
91
92
93
94
95
96
97
98
99
100
101
102
103
104
105
106
107
108
109
110
111
112
113
114
115
116
117
118
119
120
121
122
123
124
125
126
127
128
129
130
131
132
133
134
135
136
137
138
139
140
141
142
143
144
145
146
147
148
149
150
151
152
153
154
155
156
157
158
159
160
161
162
163
164
165
166
167
168
169
170
171
172
173
174
175
176
177
178
179
180
181
182
183
184
185
186
187
188
189
190
191
192
193
194
195
196
197
198
199
200
201
202
203
204
205
206
207
208
209
210
211
212
213
214
215
216
217
218
219
220
221
222
223
224
225
226
227
228
229
230
231
232
233
234
235
236
237
238
239
240
241
242
243
244
245
246
247
248
249
250
251
252
253
254
255
256
257
258
259
260
261
262
263
264
265
266
267
268
269
270
271
272
273
274
275
276
277
278
279
280
281
282
283
284
285
286
287
288
289
290
291
292
293
294
295
296
297
298
299
300
301
302
303
304
305
306
307
308
309
310
311
312
313
314
315
316
317
318
319
320
321
322
323
324
325
326
327
328
329
330
331
332
333
334
335
336
337
338
339
340
341
342
343
344
345
346
347
348
349
350
351
352
353
354
355
356
357
358
359
360
361
362
363
364
365
366
367
368
369
370
371
372
373
374
375
376
377
378
379
380
381
382
383
384
385
386
387
388
389
390
391
392
393
394
395
396
397
398
399
400
401
402
403
404
405
406
407
408
409
410
411
412
413
414
415
416
417
418
419
420
421
422
423
424
425
426
427
428
429
430
431
432
433
434
435
436
437
438
439
440
441
442
443
444
445
446
447
448
449
450
451
452
453
454
455
456
457
458
459
460
461
462
463
464
465
466
467
468
469
470
471
472
473
474
475
476
477
478
479
480
481
482
483
484
485
486
487
488
489
490
491
492
493
494
495
496
497
498
499
500
501
502
503
504
505
506
507
508
509
510
511
512
513
514
515
516
517
518
519
520
521
522
523
524
525
526
527
528
529
530
531
532
533
534
535
536
537
538
539
540
541
542
543
544
545
546
547
548
549
550
551
552
553
554
555
556
557
558
559
560
561
562
563
564
565
566
567
568
569
570
571
572
573
574
575
576
577
578
579
580
581
582
583
584
585
586
587
588
589
590
591
592
593
594
595
596
597
598
599
600
601
602
603
604
605
606
607
608
609
610
611
612
613
614
615
616
617
618
619
620
621
622
623
624
625
626
627
628
629
630
631
632
633
634
635
636
637
638
639
640
641
642
643
644
645
646
647
648
649
650
651
652
653
654
655
656
657
658
659
660
661
662
import os
import subprocess
import re
import logging
import urlparse
import requests
import signal
import pprint
import traceback
import numpy
from time import sleep
from collections import defaultdict
TCPDUMP = '/usr/sbin/tcpdump'
SCREENSHOT = 'scrot'
################################################################################
# #
# UTILITIES #
# #
################################################################################
class TimeoutError(Exception):
pass
class Timeout:
'''Can be used w/ 'with' to make arbitrary function calls with timeouts'''
def __init__(self, seconds=10, error_message='Timeout'):
self.seconds = seconds
self.error_message = error_message
def handle_timeout(self, __, _):
raise TimeoutError(self.error_message)
def __enter__(self):
signal.signal(signal.SIGALRM, self.handle_timeout)
signal.alarm(self.seconds)
def __exit__(self, _, __, ___):
signal.alarm(0)
################################################################################
# #
# RESULTS #
# #
################################################################################
class LoadResult(object):
'''Status and stats for a single URL load (i.e., one trial).
:param status: The status of the page load.
:param url: The original URL.
:param final_url: The final URL (maybe be different if we were redirected).
:param time: The page load time (in seconds).
:param size: Size of object if loading a single object; total size if loading
a full page.
:param har: Path to the HAR file.
:param img: Path to a screenshot of the loaded page.
:param tcp_fast_open_supported: True if TCP fast open was used successfully;
False otherwise or unknown
'''
# Status constants
SUCCESS = 'SUCCESS' #: Page load was successful
FAILURE_TIMEOUT = 'FAILURE_TIMEOUT' #: Page load timed out
FAILURE_UNKNOWN = 'FAILURE_UNKNOWN' #: Unkown failure occurred
FAILURE_NO_200 = 'FAILURE_NO_200' #: HTTP status code was not 200
FAILURE_UNSET = 'FAILURE_UNSET' #: Status has not been set
def __init__(self, status, url, final_url=None, time=None, size=None,\
har=None, img=None, raw=None, server=None,\
tcp_fast_open_supported=False, tls_false_start_supported=False,\
tls_session_resumption_supported=False):
self._status = status
self._url = url # the initial URL we requested
self._final_url = final_url # we may have been redirected
self._time = time # load time in seconds
self._size = size
self._har_path = har
self._image_path = img
self._raw = raw
self._server = server
self._tcp_fast_open_supported = tcp_fast_open_supported
self._tls_false_start_supported = tls_false_start_supported
self._tls_session_resumption_supported = tls_session_resumption_supported
@property
def status(self):
'''The status of this page load.'''
return self._status
@property
def url(self):
'''The original URL requested.'''
return self._url
@property
def final_url(self):
'''The final URL (could be different if we were redirected).'''
return self._final_url
@property
def time(self):
'''The page load time in seconds.'''
return self._time
@property
def size(self):
'''???'''
return self._size
@property
def har_path(self):
'''Path to the HAR captured during this page load.'''
return self._har_path
@property
def image_path(self):
'''Path to a screenshot of the loaded page.'''
return self._image_path
@property
def raw(self):
'''Raw output from the underlying command.'''
return self._raw
@property
def server(self):
'''Web server software name.'''
return self._server
@property
def tcp_fast_open_supported(self):
'''Bool indicating whether or not TCP fast open succeeded for this
connection.'''
return self._tcp_fast_open_supported
@property
def tls_false_start_supported(self):
'''Bool indicating whether or not TLS false start succeeded for this
connection.'''
return self._tls_false_start_supported
@property
def tls_session_resumption_supported(self):
'''Bool indicating whether or not TLS session resumption succeeded for this
connection.'''
return self._tls_session_resumption_supported
def __str__(self):
return 'LoadResult (%s): %s' % (self._status, pprint.saferepr(self.__dict__))
def __repr__(self):
return self.__str__()
class PageResult(object):
'''Status and stats for one URL (all trials).
:param url: The original URL.
:param status: The overall status of all trials.
:param load_results: List of individual :class:`LoadResult` objects
'''
# Status constants
SUCCESS = 'SUCCESS' #: All trials were successful
PARTIAL_SUCCESS = 'PARTIAL_SUCCESS' #: some trials were successful
FAILURE_NOT_ACCESSIBLE = 'FAILURE_NOT_ACCESSIBLE' #: The page could not be loaded with the specified protocol
FAILURE_UNKNOWN = 'FAILURE_UNKNOWN' #: An unknown failure occurred
FAILURE_UNSET = 'FAILURE_UNSET' #: Status has not been set
def __init__(self, url, status=None, load_results=None):
self._status = PageResult.FAILURE_UNSET
self._url = url
self._load_statuses = []
self._times = []
self._sizes = []
self._server = 'UNKNOWN'
self._tcp_fast_open_support_statuses = []
self._tls_false_start_support_statuses = []
self._tls_session_resumption_support_statuses = []
if load_results:
was_a_failure = False
was_a_success = False
for result in load_results:
self._load_statuses.append(result.status)
if result.server:
self._server = result.server
if result.status == PageResult.SUCCESS:
was_a_success = True
if result.time: self.times.append(result.time)
if result.size: self.sizes.append(result.size)
self._tcp_fast_open_support_statuses.append(
result.tcp_fast_open_supported)
self._tls_false_start_support_statuses.append(
result.tls_false_start_supported)
self._tls_session_resumption_support_statuses.append(
result.tls_session_resumption_supported)
else:
was_a_failure = True
if was_a_failure and was_a_success:
self._status = PageResult.PARTIAL_SUCCESS
elif was_a_success:
self._status = PageResult.SUCCESS
else:
self._status = PageResult.FAILURE_UNKNOWN
if status:
self._status = status
@property
def status(self):
'''The overall status across all trials.'''
return self._status
@property
def url(self):
'''The URL.'''
return self._url
@property
def load_statuses(self):
'''A list of statuses from individual trials.'''
return self._load_statuses
@property
def times(self):
'''A list of the load times from individual trials.'''
return self._times
@property
def sizes(self):
'''A list of the page sizes from individual trials.'''
return self._sizes
@property
def server(self):
'''Web server software name.'''
return self._server
@property
def tcp_fast_open_support_statuses(self):
'''A list of bools indicating whether or not TCP fast open succeeded
for each load.'''
return self._tcp_fast_open_support_statuses
@property
def tls_false_start_support_statuses(self):
'''A list of bools indicating whether or not TLS false start succeeded
for each load.'''
return self._tls_false_start_support_statuses
@property
def tls_session_resumption_support_statuses(self):
'''A list of bools indicating whether or not TLS session resumption
succeeded for each load.'''
return self._tls_session_resumption_support_statuses
@property
def mean_time(self):
'''Mean load time across all trials.'''
return numpy.mean(self.times)
@property
def median_time(self):
'''Median load time across all trials.'''
return numpy.median(self.times)
@property
def stddev_time(self):
'''Standard deviation of load time across all trials.'''
return numpy.std(self.times)
def __str__(self):
return 'PageResult (%s): %s' % (self._status, pprint.saferepr(self.__dict__))
def __repr__(self):
return self.__str__()
################################################################################
# #
# LOADER #
# #
################################################################################
class Loader(object):
# NOTE: some parameters of the init function are obsolete
'''Superclass for URL loader. Subclasses implement actual page load
functionality (e.g., using Chrome, PhantomJS, etc.).
:param outdir: directory for HAR files, screenshots, etc.
:param num_trials: number of times to load each URL
:param http2: use HTTP 2 (not all subclasses support this)
:param timeout: timeout in seconds
:param disable_local_cache: disable the local browser cache (RAM and disk)
:param disable_network_cache: send "Cache-Control: max-age=0" header
:param full_page: load page's subresources and render; if False, only the
object is fetched
:param user_agent: use custom user agent; if None, use browser's default
:param headless: don't use GUI (if there normally is one -- e.g., browsers)
:param restart_on_fail: if a load fails, set up the loader again (e.g.,
reboot chrome)
:param save_har: save a HAR file to the output directory
:param save_screenshot: save a screenshot to the output directory
:param retries_per_trial: if a trial fails, retry this many times (beyond
first)
:param stdout_filename: if the loader launches other procs (e.g., browser),
send their stdout and stderr to this file. If None, use parent proc's
stdout and stderr.
:param check_protocol_availability: before loading the page, check to see
if the specified protocol (HTTP or HTTPS) is supported. (otherwise, the
loader might silently fall back to a different protocol.)
:param save_packet_capture: save a pcap trace for each load (separate files)
:param disable_quic: disable use of the QUIC transport protocol
:param disable_spdy: disable use of SPDY/HTTP2
:param ssl_keylog_file: if specified, instruct browser to save SSL session
keys (by setting SSLKEYLOGFILE environment variable)
:param ignore_certificate_errors: continue loading page even if
certificate check fails
'''
def __init__(self, outdir='.', num_trials=1, http2=False, timeout=61,\
disable_local_cache=True, disable_network_cache=False, full_page=True,\
user_agent=None, headless=True, restart_on_fail=False, proxy=None,\
save_har=False, save_screenshot=False, retries_per_trial=0,\
stdout_filename=None, check_protocol_availability=True,\
save_packet_capture=False, disable_quic=False, disable_spdy=False,\
log_ssl_keys=False, ignore_certificate_errors=False):
'''Initialize a Loader object.'''
# options
self._outdir = outdir
self._num_trials = num_trials
self._http2 = http2
self._timeout = timeout
self._disable_local_cache = disable_local_cache
self._disable_network_cache = disable_network_cache
self._full_page = full_page
self._user_agent = user_agent
self._headless = headless
self._restart_on_fail = restart_on_fail
self._save_har = save_har
self._save_screenshot = save_screenshot
self._retries_per_trial = retries_per_trial
self._stdout_filename = stdout_filename
self._proxy = proxy
self._check_protocol_availability = check_protocol_availability
self._save_packet_capture = save_packet_capture
self._disable_quic = disable_quic
self._disable_spdy = disable_spdy
self._log_ssl_keys = log_ssl_keys
self._ignore_certificate_errors = ignore_certificate_errors
# cummulative list of all URLs (one per trial)
self._urls = []
# Map URLs to lists of LoadResults (there will be multiple results per
# URL if there are multiple trials)
self._load_results = defaultdict(list)
# Map URLs to PageResults (there is only one PageResult per URL; it
# summarizes the LoadResults for the individual trials)
self._page_results = {}
# count how many times we restarted the loader due to failure
self._num_restarts = 0
# if self._stdout_filename is set, this var will hold the file object
self._stdout_file = None
self.tcpdump_proc = None
# nicely teardown
# NOTE: do not SIGKILL
signal.signal(signal.SIGINT, self.handle_kill)
signal.signal(signal.SIGTERM, self.handle_kill)
##
## Internal helper methods
##
# These two functions define how the loader names result files
# They are duplicated multiple times in other components of the whole test suite
# NOTE: change all of them if one wants to update them
def _sanitize_url(self, url):
'''Returns a version of the URL suitable for use in a file name.'''
return re.sub(r'[/\;,><&*:%=+@!#^()|?^]', '-', url)
def _outfile_path(self, url, suffix=None, trial=None):
'''Returns a path for an output file (e.g., HAR, screenshot, pcap)'''
filename = self._sanitize_url(url)
if trial is not None:
filename += '_%d' % trial
if suffix:
filename += suffix
return os.path.join(self._outdir, filename)
def _check_url(self, url):
'''Make sure URL is well-formed'''
if '://' not in url:
logging.warn('URL %s has no protocol; using http.', url)
url = 'http://%s' % url
return url
# TODO: handle sites that sometimes return HTTP and sometimes HTTPS (YouTube)
def _check_protocol_available(self, url):
'''Check if the URL can be loaded over the specified protocol.
For example, an HTTPS might not respond or an HTTP URL might be
redirected to an HTTPS one.
'''
orig_protocol = urlparse.urlparse(url).scheme
logging.debug('Checking if %s can be accessed using %s'\
, url, orig_protocol)
# try to fetch the page with the specified protocol
response = None
try:
with Timeout(seconds = self._timeout+5):
headers = {}
if self._user_agent:
headers['User-Agent'] = self._user_agent
response = requests.get(url, timeout=self._timeout,\
headers=headers, verify=False)
except requests.exceptions.ConnectionError as e:
logging.debug('Could not connect to %s: %s', url, e)
return False
except requests.exceptions.Timeout as e:
logging.debug('Timed out connecting to %s: %s', url, e)
return False
except TimeoutError:
logging.debug('* Timed out connecting to %s', url)
return False
except Exception as e:
logging.exception('Error requesting %s: %s', url, e)
return False
# if we got a response, check if we changed protocols
final_protocol = urlparse.urlparse(response.url).scheme
if orig_protocol == final_protocol:
return True
else:
return False
def _preload_objects(self, _, __):
return
def _load_page(self, _, __, ___):
return
def _setup(self, _=0):
'''Subclasses can override to prepare (e.g., launch Xvfb)'''
return True
def __setup(self, my_id=0):
'''Private setup method for Loader superclass'''
if self._stdout_filename:
try:
self._stdout_file = open(self._stdout_filename, 'a')
except Exception:
logging.exception('Error opening stdout file: %s. Using parent\'s stdout.',\
self._stdout_filename)
self._stdout_file = None
return self._setup(my_id)
def setup(self, my_id=0):
# my_id is a unique value to avoid multiple browsers using the same port
return self.__setup(my_id)
def _teardown(self):
'''Subclasses can override to clean up (e.g., kill Xvfb)'''
return True
def __teardown(self):
'''Private teardown method for Loader superclass'''
child_ret = self._teardown()
if self._stdout_file:
self._stdout_file.close()
return child_ret
def teardown(self):
return self.__teardown()
def handle_kill(self, __, _):
self.teardown()
raise KeyboardInterrupt('To be killed')
def __getstate__(self):
'''override getstate so we don't try to pickle the stdout file object'''
state = dict(self.__dict__)
del state['_stdout_file']
return state
##
## Properties
##
@property
def urls(self):
'''A cummulative list of the URLs this instance has loaded
in the order they were loaded. Each trial is listed separately.'''
return self._urls
@property
def load_results(self):
'''A dict mapping URLs to a list of :class:`LoadResult`.'''
return self._load_results
@property
def page_results(self):
'''A dict mapping URLs to a :class:`PageResult`.'''
return self._page_results
@property
def num_restarts(self):
'''Number of times the loader was restarted (e.g., rebooted browser
process) due to failures if restart_on_fail is True.'''
return self._num_restarts
##
## Public methods
##
def load_page(self, the_test, trial_number):
# run a single test
test = dict(the_test)
url = test['url']
url = self._check_url(url)
i = trial_number
try:
# if load fails, keep trying self._retries_per_trial times
tries_so_far = 0
while tries_so_far <= self._retries_per_trial:
tries_so_far += 1
# handle preload first
if test['preload']:
self._preload_objects(test['preload'], test['fresh_view'])
# avoid clear cache again after preload
test['fresh_view'] = False
# start tcpdump if we want a packet capture
if test['save_packet_capture']:
# the prefix of the pcap file
prefix = test['packet_capture_file_name']
if not prefix:
prefix = url
pcap_path = self._outfile_path(prefix, suffix='.pcap', trial=i)
# start dump, for now we just filter out port 22
# could be only 80 and 443
tcpdump_command = [TCPDUMP, '-w', pcap_path, 'port not 22']
logging.debug('Starting tcpdump: %s', ' '.join(tcpdump_command))
self.tcpdump_proc = subprocess.Popen(tcpdump_command,\
stdout=self._stdout_file, stderr=self._stdout_file)
# sometimes tcpdump is slower than chrome to startup
sleep(0.5)
# load the page, this function is overrided by ChromeLoader and FirefoxLoader
result = self._load_page(test, self._outdir, i)
try:
if test['save_screenshot']:
prefix = test['screenshot_name'] if test['screenshot_name'] else url
sspath = self._outfile_path(prefix, suffix='.png', trial=i)
# the best way is to use 'scrot -u' which capture the current focused
# window instead of the full screen. But it sometimes fails maybe because
# the window loses focus, so we fallback to take full screen
if self.__class__.__name__ == 'FirefoxLoader':
cmd = [SCREENSHOT, sspath]
else:
cmd = [SCREENSHOT, sspath]
with Timeout(seconds=self._timeout+5):
subprocess.check_call(cmd, stdout=self._stdout_file, stderr=subprocess.STDOUT)
logging.debug('Screenshot taken')
except TimeoutError:
logging.exception('* Timeout taking screenshot for %s', url)
except subprocess.CalledProcessError as e:
logging.exception('Error call %s: %s\n%s', SCREENSHOT, e, e.output)
except Exception as e:
logging.exception('Error taking screenshot for %s: %s', url, e)
logging.debug('Trial %d, try %d: %s', i, tries_so_far, result)
# stop tcpdump (if it's running)
if self.tcpdump_proc:
logging.debug('Stopping tcpdump')
os.system("kill %s" % self.tcpdump_proc.pid)
self.tcpdump_proc = None
if result.status == LoadResult.SUCCESS:
self._urls.append(url)
self._load_results[url].append(result)
break # success, don't retry
elif tries_so_far > self._retries_per_trial:
# this was the last try, record the failure
self._urls.append(url)
self._load_results[url].append(result)
if result.status == LoadResult.FAILURE_UNKNOWN and self._restart_on_fail:
self.__teardown()
self.__setup()
self._num_restarts += 1
except Exception as e:
logging.exception('Error loading page %s: %s\n%s', url, e,\
traceback.format_exc())
return result
def load_pages(self, tests):
'''Load each URL in `urls` `num_trials` times and collect stats.
'''
self.tcpdump_proc = None # if we use tcpdump, keep a handle to the process
try:
if not self.__setup():
logging.error('Error setting up loader')
return
for test in tests['tests']:
url = test['url']
# make sure URL is well-formed (e.g., has protocol, etc.)
url = self._check_url(url)
# If all is well, load URL num_trials times
for i in range(0, test['num_trials']):
self.load_page(test, i)
# Save PageResult summarizing the individual trial LoadResults
if url not in self._page_results:
self._page_results[url] = []
self._page_results[url].append(PageResult(url,\
load_results=self._load_results[url]))
except Exception as e:
logging.exception('Error loading pages: %s\n%s', e, traceback.format_exc())
finally:
# stop tcpdump (if it's running)
try:
if self.tcpdump_proc:
logging.debug('Stopping tcpdump')
os.system("kill %s" % self.tcpdump_proc.pid)
self.tcpdump_proc = None
except Exception:
logging.exception('Error stopping tcpdump.')
self.__teardown()