def _RequestsSubcommand(args): """`loading_trace_analyzer.py requests` Command line tool entry point. Example: Lists all request with timing: ... requests --output-format "{timing} {url}" Lists HTTP/HTTPS requests that have used the cache: ... requests --where "{protocol} {from_disk_cache}" "https?\S* True" """ where_format = None where_statement = None if args.where_statement: where_format = args.where_statement[0] try: where_statement = re.compile(args.where_statement[1]) except re.error as e: sys.stderr.write("Invalid where statement REGEX: {}\n{}\n".format( args.where_statement[1], str(e))) return 1 loading_trace = LoadingTrace.FromJsonDict(json.load(args.loading_trace)) for request_event in loading_trace.request_track.GetEvents(): request_event_json = request_event.ToJsonDict() if where_statement != None: where_in = where_format.format(**request_event_json) if not where_statement.match(where_in): continue args.output.write( args.output_format.format(**request_event_json) + '\n') return 0
def ListRequests(loading_trace_path, output_format='{url}', where_format='{url}', where_statement=None): """`loading_trace_analyzer.py requests` Command line tool entry point. Args: loading_trace_path: Path of the loading trace. output_format: Output format of the generated strings. where_format: String formated to be regex tested with <where_statement> where_statement: Regex for selecting request event. Yields: Formated string of the selected request event. Example: Lists all request with timing: ... requests --output-format "{timing} {url}" Lists HTTP/HTTPS requests that have used the cache: ... requests --where "{protocol} {from_disk_cache}" "https?\S* True" """ if where_statement: where_statement = re.compile(where_statement) loading_trace = LoadingTrace.FromJsonFile(loading_trace_path) for request_event in loading_trace.request_track.GetEvents(): request_event_json = request_event.ToJsonDict() if where_statement != None: where_in = where_format.format(**request_event_json) if not where_statement.match(where_in): continue yield output_format.format(**request_event_json)
def LoadRemoteTrace(storage_accessor, remote_trace_path, logger): """Loads and returns the LoadingTrace located at the remote trace path. Args: storage_accessor: (GoogleStorageAccessor) Used to download the trace from CloudStorage. remote_trace_path: (str) Path to the trace file. """ # Cut the gs://<bucket_name> prefix from trace paths if needed. prefix = 'gs://%s/' % storage_accessor.BucketName() prefix_length = len(prefix) if remote_trace_path.startswith(prefix): remote_trace_path = remote_trace_path[prefix_length:] trace_string = storage_accessor.DownloadAsString(remote_trace_path) if not trace_string: logger.error('Failed to download: ' + remote_trace_path) return None trace_dict = json.loads(trace_string) if not trace_dict: logger.error('Failed to parse: ' + remote_trace_path) return None trace = LoadingTrace.FromJsonDict(trace_dict) if not trace: logger.error('Invalid format for: ' + remote_trace_path) return None return trace
def ReadSubresourceMapFromBenchmarkOutput(benchmark_output_directory_path): """Extracts a map URL-to-subresources for each navigation in benchmark directory. Args: benchmark_output_directory_path: Path of the benchmark output directory to verify. Returns: {url -> [URLs of sub-resources]} """ url_subresources = {} run_id = -1 while True: run_id += 1 run_path = os.path.join(benchmark_output_directory_path, str(run_id)) if not os.path.isdir(run_path): break trace_path = os.path.join(run_path, sandwich_runner.TRACE_FILENAME) if not os.path.isfile(trace_path): continue trace = LoadingTrace.FromJsonFile(trace_path) if trace.url in url_subresources: continue logging.info('lists resources of %s from %s' % (trace.url, trace_path)) urls_set = set() for request_event in _FilterOutDataAndIncompleteRequests( trace.request_track.GetEvents()): if request_event.url not in urls_set: logging.info(' %s' % request_event.url) urls_set.add(request_event.url) url_subresources[trace.url] = [url for url in urls_set] return url_subresources
def PatchCacheArchive(cache_archive_path, loading_trace_path, cache_archive_dest_path): """Patch the cache archive. Note: This method update the raw response headers of cache entries' to store the ones such as Set-Cookie that were pruned by the net::HttpCacheTransaction, and remove the stream index 2 holding resource's compile meta data. Args: cache_archive_path: Input archive's path to patch. loading_trace_path: Path of the loading trace that have recorded the cache archive <cache_archive_path>. cache_archive_dest_path: Archive destination's path. """ trace = LoadingTrace.FromJsonFile(loading_trace_path) with common_util.TemporaryDirectory(prefix='sandwich_tmp') as tmp_path: cache_path = os.path.join(tmp_path, 'cache') chrome_cache.UnzipDirectoryContent(cache_archive_path, cache_path) cache_backend = chrome_cache.CacheBackend(cache_path, 'simple') cache_entries = set(cache_backend.ListKeys()) logging.info('Original cache size: %d bytes' % cache_backend.GetSize()) for request in _FilterOutDataAndIncompleteRequests( trace.request_track.GetEvents()): # On requests having an upload data stream such as POST requests, # net::HttpCache::GenerateCacheKey() prefixes the cache entry's key with # the upload data stream's session unique identifier. # # It is fine to not patch these requests since when reopening Chrome, # there is no way the entry can be reused since the upload data stream's # identifier will be different. # # The fact that these entries are kept in the cache after closing Chrome # properly by closing the Chrome tab as the ChromeControler.SetSlowDeath() # do is a known Chrome bug (crbug.com/610725). # # TODO(gabadie): Add support in ValidateCacheArchiveContent() and in # VerifyBenchmarkOutputDirectory() for POST requests to be known as # impossible to use from cache. if request.url not in cache_entries: if request.method != 'POST': raise RuntimeError( 'Unexpected method that is not found in cache.' ''.format(request.method)) continue # Chrome prunes Set-Cookie from response headers before storing them in # disk cache. Also, it adds implicit "Vary: cookie" header to all redirect # response headers. Sandwich manages the cache, but between recording the # cache and benchmarking the cookie jar is invalidated. This leads to # invalidation of all cacheable redirects. raw_headers = request.GetRawResponseHeaders() cache_backend.UpdateRawResponseHeaders(request.url, raw_headers) # NoState-Prefetch would only fetch the resources, but not parse them. cache_backend.DeleteStreamForKey(request.url, 2) chrome_cache.ZipDirectoryContent(cache_path, cache_archive_dest_path) logging.info('Patched cache size: %d bytes' % cache_backend.GetSize())
def ExtractDiscoverableUrls(loading_trace_path, subresource_discoverer): """Extracts discoverable resource urls from a loading trace according to a sub-resource discoverer. Args: loading_trace_path: The loading trace's path. subresource_discoverer: The sub-resources discoverer that should white-list the resources to keep in cache for the NoState-Prefetch benchmarks. Returns: A set of urls. """ assert subresource_discoverer in SUBRESOURCE_DISCOVERERS, \ 'unknown prefetch simulation {}'.format(subresource_discoverer) # Load trace and related infos. logging.info('loading %s' % loading_trace_path) trace = LoadingTrace.FromJsonFile(loading_trace_path) dependencies_lens = RequestDependencyLens(trace) first_resource_request = trace.request_track.GetFirstResourceRequest() # Build the list of discovered requests according to the desired simulation. discovered_requests = [] if subresource_discoverer == EMPTY_CACHE_DISCOVERER: pass elif subresource_discoverer == FULL_CACHE_DISCOVERER: discovered_requests = trace.request_track.GetEvents() elif subresource_discoverer == REDIRECTED_MAIN_DISCOVERER: discovered_requests = \ [dependencies_lens.GetRedirectChain(first_resource_request)[-1]] elif subresource_discoverer == PARSER_DISCOVERER: discovered_requests = PrefetchSimulationView.ParserDiscoverableRequests( first_resource_request, dependencies_lens) elif subresource_discoverer == HTML_PRELOAD_SCANNER_DISCOVERER: discovered_requests = PrefetchSimulationView.PreloadedRequests( first_resource_request, dependencies_lens, trace) else: assert False whitelisted_urls = set() logging.info('white-listing %s' % first_resource_request.url) for request in _FilterOutDataAndIncompleteRequests(discovered_requests): logging.info('white-listing %s' % request.url) whitelisted_urls.add(request.url) return whitelisted_urls
def CreateLoadingTrace(cls, trace_events=None): # This creates a set of requests with the following dependency structure. # # 1234.redirect.1 -> 1234.redirect.2 # 1234.redirect.2 -> 1234.1 # 1234.1 -> 1234.12 # 1234.1 -> 1234.42 # 1234.1 -> 1234.56 # 1234.12 -> 1234.13 trace = test_utils.LoadingTraceFromEvents( [cls.FIRST_REDIRECT_REQUEST, cls.SECOND_REDIRECT_REQUEST, cls.REDIRECTED_REQUEST, cls.REQUEST, cls.JS_REQUEST, cls.JS_REQUEST_2, cls.JS_REQUEST_OTHER_FRAME, cls.JS_REQUEST_UNRELATED_FRAME], cls.PAGE_EVENTS, trace_events) # Serialize and deserialize so that clients can change events without # affecting future tests. return LoadingTrace.FromJsonDict(trace.ToJsonDict())
def VerifyBenchmarkOutputDirectory(benchmark_setup_path, benchmark_output_directory_path): """Verifies that all run inside the run_output_directory worked as expected. Args: benchmark_setup_path: Path of the JSON of the benchmark setup. benchmark_output_directory_path: Path of the benchmark output directory to verify. """ # TODO(gabadie): What's the best way of propagating errors happening in here? benchmark_setup = json.load(open(benchmark_setup_path)) cache_whitelist = set(benchmark_setup['cache_whitelist']) original_requests = set(benchmark_setup['url_resources']) original_cached_requests = original_requests.intersection(cache_whitelist) original_uncached_requests = original_requests.difference(cache_whitelist) all_sent_url_requests = set() # Verify requests from traces. run_id = -1 while True: run_id += 1 run_path = os.path.join(benchmark_output_directory_path, str(run_id)) if not os.path.isdir(run_path): break trace_path = os.path.join(run_path, sandwich_runner.TRACE_FILENAME) if not os.path.isfile(trace_path): logging.error('missing trace %s' % trace_path) continue trace = LoadingTrace.FromJsonFile(trace_path) logging.info('verifying %s from %s' % (trace.url, trace_path)) effective_requests = ListUrlRequests(trace, RequestOutcome.All) effective_cached_requests = \ ListUrlRequests(trace, RequestOutcome.ServedFromCache) effective_uncached_requests = \ ListUrlRequests(trace, RequestOutcome.NotServedFromCache) missing_requests = original_requests.difference(effective_requests) unexpected_requests = effective_requests.difference(original_requests) expected_cached_requests = \ original_cached_requests.difference(missing_requests) missing_cached_requests = \ expected_cached_requests.difference(effective_cached_requests) expected_uncached_requests = original_uncached_requests.union( unexpected_requests).union(missing_cached_requests) all_sent_url_requests.update(effective_uncached_requests) _PrintUrlSetComparison(original_requests, effective_requests, 'All resources') _PrintUrlSetComparison(expected_cached_requests, effective_cached_requests, 'Cached resources') _PrintUrlSetComparison(expected_uncached_requests, effective_uncached_requests, 'Non cached resources') # Verify requests from WPR. wpr_log_path = os.path.join(benchmark_output_directory_path, sandwich_runner.WPR_LOG_FILENAME) logging.info('verifying requests from %s' % wpr_log_path) all_wpr_requests = wpr_backend.ExtractRequestsFromLog(wpr_log_path) all_wpr_urls = set() unserved_wpr_urls = set() wpr_command_colliding_urls = set() for request in all_wpr_requests: if request.is_wpr_host: continue if urlparse(request.url).path.startswith('/web-page-replay'): wpr_command_colliding_urls.add(request.url) elif request.is_served is False: unserved_wpr_urls.add(request.url) all_wpr_urls.add(request.url) _PrintUrlSetComparison(set(), unserved_wpr_urls, 'Distinct unserved resources from WPR') _PrintUrlSetComparison(set(), wpr_command_colliding_urls, 'Distinct resources colliding to WPR commands') _PrintUrlSetComparison(all_wpr_urls, all_sent_url_requests, 'Distinct resource requests to WPR')