def _BuildPatchedCache(original_cache_run_path, original_cache_archive_path, cache_archive_dest_path): CACHE_CONTROL_VALUE = 'max-age=0,stale-while-revalidate=315360000' trace_path = os.path.join(original_cache_run_path, '0', sandwich_runner.TRACE_FILENAME) trace = loading_trace.LoadingTrace.FromJsonFile(trace_path) patch_count = 0 with common_util.TemporaryDirectory(prefix='sandwich_tmp') as tmp_path: cache_path = os.path.join(tmp_path, 'cache') chrome_cache.UnzipDirectoryContent(original_cache_archive_path, cache_path) cache_backend = chrome_cache.CacheBackend(cache_path, 'simple') cache_keys = set(cache_backend.ListKeys()) for request in trace.request_track.GetEvents(): if request.url not in cache_keys: continue caching_policy = request_track.CachingPolicy(request) assert caching_policy.IsCacheable() freshness = caching_policy.GetFreshnessLifetimes() if freshness[0] == 0: continue request.SetHTTPResponseHeader('cache-control', CACHE_CONTROL_VALUE) raw_headers = request.GetRawResponseHeaders() cache_backend.UpdateRawResponseHeaders(request.url, raw_headers) patch_count += 1 chrome_cache.ZipDirectoryContent(cache_path, cache_archive_dest_path) logging.info('Patched %d cached resources out of %d' % (patch_count, len(cache_keys)))
def PatchCacheArchive(cache_archive_path, loading_trace_path, cache_archive_dest_path): """Patch the cache archive. Note: This method update the raw response headers of cache entries' to store the ones such as Set-Cookie that were pruned by the net::HttpCacheTransaction, and remove the stream index 2 holding resource's compile meta data. Args: cache_archive_path: Input archive's path to patch. loading_trace_path: Path of the loading trace that have recorded the cache archive <cache_archive_path>. cache_archive_dest_path: Archive destination's path. """ trace = LoadingTrace.FromJsonFile(loading_trace_path) with common_util.TemporaryDirectory(prefix='sandwich_tmp') as tmp_path: cache_path = os.path.join(tmp_path, 'cache') chrome_cache.UnzipDirectoryContent(cache_archive_path, cache_path) cache_backend = chrome_cache.CacheBackend(cache_path, 'simple') cache_entries = set(cache_backend.ListKeys()) logging.info('Original cache size: %d bytes' % cache_backend.GetSize()) for request in _FilterOutDataAndIncompleteRequests( trace.request_track.GetEvents()): # On requests having an upload data stream such as POST requests, # net::HttpCache::GenerateCacheKey() prefixes the cache entry's key with # the upload data stream's session unique identifier. # # It is fine to not patch these requests since when reopening Chrome, # there is no way the entry can be reused since the upload data stream's # identifier will be different. # # The fact that these entries are kept in the cache after closing Chrome # properly by closing the Chrome tab as the ChromeControler.SetSlowDeath() # do is a known Chrome bug (crbug.com/610725). # # TODO(gabadie): Add support in ValidateCacheArchiveContent() and in # VerifyBenchmarkOutputDirectory() for POST requests to be known as # impossible to use from cache. if request.url not in cache_entries: if request.method != 'POST': raise RuntimeError( 'Unexpected method that is not found in cache.' ''.format(request.method)) continue # Chrome prunes Set-Cookie from response headers before storing them in # disk cache. Also, it adds implicit "Vary: cookie" header to all redirect # response headers. Sandwich manages the cache, but between recording the # cache and benchmarking the cookie jar is invalidated. This leads to # invalidation of all cacheable redirects. raw_headers = request.GetRawResponseHeaders() cache_backend.UpdateRawResponseHeaders(request.url, raw_headers) # NoState-Prefetch would only fetch the resources, but not parse them. cache_backend.DeleteStreamForKey(request.url, 2) chrome_cache.ZipDirectoryContent(cache_path, cache_archive_dest_path) logging.info('Patched cache size: %d bytes' % cache_backend.GetSize())
def ValidateCacheArchiveContent(ref_urls, cache_archive_path): """Validates a cache archive content. Args: ref_urls: Reference list of urls. cache_archive_path: Cache archive's path to validate. """ # TODO(gabadie): What's the best way of propagating errors happening in here? logging.info('lists cached urls from %s' % cache_archive_path) with common_util.TemporaryDirectory() as cache_directory: chrome_cache.UnzipDirectoryContent(cache_archive_path, cache_directory) cached_urls = \ chrome_cache.CacheBackend(cache_directory, 'simple').ListKeys() _PrintUrlSetComparison(set(ref_urls), set(cached_urls), 'cached resources')
def _ValidateCacheArchiveContent(cache_build_trace_path, cache_archive_path): """Validates a cache archive content. Args: cache_build_trace_path: Path of the generated trace at the cache build time. cache_archive_path: Cache archive's path to validate. Returns: { 'effective_encoded_data_lengths': {URL of all requests: encoded_data_length}, 'effective_post_requests': [URLs of POST requests], 'expected_cached_resources': [URLs of resources expected to be cached], 'successfully_cached': [URLs of cached sub-resources] } """ # TODO(gabadie): What's the best way of propagating errors happening in here? logging.info('lists cached urls from %s' % cache_archive_path) with common_util.TemporaryDirectory() as cache_directory: chrome_cache.UnzipDirectoryContent(cache_archive_path, cache_directory) cache_keys = set( chrome_cache.CacheBackend(cache_directory, 'simple').ListKeys()) trace = loading_trace.LoadingTrace.FromJsonFile(cache_build_trace_path) effective_requests = sandwich_utils.ListUrlRequests( trace, sandwich_utils.RequestOutcome.All) effective_post_requests = sandwich_utils.ListUrlRequests( trace, sandwich_utils.RequestOutcome.Post) effective_encoded_data_lengths = {} for request in sandwich_utils.FilterOutDataAndIncompleteRequests( trace.request_track.GetEvents()): if request.from_disk_cache or request.served_from_cache: # At cache archive creation time, a request might be loaded several times, # but avoid the request.encoded_data_length == 0 if loaded from cache. continue if request.url in effective_encoded_data_lengths: effective_encoded_data_lengths[request.url] = max( effective_encoded_data_lengths[request.url], request.GetResponseTransportLength()) else: effective_encoded_data_lengths[request.url] = ( request.GetResponseTransportLength()) upload_data_stream_cache_entry_keys = set() upload_data_stream_requests = set() for cache_entry_key in cache_keys: match = _UPLOAD_DATA_STREAM_REQUESTS_REGEX.match(cache_entry_key) if not match: continue upload_data_stream_cache_entry_keys.add(cache_entry_key) upload_data_stream_requests.add(match.group('url')) expected_cached_requests = effective_requests.difference( effective_post_requests) effective_cache_keys = cache_keys.difference( upload_data_stream_cache_entry_keys) _PrintUrlSetComparison(effective_post_requests, upload_data_stream_requests, 'POST resources') _PrintUrlSetComparison(expected_cached_requests, effective_cache_keys, 'Cached resources') return { 'effective_encoded_data_lengths': effective_encoded_data_lengths, 'effective_post_requests': [url for url in effective_post_requests], 'expected_cached_resources': [url for url in expected_cached_requests], 'successfully_cached_resources': [url for url in effective_cache_keys] }
def _BuildBenchmarkCache(original_wpr_trace_path, urls_to_enable_swr, original_cache_trace_path, original_cache_archive_path, cache_archive_dest_path): # Load trace that was generated at original cache creation. logging.info('loading %s', original_wpr_trace_path) trace = loading_trace.LoadingTrace.FromJsonFile(original_wpr_trace_path) # Lists URLs that should not be in the cache or already have SWR headers. urls_should_not_be_cached = set() urls_already_with_swr = set() for request in trace.request_track.GetEvents(): caching_policy = request_track.CachingPolicy(request) if not caching_policy.IsCacheable(): urls_should_not_be_cached.add(request.url) elif caching_policy.GetFreshnessLifetimes()[1] > 0: urls_already_with_swr.add(request.url) # Trace are fat, kill this one to save up memory for the next one to load in # this scope. del trace # Load trace that was generated at original cache creation. logging.info('loading %s', original_cache_trace_path) trace = loading_trace.LoadingTrace.FromJsonFile(original_cache_trace_path) # Create cache contents. delete_count = 0 swr_patch_count = 0 originaly_swr_patch_count = 0 noswr_patch_count = 0 with common_util.TemporaryDirectory(prefix='sandwich_tmp') as tmp_path: cache_path = os.path.join(tmp_path, 'cache') chrome_cache.UnzipDirectoryContent(original_cache_archive_path, cache_path) cache_backend = chrome_cache.CacheBackend(cache_path, 'simple') cache_keys = set(cache_backend.ListKeys()) for request in trace.request_track.GetEvents(): if request.url not in cache_keys: continue if request.url in urls_should_not_be_cached: cache_backend.DeleteKey(request.url) delete_count += 1 continue if not request.HasReceivedResponse(): continue if request.url in urls_to_enable_swr: request.SetHTTPResponseHeader( 'cache-control', 'max-age=0,stale-while-revalidate=315360000') request.SetHTTPResponseHeader('last-modified', 'Thu, 23 Jun 2016 11:30:00 GMT') swr_patch_count += 1 elif request.url in urls_already_with_swr: # Force to use SWR on resources that originally attempted to use it. request.SetHTTPResponseHeader( 'cache-control', 'max-age=0,stale-while-revalidate=315360000') # The resource originally had SWR enabled therefore we don't # Last-Modified to repro exactly the performance impact in case these # headers were not set properly causing an invalidation instead of a # revalidation. originaly_swr_patch_count += 1 else: # Force synchronous revalidation. request.SetHTTPResponseHeader('cache-control', 'max-age=0') noswr_patch_count += 1 raw_headers = request.GetRawResponseHeaders() cache_backend.UpdateRawResponseHeaders(request.url, raw_headers) chrome_cache.ZipDirectoryContent(cache_path, cache_archive_dest_path) logging.info('patched %d cached resources with forced SWR', swr_patch_count) logging.info('patched %d cached resources with original SWR', originaly_swr_patch_count) logging.info('patched %d cached resources without SWR', noswr_patch_count) logging.info('deleted %d cached resources', delete_count)