Example #1
0
def _PatchCacheArchive(cache_archive_path, loading_trace_path,
                       cache_archive_dest_path):
    """Patch the cache archive.

  Note: This method update the raw response headers of cache entries' to store
    the ones such as Set-Cookie that were pruned by the
    net::HttpCacheTransaction, and remove the stream index 2 holding resource's
    compile meta data.

  Args:
    cache_archive_path: Input archive's path to patch.
    loading_trace_path: Path of the loading trace that have recorded the cache
        archive <cache_archive_path>.
    cache_archive_dest_path: Archive destination's path.
  """
    trace = loading_trace.LoadingTrace.FromJsonFile(loading_trace_path)
    with common_util.TemporaryDirectory(prefix='sandwich_tmp') as tmp_path:
        cache_path = os.path.join(tmp_path, 'cache')
        chrome_cache.UnzipDirectoryContent(cache_archive_path, cache_path)
        cache_backend = chrome_cache.CacheBackend(cache_path, 'simple')
        cache_entries = set(cache_backend.ListKeys())
        logging.info('Original cache size: %d bytes' % cache_backend.GetSize())
        for request in sandwich_utils.FilterOutDataAndIncompleteRequests(
                trace.request_track.GetEvents()):
            # On requests having an upload data stream such as POST requests,
            # net::HttpCache::GenerateCacheKey() prefixes the cache entry's key with
            # the upload data stream's session unique identifier.
            #
            # It is fine to not patch these requests since when reopening Chrome,
            # there is no way the entry can be reused since the upload data stream's
            # identifier will be different.
            #
            # The fact that these entries are kept in the cache after closing Chrome
            # properly by closing the Chrome tab as the ChromeControler.SetSlowDeath()
            # do is known chrome bug (crbug.com/610725).
            if request.url not in cache_entries:
                continue
            # Chrome prunes Set-Cookie from response headers before storing them in
            # disk cache. Also, it adds implicit "Vary: cookie" header to all redirect
            # response headers. Sandwich manages the cache, but between recording the
            # cache and benchmarking the cookie jar is invalidated. This leads to
            # invalidation of all cacheable redirects.
            raw_headers = request.GetRawResponseHeaders()
            cache_backend.UpdateRawResponseHeaders(request.url, raw_headers)
            # NoState-Prefetch would only fetch the resources, but not parse them.
            cache_backend.DeleteStreamForKey(request.url, 2)
        chrome_cache.ZipDirectoryContent(cache_path, cache_archive_dest_path)
        logging.info('Patched cache size: %d bytes' % cache_backend.GetSize())
Example #2
0
def _ExtractDiscoverableUrls(original_headers_path, loading_trace_path,
                             subresource_discoverer):
    """Extracts discoverable resource urls from a loading trace according to a
  sub-resource discoverer.

  Args:
    original_headers_path: Path of JSON containing the original headers.
    loading_trace_path: Path of the loading trace recorded at original cache
      creation.
    subresource_discoverer: The sub-resources discoverer that should white-list
      the resources to keep in cache for the NoState-Prefetch benchmarks.

  Returns:
    A set of urls.
  """
    assert subresource_discoverer in SUBRESOURCE_DISCOVERERS, \
        'unknown prefetch simulation {}'.format(subresource_discoverer)
    logging.info('loading %s', loading_trace_path)
    trace = loading_trace.LoadingTrace.FromJsonFile(loading_trace_path)
    dependencies_lens = RequestDependencyLens(trace)

    # Build the list of discovered requests according to the desired simulation.
    discovered_requests = []
    if subresource_discoverer == Discoverer.HTMLPreloadScannerStore:
        requests = _DiscoverRequests(dependencies_lens,
                                     Discoverer.HTMLPreloadScanner)
        discovered_requests = _PruneOutOriginalNoStoreRequests(
            original_headers_path, requests)
    else:
        discovered_requests = _DiscoverRequests(dependencies_lens,
                                                subresource_discoverer)

    whitelisted_urls = set()
    for request in sandwich_utils.FilterOutDataAndIncompleteRequests(
            discovered_requests):
        logging.debug('white-listing %s', request.url)
        whitelisted_urls.add(request.url)
    logging.info('number of white-listed resources: %d', len(whitelisted_urls))
    return whitelisted_urls
Example #3
0
def _ProcessRunOutputDir(cache_validation_result, benchmark_setup,
                         runner_output_dir):
    """Process benchmark's run output directory.

  Args:
    cache_validation_result: Same as for _RunOutputVerifier
    benchmark_setup: Same as for _RunOutputVerifier
    runner_output_dir: Same as for SandwichRunner.output_dir

  Returns:
    List of dictionary.
  """
    run_metrics_list = []
    run_output_verifier = _RunOutputVerifier(cache_validation_result,
                                             benchmark_setup)
    cached_encoded_data_lengths = (
        cache_validation_result['effective_encoded_data_lengths'])
    for repeat_id, repeat_dir in sandwich_runner.WalkRepeatedRuns(
            runner_output_dir):
        trace_path = os.path.join(repeat_dir, sandwich_runner.TRACE_FILENAME)

        logging.info('loading trace: %s', trace_path)
        trace = loading_trace.LoadingTrace.FromJsonFile(trace_path)

        logging.info('verifying trace: %s', trace_path)
        run_output_verifier.VerifyTrace(trace)

        logging.info('extracting metrics from trace: %s', trace_path)

        # Gather response size per URLs.
        response_sizes = {}
        for request in sandwich_utils.FilterOutDataAndIncompleteRequests(
                trace.request_track.GetEvents()):
            # Ignore requests served from the blink's cache.
            if request.served_from_cache:
                continue
            if request.from_disk_cache:
                if request.url in cached_encoded_data_lengths:
                    response_size = cached_encoded_data_lengths[request.url]
                else:
                    # Some fat webpages may overflow the Memory cache, and so some
                    # requests might be served from disk cache couple of times per page
                    # load.
                    logging.warning(
                        'Looks like could be served from memory cache: %s',
                        request.url)
                    if request.url in response_sizes:
                        response_size = response_sizes[request.url]
            else:
                response_size = request.GetResponseTransportLength()
            response_sizes[request.url] = response_size

        # Sums the served from cache/network bytes.
        served_from_network_bytes = 0
        served_from_cache_bytes = 0
        urls_hitting_network = set()
        for request in sandwich_utils.FilterOutDataAndIncompleteRequests(
                trace.request_track.GetEvents()):
            # Ignore requests served from the blink's cache.
            if request.served_from_cache:
                continue
            urls_hitting_network.add(request.url)
            if request.from_disk_cache:
                served_from_cache_bytes += response_sizes[request.url]
            else:
                served_from_network_bytes += response_sizes[request.url]

        # Make sure the served from blink's cache requests have at least one
        # corresponding request that was not served from the blink's cache.
        for request in sandwich_utils.FilterOutDataAndIncompleteRequests(
                trace.request_track.GetEvents()):
            assert (request.url in urls_hitting_network
                    or not request.served_from_cache)

        run_metrics = {
            'url':
            trace.url,
            'repeat_id':
            repeat_id,
            'subresource_discoverer':
            benchmark_setup['subresource_discoverer'],
            'cache_recording.subresource_count':
            len(cache_validation_result['effective_encoded_data_lengths']),
            'cache_recording.cached_subresource_count_theoretic':
            len(cache_validation_result['successfully_cached_resources']),
            'cache_recording.cached_subresource_count':
            len(cache_validation_result['expected_cached_resources']),
            'benchmark.subresource_count':
            len(
                sandwich_utils.ListUrlRequests(
                    trace, sandwich_utils.RequestOutcome.All)),
            'benchmark.served_from_cache_count_theoretic':
            len(benchmark_setup['cache_whitelist']),
            'benchmark.served_from_cache_count':
            len(
                sandwich_utils.ListUrlRequests(
                    trace, sandwich_utils.RequestOutcome.ServedFromCache)),
            'benchmark.served_from_network_bytes':
            served_from_network_bytes,
            'benchmark.served_from_cache_bytes':
            served_from_cache_bytes
        }
        run_metrics.update(
            sandwich_metrics.ExtractCommonMetricsFromRepeatDirectory(
                repeat_dir, trace))
        run_metrics_list.append(run_metrics)
    run_metrics_list.sort(key=lambda e: e['repeat_id'])

    wpr_log_path = os.path.join(runner_output_dir,
                                sandwich_runner.WPR_LOG_FILENAME)
    logging.info('verifying wpr log: %s', wpr_log_path)
    run_output_verifier.VerifyWprLog(wpr_log_path)
    return run_metrics_list
Example #4
0
def _ValidateCacheArchiveContent(cache_build_trace_path, cache_archive_path):
    """Validates a cache archive content.

  Args:
    cache_build_trace_path: Path of the generated trace at the cache build time.
    cache_archive_path: Cache archive's path to validate.

  Returns:
    {
      'effective_encoded_data_lengths':
        {URL of all requests: encoded_data_length},
      'effective_post_requests': [URLs of POST requests],
      'expected_cached_resources': [URLs of resources expected to be cached],
      'successfully_cached': [URLs of cached sub-resources]
    }
  """
    # TODO(gabadie): What's the best way of propagating errors happening in here?
    logging.info('lists cached urls from %s' % cache_archive_path)
    with common_util.TemporaryDirectory() as cache_directory:
        chrome_cache.UnzipDirectoryContent(cache_archive_path, cache_directory)
        cache_keys = set(
            chrome_cache.CacheBackend(cache_directory, 'simple').ListKeys())
    trace = loading_trace.LoadingTrace.FromJsonFile(cache_build_trace_path)
    effective_requests = sandwich_utils.ListUrlRequests(
        trace, sandwich_utils.RequestOutcome.All)
    effective_post_requests = sandwich_utils.ListUrlRequests(
        trace, sandwich_utils.RequestOutcome.Post)
    effective_encoded_data_lengths = {}
    for request in sandwich_utils.FilterOutDataAndIncompleteRequests(
            trace.request_track.GetEvents()):
        if request.from_disk_cache or request.served_from_cache:
            # At cache archive creation time, a request might be loaded several times,
            # but avoid the request.encoded_data_length == 0 if loaded from cache.
            continue
        if request.url in effective_encoded_data_lengths:
            effective_encoded_data_lengths[request.url] = max(
                effective_encoded_data_lengths[request.url],
                request.GetResponseTransportLength())
        else:
            effective_encoded_data_lengths[request.url] = (
                request.GetResponseTransportLength())

    upload_data_stream_cache_entry_keys = set()
    upload_data_stream_requests = set()
    for cache_entry_key in cache_keys:
        match = _UPLOAD_DATA_STREAM_REQUESTS_REGEX.match(cache_entry_key)
        if not match:
            continue
        upload_data_stream_cache_entry_keys.add(cache_entry_key)
        upload_data_stream_requests.add(match.group('url'))

    expected_cached_requests = effective_requests.difference(
        effective_post_requests)
    effective_cache_keys = cache_keys.difference(
        upload_data_stream_cache_entry_keys)

    _PrintUrlSetComparison(effective_post_requests,
                           upload_data_stream_requests, 'POST resources')
    _PrintUrlSetComparison(expected_cached_requests, effective_cache_keys,
                           'Cached resources')

    return {
        'effective_encoded_data_lengths': effective_encoded_data_lengths,
        'effective_post_requests': [url for url in effective_post_requests],
        'expected_cached_resources': [url for url in expected_cached_requests],
        'successfully_cached_resources': [url for url in effective_cache_keys]
    }