def from_dict(policy_dict: dict) -> "Policy": """ Returns a Policy instantiated from a simple dictionary of source URLs to lists of push URLs. The returned Policy will only be useful for display purposes, as the action space will be instantiated with an empty list of push groups. """ policy = Policy() for ptype, policy_obj in policy_dict.items(): if ptype not in {"push", "preload"}: continue action_set = policy.source_to_push if ptype == "push" else policy.source_to_preload reverse_map = policy.push_to_source if ptype == "push" else policy.preload_to_source for (source, deps) in policy_obj.items(): action_set[Resource( url=source, size=0, type=ResourceType.NONE)] = set( Resource(url=push["url"], size=0, type=ResourceType[push["type"]]) for push in deps) for obj in deps: reverse_map[Resource( url=obj["url"], size=0, type=ResourceType[obj["type"]])] = Resource( url=source, size=0, type=ResourceType.NONE) policy.steps_taken += sum(map(len, policy.source_to_push.values())) policy.steps_taken += sum(map(len, policy.source_to_preload.values())) return policy
def har_entries_to_resources(har: Har) -> List[Resource]: """ Converts a list of HAR entries to a list of Resources """ har_entries = har.log.entries # filter only entries that are requests for http(s) resources har_entries = [entry for entry in har_entries if entry.request.url.startswith("http")] # filter only entries for requests that completed har_entries = [entry for entry in har_entries if entry.response.status != 0] # sort the requests by initiated time har_entries = sorted(har_entries, key=lambda e: e.started_date_time) # select unique entries in case the same URL shows up twice har_entries = ordered_uniq(har_entries, key=lambda e: e.request.url) resource_list = [] for (order, entry) in enumerate(har_entries): resource_list.append( Resource( url=entry.request.url, size=max(entry.response.body_size, 0) + max(entry.response.headers_size, 0), type=get_har_entry_type(entry), order=order, source_id=order, critical=entry.critical, ) ) return compute_parent_child_relationships(resource_list, har.timings)
def compute_parent_child_relationships(res_list: List[Resource], timings: Dict[str, Timing]) -> List[Resource]: """ Returns a new, ordered list of resources with parent-child relationships given the passed-in timing information. The input list is assumed to be ordered """ # pre-map entry URL to its order order_map = {res.url: res.order for res in res_list} new_res_list = [] for res in res_list: timing = timings.get(res.url, None) parent = order_map.get(timing.initiator, 0) if timing else 0 new_res_list.append( Resource( url=res.url, type=res.type, size=res.size, order=res.order, group_id=res.group_id, source_id=res.source_id, initiator=parent, execution_ms=timing.execution_ms if timing else 0, fetch_delay_ms=timing.fetch_delay_ms if timing else 0, time_to_first_byte_ms=timing.time_to_first_byte_ms if timing else 0, critical=res.critical, ) ) return new_res_list
def resource_list_to_push_groups(res_list: List[Resource], train_domain_globs=None) -> List[PushGroup]: """ Convert an ordered list of resources to a list of PushGroups """ # extract the list of domains and sort domains = sorted(list(set(Url.parse(res.url).domain for res in res_list))) # map domain to push group domain_to_push_group = {domain: i for (i, domain) in enumerate(domains)} # create the push groups is_trainable = lambda d: not train_domain_globs or any( map(pathlib.PurePath(d).match, train_domain_globs)) trainable_domains = set(domain for domain in domains if is_trainable(domain)) push_groups = [ PushGroup(id=i, name=domain, resources=[], trainable=(domain in trainable_domains)) for (i, domain) in enumerate(domains) ] # map the old order to the new order so that the initiators can be translated in place old_to_new_order_map = { res.order: order for (order, res) in enumerate(res_list) } for (order, res) in enumerate(res_list): url = Url.parse(res.url) group_id = domain_to_push_group[url.domain] new_res = Resource( url=res.url, size=res.size, type=res.type, order=order, group_id=group_id, source_id=len(push_groups[group_id].resources), initiator=old_to_new_order_map[res.initiator], execution_ms=res.execution_ms, fetch_delay_ms=res.fetch_delay_ms, time_to_first_byte_ms=res.time_to_first_byte_ms, critical=res.critical, ) push_groups[new_res.group_id].resources.append(new_res) return push_groups
def find_url_stable_set(url: str, config: Config) -> List[Resource]: """ Loads the given URL `STABLE_SET_NUM_RUNS` times back-to-back and records the HAR file generated by chrome. It then finds the common URLs across the page loads, computes their relative ordering, and returns a list of PushGroups for the webpage """ log = logger.with_namespace("find_url_stable_set") hars: List[Har] = [] resource_sets: List[Set[Resource]] = [] pos_dict = collections.defaultdict(lambda: collections.defaultdict(int)) for n in range(STABLE_SET_NUM_RUNS): log.debug("capturing HAR...", run=n + 1, url=url) har = capture_har_in_replay_server(url, config, get_default_client_environment()) resource_list = har_entries_to_resources(har) if not resource_list: log.warn("no response received", run=n + 1) continue log.debug("received resources", total=len(resource_list)) for i in range(len(resource_list)): # pylint: disable=consider-using-enumerate for j in range(i + 1, len(resource_list)): pos_dict[resource_list[i].url][resource_list[j].url] += 1 resource_sets.append(set(resource_list)) hars.append(har) log.debug("resource set lengths", resource_lens=list(map(len, resource_sets))) if not resource_sets: return [] common_res = list(set.intersection(*resource_sets)) common_res.sort(key=functools.cmp_to_key( lambda a, b: -pos_dict[a.url][b.url] + (len(resource_sets) // 2))) # Hackily reorder the combined resource sets so that compute_parent_child_relationships works common_res = [ Resource(**{ **r._asdict(), "order": i }) for (i, r) in enumerate(common_res) ] return compute_parent_child_relationships(common_res, hars[0].timings)
def create_resource(url): return Resource(url=url, size=1024, order=1, group_id=0, source_id=0, type=ResourceType.HTML)
def get_push_groups() -> List[PushGroup]: return [ PushGroup( id=0, name="example.com", trainable=True, resources=[ Resource( url="http://example.com/", size=1024, order=0, group_id=0, source_id=0, initiator=0, type=ResourceType.HTML, ), Resource( url="http://example.com/A", size=1024, order=1, group_id=0, source_id=1, initiator=0, type=ResourceType.IMAGE, ), Resource( url="http://example.com/B", size=1024, order=5, group_id=0, source_id=2, initiator=0, type=ResourceType.IMAGE, ), Resource( url="http://example.com/C", size=1024, order=8, group_id=0, source_id=3, initiator=0, type=ResourceType.IMAGE, ), Resource( url="http://example.com/F", size=1024, order=12, group_id=0, source_id=4, initiator=0, type=ResourceType.IMAGE, ), ], ), PushGroup( id=1, name="img.example.com", trainable=True, resources=[ Resource( url="http://img.example.com/D", size=1024, order=9, group_id=1, source_id=0, initiator=0, type=ResourceType.IMAGE, ), Resource( url="http://img.example.com/E", size=1024, order=11, group_id=1, source_id=1, initiator=0, type=ResourceType.IMAGE, ), Resource( url="http://img.example.com/G", size=1024, order=13, group_id=1, source_id=2, initiator=0, type=ResourceType.IMAGE, ), ], ), PushGroup( id=2, name="serve.ads.googleads.com", trainable=False, resources=[ Resource( url="http://serve.ads.googleads.com/script.1.js", size=1024, order=4, group_id=2, source_id=0, initiator=0, type=ResourceType.SCRIPT, ), Resource( url="http://serve.ads.googleads.com/script.2.js", size=1024, order=7, group_id=2, source_id=1, initiator=4, type=ResourceType.SCRIPT, ), Resource( url="http://serve.ads.googleads.com/script.3.js", size=1024, order=10, group_id=2, source_id=2, initiator=7, type=ResourceType.SCRIPT, ), ], ), PushGroup( id=3, name="static.example.com", trainable=True, resources=[ Resource( url="http://static.example.com/script.js", size=1024, order=2, group_id=3, source_id=0, initiator=1, type=ResourceType.SCRIPT, ), Resource( url="http://static.example.com/font.woff", size=1024, order=3, group_id=3, source_id=1, initiator=2, type=ResourceType.FONT, ), Resource( url="http://static.example.com/image.jpg", size=1024, order=6, group_id=3, source_id=2, initiator=0, type=ResourceType.IMAGE, ), ], ), ]