def get_cache_times(file_dir: str) -> Dict[str, int]: """ :return: a dictionary mapping each file name to its freshness using `findcacheable` """ def demote(): if os.path.isfile("/opt/entrypoint.sh"): os.setgid(27) os.setuid(103) path = os.environ.get("FINDCACHEABLE_BIN", os.path.join(os.path.dirname(__file__), "findcacheable")) proc = subprocess.run( f"{path} '{file_dir}/' | awk -F'/' '{{print $NF'}} | grep freshness", shell=True, preexec_fn=demote, stdout=subprocess.PIPE, ) if proc.returncode != 0: log.with_namespace("get_cache_times").warn("failed to run findcacheable", code=proc.returncode) d = {} for line in proc.stdout.decode("utf-8").strip().split("\n"): try: fname, time = line.strip().split(" freshness=") d[fname] = int(time) except ValueError: continue return d
def use_action(self, action: Action): """ Marks the action as used in both the push and preload spaces """ self.push_space.use_action(action) self.preload_space.use_action(action) logger.with_namespace("action_space").info( "used_action", action=repr(action), new_push_size=len(self.push_space), new_preload_size=len(self.preload_space), )
def get_speed_index_in_replay_server( request_url: str, client_env: ClientEnvironment, config: Config, policy: Optional[Policy] = None, cache_time: Optional[int] = None, user_data_dir: Optional[str] = None, extract_critical_requests: Optional[bool] = False, ): """ Return the page speed index """ log = logger.with_namespace("get_speed_index_in_replay_server") log.debug("using client environment", **client_env._asdict()) speed_indices = [] for i in range(EXECUTION_CAPTURE_RUNS): log.debug("recording page execution in Mahimahi", run=(i + 1), total_runs=EXECUTION_CAPTURE_RUNS) speed_index = capture_si_in_replay_server( url=request_url, config=config, client_env=client_env, policy=policy, cache_time=cache_time, user_data_dir=user_data_dir, extract_critical_requests=extract_critical_requests, ) speed_indices.append(speed_index) log.debug("captured page execution", speed_index=speed_index) speed_indices.sort() median_speedindex = speed_indices[len(speed_indices) // 2] return median_speedindex
def capture_har_in_replay_server( url: str, config: Config, client_env: ClientEnvironment, policy: Optional[Policy] = None, cache_time: Optional[int] = None, user_data_dir: Optional[str] = None, extract_critical_requests: Optional[bool] = False, ) -> Har: """ capture_har spawns a headless chrome instance and connects to its remote debugger in order to extract the HAR file generated by loading the given URL. The har capturer is launched inside a replay shell using the specified Mahimahi config, which means that the webpage needs to have been recorded before calling this method """ log = logger.with_namespace("capture_har_in_replay_server") if not config.env_config or not config.env_config.replay_dir: raise ValueError("replay_dir must be specified") policy = policy or Policy.from_dict({}) mahimahi_config = MahiMahiConfig(config=config, policy=policy, client_environment=client_env) with tempfile.TemporaryDirectory() as temp_dir: policy_file = os.path.join(temp_dir, "policy.json") output_file = os.path.join(temp_dir, "har.json") trace_file = os.path.join(temp_dir, "trace_file") with open(policy_file, "w") as f: log.debug("writing push policy file", policy_file=policy_file) f.write(json.dumps(policy.as_dict)) with open(trace_file, "w") as f: log.debug("writing trace file", trace_file=trace_file) f.write(mahimahi_config.formatted_trace_file) # configure the HAR capturer cmd = mahimahi_config.har_capture_cmd( share_dir=temp_dir, har_output_file_name="har.json", policy_file_name="policy.json", link_trace_file_name="trace_file", capture_url=url, cache_time=cache_time, user_data_dir=user_data_dir, extract_critical_requests=extract_critical_requests, ) # spawn the HAR capturer process log.debug("spawning har capturer", url=url, cmd=cmd) har_capture_proc = subprocess.run(cmd, stdout=sys.stderr, stderr=sys.stderr, timeout=300) har_capture_proc.check_returncode() with open(output_file, "r") as f: return har_from_json(f.read())
def files(self) -> List[File]: """ :return: A list of File objects corresponding to the files in self.path """ if not self._files: self._cache_times = get_cache_times(self.path) self._files = self._files or list(map(File.read, glob.iglob(f"{self.path}/*"))) for f in self._files: cache_time = self._cache_times.get(f.file_name, 0) if self.cache_time is None and cache_time > 0: f.set_cache_time(cache_time) elif self.cache_time is not None and cache_time > self.cache_time: f.set_cache_time(cache_time) else: log.with_namespace("filestore").debug( "skipping setting cache", url=f.url, actual_cache_time=cache_time, cache_time=self.cache_time ) return self._files
def decode_action(self, action: ActionIDType) -> Action: """ Decodes the given action ID into an Action object """ # Temporary for compatibility: if len(action) == 6: action = (action[0], tuple(action[1:4]), tuple(action[4:])) (action_type, push_id, preload_id) = action action_type_id = 0 if action_type == 0 else ((action_type // 5) + 1) if action_type_id == 0: return Action() is_push = action_type_id == 1 and not self.disable_push try: if is_push: return self.push_space.decode_action_id(push_id) return self.preload_space.decode_action_id(preload_id) except KeyError: logger.with_namespace("action_space").warn( "picked out of bounds action", action=action) return Action()
def record_webpage(url: str, save_dir: str, config: Config): """ Given a URL and runtime configuration, record_webpage creates a Mahimahi record shell and records the web page load in Chrome. It saves the result to the given save directory, which is expected to be empty. A subprocess.CalledProcessError is raised if an error occurs """ with tempfile.TemporaryDirectory(prefix="blaze_record", dir="/tmp") as tmp_dir: chrome_flags = get_chrome_flags(tmp_dir) chrome_cmd = get_chrome_command(url, chrome_flags, config) mm_config = MahiMahiConfig(config) cmd = mm_config.record_shell_with_cmd(save_dir, chrome_cmd) logger.with_namespace("record_webpage").debug("spawning web recorder", url=url, cmd=cmd) proc = subprocess.run(cmd, stdout=sys.stderr, stderr=sys.stderr) proc.check_returncode()
def get_page_links(url: str, max_depth: int = 1) -> List[str]: """ Performs DFS with the given max_depth on the given URL to discover all <a href="..."> links in the page """ if max_depth == 0: return [] log = logger.with_namespace("get_page_links").with_context( depth_left=max_depth) try: log.info("fetching page", url=url) page = requests.get(url) page.raise_for_status() page_text = page.text except requests.exceptions.RequestException as err: log.warn("failed to fetch page", error=repr(err)) return [] try: log.debug("parsing http response", length=len(page_text)) root = BeautifulSoup(page_text, "html.parser") except err: log.verbose(page_text) log.warn("failed to parse response", error=repr(err)) return [] parsed_links = root.find_all("a") log.info("found links", url=url, n_links=len(parsed_links)) links = [] domain = Url.parse(url).domain scheme = Url.parse(url).scheme for link in parsed_links: link_url = link.get("href") if link_url.startswith("http"): link_domain = Url.parse(link_url).domain if link_domain != domain: log.debug("ignoring found link (bad domain)", link=link_url) continue elif link_url.startswith("/"): link_url = f"{scheme}://{domain}{link_url}" else: log.debug("ignoring found link (bad prefix)", link=link_url) continue links.append(link_url) links.extend(get_page_links(link_url, max_depth - 1)) return ordered_uniq(links)
def cluster(args): """ Cluster the given folder of pages """ log = logger.with_namespace("cluster") log.info("clustering pages", folder=args.folder) def read_file(fpath): log.debug("reading file...", file=fpath) return EnvironmentConfig.load_file(fpath) files = list(map(read_file, glob.iglob(f"{args.folder}/*"))) distance_func = create_apted_distance_function(args.apted_port) c = AgglomerativeCluster(distance_func) mapping = c.cluster(files) print( json.dumps({f.request_url: int(i) for f, i in zip(files, mapping)}, indent=4))
def __init__( self, config: Config, reward_func_num: int = 0, use_aft: bool = False, client_environment: Optional[ClientEnvironment] = None, cached_urls: Optional[Set[str]] = None, ): self.config = config self.use_aft = use_aft self.cached_urls = cached_urls self.client_environment = client_environment self.simulator = Simulator(config.env_config) self.reward_func_num = reward_func_num self.reward_func = REWARD_FUNCTIONS[self.reward_func_num]( self.simulator, self.client_environment, self.cached_urls, self.use_aft) self.log = logger.with_namespace("analyzer")
def find_url_stable_set(url: str, config: Config) -> List[Resource]: """ Loads the given URL `STABLE_SET_NUM_RUNS` times back-to-back and records the HAR file generated by chrome. It then finds the common URLs across the page loads, computes their relative ordering, and returns a list of PushGroups for the webpage """ log = logger.with_namespace("find_url_stable_set") hars: List[Har] = [] resource_sets: List[Set[Resource]] = [] pos_dict = collections.defaultdict(lambda: collections.defaultdict(int)) for n in range(STABLE_SET_NUM_RUNS): log.debug("capturing HAR...", run=n + 1, url=url) har = capture_har_in_replay_server(url, config, get_default_client_environment()) resource_list = har_entries_to_resources(har) if not resource_list: log.warn("no response received", run=n + 1) continue log.debug("received resources", total=len(resource_list)) for i in range(len(resource_list)): # pylint: disable=consider-using-enumerate for j in range(i + 1, len(resource_list)): pos_dict[resource_list[i].url][resource_list[j].url] += 1 resource_sets.append(set(resource_list)) hars.append(har) log.debug("resource set lengths", resource_lens=list(map(len, resource_sets))) if not resource_sets: return [] common_res = list(set.intersection(*resource_sets)) common_res.sort(key=functools.cmp_to_key( lambda a, b: -pos_dict[a.url][b.url] + (len(resource_sets) // 2))) # Hackily reorder the combined resource sets so that compute_parent_child_relationships works common_res = [ Resource(**{ **r._asdict(), "order": i }) for (i, r) in enumerate(common_res) ] return compute_parent_child_relationships(common_res, hars[0].timings)
def __init__(self, env_config: EnvironmentConfig): self.env_config = env_config self.log = logger.with_namespace("simulator") self.root = None self.node_map = {} self.url_to_node_map = {} self.create_execution_graph(env_config) self.pq: Optional[PriorityQueue] = None self.request_queue: Optional[RequestQueue] = None self.completed_nodes = {} self.pushed_nodes = {} self.total_time_ms = 0 self.cached_urls = set() self.no_push: Optional[Simulator] = None self.client_env: Optional[ClientEnvironment] = None self.policy: Optional[Policy] = None
def get_page_load_time_in_replay_server( request_url: str, client_env: ClientEnvironment, config: Config, policy: Optional[Policy] = None, cache_time: Optional[int] = None, user_data_dir: Optional[str] = None, extract_critical_requests: Optional[bool] = False, ): """ Return the page load time, the HAR resources captured, and the push groups detected by loading the page in the given mahimahi record directory """ log = logger.with_namespace("get_page_load_time_in_replay_server") log.debug("using client environment", **client_env._asdict()) hars = [] for i in range(EXECUTION_CAPTURE_RUNS): log.debug("recording page execution in Mahimahi", run=(i + 1), total_runs=EXECUTION_CAPTURE_RUNS) har = capture_har_in_replay_server( url=request_url, config=config, client_env=client_env, policy=policy, cache_time=cache_time, user_data_dir=user_data_dir, extract_critical_requests=extract_critical_requests, ) hars.append(har) log.debug("captured page execution", page_load_time=har.page_load_time_ms) hars.sort(key=lambda h: h.page_load_time_ms) plt_ms = [h.page_load_time_ms for h in hars] median_har = hars[len(hars) // 2] log.debug("recorded execution times", plt_ms=plt_ms) return median_har.page_load_time_ms, plt_ms
def stop_condition(): """ Implements a stateful stopping condition to automatically stop the training based on analyzing the running episode reward mean over a certain window size. It also stop automatically if the number of training iterations exceeds some maximum, but not before it exceeds some minimum. """ log = logger.with_namespace("stop_condition") num_iters = 0 past_rewards = deque() def stopper(trial_id, result): nonlocal num_iters, past_rewards num_iters += 1 if "time_since_restore" in result and result[ "time_since_restore"] >= MAX_TIME_SECONDS: log.info("auto stopping", time_seconds=result["time_since_restore"], iters=num_iters) return True if "episode_reward_max" in result and "episode_reward_min" in result and "episode_reward_mean" in result: rewards = (result["episode_reward_min"], result["episode_reward_mean"], result["episode_reward_max"]) log.debug("recording trial result", trial_id=trial_id, num_iters=num_iters, rewards=rewards) past_rewards.append(rewards) else: log.warn("unable to record episode result", result=result, trial_id=trial_id) return False # truncate the rewards list to past `WINDOW_SIZE` iterations only if len(past_rewards) > WINDOW_SIZE: past_rewards.popleft() if num_iters > MIN_ITERATIONS: stdev_min, stdev_mean, stdev_max = tuple( map(stdev, zip(*past_rewards))) log.debug("reward stats", stdev_min=stdev_min, stdev_mean=stdev_mean, stdev_max=stdev_max) relative_stdev_based_stop = stdev_mean <= 0.05 * abs( past_rewards[-1][1]) if num_iters > MAX_ITERATIONS or relative_stdev_based_stop: log.info("auto stopping", time_seconds=result.get("time_since_restore", 0), iters=num_iters) return True return False stopper.past_rewards = past_rewards return stopper
def __init__(self, hosts: List[str]): """ :param hosts: The hosts to manage IP addresses for """ self.hosts = hosts self.log = logger.with_namespace("interface")
def start_server(replay_dir: str, cert_path: Optional[str] = None, key_path: Optional[str] = None, policy: Optional[Policy] = None, per_resource_latency: Optional[str] = None, cache_time: Optional[int] = None, extract_critical_requests: Optional[bool] = False, enable_http2: Optional[bool] = False): """ Reads the given replay directory and sets up the NGINX server to replay it. This function also creates the DNS servers, Interfaces, and writes all necessary temporary files. :param replay_dir: The directory to replay (should be mahimahi-recorded) :param cert_path: The path to the SSL certificate for the HTTP/2 NGINX server :param key_path: The path to the SSL key for the HTTP/2 NGINX server :param policy: The path to the push/preload policy to use for the server """ log = logger.with_namespace("replay_server") push_policy = policy.as_dict["push"] if policy else {} preload_policy = policy.as_dict["preload"] if policy else {} res_latency_map = json.loads(open( per_resource_latency, 'r').read()) if per_resource_latency else {} # Load the file store into memory if not os.path.isdir(replay_dir): raise NotADirectoryError(f"{replay_dir} is not a directory") filestore = FileStore(replay_dir, cache_time=cache_time) # Create host-ip mapping hosts = filestore.hosts interfaces = Interfaces(hosts) host_ip_map = interfaces.mapping # Save files and create nginx configuration config = Config() with tempfile.TemporaryDirectory() as file_dir: log.debug("storing temporary files in", file_dir=file_dir) for host, files in filestore.files_by_host.items(): log.info("creating host", host=host, address=host_ip_map[host]) uris_served = set() host_res_lmap = res_latency_map[ host] if host in res_latency_map else {} http2_directive = "ssl http2" if enable_http2 and host != "archive.org" and host != "analytics.archive.org" else "ssl" # Create a server block for this host server = config.http_block.add_server( server_name=host, server_addr=host_ip_map[host], cert_path=cert_path, key_path=key_path, root=file_dir, res_latency_map=host_res_lmap, enable_http2=http2_directive) for file in files: # Handles the case where we may have duplicate URIs for a single host # or where URIs in nginx cannot be too long if file.uri in uris_served or len(file.uri) > 3600 or len( file.headers.get("location", "")) > 3600: continue uris_served.add(file.uri + file.scheme) log.debug( "serve", file_name=file.file_name, status=file.status, method=file.method, uri=file.uri, host=file.host, ) # Create entry for this resource if file.status < 300 or file.status >= 400: loc = server.add_location_block( uri=file.uri, scheme=file.scheme, file_name=file.file_name, content_type=file.headers.get("content-type", None)) elif "location" in file.headers: loc = server.add_location_block( uri=file.uri, scheme=file.scheme, redirect_uri=file.headers["location"]) else: log.warn("skipping", file_name=file.file_name, method=file.method, uri=file.uri, host=file.host) continue backup_file_body = file.body try: if extract_critical_requests and "text/html" in file.headers.get( "content-type", ""): file.body = inject_extract_critical_requests_javascript( file) file_path = os.path.join(file_dir, file.file_name) with open( os.open(file_path, os.O_CREAT | os.O_WRONLY, 0o644), "wb") as f: f.write(file.body) except TypeError as e: file_path = os.path.join(file_dir, file.file_name) with open(os.open(file_path, os.O_CREAT | os.O_WRONLY, 0o644), mode="w", encoding="utf8") as f: f.write(file.body) except UnicodeEncodeError as e: # file.body somehow because corrupted. # this messes up the encoding # Save the file's original body to file # this happens if a file is a bytestream but does not have gzip header or vice versa log.warn("unable to inject critical for file ", uri=file.uri, error=e) file.body = backup_file_body file_path = os.path.join(file_dir, file.file_name) with open( os.open(file_path, os.O_CREAT | os.O_WRONLY, 0o644), "wb") as f: f.write(file.body) # Add headers for key, value in file.headers.items(): loc.add_header(key, value) # Look up push and preload policy full_source = f"https://{file.host}{file.uri}" push_res_list = push_policy.get( full_source, push_policy.get(full_source + "/", [])) preload_res_list = preload_policy.get( full_source, preload_policy.get(full_source + "/", [])) for res in push_res_list: path = urlparse(res["url"]).path log.debug("create push rule", source=file.uri, push=path) loc.add_push(path) for res in preload_res_list: log.debug("create preload rule", source=file.uri, preload=res["url"], type=res["type"]) loc.add_preload(res["url"], res["type"]) # Save the nginx configuration conf_file = os.path.join(file_dir, "nginx.conf") log.debug("writing nginx config", conf_file=conf_file) with open(conf_file, "w") as f: f.write(str(config)) # with open("/tmp/dummy", "w") as f: # f.write(str(config)) # log.debug("contents of nginx config", config=str(config)) # time.sleep(10000) # Create the interfaces, start the DNS server, and start the NGINX server with interfaces: with DNSServer(host_ip_map) as dns: # If wait lasts for more than 0.5 seconds, a TimeoutError will be raised, which is okay since it # means that nginx is running successfully. If it finishes sooner, it means it crashed and # we should raise an exception try: proc = subprocess.Popen([ "/usr/local/openresty/nginx/sbin/nginx", "-c", conf_file ], stdout=sys.stderr, stderr=sys.stderr) proc.wait(0.5) raise RuntimeError("nginx exited unsuccessfully") except subprocess.TimeoutExpired: yield finally: log.info("Killing dns server and nginx server") # subprocess.call(["sudo","kill",str(proc.pid)]) # subprocess.call(["sudo","kill","-SIGKILL",str(proc.pid)]) # dns.proc.terminate() proc.terminate()