def test_get_sld_top_convenience_function_is_the_same_as_PublicSuffixList_method( self): psl = publicsuffix.PublicSuffixList() # these functions should be identical assert psl.get_sld('www.google.com') == publicsuffix.get_sld( 'www.google.com') assert psl.get_sld('www.test.ak.us') == publicsuffix.get_sld( 'www.test.ak.us')
def benchmark_publicsuffix2(urls, ): start = time.perf_counter() for url in urls: publicsuffix2.get_sld(url) end = time.perf_counter() print(f'publicsuffix2: {end - start}s')
def benchmark_publicsuffix2(domains, ): start = time.perf_counter() for domain in domains: publicsuffix2.get_sld(domain) end = time.perf_counter() print(f'publicsuffix2: {end - start}s')
def get_cookies(self, url): """ Return a list of cookies to be returned to server. Ref: https://searchfox.org/mozilla-central/source/netwerk/cookie/nsCookieService.cpp#2952 """ us = urlsplit(url) base_domain = ps2.get_sld(domain_to_ascii(us.hostname)) domain = domain_to_ascii(us.hostname) sql = '''select name, value, host, path from moz_cookies where (host = ? or host like ?)''' if us.scheme != 'https': sql += ' and isSecure != 1' '' if self._origin: sql += " and originAttributes = ''" cursor = self._db.execute(sql, (domain, f'%.{base_domain}')) candidates = cursor.fetchall() path = us.path or '/' candidates = [ x for x in candidates if domain == x[2] or (x[2][0] == '.' and domain.endswith(x[2])) ] candidates = [x for x in candidates if path_matches(x[3], path)] return {x[0]: x[1] for x in candidates}
def get_console_bag_for_dir(directory: Optional[str]) -> multiset.Multiset: bag = multiset.Multiset() for graph in graphs_in_dir(directory): try: node_types = nx.get_node_attributes(graph, "node type") console_node_candidates = [ k for k, v in node_types.items() if v == "web API" and graph.nodes[k].get("method") == "console.log" ] if console_node_candidates: cln = console_node_candidates[0] for u, v, eid, args in graph.in_edges(cln, data="args", keys=True): if args: jargs = json.loads(args) url = jargs.get("location", {}).get("url") if url: bits = urlparse(url) hostname = bits.hostname upath = bits.path else: hostname = upath = None bag.add( ConsoleTuple( jargs.get("source"), jargs.get("level"), get_sld(hostname) if hostname else None, upath, )) except: logger.exception( f"error processing graph in {directory} (skipping)") return bag
def get_request_bag_for_dir(dirname: Optional[str]) -> multiset.Multiset: bag_map = multiset.Multiset() for graph in find_3p_nonad_graphs(dirname): resource_nodes = [k for k, v in nx.get_node_attributes(graph, "node type").items() if v == "resource"] for k, v in nx.get_node_attributes(graph, "node type").items(): if v == 'resource': url_fields = urlparse(graph.nodes[k]["url"]) etld1 = get_sld(url_fields.hostname) bag_map.update((etld1, rt) for n1, n2, eid, rt in graph.in_edges(k, data="request type", keys=True)) return bag_map
def get_meta_tags(filename: str) -> MetaTags: with open(filename, "rt", encoding="utf8") as fd: blob = fd.read() m = RE_EXTRACT_META_TAGS.search(blob) if not m: raise ValueError("no tags found") url = m.group(1) ubits = urlparse(url) etld1 = get_sld(ubits.hostname) is_root = m.group(2) == "true" return MetaTags(filename, url, etld1, is_root)
def _extract_base_domain_from_origin(self, origin): """Extract base domain from an origin according to publicsuffix list. Handles IDNs in both unicode and punycode form, but always return the base domain in punycode.""" hostname = urlparse(origin).hostname or '' # If the domain is an Internationalized Domain Name (IDN), we want to # return the punycode version. This follows publicsuffix2's default # behavior - idna=True is the default and that means it expects input # to be idna-encoded. That's the format we'd like to return anyway, to # make it obvious to reviewers/admins when the base domain is an IDN. hostname = self.punycode(hostname) return get_sld(hostname)
def get_base_domain(domain, use_fresh_psl=False): """ Gets the base domain name for the given domain .. note:: Results are based on a list of public domain suffixes at https://publicsuffix.org/list/public_suffix_list.dat. Args: domain (str): A domain or subdomain use_fresh_psl (bool): Download a fresh Public Suffix List Returns: str: The base domain of the given domain """ psl_path = os.path.join(tempdir, "public_suffix_list.dat") def download_psl(): url = "https://publicsuffix.org/list/public_suffix_list.dat" # Use a browser-like user agent string to bypass some proxy blocks headers = {"User-Agent": USER_AGENT} try: fresh_psl = requests.get(url, headers=headers).text with open(psl_path, "w", encoding="utf-8") as fresh_psl_file: fresh_psl_file.write(fresh_psl) except Exception as error: raise DownloadError( "Failed to download an updated PSL {0}".format(error)) if use_fresh_psl: if not os.path.exists(psl_path): download_psl() else: psl_age = datetime.now() - datetime.fromtimestamp( os.stat(psl_path).st_mtime) if psl_age > timedelta(hours=24): download_psl() with open(psl_path, encoding="utf-8") as psl_file: psl = publicsuffix2.PublicSuffixList(psl_file) return psl.get_public_suffix(domain) else: return publicsuffix2.get_sld(domain)
def check_domain(domain): if domain in WHITELIST: return 'ok: domain is whitelisted\n' if not domain.startswith('openpgpkey.'): return 'domain must have "openpgpkey" prefix\n', 400 if domain != ("openpgpkey." + get_sld(domain)): return 'subdomains can only be used upon request. send an email to <tt>support at keys dot openpgp dot org</tt>\n', 400 req = requests.get('https://cloudflare-dns.com/dns-query', params={ 'name': domain, 'type': 'CNAME' }, headers={'accept': 'application/dns-json'}) app.logger.debug(f'lookup url: {req.url}') if req.status_code != 200: app.logger.debug(f'dns error: {req.status_code} {req.text})') abort(400, f'CNAME lookup failed (http {req.status_code})') response = req.json() app.logger.debug(f'response json: {response}') if 'Status' not in response: return 'CNAME lookup failed (no status)\n', 400 if response['Status'] != 0: return 'CNAME lookup failed (invalid domain?)\n', 400 if 'Answer' not in response: return 'CNAME lookup failed: no CNAME record set\n', 400 if len(response['Answer']) != 1: return 'CNAME lookup failed: ambiguous answer section\n', 400 answer = response['Answer'][0] if answer['type'] != 5: return 'CNAME lookup failed: unexpected response (record type)\n', 400 if answer['name'] != domain and answer['name'] != f'{domain}.': return f'CNAME lookup failed: unexpected response (domain response was for {escape(domain)})\n', 400 if not answer['data'].startswith(GATEWAY_DOMAIN): return f'CNAME lookup failed: {escape(domain)} resolves to {escape(answer["data"])} (expected {GATEWAY_DOMAIN})\n', 400 return f'CNAME lookup ok: {escape(domain)} resolves to {GATEWAY_DOMAIN}\n'
def colorize_host(host): tld = get_tld(host) sld = get_sld(host) attr = [] tld_size = len(tld) sld_size = len(sld) - tld_size for letter in reversed(range(len(host))): character = host[letter] if tld_size > 0: style = 'url_domain' tld_size -= 1 elif tld_size == 0: style = 'text' tld_size -= 1 elif sld_size > 0: sld_size -= 1 style = 'url_extension' else: style = 'text' rle_append_beginning_modify(attr, (style, len(character.encode()))) return attr
def url_to_domain(url, psl): parsed_uri = urlparse(url) domain = publicsuffix2.get_sld('{uri.netloc}'.format(uri=parsed_uri), psl) return domain
def test_get_sld_tld_with_a_wildcard_rule_and_exceptions4(self): assert 'b.test.ck' == publicsuffix.get_sld('a.b.test.ck')
def url_etld1(url: str) -> str: bits = urlparse(url) return get_sld(bits.hostname)
def test_get_sld_tld_with_a_wildcard_rule_and_exceptions6(self): assert 'www.ck' == publicsuffix.get_sld('www.www.ck')
def test_get_sld_US_K127(self): assert 'k12.ak.us' == publicsuffix.get_sld('k12.ak.us')
def main(argv): try: json_file = argv[1] except IndexError: print(f"usage: {argv[0]} JSON_STATS_DUMP") return with open(json_file, 'rb') as fd: stats_dump = json.load(fd) per_policy = defaultdict(list) for i, visit_record in enumerate(stats_dump): try: visit_url = urlparse(visit_record["url"]) visit_etld1 = get_sld(visit_url.hostname) policy_name = visit_record["policy"] policy_list = per_policy[policy_name] for ckey, cycle in visit_record["visits"].items(): try: labeled_stats = {} for label, data in cycle["stats"].items(): labeled_list = {} for raw_origin_url, stats in data["req"].items(): try: origin_url = urlparse(raw_origin_url) except ValueError as ux: print( f"unparseable execution context URL: '{raw_origin_url}': {ux}", file=sys.stderr) continue # 3p only! origin_etld1 = get_sld(origin_url.hostname) if (origin_etld1 is not None) and (origin_etld1 != visit_etld1): labeled_list[origin_etld1] = stats if labeled_list: labeled_stats[label] = labeled_list if 'diff' in labeled_stats: policy_list.append(labeled_stats) except KeyError as err: print( f"record[{i}], policy[{policy_name}], cycle[{ckey}] malformed, missing key {err}", file=sys.stderr) json.dump(cycle, sys.stderr, indent=2) print(file=sys.stderr) except KeyError as err: print( f"record[{i}], policy[{policy_name}] malformed, missing key {err}", file=sys.stderr) json.dump(visit_record, sys.stderr, indent=2) print(file=sys.stderr) #return per_policy writer = csv.writer(sys.stdout) writer.writerow(['tpetld1', 'policy', 'temp', 'requests', 'bytes']) for policy_name, stats_list in per_policy.items(): for cycle in stats_list: for tpetld1 in cycle["diff"]: writer.writerows([ [ tpetld1, policy_name, 'cold', cycle["cold"][tpetld1]["count"], cycle["cold"][tpetld1]["bytes"] ], [ tpetld1, policy_name, 'hot', cycle["hot"][tpetld1]["count"], cycle["hot"][tpetld1]["bytes"] ], ])
def test_get_sld_More_complex_sld11(self): assert 'b.ide.kyoto.jp' == publicsuffix.get_sld('a.b.ide.kyoto.jp')
def test_get_sld_Same_as_above_but_punycoded9(self): assert 'xn--fiqs8s' == publicsuffix.get_sld('xn--fiqs8s')
def test_get_sld_Same_as_above_but_punycoded5(self): assert 'xn--55qx5d.cn' == publicsuffix.get_sld('xn--55qx5d.cn')
def test_get_sld_Same_as_above_but_punycoded3(self): assert 'xn--85x722f.xn--55qx5d.cn' == publicsuffix.get_sld( 'www.xn--85x722f.xn--55qx5d.cn')
def test_get_sld_Same_as_above_but_punycoded1(self): assert 'xn--85x722f.com.cn' == publicsuffix.get_sld( 'xn--85x722f.com.cn')
def test_get_sld_US_K129(self): assert 'test.k12.ak.us' == publicsuffix.get_sld('www.test.k12.ak.us')
def test_get_sld_More_complex_sld16(self): assert 'city.kobe.jp' == publicsuffix.get_sld('www.city.kobe.jp')
def test_get_sld_More_complex_sld14(self): assert 'b.c.kobe.jp' == publicsuffix.get_sld('a.b.c.kobe.jp')
def test_get_sld_US_K124(self): assert 'ak.us' == publicsuffix.get_sld('ak.us')
def main(argv): # per-graph data csv_filename = argv[1] csv_stem = os.path.splitext(csv_filename)[0] orig_df = pd.read_csv(csv_filename) # optional per-site-tag error data used to filter out rows from URLs that encountered errors if len(argv) > 2: url_df = pd.read_csv(argv[2]) err_df = url_df.set_index('site_tag').drop(['order', 'crawl_url'], axis=1).transpose().any().transpose() err_df = err_df[err_df == False] orig_df = orig_df[orig_df.site_tag.isin(err_df.index)] else: url_df = None # augment with eTLD+1 extracted from site-tag (i.e., from the crawl URL hostname) orig_df['site_etld1'] = orig_df['site_tag'].apply(lambda x: get_sld(x.split('/')[0])) # all graphs adf = orig_df.drop(['is_root', 'is_ad'], axis=1) # 1p-only graphs rdf = orig_df[orig_df.is_root == True].drop(['is_root', 'is_ad'], axis=1) # 3p-no-ad graphs tdf = orig_df[(orig_df.is_root == False) & (orig_df.is_ad == False)].drop(['is_root', 'is_ad'], axis=1) # 3p-ad graphs (just for fun) zdf = orig_df[(orig_df.is_root == False) & (orig_df.is_ad == True)].drop(['is_root', 'is_ad'], axis=1) # TEST: can we identify "first-use" subsets for each (profile/frame-etld1) tuple? #-------------------------------------------------------------------------------- assert url_df is not None, "need URL list for order information..." FIELDS = """profile_tag,url_etld1,total_nodes,total_edges,total_dom_nodes,total_remote_frames,touched_dom_nodes,completed_requests,event_listenings,post_storage_script_edges,post_storage_console_errors""".split(",") for (dataset_label, wutdf) in [("3p-no-ad", tdf), ("3p-ad-only", zdf)]: mongo_df = wutdf.join(url_df.set_index("site_tag"), on="site_tag").sort_values('order') # sub-set test: (global-first-median, site-first-median, overall-median) comparisons? [bust] """ global_firsts = [] #defaultdict(list) site_firsts = [] #defaultdict(list) for (url_etld1, profile_tag), records in mongo_df.groupby(['url_etld1', 'profile_tag']): ordered_records = records.sort_values('order') global_firsts.append(ordered_records.iloc[0][FIELDS]) for site_etld1, site_records in ordered_records.groupby("site_etld1"): site_firsts.append(site_records.sort_values('order').iloc[0][FIELDS]) gfdf = pd.DataFrame(global_firsts) sfdf = pd.DataFrame(site_firsts) print(gfdf.groupby('profile_tag').median()) print(sfdf.groupby('profile_tag').median()) print(mongo_df[FIELDS].groupby('profile_tag').median()) """ global_firsts = set() site_firsts = set() for (url_etld1, profile_tag), records in mongo_df[['order', 'site_tag', 'site_etld1', 'url_etld1', 'profile_tag']].groupby(['url_etld1', 'profile_tag']): ordered_records = records.sort_values('order') global_firsts.add((ordered_records.iloc[0].site_tag, url_etld1)) for site_etld1, site_records in ordered_records.groupby("site_etld1"): site_firsts.add((site_records.sort_values('order').iloc[0].site_tag, url_etld1)) gfdf = mongo_df.set_index(['site_tag', 'url_etld1']).loc[pd.MultiIndex.from_tuples(global_firsts)].reset_index()[FIELDS] sfdf = mongo_df.set_index(['site_tag', 'url_etld1']).loc[pd.MultiIndex.from_tuples(site_firsts)].reset_index()[FIELDS] #gfdf = pd.DataFrame([mongo_df.where((mongo_df.site_tag == site_tag) & (mongo_df.url_etld1 == url_etld1))[FIELDS] for (site_tag, url_etld1) in global_firsts]) #sfdf = pd.DataFrame([mongo_df.where((mongo_df.site_tag == site_tag) & (mongo_df.url_etld1 == url_etld1))[FIELDS] for (site_tag, url_etld1) in site_firsts]) #print(gfdf) #print(len(site_firsts)) for field in FIELDS[2:]: fig, axen = plt.subplots(3, 1, sharex=True) gfdf.groupby(['url_etld1', 'profile_tag'])[field].sum().unstack(fill_value=0).cumsum().plot(ax=axen[0], title="Global-First Use", legend=False) sfdf.groupby(['url_etld1', 'profile_tag'])[field].sum().unstack(fill_value=0).cumsum().plot(ax=axen[1], title="Site-First Use", legend=False) mongo_df.groupby(['url_etld1', 'profile_tag'])[field].sum().unstack(fill_value=0).cumsum().plot(ax=axen[2], title="All Use", legend=False) handles, labels = axen[0].get_legend_handles_labels() fig.tight_layout() fig.legend(handles, labels, loc=(0, -0.01), ncol=4) fig.suptitle(f"Temporal Analysis of '{field}' ({dataset_label})") fig.savefig(f"{csv_stem}_temporal_{dataset_label}_{field}.pdf") plt.close(fig) return #-------------------------------------------------------------------------------- # END TEST # computation: turn into graph counts c_adf = graph_counts_by_profile(adf) c_rdf = graph_counts_by_profile(rdf) c_tdf = graph_counts_by_profile(tdf) c_zdf = graph_counts_by_profile(zdf) # count all-profiles-same-count for each all_same_adf_mask = pred_all_same(c_adf) all_same_adf = sum(all_same_adf_mask) # true == 1, false == 0; sum == number-of-trues all_same_rdf_mask = pred_all_same(c_rdf) all_same_rdf = sum(all_same_rdf_mask) all_same_tdf_mask = pred_all_same(c_tdf) all_same_tdf = sum(all_same_tdf_mask) all_same_zdf_mask = pred_all_same(c_zdf) all_same_zdf = sum(all_same_zdf_mask) print(f"Crawls with same-graph-count-across-all-profiles (all graphs): {all_same_adf:,}/{len(c_adf):,} ({(all_same_adf / len(c_adf)):%})") print(f"Crawls with same-graph-count-across-all-profiles (1p-only): {all_same_rdf:,}/{len(c_rdf):,} ({(all_same_rdf / len(c_rdf)):%})") print(f"Crawls with same-graph-count-across-all-profiles (3p-no-ad-only): {all_same_tdf:,}/{len(c_tdf):,} ({(all_same_tdf / len(c_tdf)):%})") print(f"Crawls with same-graph-count-across-all-profiles (3p-ad-only): {all_same_zdf:,}/{len(c_zdf):,} ({(all_same_zdf / len(c_zdf)):%})") # count number of graphs in balanced URLs vs total (for each) balanced_count_adf = c_adf[all_same_adf_mask].sum().sum() total_count_adf = c_adf.sum().sum() balanced_count_rdf = c_rdf[all_same_rdf_mask].sum().sum() total_count_rdf = c_rdf.sum().sum() balanced_count_tdf = c_tdf[all_same_tdf_mask].sum().sum() total_count_tdf = c_tdf.sum().sum() balanced_count_zdf = c_zdf[all_same_zdf_mask].sum().sum() total_count_zdf = c_zdf.sum().sum() print(f"Sum of graphs in balanced URLs (all graphs): {balanced_count_adf:,}/{total_count_adf:,} ({(balanced_count_adf / total_count_adf):%})") print(f"Sum of graphs in balanced URLs (1p-only): {balanced_count_rdf:,}/{total_count_rdf:,} ({(balanced_count_rdf / total_count_rdf):%})") print(f"Sum of graphs in balanced URLs (3p-no-ad-only): {balanced_count_tdf:,}/{total_count_tdf:,} ({(balanced_count_tdf / total_count_tdf):%})") print(f"Sum of graphs in balanced URLs (3p-ad-only): {balanced_count_zdf:,}/{total_count_zdf:,} ({(balanced_count_zdf / total_count_zdf):%})") # tally profile-with-most-graphs for each unbalanced crawl URL proclivity_sets = [ ("all-graphs", c_adf, all_same_adf_mask), ("1p-only", c_rdf, all_same_rdf_mask), ("3p-no-ad", c_tdf, all_same_tdf_mask), ("3p-ad-only", c_zdf, all_same_zdf_mask), ] for name, df, mask in proclivity_sets: ppdf = df[~mask].transpose().apply(lambda x: x.sort_values().index[-1][:-1]).transpose().value_counts() ppdf_bottom = df[~mask].transpose().apply(lambda x: x.sort_values().index[0][:-1]).transpose().value_counts() ppdf_total = ppdf.sum() print(f"Most-prolific-profile-over-all-unbalanced-URLs ({name}):") for profile, count in ppdf.items(): print(f"\t{profile:14} {count:8,}/{ppdf_total:,} ({count / ppdf_total:%})") print(f"Least-prolific-profile-over-all-unbalanced-URLs ({name}):") for profile, count in ppdf_bottom.items(): print(f"\t{profile:14} {count:8,}/{ppdf_total:,} ({count / ppdf_total:%})") # experiment in identifying biggest steps in the url_etld1-cumsum-stairstep (for 3p-no-ad) """ total_graphs = tdf.groupby(['url_etld1', 'profile_tag']).url.count().unstack(fill_value=0) top_dogs = {profile: set(series.sort_values(ascending=False).iloc[:10].index) for profile, series in total_graphs.items()} top_intersection = reduce(lambda a, b: a & b, top_dogs.values()) print("intersection:") print("\t" + "\n\t".join(top_intersection)) top_union = reduce(lambda a, b: a | b, top_dogs.values()) print("union - intersection:") print("\t" + "\n\t".join(top_union - top_intersection)) tui = list(sorted(top_union)) print(total_graphs.loc[tui]) total_graphs.loc[tui].cumsum().plot() print(total_graphs.loc[tui].transpose().var().transpose()) plt.show() """ # identify top-variance in cross-profile-graph-counts by url_etld1 (3p-no-ad only) TOP_N = 5 DF = tdf total_graphs = DF.groupby(['url_etld1', 'profile_tag']).url.count().unstack(fill_value=0) reject = total_graphs.transpose().var().sort_values(ascending=False).iloc[:TOP_N] print(reject) """ all_the_things = DF[~DF.url_etld1.isin(reject.index)].groupby(['site_tag', 'profile_tag']).url.count().unstack(fill_value=0) print(all_the_things) all_the_things.cumsum().plot() plt.show() """ # graph all of our metrics (and the total number of graphs) as cumulative-sum curves across all crawled URLs DF = DF[~DF.url_etld1.isin(reject.index)] cdf = DF.groupby(['site_tag', 'profile_tag']).total_nodes.count().unstack(fill_value=0).cumsum() ax = cdf.plot(title=f"Cumulative Graphs Across All Crawled URLs") ax.set_xticklabels([]) fig = ax.get_figure() fig.autofmt_xdate(rotation=45) fig.tight_layout() fig.savefig(f"{csv_stem}_cumulative_GRAPHS.pdf") plt.close(fig) DF = DF.groupby(['site_tag', 'profile_tag']).sum() fields = DF.columns for field in fields: cdf = DF[field].unstack(fill_value=0).cumsum() ax = cdf.plot(title=f"Cumulative '{field}' Across All Crawled URLs") ax.set_xticklabels([]) fig = ax.get_figure() fig.autofmt_xdate(rotation=45) fig.tight_layout() fig.savefig(f"{csv_stem}_cumulative_{field}.pdf") plt.close(fig)
def test_get_sld_US_K121(self): assert 'us' == publicsuffix.get_sld('us')
def test_get_sld_Same_as_above_but_punycoded7(self): assert 'xn--85x722f.xn--fiqs8s' == publicsuffix.get_sld( 'www.xn--85x722f.xn--fiqs8s')
def test_get_sld_US_K125(self): assert 'test.ak.us' == publicsuffix.get_sld('test.ak.us')