Python get_sld Beispiele, publicsuffix2.get_sld Python Beispiele

Beispiel #1

0

Datei anzeigen

 def test_get_sld_top_convenience_function_is_the_same_as_PublicSuffixList_method(
         self):
     psl = publicsuffix.PublicSuffixList()
     # these functions should be identical
     assert psl.get_sld('www.google.com') == publicsuffix.get_sld(
         'www.google.com')
     assert psl.get_sld('www.test.ak.us') == publicsuffix.get_sld(
         'www.test.ak.us')

Beispiel #2

0

Datei anzeigen

Datei: url_benchmark.py Projekt: nitzspiz/PyDomainExtractor

def benchmark_publicsuffix2(urls, ):
    start = time.perf_counter()

    for url in urls:
        publicsuffix2.get_sld(url)

    end = time.perf_counter()

    print(f'publicsuffix2: {end - start}s')

Beispiel #3

0

Datei anzeigen

def benchmark_publicsuffix2(domains, ):
    start = time.perf_counter()

    for domain in domains:
        publicsuffix2.get_sld(domain)

    end = time.perf_counter()

    print(f'publicsuffix2: {end - start}s')

Beispiel #4

0

Datei anzeigen

    def get_cookies(self, url):
        """
    Return a list of cookies to be returned to server.

    Ref: https://searchfox.org/mozilla-central/source/netwerk/cookie/nsCookieService.cpp#2952
    """
        us = urlsplit(url)
        base_domain = ps2.get_sld(domain_to_ascii(us.hostname))
        domain = domain_to_ascii(us.hostname)

        sql = '''select name, value, host, path from moz_cookies
             where (host = ? or host like ?)'''
        if us.scheme != 'https':
            sql += ' and isSecure != 1' ''
        if self._origin:
            sql += " and originAttributes = ''"

        cursor = self._db.execute(sql, (domain, f'%.{base_domain}'))
        candidates = cursor.fetchall()

        path = us.path or '/'
        candidates = [
            x for x in candidates
            if domain == x[2] or (x[2][0] == '.' and domain.endswith(x[2]))
        ]
        candidates = [x for x in candidates if path_matches(x[3], path)]

        return {x[0]: x[1] for x in candidates}

Beispiel #5

0

Datei anzeigen

def get_console_bag_for_dir(directory: Optional[str]) -> multiset.Multiset:
    bag = multiset.Multiset()
    for graph in graphs_in_dir(directory):
        try:
            node_types = nx.get_node_attributes(graph, "node type")
            console_node_candidates = [
                k for k, v in node_types.items() if v == "web API"
                and graph.nodes[k].get("method") == "console.log"
            ]
            if console_node_candidates:
                cln = console_node_candidates[0]
                for u, v, eid, args in graph.in_edges(cln,
                                                      data="args",
                                                      keys=True):
                    if args:
                        jargs = json.loads(args)
                        url = jargs.get("location", {}).get("url")
                        if url:
                            bits = urlparse(url)
                            hostname = bits.hostname
                            upath = bits.path
                        else:
                            hostname = upath = None
                        bag.add(
                            ConsoleTuple(
                                jargs.get("source"),
                                jargs.get("level"),
                                get_sld(hostname) if hostname else None,
                                upath,
                            ))
        except:
            logger.exception(
                f"error processing graph in {directory} (skipping)")
    return bag

Beispiel #6

0

Datei anzeigen

def get_request_bag_for_dir(dirname: Optional[str]) -> multiset.Multiset:
    bag_map = multiset.Multiset()
    for graph in find_3p_nonad_graphs(dirname):
        resource_nodes = [k for k, v in nx.get_node_attributes(graph, "node type").items() if v == "resource"]
        for k, v in nx.get_node_attributes(graph, "node type").items():
            if v == 'resource':
                url_fields = urlparse(graph.nodes[k]["url"])
                etld1 = get_sld(url_fields.hostname)
                bag_map.update((etld1, rt) for n1, n2, eid, rt in graph.in_edges(k, data="request type", keys=True))
    return bag_map

Beispiel #7

0

Datei anzeigen

Datei: grab.py Projekt: jueckstock/efs-crawl-driver

def get_meta_tags(filename: str) -> MetaTags:
    with open(filename, "rt", encoding="utf8") as fd:
        blob = fd.read()
        m = RE_EXTRACT_META_TAGS.search(blob)
        if not m:
            raise ValueError("no tags found")
        url = m.group(1)
        ubits = urlparse(url)
        etld1 = get_sld(ubits.hostname)
        is_root = m.group(2) == "true"
        return MetaTags(filename, url, etld1, is_root)

Beispiel #8

0

Datei anzeigen

Datei: models.py Projekt: wagnerand/addons-server

 def _extract_base_domain_from_origin(self, origin):
     """Extract base domain from an origin according to publicsuffix list.
     Handles IDNs in both unicode and punycode form, but always return the
     base domain in punycode."""
     hostname = urlparse(origin).hostname or ''
     # If the domain is an Internationalized Domain Name (IDN), we want to
     # return the punycode version. This follows publicsuffix2's default
     # behavior - idna=True is the default and that means it expects input
     # to be idna-encoded. That's the format we'd like to return anyway, to
     # make it obvious to reviewers/admins when the base domain is an IDN.
     hostname = self.punycode(hostname)
     return get_sld(hostname)

Beispiel #9

0

Datei anzeigen

Datei: utils.py Projekt: tom-henderson/parsedmarc

def get_base_domain(domain, use_fresh_psl=False):
    """
    Gets the base domain name for the given domain

    .. note::
        Results are based on a list of public domain suffixes at
        https://publicsuffix.org/list/public_suffix_list.dat.

    Args:
        domain (str): A domain or subdomain
        use_fresh_psl (bool): Download a fresh Public Suffix List

    Returns:
        str: The base domain of the given domain

    """
    psl_path = os.path.join(tempdir, "public_suffix_list.dat")

    def download_psl():
        url = "https://publicsuffix.org/list/public_suffix_list.dat"
        # Use a browser-like user agent string to bypass some proxy blocks
        headers = {"User-Agent": USER_AGENT}
        try:
            fresh_psl = requests.get(url, headers=headers).text
            with open(psl_path, "w", encoding="utf-8") as fresh_psl_file:
                fresh_psl_file.write(fresh_psl)
        except Exception as error:
            raise DownloadError(
                "Failed to download an updated PSL {0}".format(error))

    if use_fresh_psl:
        if not os.path.exists(psl_path):
            download_psl()
        else:
            psl_age = datetime.now() - datetime.fromtimestamp(
                os.stat(psl_path).st_mtime)
            if psl_age > timedelta(hours=24):
                download_psl()

        with open(psl_path, encoding="utf-8") as psl_file:
            psl = publicsuffix2.PublicSuffixList(psl_file)

        return psl.get_public_suffix(domain)
    else:
        return publicsuffix2.get_sld(domain)

Beispiel #10

0

Datei anzeigen

def check_domain(domain):
    if domain in WHITELIST:
        return 'ok: domain is whitelisted\n'

    if not domain.startswith('openpgpkey.'):
        return 'domain must have "openpgpkey" prefix\n', 400

    if domain != ("openpgpkey." + get_sld(domain)):
        return 'subdomains can only be used upon request. send an email to <tt>support at keys dot openpgp dot org</tt>\n', 400

    req = requests.get('https://cloudflare-dns.com/dns-query',
                       params={
                           'name': domain,
                           'type': 'CNAME'
                       },
                       headers={'accept': 'application/dns-json'})
    app.logger.debug(f'lookup url: {req.url}')

    if req.status_code != 200:
        app.logger.debug(f'dns error: {req.status_code} {req.text})')
        abort(400, f'CNAME lookup failed (http {req.status_code})')
    response = req.json()
    app.logger.debug(f'response json: {response}')

    if 'Status' not in response:
        return 'CNAME lookup failed (no status)\n', 400
    if response['Status'] != 0:
        return 'CNAME lookup failed (invalid domain?)\n', 400
    if 'Answer' not in response:
        return 'CNAME lookup failed: no CNAME record set\n', 400
    if len(response['Answer']) != 1:
        return 'CNAME lookup failed: ambiguous answer section\n', 400
    answer = response['Answer'][0]
    if answer['type'] != 5:
        return 'CNAME lookup failed: unexpected response (record type)\n', 400
    if answer['name'] != domain and answer['name'] != f'{domain}.':
        return f'CNAME lookup failed: unexpected response (domain response was for {escape(domain)})\n', 400
    if not answer['data'].startswith(GATEWAY_DOMAIN):
        return f'CNAME lookup failed: {escape(domain)} resolves to {escape(answer["data"])} (expected {GATEWAY_DOMAIN})\n', 400
    return f'CNAME lookup ok: {escape(domain)} resolves to {GATEWAY_DOMAIN}\n'

Beispiel #11

0

Datei anzeigen

Datei: common.py Projekt: yuuagh/mitmproxy

def colorize_host(host):
    tld = get_tld(host)
    sld = get_sld(host)

    attr = []

    tld_size = len(tld)
    sld_size = len(sld) - tld_size

    for letter in reversed(range(len(host))):
        character = host[letter]
        if tld_size > 0:
            style = 'url_domain'
            tld_size -= 1
        elif tld_size == 0:
            style = 'text'
            tld_size -= 1
        elif sld_size > 0:
            sld_size -= 1
            style = 'url_extension'
        else:
            style = 'text'
        rle_append_beginning_modify(attr, (style, len(character.encode())))
    return attr

Beispiel #12

0

Datei anzeigen

def url_to_domain(url, psl):
    parsed_uri = urlparse(url)
    domain = publicsuffix2.get_sld('{uri.netloc}'.format(uri=parsed_uri), psl)
    return domain

Beispiel #13

0

Datei anzeigen

Datei: tests_mozilla.py Projekt: offensity/python-publicsuffix2

 def test_get_sld_tld_with_a_wildcard_rule_and_exceptions4(self):
     assert 'b.test.ck' == publicsuffix.get_sld('a.b.test.ck')

Beispiel #14

0

Datei anzeigen

def url_etld1(url: str) -> str:
    bits = urlparse(url)
    return get_sld(bits.hostname)

Beispiel #15

0

Datei anzeigen

Datei: tests_mozilla.py Projekt: offensity/python-publicsuffix2

 def test_get_sld_tld_with_a_wildcard_rule_and_exceptions6(self):
     assert 'www.ck' == publicsuffix.get_sld('www.www.ck')

Beispiel #16

0

Datei anzeigen

Datei: tests_mozilla.py Projekt: offensity/python-publicsuffix2

 def test_get_sld_US_K127(self):
     assert 'k12.ak.us' == publicsuffix.get_sld('k12.ak.us')

Beispiel #17

0

Datei anzeigen

def main(argv):
    try:
        json_file = argv[1]
    except IndexError:
        print(f"usage: {argv[0]} JSON_STATS_DUMP")
        return

    with open(json_file, 'rb') as fd:
        stats_dump = json.load(fd)

    per_policy = defaultdict(list)
    for i, visit_record in enumerate(stats_dump):
        try:
            visit_url = urlparse(visit_record["url"])
            visit_etld1 = get_sld(visit_url.hostname)

            policy_name = visit_record["policy"]
            policy_list = per_policy[policy_name]
            for ckey, cycle in visit_record["visits"].items():
                try:
                    labeled_stats = {}
                    for label, data in cycle["stats"].items():
                        labeled_list = {}
                        for raw_origin_url, stats in data["req"].items():
                            try:
                                origin_url = urlparse(raw_origin_url)
                            except ValueError as ux:
                                print(
                                    f"unparseable execution context URL: '{raw_origin_url}': {ux}",
                                    file=sys.stderr)
                                continue
                            # 3p only!
                            origin_etld1 = get_sld(origin_url.hostname)
                            if (origin_etld1 is not None) and (origin_etld1 !=
                                                               visit_etld1):
                                labeled_list[origin_etld1] = stats
                        if labeled_list:
                            labeled_stats[label] = labeled_list
                    if 'diff' in labeled_stats:
                        policy_list.append(labeled_stats)
                except KeyError as err:
                    print(
                        f"record[{i}], policy[{policy_name}], cycle[{ckey}] malformed, missing key {err}",
                        file=sys.stderr)
                    json.dump(cycle, sys.stderr, indent=2)
                    print(file=sys.stderr)
        except KeyError as err:
            print(
                f"record[{i}], policy[{policy_name}] malformed, missing key {err}",
                file=sys.stderr)
            json.dump(visit_record, sys.stderr, indent=2)
            print(file=sys.stderr)

    #return per_policy

    writer = csv.writer(sys.stdout)
    writer.writerow(['tpetld1', 'policy', 'temp', 'requests', 'bytes'])
    for policy_name, stats_list in per_policy.items():
        for cycle in stats_list:
            for tpetld1 in cycle["diff"]:
                writer.writerows([
                    [
                        tpetld1, policy_name, 'cold',
                        cycle["cold"][tpetld1]["count"],
                        cycle["cold"][tpetld1]["bytes"]
                    ],
                    [
                        tpetld1, policy_name, 'hot',
                        cycle["hot"][tpetld1]["count"],
                        cycle["hot"][tpetld1]["bytes"]
                    ],
                ])

Beispiel #18

0

Datei anzeigen

Datei: tests_mozilla.py Projekt: offensity/python-publicsuffix2

 def test_get_sld_More_complex_sld11(self):
     assert 'b.ide.kyoto.jp' == publicsuffix.get_sld('a.b.ide.kyoto.jp')

Beispiel #19

0

Datei anzeigen

Datei: tests_mozilla.py Projekt: offensity/python-publicsuffix2

 def test_get_sld_Same_as_above_but_punycoded9(self):
     assert 'xn--fiqs8s' == publicsuffix.get_sld('xn--fiqs8s')

Beispiel #20

0

Datei anzeigen

Datei: tests_mozilla.py Projekt: offensity/python-publicsuffix2

 def test_get_sld_Same_as_above_but_punycoded5(self):
     assert 'xn--55qx5d.cn' == publicsuffix.get_sld('xn--55qx5d.cn')

Beispiel #21

0

Datei anzeigen

Datei: tests_mozilla.py Projekt: offensity/python-publicsuffix2

 def test_get_sld_Same_as_above_but_punycoded3(self):
     assert 'xn--85x722f.xn--55qx5d.cn' == publicsuffix.get_sld(
         'www.xn--85x722f.xn--55qx5d.cn')

Beispiel #22

0

Datei anzeigen

Datei: tests_mozilla.py Projekt: offensity/python-publicsuffix2

 def test_get_sld_Same_as_above_but_punycoded1(self):
     assert 'xn--85x722f.com.cn' == publicsuffix.get_sld(
         'xn--85x722f.com.cn')

Beispiel #23

0

Datei anzeigen

Datei: tests_mozilla.py Projekt: offensity/python-publicsuffix2

 def test_get_sld_US_K129(self):
     assert 'test.k12.ak.us' == publicsuffix.get_sld('www.test.k12.ak.us')

Beispiel #24

0

Datei anzeigen

Datei: tests_mozilla.py Projekt: offensity/python-publicsuffix2

 def test_get_sld_More_complex_sld16(self):
     assert 'city.kobe.jp' == publicsuffix.get_sld('www.city.kobe.jp')

Beispiel #25

0

Datei anzeigen

Datei: tests_mozilla.py Projekt: offensity/python-publicsuffix2

 def test_get_sld_More_complex_sld14(self):
     assert 'b.c.kobe.jp' == publicsuffix.get_sld('a.b.c.kobe.jp')

Beispiel #26

0

Datei anzeigen

Datei: tests_mozilla.py Projekt: offensity/python-publicsuffix2

 def test_get_sld_US_K124(self):
     assert 'ak.us' == publicsuffix.get_sld('ak.us')

Beispiel #27

0

Datei anzeigen

Datei: all_graph_tests.py Projekt: jueckstock/efs-crawl-driver

def main(argv):
    # per-graph data
    csv_filename = argv[1]
    csv_stem = os.path.splitext(csv_filename)[0]
    orig_df = pd.read_csv(csv_filename)

    # optional per-site-tag error data used to filter out rows from URLs that encountered errors
    if len(argv) > 2:
        url_df = pd.read_csv(argv[2])
        err_df = url_df.set_index('site_tag').drop(['order', 'crawl_url'], axis=1).transpose().any().transpose()
        err_df = err_df[err_df == False]
        orig_df = orig_df[orig_df.site_tag.isin(err_df.index)]
    else:
        url_df = None
    
    # augment with eTLD+1 extracted from site-tag (i.e., from the crawl URL hostname)
    orig_df['site_etld1'] = orig_df['site_tag'].apply(lambda x: get_sld(x.split('/')[0]))

    # all graphs
    adf = orig_df.drop(['is_root', 'is_ad'], axis=1)

    # 1p-only graphs
    rdf = orig_df[orig_df.is_root == True].drop(['is_root', 'is_ad'], axis=1)

    # 3p-no-ad graphs
    tdf = orig_df[(orig_df.is_root == False) & (orig_df.is_ad == False)].drop(['is_root', 'is_ad'], axis=1)

    # 3p-ad graphs (just for fun)
    zdf = orig_df[(orig_df.is_root == False) & (orig_df.is_ad == True)].drop(['is_root', 'is_ad'], axis=1)


    # TEST: can we identify "first-use" subsets for each (profile/frame-etld1) tuple?
    #--------------------------------------------------------------------------------
    assert url_df is not None, "need URL list for order information..."
    FIELDS = """profile_tag,url_etld1,total_nodes,total_edges,total_dom_nodes,total_remote_frames,touched_dom_nodes,completed_requests,event_listenings,post_storage_script_edges,post_storage_console_errors""".split(",")


    for (dataset_label, wutdf) in [("3p-no-ad", tdf), ("3p-ad-only", zdf)]:
        mongo_df = wutdf.join(url_df.set_index("site_tag"), on="site_tag").sort_values('order')

        # sub-set test: (global-first-median, site-first-median, overall-median) comparisons? [bust]
        """ global_firsts = [] #defaultdict(list)
        site_firsts = [] #defaultdict(list)
        for (url_etld1, profile_tag), records in mongo_df.groupby(['url_etld1', 'profile_tag']):
            ordered_records = records.sort_values('order')
            global_firsts.append(ordered_records.iloc[0][FIELDS])
            for site_etld1, site_records in ordered_records.groupby("site_etld1"):
                site_firsts.append(site_records.sort_values('order').iloc[0][FIELDS])
        gfdf = pd.DataFrame(global_firsts)
        sfdf = pd.DataFrame(site_firsts)
        print(gfdf.groupby('profile_tag').median())
        print(sfdf.groupby('profile_tag').median())
        print(mongo_df[FIELDS].groupby('profile_tag').median()) """

        global_firsts = set()
        site_firsts = set()
        for (url_etld1, profile_tag), records in mongo_df[['order', 'site_tag', 'site_etld1', 'url_etld1', 'profile_tag']].groupby(['url_etld1', 'profile_tag']):
            ordered_records = records.sort_values('order')
            global_firsts.add((ordered_records.iloc[0].site_tag, url_etld1))
            for site_etld1, site_records in ordered_records.groupby("site_etld1"):
                site_firsts.add((site_records.sort_values('order').iloc[0].site_tag, url_etld1))
        

        gfdf = mongo_df.set_index(['site_tag', 'url_etld1']).loc[pd.MultiIndex.from_tuples(global_firsts)].reset_index()[FIELDS]
        sfdf = mongo_df.set_index(['site_tag', 'url_etld1']).loc[pd.MultiIndex.from_tuples(site_firsts)].reset_index()[FIELDS]
        #gfdf = pd.DataFrame([mongo_df.where((mongo_df.site_tag == site_tag) & (mongo_df.url_etld1 == url_etld1))[FIELDS] for (site_tag, url_etld1) in global_firsts])
        #sfdf = pd.DataFrame([mongo_df.where((mongo_df.site_tag == site_tag) & (mongo_df.url_etld1 == url_etld1))[FIELDS] for (site_tag, url_etld1) in site_firsts])
        #print(gfdf)
        #print(len(site_firsts))

        for field in FIELDS[2:]:
            fig, axen = plt.subplots(3, 1, sharex=True)
            gfdf.groupby(['url_etld1', 'profile_tag'])[field].sum().unstack(fill_value=0).cumsum().plot(ax=axen[0], title="Global-First Use", legend=False)
            sfdf.groupby(['url_etld1', 'profile_tag'])[field].sum().unstack(fill_value=0).cumsum().plot(ax=axen[1], title="Site-First Use", legend=False)
            mongo_df.groupby(['url_etld1', 'profile_tag'])[field].sum().unstack(fill_value=0).cumsum().plot(ax=axen[2], title="All Use", legend=False)
            handles, labels = axen[0].get_legend_handles_labels()
            fig.tight_layout()
            fig.legend(handles, labels, loc=(0, -0.01), ncol=4)
            fig.suptitle(f"Temporal Analysis of '{field}' ({dataset_label})")
            fig.savefig(f"{csv_stem}_temporal_{dataset_label}_{field}.pdf")
            plt.close(fig)
    

    return
    #--------------------------------------------------------------------------------
    # END TEST

    # computation: turn into graph counts
    c_adf = graph_counts_by_profile(adf)
    c_rdf = graph_counts_by_profile(rdf)
    c_tdf = graph_counts_by_profile(tdf)
    c_zdf = graph_counts_by_profile(zdf)

    # count all-profiles-same-count for each
    all_same_adf_mask = pred_all_same(c_adf)
    all_same_adf = sum(all_same_adf_mask)   # true == 1, false == 0; sum == number-of-trues
    all_same_rdf_mask = pred_all_same(c_rdf)
    all_same_rdf = sum(all_same_rdf_mask)
    all_same_tdf_mask = pred_all_same(c_tdf)
    all_same_tdf = sum(all_same_tdf_mask)
    all_same_zdf_mask = pred_all_same(c_zdf)
    all_same_zdf = sum(all_same_zdf_mask)
    print(f"Crawls with same-graph-count-across-all-profiles (all graphs): {all_same_adf:,}/{len(c_adf):,} ({(all_same_adf / len(c_adf)):%})")
    print(f"Crawls with same-graph-count-across-all-profiles (1p-only): {all_same_rdf:,}/{len(c_rdf):,} ({(all_same_rdf / len(c_rdf)):%})")
    print(f"Crawls with same-graph-count-across-all-profiles (3p-no-ad-only): {all_same_tdf:,}/{len(c_tdf):,} ({(all_same_tdf / len(c_tdf)):%})")
    print(f"Crawls with same-graph-count-across-all-profiles (3p-ad-only): {all_same_zdf:,}/{len(c_zdf):,} ({(all_same_zdf / len(c_zdf)):%})")

    # count number of graphs in balanced URLs vs total (for each)
    balanced_count_adf = c_adf[all_same_adf_mask].sum().sum()
    total_count_adf = c_adf.sum().sum()
    balanced_count_rdf = c_rdf[all_same_rdf_mask].sum().sum()
    total_count_rdf = c_rdf.sum().sum()
    balanced_count_tdf = c_tdf[all_same_tdf_mask].sum().sum()
    total_count_tdf = c_tdf.sum().sum()
    balanced_count_zdf = c_zdf[all_same_zdf_mask].sum().sum()
    total_count_zdf = c_zdf.sum().sum()
    print(f"Sum of graphs in balanced URLs (all graphs): {balanced_count_adf:,}/{total_count_adf:,} ({(balanced_count_adf / total_count_adf):%})")
    print(f"Sum of graphs in balanced URLs (1p-only): {balanced_count_rdf:,}/{total_count_rdf:,} ({(balanced_count_rdf / total_count_rdf):%})")
    print(f"Sum of graphs in balanced URLs (3p-no-ad-only): {balanced_count_tdf:,}/{total_count_tdf:,} ({(balanced_count_tdf / total_count_tdf):%})")
    print(f"Sum of graphs in balanced URLs (3p-ad-only): {balanced_count_zdf:,}/{total_count_zdf:,} ({(balanced_count_zdf / total_count_zdf):%})")

    # tally profile-with-most-graphs for each unbalanced crawl URL
    proclivity_sets = [
        ("all-graphs", c_adf, all_same_adf_mask),
        ("1p-only", c_rdf, all_same_rdf_mask),
        ("3p-no-ad", c_tdf, all_same_tdf_mask),
        ("3p-ad-only", c_zdf, all_same_zdf_mask),
    ]
    for name, df, mask in proclivity_sets:
        ppdf = df[~mask].transpose().apply(lambda x: x.sort_values().index[-1][:-1]).transpose().value_counts()
        ppdf_bottom = df[~mask].transpose().apply(lambda x: x.sort_values().index[0][:-1]).transpose().value_counts()
        ppdf_total = ppdf.sum()
        print(f"Most-prolific-profile-over-all-unbalanced-URLs ({name}):")
        for profile, count in ppdf.items():
            print(f"\t{profile:14} {count:8,}/{ppdf_total:,} ({count / ppdf_total:%})")
        print(f"Least-prolific-profile-over-all-unbalanced-URLs ({name}):")
        for profile, count in ppdf_bottom.items():
            print(f"\t{profile:14} {count:8,}/{ppdf_total:,} ({count / ppdf_total:%})")
    

    # experiment in identifying biggest steps in the url_etld1-cumsum-stairstep (for 3p-no-ad)
    """ total_graphs = tdf.groupby(['url_etld1', 'profile_tag']).url.count().unstack(fill_value=0)
    top_dogs = {profile: set(series.sort_values(ascending=False).iloc[:10].index) for profile, series in total_graphs.items()}
    
    top_intersection = reduce(lambda a, b: a & b, top_dogs.values())
    print("intersection:")
    print("\t" + "\n\t".join(top_intersection))
    top_union = reduce(lambda a, b: a | b, top_dogs.values())
    print("union - intersection:")
    print("\t" + "\n\t".join(top_union - top_intersection))

    tui = list(sorted(top_union))
    print(total_graphs.loc[tui])
    total_graphs.loc[tui].cumsum().plot()
    print(total_graphs.loc[tui].transpose().var().transpose())
    plt.show() """

    # identify top-variance in cross-profile-graph-counts by url_etld1 (3p-no-ad only)
    TOP_N = 5
    DF = tdf
    total_graphs = DF.groupby(['url_etld1', 'profile_tag']).url.count().unstack(fill_value=0)
    reject = total_graphs.transpose().var().sort_values(ascending=False).iloc[:TOP_N]
    print(reject)

    """ 
    all_the_things = DF[~DF.url_etld1.isin(reject.index)].groupby(['site_tag', 'profile_tag']).url.count().unstack(fill_value=0)
    print(all_the_things)
    all_the_things.cumsum().plot()
    plt.show() """

    # graph all of our metrics (and the total number of graphs) as cumulative-sum curves across all crawled URLs
    DF = DF[~DF.url_etld1.isin(reject.index)]
    cdf = DF.groupby(['site_tag', 'profile_tag']).total_nodes.count().unstack(fill_value=0).cumsum()
    ax = cdf.plot(title=f"Cumulative Graphs Across All Crawled URLs")
    ax.set_xticklabels([])
    fig = ax.get_figure()
    fig.autofmt_xdate(rotation=45)
    fig.tight_layout()
    fig.savefig(f"{csv_stem}_cumulative_GRAPHS.pdf")
    plt.close(fig)
    
    DF = DF.groupby(['site_tag', 'profile_tag']).sum()
    fields = DF.columns
    for field in fields:
        cdf = DF[field].unstack(fill_value=0).cumsum()
        ax = cdf.plot(title=f"Cumulative '{field}' Across All Crawled URLs")
        ax.set_xticklabels([])
        fig = ax.get_figure()
        fig.autofmt_xdate(rotation=45)
        fig.tight_layout()
        fig.savefig(f"{csv_stem}_cumulative_{field}.pdf")
        plt.close(fig)

Beispiel #28

0

Datei anzeigen

Datei: tests_mozilla.py Projekt: offensity/python-publicsuffix2

 def test_get_sld_US_K121(self):
     assert 'us' == publicsuffix.get_sld('us')

Beispiel #29

0

Datei anzeigen

Datei: tests_mozilla.py Projekt: offensity/python-publicsuffix2

 def test_get_sld_Same_as_above_but_punycoded7(self):
     assert 'xn--85x722f.xn--fiqs8s' == publicsuffix.get_sld(
         'www.xn--85x722f.xn--fiqs8s')

Beispiel #30

0

Datei anzeigen

Datei: tests_mozilla.py Projekt: offensity/python-publicsuffix2

 def test_get_sld_US_K125(self):
     assert 'test.ak.us' == publicsuffix.get_sld('test.ak.us')