Exemple #1
0
def get_public_suffix_list():
    """Initialize Public Suffix List"""
    try:
        psl_file = fetch()
        psl = PublicSuffixList(psl_file=psl_file)
    except Exception:
        psl = PublicSuffixList()
    return psl
Exemple #2
0
def get_public_suffix_list() -> PublicSuffixList:
    # Initialize Public Suffix List
    try:
        psl_file = fetch()
        psl = PublicSuffixList(psl_file=psl_file)
    except Exception as e:
        logging.getLogger(__name__).warning(f'Unable to fetch the PublicSuffixList: {e}')
        psl = PublicSuffixList()
    return psl
def vendors_violations_check(browser, website, args):
    time.sleep(SLEEP_TIME_GET_LOGS)
    psl = publicsuffix2.fetch()
    domains = {"direct": set()}
    for entry in browser.get_log('browser'):
        script_file_checking_consent = get_info(entry, re_script_file, "Vendor script file")
        if script_file_checking_consent is not None:
            domain = url_to_domain(script_file_checking_consent, psl)
            domains["direct"].add(domain)
    check_violation_vendors_2_4_5(website, domains)
    return domains
def get_through_logs_for_violations(browser, website, seen_consent_strings, try_getting_consent_string=False):
    # Test case (postmessages) : www.cotemaison.fr (lots of postmessages)
    # Test case (URL-based, GET) : lepoint.fr
    # Test case (URL-based, POST) : lepoint.fr
    # Test case (Ignored GDPR, GET) : lepoint.fr
    # Test case (Ignored GDPR, POST) : lepoint.fr
    time.sleep(SLEEP_TIME_GET_LOGS)
    psl = publicsuffix2.fetch()
    domains = {"postmessage": set(), "get": set(), "post": set()}
    all_domains = set()
    if try_getting_consent_string:
        call_cmp_to_get_consent_string(browser)
    # Some doc: https://github.com/SeleniumHQ/selenium/wiki/Logging
    logs = browser.get_log('browser')
    cookie = get_shared_cookie(browser, website, logs=logs)
    if cookie is not None:
        seen_consent_strings["cookie"].add(cookie)
        website.shared_cookie_set = True
    if try_getting_consent_string:
        consent_string, consent_strings_postmessage = get_consent_string(browser, website, normal=False, logs=logs)
        seen_consent_strings["direct"].add(consent_string)
        for consent_string in consent_strings_postmessage:
            seen_consent_strings["postmessage"].add(consent_string)
    for entry in logs:
        # checking postmessages
        postmessage_origin = get_info(entry, re_postmessage, "")
        if postmessage_origin is not None:
            domain = url_to_domain(postmessage_origin, psl)
            domains["postmessage"].add(domain)
            continue
        # checking requests
        request = get_info_multiple(entry, re_requests, "")
        if request is not None:
            #print(html.unescape(request)) # should not be necessary
            parsed = urlparse(request[0])
            domain = url_to_domain(request[0], psl)
            all_domains.add(domain)
            # GET parameters
            parameters = parse_qs(parsed.query)
            if "gdpr_consent" in parameters or "gdpr" in parameters:
                domains["get"].add(domain) # add domain to vendors list
                if "gdpr_consent" in parameters and len(parameters["gdpr_consent"]) > 0:
                    #consent_string_violations(website, parameters["gdpr_consent"][0], origin="GET")
                    seen_consent_strings["GET"].add(parameters["gdpr_consent"][0])
                if "gdpr" in parameters and len(parameters["gdpr"]) > 0 and parameters["gdpr"][0] == '0':
                        if not website.violation_gdpr_does_not_apply_this_session:
                            #print("*** A request (GET) pretends GDPR does not apply")
                            website.violation_gdpr_does_not_apply_this_session = True
                        website.violation_gdpr_does_not_apply = True
            if "redirect" in parameters and "consensu.org" in domain and len(parameters["redirect"]) > 0 and str(parameters["redirect"][0]).startswith("http"):
                print("Found a consensu.org redirector respecting the specification. Request: %s" % request[0])
                website.redirector_seen = True
                # Examples: sports.fr, lematin.ch (sddan)
            elif "consensu.org" in domain and "redirect" not in parameters:
                print("Found a request to consensu.org not respecting the redirector specification. Request: %s" % request[0])
                website.other_consensu_seen = True
            # POST parameters
            if len(request) > 1:
                raw_post_data = request[1]
                post_data = None
                raw_consent_string = None
                gdpr_param = None
                try:
                    post_data = json.loads(codecs.getdecoder('unicode_escape')(raw_post_data)[0])
                    if post_data is not None:
                        raw_consent_string = extract_from_json(post_data, "gdpr_consent")
                        # extract_from_json does not work for integers. We do not look for embedded parameters.
                except json.decoder.JSONDecodeError as e:
                    if raw_post_data is list:
                        # parameters are not necessarily in JSON format
                        if "gdpr_consent" in raw_post_data:
                            print(raw_post_data)
                            raw_consent_string = raw_post_data["gdpr_consent"]
                        if "gdpr" in raw_post_data:
                            gdpr_param = raw_post_data["gdpr"]
                if raw_consent_string is not None:
                    domains["post"].add(domain) # add domain to vendors list
                    seen_consent_strings["POST"].add(raw_consent_string)
                if gdpr_param == '0' or gdpr_param == False:
                    if not website.violation_gdpr_does_not_apply_this_session:
                        #print("*** A request (POST) pretends GDPR does not apply")
                        website.violation_gdpr_does_not_apply_this_session = True
                    website.violation_gdpr_does_not_apply = True
    check_violation_vendors_2_4_5(website, domains)
    return all_domains, domains