def get_public_suffix_list(): """Initialize Public Suffix List""" try: psl_file = fetch() psl = PublicSuffixList(psl_file=psl_file) except Exception: psl = PublicSuffixList() return psl
def get_public_suffix_list() -> PublicSuffixList: # Initialize Public Suffix List try: psl_file = fetch() psl = PublicSuffixList(psl_file=psl_file) except Exception as e: logging.getLogger(__name__).warning(f'Unable to fetch the PublicSuffixList: {e}') psl = PublicSuffixList() return psl
def vendors_violations_check(browser, website, args): time.sleep(SLEEP_TIME_GET_LOGS) psl = publicsuffix2.fetch() domains = {"direct": set()} for entry in browser.get_log('browser'): script_file_checking_consent = get_info(entry, re_script_file, "Vendor script file") if script_file_checking_consent is not None: domain = url_to_domain(script_file_checking_consent, psl) domains["direct"].add(domain) check_violation_vendors_2_4_5(website, domains) return domains
def get_through_logs_for_violations(browser, website, seen_consent_strings, try_getting_consent_string=False): # Test case (postmessages) : www.cotemaison.fr (lots of postmessages) # Test case (URL-based, GET) : lepoint.fr # Test case (URL-based, POST) : lepoint.fr # Test case (Ignored GDPR, GET) : lepoint.fr # Test case (Ignored GDPR, POST) : lepoint.fr time.sleep(SLEEP_TIME_GET_LOGS) psl = publicsuffix2.fetch() domains = {"postmessage": set(), "get": set(), "post": set()} all_domains = set() if try_getting_consent_string: call_cmp_to_get_consent_string(browser) # Some doc: https://github.com/SeleniumHQ/selenium/wiki/Logging logs = browser.get_log('browser') cookie = get_shared_cookie(browser, website, logs=logs) if cookie is not None: seen_consent_strings["cookie"].add(cookie) website.shared_cookie_set = True if try_getting_consent_string: consent_string, consent_strings_postmessage = get_consent_string(browser, website, normal=False, logs=logs) seen_consent_strings["direct"].add(consent_string) for consent_string in consent_strings_postmessage: seen_consent_strings["postmessage"].add(consent_string) for entry in logs: # checking postmessages postmessage_origin = get_info(entry, re_postmessage, "") if postmessage_origin is not None: domain = url_to_domain(postmessage_origin, psl) domains["postmessage"].add(domain) continue # checking requests request = get_info_multiple(entry, re_requests, "") if request is not None: #print(html.unescape(request)) # should not be necessary parsed = urlparse(request[0]) domain = url_to_domain(request[0], psl) all_domains.add(domain) # GET parameters parameters = parse_qs(parsed.query) if "gdpr_consent" in parameters or "gdpr" in parameters: domains["get"].add(domain) # add domain to vendors list if "gdpr_consent" in parameters and len(parameters["gdpr_consent"]) > 0: #consent_string_violations(website, parameters["gdpr_consent"][0], origin="GET") seen_consent_strings["GET"].add(parameters["gdpr_consent"][0]) if "gdpr" in parameters and len(parameters["gdpr"]) > 0 and parameters["gdpr"][0] == '0': if not website.violation_gdpr_does_not_apply_this_session: #print("*** A request (GET) pretends GDPR does not apply") website.violation_gdpr_does_not_apply_this_session = True website.violation_gdpr_does_not_apply = True if "redirect" in parameters and "consensu.org" in domain and len(parameters["redirect"]) > 0 and str(parameters["redirect"][0]).startswith("http"): print("Found a consensu.org redirector respecting the specification. Request: %s" % request[0]) website.redirector_seen = True # Examples: sports.fr, lematin.ch (sddan) elif "consensu.org" in domain and "redirect" not in parameters: print("Found a request to consensu.org not respecting the redirector specification. Request: %s" % request[0]) website.other_consensu_seen = True # POST parameters if len(request) > 1: raw_post_data = request[1] post_data = None raw_consent_string = None gdpr_param = None try: post_data = json.loads(codecs.getdecoder('unicode_escape')(raw_post_data)[0]) if post_data is not None: raw_consent_string = extract_from_json(post_data, "gdpr_consent") # extract_from_json does not work for integers. We do not look for embedded parameters. except json.decoder.JSONDecodeError as e: if raw_post_data is list: # parameters are not necessarily in JSON format if "gdpr_consent" in raw_post_data: print(raw_post_data) raw_consent_string = raw_post_data["gdpr_consent"] if "gdpr" in raw_post_data: gdpr_param = raw_post_data["gdpr"] if raw_consent_string is not None: domains["post"].add(domain) # add domain to vendors list seen_consent_strings["POST"].add(raw_consent_string) if gdpr_param == '0' or gdpr_param == False: if not website.violation_gdpr_does_not_apply_this_session: #print("*** A request (POST) pretends GDPR does not apply") website.violation_gdpr_does_not_apply_this_session = True website.violation_gdpr_does_not_apply = True check_violation_vendors_2_4_5(website, domains) return all_domains, domains