Example #1
0
    def extract_information(self):
        third_parties = {
            'fqdns': set(),
            'num_http_requests': 0,
            'num_https_requests': 0
        }
        first_party_domains = set()
        for url in (self.result['site_url'], self.result['final_url']):
            extracted = parse_domain(url)
            first_party_domains.add(extracted.registered_domain)
        for request in self.page.request_log:
            request['is_thirdparty'] = False
            extracted_url = parse_domain(request['url'])
            parsed_url = request['parsed_url']
            if extracted_url.registered_domain in first_party_domains:
                continue
            if request['url'].startswith('data:'):
                continue
            request['is_thirdparty'] = True
            third_parties['fqdns'].add(extracted_url.fqdn)
            if parsed_url.scheme not in ('http', 'https'):
                continue
            third_parties['num_{}_requests'.format(parsed_url.scheme)] += 1
        third_parties['fqdns'] = list(third_parties['fqdns'])
        third_parties['fqdns'].sort()
        self.result['third_parties'] = third_parties

        for cookie in self.result['cookies']:
            domain = cookie['domain']
            if domain.startswith('.'):
                domain = domain[1:]
            domain = parse_domain(domain).registered_domain
            cookie['is_thirdparty'] = domain not in first_party_domains
    def extract_information(self):
        self._load_rules()
        trackers_fqdn = set()
        trackers_domain = set()
        num_tracker_requests = 0
        blacklist = set()
        num_evaluations = 0
        for request in self.page.request_log:
            request['is_tracker'] = False
            if not request['is_thirdparty'] or request['url'].startswith(
                    'data:'):
                continue
            is_tracker = request['parsed_url'].netloc in blacklist
            if not is_tracker:
                # Giving only the first 150 characters of an URL is
                # sufficient to get good matches, so this will speed
                # up checking quite a bit!
                match_result = self.rules.match(request['url'][:150],
                                                request['document_url'])
                is_tracker = match_result.is_match
                num_evaluations += 1
            if is_tracker:
                request['is_tracker'] = True
                extracted = parse_domain(request['url'])
                if extracted.fqdn:
                    trackers_fqdn.add(extracted.fqdn)
                trackers_domain.add(extracted.registered_domain)
                num_tracker_requests += 1
                blacklist.add(request['parsed_url'].netloc)

        num_tracker_cookies = 0
        for cookie in self.result['cookies']:
            is_tracker = False
            domain = cookie['domain']
            if domain in trackers_fqdn or domain in trackers_domain:
                is_tracker = True
            elif domain.startswith('.'):
                reg_domain = parse_domain(domain[1:]).registered_domain
                if reg_domain in trackers_domain:
                    is_tracker = True

            if is_tracker:
                num_tracker_cookies += 1
            cookie['is_tracker'] = is_tracker

        self.result['tracking'] = {
            'trackers': list(sorted(trackers_fqdn)),
            'num_tracker_requests': num_tracker_requests,
            'num_tracker_cookies': num_tracker_cookies
        }
Example #3
0
 def extract_information(self):
     requests_lookup = {request['requestId']: request for request in self.page.request_log}
     failed_requests = []
     for failed_request in self.page.failed_request_log:
         error_text = failed_request['errorText']
         valid_errors = ('net::ERR_CACHE_MISS', 'net::ERR_ABORTED')
         if any(error in error_text for error in valid_errors):
             # Requests that were aborted by the site (e.g. a XHR
             # request that was canceled) and cache misses are
             # not considered failed.
             continue
         extra = None
         try:
             request = requests_lookup[failed_request['requestId']]
         except KeyError:
             self.logger.error('Could not find request: {}'.format(failed_request))
             continue
         if 'net::ERR_NAME_NOT_RESOLVED' in error_text:
             error_type = 'dns-not-resolved'
             # We could not resolve the IP address of this host. One
             # reason might be, that the domain is not registered.
             # To check whether this is the case, we check for the
             # absence of a SOA record for the domain itself, i.e.,
             # not the netloc of the URL. Unregistered domains
             # should have no SOA entry, while registered should.
             domain = parse_domain(request['url']).registered_domain
             try:
                 dns.resolver.query(domain, 'SOA')
                 domain_registered = True
             # If we have a timeout, we better don't say anything about
             # this domain rather than giving a wrong impressing wether
             # the domain is registered or net
             except dns.resolver.Timeout:
                 domain_registered = None
             # Nameservers behave weird, if the domain is not registered.
             # Some send NXDOMAIN as expected, others prefer to give an
             # answer but do not include a SOA entry in the response.
             # Sometimes all nameservers do not like to answer if the
             # domain is not registered. It is a real mess.
             except (dns.resolver.NXDOMAIN, dns.resolver.NoNameservers,
                     dns.resolver.NoAnswer):
                 domain_registered = False
             extra = {'domain_registered': domain_registered}
         elif 'net::ERR_UNKNOWN_URL_SCHEME' in error_text:
             error_type = 'unknown-url-scheme'
         else:
             error_type = 'unknown'
         error = {
             'url': request['url'],
             'error_type': error_type,
         }
         if extra is not None:
             error.update(extra)
         if error_type == 'unknown':
             error['error_text'] = error_text
         failed_requests.append(error)
     self.result['failed_requests'] = failed_requests
Example #4
0
 def extract_information(self):
     stats = {}
     for party in ('first', 'third'):
         for duration in ('short', 'long'):
             stats['{}_party_{}'.format(party, duration)] = 0
     cookietrackers = set()
     for cookie in self.result['cookies']:
         prefix = 'third' if cookie['is_thirdparty'] else 'first'
         suffix = 'long' if cookie[
             'lifetime'] > self.long_cookie_time else 'short'
         stats['{}_party_{}'.format(prefix, suffix)] += 1
         if cookie['is_tracker']:
             tracker = parse_domain(cookie['domain'])
             cookietrackers.add(tracker.registered_domain)
     stats['trackers'] = list(sorted(cookietrackers))
     self.result['cookiestats'] = stats
Example #5
0
    def extract_information(self):
        global _hsts_lookup

        hsts_preload = {'is_ready': False, 'is_preloaded': False}
        self.result['https']['hsts_preload'] = hsts_preload
        self.result.mark_dirty('https')

        if _hsts_lookup is None:
            lookup_file = self.options['storage_path'] / 'hsts.json'
            with lookup_file.open() as f:
                _hsts_lookup = json.load(f)

        domain = parse_domain(self.result['final_url']).registered_domain
        is_preloaded = domain in _hsts_lookup

        # Iterate over all subdomains and check if any of it is preloaded.
        # We have to handle three cases:
        # 1) all subdomains are not in the preload list. Reject.
        # 2) a subdomain is in the preload list, and include_subdomains is set.
        #    Accept in this case.
        # 3) a subdomain is in the preload list, but include_subdomains
        #    is not set. Then we have to do two things. Firstly, continue
        #    searching: maybe another subdomain of the current subdomain
        #    is in the list and has include_subdomains. See case 2. Secondly,
        #    the full domain might be in the lookup. This has already been
        #    checked beforehand, so nothing to do.
        current_domain = ''
        for part in domain.split('.'):
            current_domain = part + '.' + current_domain
            if current_domain in _hsts_lookup:
                include_subdomains = _hsts_lookup[current_domain]
                if include_subdomains:
                    is_preloaded = True
                    break
            else:
                break

        hsts_header = self.result['security_headers'][
            'Strict-Transport-Security']
        if hsts_header is None:
            return

        # There are some big players who got exceptions from the standard
        # requirements to be HSTS ready, therefore we treat them as HSTS ready
        # if they are already in the preload list. However, we require the HSTS
        # header to be set (see return statement above).
        hsts_preload['is_preloaded'] = is_preloaded
        if is_preloaded:
            hsts_preload['is_ready'] = True
            return

        # According to hstspreload.org, these are the criteria for being ready
        # to be included in the HSTS preload list:
        #
        # 1. Serve a valid certificate.
        # 2. Redirect from HTTP to HTTPS on the same host, if you are listening
        #    on port 80.
        # 3. Serve all subdomains over HTTPS.
        #    In particular, you must support HTTPS for the www subdomain if a
        #    DNS record for that subdomain exists.
        # 4. Serve an HSTS header on the base domain for HTTPS requests:
        #    4.1 The max-age must be at least 31536000 seconds (1 year).
        #    4.2 The includeSubDomains directive must be specified.
        #    4.3 The preload directive must be specified.
        #    4.4 If you are serving an additional redirect from your HTTPS site,
        #        that redirect must still have the HSTS header (rather than the
        #        page it redirects to).

        fail_reasons = []
        if not self.result['final_url'].startswith('https://'):
            fail_reasons.append('no-https-redirect')
        if not hsts_header['includeSubDomains']:
            fail_reasons.append('no-include-subdomains')
        if hsts_header['max-age'] is None:
            fail_reasons.append('no-max-age')
        elif hsts_header['max-age'] < 31536000:
            fail_reasons.append('max-age-too-short')
        if not hsts_header['preload']:
            fail_reasons.append('missing-preload')

        fail_reasons.sort()
        hsts_preload['is_ready'] = len(fail_reasons) == 0
        if fail_reasons:
            hsts_preload['fail_reasons'] = fail_reasons