Esempio n. 1
0
    def test_grouped_ua_ends_with_rule(self):
        robots_txt = "User-agent: *\nDisallow: /\nUser-agent: Google\nUser-agent: Bingbot\nAllow: /"
        ua = "Mozilla/5.0 (compatible; Googlebot/2.1; +http://www.google.com/bot.html)"

        url = "https://www.example.com/test/is/allowed.html"
        robot = GFlareRobots(robots_txt, user_agent=ua)
        self.assertEqual(robot.is_allowed(url), True, "Should be allowed")
Esempio n. 2
0
    def test_specificity_two(self):
        robots_txt = "User-agent: *\nDisallow: /test/corner/\nAllow: /test/\nDisallow: /test/is/"
        ua = "Mozilla/5.0 (compatible; Googlebot/2.1; +http://www.google.com/bot.html)"

        url = "https://www.example.com/test/corner/funpart.html"
        robot = GFlareRobots(robots_txt, user_agent=ua)
        self.assertEqual(robot.is_allowed(url), False, "Should be disallowed")
Esempio n. 3
0
    def test_ua_override(self):
        robots_txt = "User-agent: *\nAllow: /test*\nUser-agent: Google\nDisallow: /"
        ua = "Mozilla/5.0 (compatible; Googlebot/2.1; +http://www.google.com/bot.html)"

        url = "https://www.example.com/test/is/disallowed.html"
        robot = GFlareRobots(robots_txt, user_agent=ua)
        self.assertEqual(robot.is_allowed(url), False, "Should be disallowed")
Esempio n. 4
0
    def test_least_restrictive(self):
        robots_txt = "User-agent: *\nDisallow: /test*\nAllow: /test/"
        ua = "Mozilla/5.0 (compatible; Googlebot/2.1; +http://www.google.com/bot.html)"

        allowed_url = "https://www.example.com/test/is/allowed.html"
        robot = GFlareRobots(robots_txt, user_agent=ua)
        self.assertEqual(robot.is_allowed(allowed_url), True,
                         "Should be allowed")
Esempio n. 5
0
    def test_broken_robots_txt(self):
        robots_txt = "User-agent: *Allow: /\n\nDisallow: /test/is/disallowed\nUser-agent: Yandex\nDisallow: /*test"
        ua = "Mozilla/5.0 (compatible; Googlebot/2.1; +http://www.google.com/bot.html)"

        url = "https://www.example.com/test/is/disallowed.html"

        robot = GFlareRobots(robots_txt, user_agent=ua)
        self.assertEqual(robot.is_allowed(url), True, "Should be allowed")
Esempio n. 6
0
    def test_ua_as_submatch(self):
        robots_txt = "User-agent: *\nAllow: /\nUser-agent: Google\nDisallow: /test/"
        ua = "Mozilla/5.0 (compatible; Googlebot/2.1; +http://www.google.com/bot.html)"

        allowed_url = "https://www.example.com/is/allowed.html"
        disallowed_url = "https://www.example.com/test/is/not/allowed.html"

        robot = GFlareRobots(robots_txt, user_agent=ua)
        self.assertEqual(robot.is_allowed(allowed_url), True,
                         "Should be allowed")
        self.assertEqual(robot.is_allowed(disallowed_url), False,
                         "Should be disallowed")
Esempio n. 7
0
    def test_rogue_sitemap_entry(self):
        robots_txt = "User-agent: *\nAllow: /allowed/section\nSitemap: https://www.example.com/sitemap.xml\nDisallow: /disallowed/section\nDisallow: /*section"
        ua = "Mozilla/5.0 (compatible; Googlebot/2.1; +http://www.google.com/bot.html)"

        allowed_url = "https://www.example.com/allowed/section"
        disallowed_url = "https://www.example.com/disallowed/section"

        robot = GFlareRobots(robots_txt, user_agent=ua)
        self.assertEqual(robot.is_allowed(allowed_url), True,
                         "Should be allowed")
        self.assertEqual(robot.is_allowed(disallowed_url), False,
                         "Should be disallowed")
Esempio n. 8
0
    def __init__(self, settings, columns):
        self.settings = settings
        self.all_items = columns
        self.response = None
        self.url = ''
        self.base_url = ''
        self.host = ''
        self.robots_txt_ua = "Googlebot"
        self.gfrobots = GFlareRobots('', self.settings.get("USER_AGENT", ''))

        self.spider_links = "Spider" in self.settings.get("MODE", "")

        self.extraction_separator = self.settings.get('EXTRACTION_SEPARATOR',
                                                      '; ')

        self.xpath_mapping = {
            'href_all': '//a/@href',
            'href_respect_nofollow':
            '//a[not(contains(@rel, "nofollow"))]/@href',
            'canonical_tag': '/html/head/link[@rel="canonical"]/@href',
            'hreflang':
            '/html/head/link[@rel="alternate" and @hreflang]/@href',
            'pagination':
            '/html/head/link[@rel="next"]/@href|/html/head/link[@rel="prev"]/@href',
            'images': '//img/@src',
            'stylesheets': '//link[@rel="stylesheet"]/@href',
            'javascript': '//script/@src',
            'h1': '//h1/text()',
            'h2': '//h2/text()',
            'page_title': '/html/head/title/text()',
            'meta_description':
            '/html/head/meta[@name="description"]/@content',
            'base_url': '/html/head/base/@href'
        }

        self.xpath_link_extraction = self.get_link_extraction_xpath()

        self.exclusions_regex = self.exclusions_to_regex(
            self.settings.get('EXCLUSIONS', []))

        self.crawlable_schemes = ('http', 'https', '')
Esempio n. 9
0
    def __init__(self, settings, columns):
        self.settings = settings
        self.all_items = columns
        self.response = None
        self.url = None
        self.url_components = None
        self.robots_txt_ua = "Googlebot"
        self.gfrobots = GFlareRobots('', self.settings.get("USER_AGENT", ''))
        self.robots_txt_status = None

        self.spider_links = "Spider" in self.settings.get("MODE", "")

        if self.robots_txt_status == "BLOCKED" and 'respect_robots_txt' in self.settings.get(
                'CRAWL_ITEMS', ''):
            self.spider_links = False

        self.extraction_separator = self.settings.get('EXTRACTION_SEPARATOR',
                                                      '; ')

        self.xpath_mapping = {
            'canonical_tag': '/html/head/link[@rel="canonical"]/@href',
            'hreflang': '/html/head/link[@rel="alternate"]/@href',
            'pagination':
            '/html/head/link[@rel="next"]/@href|//link[@rel="prev"]/@href',
            'images': '//img/@src',
            'stylesheets': '//link[@rel="stylesheet"]/@href',
            'javascript': '//script/@src',
            'h1': '//h1/text()',
            'h2': '//h2/text()',
            'page_title': '/html/head/title/text()',
            'meta_description': '/html/head/meta[@name="description"]/@content'
        }

        self.xpath_link_extraction = self.get_link_extraction_xpath()

        self.exclusions_regex = self.exclusions_to_regex(
            self.settings.get('EXCLUSIONS', []))
Esempio n. 10
0
class GFlareResponse:
    def __init__(self, settings, columns):
        self.settings = settings
        self.all_items = columns
        self.response = None
        self.url = ''
        self.base_url = ''
        self.host = ''
        self.robots_txt_ua = "Googlebot"
        self.gfrobots = GFlareRobots('', self.settings.get("USER_AGENT", ''))

        self.spider_links = "Spider" in self.settings.get("MODE", "")

        self.extraction_separator = self.settings.get('EXTRACTION_SEPARATOR',
                                                      '; ')

        self.xpath_mapping = {
            'href_all': '//a/@href',
            'href_respect_nofollow':
            '//a[not(contains(@rel, "nofollow"))]/@href',
            'canonical_tag': '/html/head/link[@rel="canonical"]/@href',
            'hreflang':
            '/html/head/link[@rel="alternate" and @hreflang]/@href',
            'pagination':
            '/html/head/link[@rel="next"]/@href|/html/head/link[@rel="prev"]/@href',
            'images': '//img/@src',
            'stylesheets': '//link[@rel="stylesheet"]/@href',
            'javascript': '//script/@src',
            'h1': '//h1/text()',
            'h2': '//h2/text()',
            'page_title': '/html/head/title/text()',
            'meta_description':
            '/html/head/meta[@name="description"]/@content',
            'base_url': '/html/head/base/@href'
        }

        self.xpath_link_extraction = self.get_link_extraction_xpath()

        self.exclusions_regex = self.exclusions_to_regex(
            self.settings.get('EXCLUSIONS', []))

        self.crawlable_schemes = ('http', 'https', '')

    def timing(f):
        @wraps(f)
        def wrap(*args, **kw):
            ts = time()
            result = f(*args, **kw)
            te = time()
            print(f'func:{f.__name__} took: {te - ts}')
            return result

        return wrap

    def set_response(self, response):
        self.response = response
        self.url = self.response.url
        self.host = self.get_domain(self.url)
        self.spider_links = "Spider" in self.settings.get("MODE", "")

        if self.get_robots_txt_status == 'blocked' and 'respect_robots_txt' in self.settings.get(
                'CRAWL_ITEMS', ''):
            self.spider_links = False

        if self.is_robots_txt():
            self.response_to_robots_txt()

    def response_to_robots_txt(self):
        if self.response.status_code == 200:
            self.robots_txt = self.response.text
            self.gfrobots.set_robots_txt(self.robots_txt,
                                         user_agent=self.settings.get(
                                             "USER_AGENT", ''))
            self.robots_txt_ua = self.gfrobots.get_short_ua(
                self.settings.get("USER_AGENT", ''))

    def get_initial_url(self):
        if len(self.response.history) == 0:
            return str(self.response.url).strip()
        return str(self.response.history[0].url).strip()

    def get_link_extraction_xpath(self):

        xpaths = []

        crawl_items = self.settings['CRAWL_ITEMS']

        if not 'respect_nofollow' in crawl_items:
            xpaths.append(self.xpath_mapping['href_all'])
        else:
            xpaths.append(self.xpath_mapping['href_respect_nofollow'])
        if 'canonical_tag' in crawl_items:
            xpaths.append(self.xpath_mapping['canonical_tag'])
        if 'hreflang' in crawl_items:
            xpaths.append(self.xpath_mapping['hreflang'])
        if 'pagination' in crawl_items:
            xpaths.append(self.xpath_mapping['pagination'])
        if 'images' in crawl_items:
            xpaths.append(self.xpath_mapping['images'])
        if 'stylesheets' in crawl_items:
            xpaths.append(self.xpath_mapping['stylesheets'])
        if 'javascript' in crawl_items:
            xpaths.append(self.xpath_mapping['javascript'])

        return '|'.join(xpaths)

    def get_data(self):

        d = {'url': self.url}
        d['data'] = self.get_header_info()

        if len(self.response.content) > 0:
            self.tree = self.get_tree()
            self.base_url = self.get_base_url()

            if self.spider_links:
                d['links'] = self.extract_links()
            d['data'] = {**d['data'], **self.get_crawl_data()}

        d['data'] = {
            **d['data'],
            **{
                'crawl_status': self.get_full_status(self.url, d['data'])
            }
        }

        d['data'] = [self.dict_to_row(d['data'])]

        if self.has_redirected():
            d['data'] += self.get_redirects()

        return d

    def get_tree(self):
        try:
            # We need to use page.content rather than page.text because
            # html.fromstring implicitly expects bytes as input.
            return fromstring(self.response.content)
        except Exception as e:
            print('Error parsing', self.url, 'with lxml')
            print(e)

    def get_domain(self, url):
        try:
            _, _, domain, _, _, _, _ = parse_url(url)
        except:
            return ''

        if not domain:
            return ''

        if domain.startswith('www.'):
            return domain.replace('www.', '')

        return domain

    def get_robots_txt_url(self, url):
        comps = parse_url(url)
        url = requote_uri(
            urlunparse([
                comps.scheme, comps.host, 'robots.txt', None, comps.query,
                comps.fragment
            ]))
        return url

    def is_external(self, url):
        if self.settings.get('MODE') == 'List':
            return False

        domain = self.get_domain(url)

        if not domain:
            return False

        return domain != self.settings.get('ROOT_DOMAIN', '')

    def is_excluded(self, url):
        if self.exclusions_regex:
            return bool(match(self.exclusions_regex, url))
        return False

    def get_base_url(self) -> str:
        extraction = self.extract_xpath(self.xpath_mapping['base_url'])

        if extraction:
            return self.sanitise_url(extraction[0], base_url=self.url)

        return self.url

    def exclusions_to_regex(self, exclusions):

        rules = []

        for exclusion in exclusions:
            operator, value = exclusion

            if operator == 'Equal to (=)':
                value = escape(value)
                rules.append(f"^{value}$")
            elif operator == 'Contain':
                value = escape(value)
                rules.append(f".*{value}.*")
            elif operator == 'Start with':
                value = escape(value)
                rules.append(f"^{value}.*")
            elif operator == 'End with':
                value = escape(value)
                rules.append(f".*{value}$")
            elif operator == 'Regex match':
                rules.append(value)

        return '|'.join(rules)

    def is_robots_txt(self, url=None):
        if not url:
            url = self.url

        if self.is_external(url):
            return False
        return parse_url(url).path == '/robots.txt'

    def get_final_url(self):
        return self.response.url

    def get_text(self):
        return self.response.text

    def get_canonical_http_header(self):
        header = self.response.headers.get("Link", "")
        if "rel=" in header:
            return header.split(";")[0].replace("<", "").replace(">", "")
        return ""

    def get_header_info(self):
        header = {
            'url': self.url,
            'status_code': self.response.status_code,
            'content_type': self.response.headers.get('content-type', ''),
            'robots_txt': self.get_robots_txt_status(self.url),
            'x_robots_tag': self.response.headers.get('x-robots-tag', ''),
            'canonical_http': self.get_canonical_http_header()
        }
        return header

    def valid_url(self, url):
        try:
            cmps = parse_url(url)
            if cmps.scheme and not cmps.host:
                return False
            if not cmps.scheme and cmps.host:
                return False
        except:
            return False

        if cmps.scheme and not cmps.scheme in self.crawlable_schemes:
            return False

        # Filter out external links if needed
        if self.settings.get('MODE') != 'List':
            if "external_links" not in self.settings.get(
                    "CRAWL_ITEMS", "") and self.is_external(url):
                return False

        if self.is_excluded(url):
            return False

        # Do not check and report on on-page links
        if "check_blocked_urls" not in self.settings.get(
                "CRAWL_ITEMS",
                "") and self.allowed_by_robots_txt(url) == False:
            return False
        return True

    def sanitise_url(self, url: str, base_url='') -> str:
        """Cleans a given input URL and returns a RFC compliant URL as a string."""

        if isinstance(url, bytes):
            url = url.decode('utf8')
        else:
            url = str(url)

        # Remove leading whitespaces from url
        url = url.lstrip()

        if base_url:
            url = urljoin(base_url, url)

        try:
            scheme, auth, host, port, path, query, fragment = parse_url(url)
        except:
            return None

        # Carefully reconstruct the network location
        netloc = auth or ''
        if netloc:
            netloc += '@'
        netloc += host
        if port:
            # Only report on ports if they are used in a non-standard way
            if scheme == 'http' and port == 80:
                pass
            elif scheme == 'https' and port == 443:
                pass
            else:
                netloc += ':' + str(port)

        # Bare domains aren't valid URLs.
        if not path:
            path = '/'

        url = requote_uri(
            urlunparse([scheme, netloc, path, None, query, fragment]))

        # Search engines ignore hash fragments hence we remove them from URLs
        url = url.split('#')[0]

        return url

    def extract_links(self):
        links = [
            self.sanitise_url(url, base_url=self.base_url)
            for url in self.extract_xpath(self.xpath_link_extraction)
            if self.valid_url(url)
        ]
        return list(set(links))

    def get_txt_by_selector(self, selector, method="css", get="txt"):
        try:
            if method == "css":
                tree_result = self.tree.cssselect(selector)
            elif method == "xpath":
                tree_result = self.tree.xpath(selector)
            else:
                pass

            txt = ""

            if len(tree_result) > 0:
                if get == "href":
                    txt = tree_result[0].attrib['href']
                elif get != "txt":
                    txt = tree_result[0].get(get)
                else:
                    txt = tree_result[0].text_content()

            if txt == None:
                return ""

            return ' '.join(txt.split())

        except:
            print(f"{selector} failed")
            return ""

    def extract_onpage_elements(self):
        d = {}
        if 'h1' in self.all_items:
            d['h1'] = self.extraction_separator.join(
                self.clean_list(self.extract_xpath(self.xpath_mapping['h1'])))

        if 'h2' in self.all_items:
            d['h2'] = self.extraction_separator.join(
                self.clean_list(self.extract_xpath(self.xpath_mapping['h2'])))

        if 'page_title' in self.all_items:
            d['page_title'] = self.extraction_separator.join(
                self.clean_list(
                    self.extract_xpath(self.xpath_mapping['page_title'])))

        if 'meta_description' in self.all_items:
            d['meta_description'] = self.extraction_separator.join(
                self.clean_list(
                    self.extract_xpath(
                        self.xpath_mapping['meta_description'])))

        return d

    def extract_directives(self):
        d = {}
        if 'canonical_tag' in self.all_items:
            canonicals = self.extract_xpath(
                self.xpath_mapping['canonical_tag'])
            if len(canonicals) > 0:
                d['canonical_tag'] = self.sanitise_url(canonicals[0],
                                                       base_url=self.base_url)
            else:
                d['canonical_tag'] = ''

        if 'canonical_http_header' in self.all_items:
            d['canonical_http_header'] = self.get_canonical_http_header()

        if 'meta_robots' in self.all_items:
            all_fields = self.get_meta_name_fields()
            matching_ua = [
                f for f in all_fields
                if f.lower() in self.robots_txt_ua.lower()
            ]
            rules = []

            if len(matching_ua) > 0:
                ua = matching_ua[0]
                rules = self.extract_xpath(f'//meta[@name="{ua}"]/@content')

            rules += self.extract_xpath('//meta[@name="robots"]/@content')

            d['meta_robots'] = ', '.join(rules)

        return d

    def custom_extractions(self):

        for extraction_name, selector, value in self.settings.get(
                'EXTRACTIONS', []):
            if selector == 'CSS Selector':
                return {
                    extraction_name:
                    self.get_txt_by_selector(value, method='css', get='txt')
                }
            elif selector == 'XPath':
                return {
                    extraction_name:
                    self.extraction_separator.join(
                        self.clean_list(self.extract_xpath(value)))
                }
            else:
                print('WARNING: regex extraction is not implemented yet')
                return {extraction_name: ''}

        return {}

    def get_crawl_data(self):
        return {
            **self.extract_onpage_elements(),
            **self.extract_directives(),
            **self.custom_extractions()
        }

    def is_canonicalised(self, url, canonical):
        if not canonical:
            return False
        if canonical != url:
            return True
        return False

    def get_full_status(self, url, seo_items):
        status = []

        # Evaluate status code
        try:
            code_description = status_codes._codes[
                seo_items['status_code']][0].replace('_', ' ')
        except KeyError:
            code_description = 'non-standard response'
        status.append(code_description)

        # Check against X-Robots
        # No checking against User-Agents is done
        # As the following setup can not be evaluated:
        # X-Robots-Tag: bingbot: noindex
        # X-Robots-Tag: nofollow, nosnippet
        # response.headers['X-Robots-Tag'] would return a combined result
        # 'X-Robots-Tag': 'bingbot: noindex, nofun, norisk, nofollow, nosnippet'
        # Which CANNOT be deconstructed again
        # This is actually compliant to RFC2616
        # https://www.w3.org/Protocols/rfc2616/rfc2616-sec4.html#sec4.2
        if 'noindex' in seo_items.get('x_robots_tag', ''):
            status.append('blocked by x-robots-tag')

        # Check against robots.txt
        if 'blocked' in seo_items.get('robots_txt', ''):
            status.append('blocked by robots.txt')

        # Check against meta robots.txt
        if 'noindex' in seo_items.get('meta_robots', ''):
            status.append('noindex')

        # Canonical Tag
        if self.is_canonicalised(url, seo_items.get('canonical_tag', '')):
            status.append('canonicalised')

        # Canonical Header
        if self.is_canonicalised(url, seo_items.get('canonical_http_header',
                                                    '')):
            status.append('header canonicalised')

        # Avoid ok, blocked by robots.txt and show blocked by robots.txt
        # instead
        if len(status) != 1 and status[0] == 'ok':
            status.pop(0)

        return ', '.join(status)

    def get_meta_name_fields(self):
        fields = []
        try:
            fields = self.tree.xpath('//meta/@name')
        except:
            pass
        return fields

    def dict_to_row(self, data):
        out = tuple(data.get(item, "") for item in self.all_items)
        return out

    def has_redirected(self):
        return len(self.response.history) > 0

    def get_redirects(self):
        data = []
        hist = self.response.history

        for i in range(len(hist)):
            hob_url = self.sanitise_url(hist[i].url)

            if 'external_links' not in self.settings.get('CRAWL_ITEMS', ''):
                if self.is_external(hob_url):
                    break

            robots_status = self.get_robots_txt_status(hob_url)
            if 'respect_robots_txt' in self.settings.get(
                    'CRAWL_ITEMS', ''
            ) and 'follow_blocked_redirects' not in self.settings.get(
                    'CRAWL_ITEMS', '') and robots_status == 'blocked':
                continue

            if i + 1 < len(hist):
                redirect_to_url = self.sanitise_url(str(hist[i + 1].url))
            else:
                redirect_to_url = self.get_final_url()

            hob_data = {
                "url": hob_url,
                "content_type": hist[i].headers.get('Content-Type', ""),
                'status_code': hist[i].status_code,
                'x_robots_tag': hist[i].headers.get('X-Robots-Tag', ''),
                'redirect_url': redirect_to_url,
                'robots_txt': robots_status
            }

            hob_data['crawl_status'] = self.get_full_status(hob_url, hob_data)
            hob_row = self.dict_to_row(hob_data)

            data.append(hob_row)

        return data

    def allowed_by_robots_txt(self, url):
        return self.gfrobots.is_allowed(url)

    def get_robots_txt_status(self, url):
        if self.allowed_by_robots_txt(url):
            return "allowed"
        return "blocked"

    def extract_xpath(self, path):
        try:
            return self.tree.xpath(path)
        except:
            return []

    def clean_list(self, inp):
        try:
            return [' '.join(i.split()) for i in inp if i.strip()]
        except Exception as e:
            print(f'ERROR: cleaning list {inp} failed!')
            return inp

    def get_hreflang_links(self):
        return self.extract_xpath(self.xpath_mapping['hreflang'])

    def get_canonical_links(self):
        return self.extract_xpath(self.xpath_mapping['canonical_tag'])

    def get_pagination_links(self):
        return self.extract_xpath(self.xpath_mapping['pagination'])
Esempio n. 11
0
class GFlareResponse:
    def __init__(self, settings, columns):
        self.settings = settings
        self.all_items = columns
        self.response = None
        self.url = None
        self.url_components = None
        self.robots_txt_ua = "Googlebot"
        self.gfrobots = GFlareRobots('', self.settings.get("USER_AGENT", ''))
        self.robots_txt_status = None

        self.spider_links = "Spider" in self.settings.get("MODE", "")

        if self.robots_txt_status == "BLOCKED" and 'respect_robots_txt' in self.settings.get(
                'CRAWL_ITEMS', ''):
            self.spider_links = False

        self.extraction_separator = self.settings.get('EXTRACTION_SEPARATOR',
                                                      '; ')

        self.xpath_mapping = {
            'canonical_tag': '/html/head/link[@rel="canonical"]/@href',
            'hreflang': '/html/head/link[@rel="alternate"]/@href',
            'pagination':
            '/html/head/link[@rel="next"]/@href|//link[@rel="prev"]/@href',
            'images': '//img/@src',
            'stylesheets': '//link[@rel="stylesheet"]/@href',
            'javascript': '//script/@src',
            'h1': '//h1/text()',
            'h2': '//h2/text()',
            'page_title': '/html/head/title/text()',
            'meta_description': '/html/head/meta[@name="description"]/@content'
        }

        self.xpath_link_extraction = self.get_link_extraction_xpath()

        self.exclusions_regex = self.exclusions_to_regex(
            self.settings.get('EXCLUSIONS', []))

    def timing(f):
        @wraps(f)
        def wrap(*args, **kw):
            ts = time()
            result = f(*args, **kw)
            te = time()
            print(f'func:{f.__name__} took: {te - ts}')
            return result

        return wrap

    def set_response(self, response):
        self.response = response
        # requests.get() encodes spaces within the path with %25
        # If we encode the path beforehand, request.get() will double encode the path again resulting in the generation of endless new urls
        # we need to decode the path back to what it was before request.get()
        # encoded it
        self.url = self.url_components_to_str(
            self.parse_url(self.unencode_url(self.response.url)))

        if self.is_robots_txt():
            self.response_to_robots_txt()

    def response_to_robots_txt(self):
        if self.response.status_code == 200:
            self.robots_txt = self.response.text
            self.gfrobots.set_robots_txt(self.robots_txt,
                                         user_agent=self.settings.get(
                                             "USER_AGENT", ''))
            self.robots_txt_ua = self.gfrobots.get_short_ua(
                self.settings.get("USER_AGENT", ''))

    def get_initial_url(self):
        if len(self.response.history) == 0:
            return str(self.response.url).strip()
        return str(self.response.history[0].url).strip()

    def get_link_extraction_xpath(self):

        xpaths = []
        xpaths.append('//a/@href')

        crawl_items = self.settings['CRAWL_ITEMS']

        if 'canonical_tag' in crawl_items:
            xpaths.append(self.xpath_mapping['canonical_tag'])
        if 'hreflang' in crawl_items:
            xpaths.append(self.xpath_mapping['hreflang'])
        if 'pagination' in crawl_items:
            xpaths.append(self.xpath_mapping['pagination'])
        if 'images' in crawl_items:
            xpaths.append(self.xpath_mapping['images'])
        if 'stylesheets' in crawl_items:
            xpaths.append(self.xpath_mapping['stylesheets'])
        if 'javascript' in crawl_items:
            xpaths.append(self.xpath_mapping['javascript'])

        return '|'.join(xpaths)

    # @timing
    def get_data(self):

        self.url_components = urllib.parse.urlsplit(self.url)
        d = {'url': self.url}
        d['data'] = self.get_header_info()

        if len(self.response.content) > 0:
            self.tree = self.get_tree()
            if self.spider_links:
                d['links'] = self.extract_links()
            d['data'] = {**d['data'], **self.get_crawl_data()}

        d['data'] = {
            **d['data'],
            **{
                'crawl_status': self.get_full_status(self.url, d['data'])
            }
        }

        d['data'] = [self.dict_to_row(d['data'])]

        if self.has_redirected():
            d['data'] += self.get_redirects()

        return d

    # @timing
    def get_tree(self):
        try:
            # We need to use page.content rather than page.text because
            # html.fromstring implicitly expects bytes as input.
            return fromstring(self.response.content)
        except Exception as e:
            print("Error parsing", self.url, "with lxml")
            print(e)

    def parse_url(self, url):
        try:
            scheme, netloc, path, query, frag = urllib.parse.urlsplit(
                url.strip())
        except:
            print(f'Error parsing {url}')
            return {
                "scheme": '',
                "netloc": '',
                "path": '',
                "query": '',
                "frag": ''
            }
        if not scheme and not netloc:
            # Hack needed as non RFC tel references are not detected by
            # urlsplit
            if path.startswith("tel:"):
                path.replace("tel:", "")
                scheme = "tel"
            else:
                absolute_url = urllib.parse.urljoin(self.url, url)
                scheme, netloc, path, query, frag = urllib.parse.urlsplit(
                    absolute_url)
        if ':' in netloc:
            if scheme == 'https' and ':443' in netloc:
                netloc = netloc.replace(':443', '')
            elif scheme == 'http' and ':80' in netloc:
                netloc = netloc.replace(':80', '')

        return {
            "scheme": scheme,
            "netloc": netloc,
            "path": path.strip(),
            "query": query,
            "frag": frag
        }

    def url_components_to_str(self, comp):
        url = str(
            urllib.parse.urlunsplit((comp["scheme"], comp["netloc"],
                                     comp["path"], comp["query"], "")))
        if comp['path'] == '':
            url += '/'
        return url

    def unencode_url(self, url):
        parsed = self.parse_url(url)
        parsed["path"] = urllib.parse.unquote(parsed["path"])
        return self.url_components_to_str(parsed)

    def get_domain(self, url):
        domain = self.parse_url(url)["netloc"]
        if "www." in domain:
            return domain.replace("www.", "")
        return domain

    def get_robots_txt_url(self, url):
        comps = self.parse_url(url)
        comps["path"] = "robots.txt"
        return self.url_components_to_str(comps)

    def is_external(self, url):
        if self.settings.get("ROOT_DOMAIN", "") == "":
            return False
        return self.get_domain(url) != self.settings.get("ROOT_DOMAIN", "")

    def is_excluded(self, url):
        if self.exclusions_regex:
            return bool(match(self.exclusions_regex, url))
        return False

    def exclusions_to_regex(self, exclusions):

        rules = []

        for exclusion in exclusions:
            operator, value = exclusion

            if operator == 'Equal to (=)':
                value = escape(value)
                rules.append(f"^{value}$")
            elif operator == 'Contain':
                value = escape(value)
                rules.append(f".*{value}.*")
            elif operator == 'Start with':
                value = escape(value)
                rules.append(f"^{value}.*")
            elif operator == 'End with':
                value = escape(value)
                rules.append(f".*{value}$")
            elif operator == 'Regex match':
                rules.append(value)

        return '|'.join(rules)

    def is_robots_txt(self, url=None):
        if not url:
            url = self.url

        if self.is_external(url):
            return False
        return self.parse_url(url)["path"] == "/robots.txt"

    def get_final_url(self):
        return self.url_components_to_str(self.parse_url(self.response.url))

    def get_text(self):
        return self.response.text

    def get_canonical_http_header(self):
        header = self.response.headers.get("Link", "")
        if "rel=" in header:
            return header.split(";")[0].replace("<", "").replace(">", "")
        return ""

    def get_header_info(self):
        header = {
            'url': self.url,
            'status_code': self.response.status_code,
            'content_type': self.response.headers.get('content-type', ''),
            'robots_txt': self.get_robots_txt_status(self.url),
            'x_robots_tag': self.response.headers.get('x-robots-tag', ''),
            'canonical_http': self.get_canonical_http_header()
        }
        return header

    def valid_url(self, components):
        if not "http" in components['scheme']:
            return False

        url = self.url_components_to_str(components)

        if ' ' in url:
            return False

        # Filter out external links if needed
        if "external_links" not in self.settings.get(
                "CRAWL_ITEMS", "") and self.is_external(url):
            return False

        if self.is_excluded(url):
            return False

        # Do not check and report on on-page links
        if "check_blocked_urls" not in self.settings.get(
                "CRAWL_ITEMS",
                "") and self.allowed_by_robots_txt(url) == False:
            return False
        return True

    # @timing
    def extract_links(self):
        parsed_links = [
            self.parse_url(l)
            for l in self.extract_xpath(self.xpath_link_extraction)
        ]
        links = list(
            set([
                self.url_components_to_str(l) for l in parsed_links
                if self.valid_url(l)
            ]))
        return links

    def get_txt_by_selector(self, selector, method="css", get="txt"):
        try:
            if method == "css":
                tree_result = self.tree.cssselect(selector)
            elif method == "xpath":
                tree_result = self.tree.xpath(selector)
            else:
                pass

            txt = ""

            if len(tree_result) > 0:
                if get == "href":
                    txt = tree_result[0].attrib['href']
                elif get != "txt":
                    txt = tree_result[0].get(get)
                else:
                    txt = tree_result[0].text_content()

            if txt == None:
                return ""

            return ' '.join(txt.split())

        except:
            print(f"{selector} failed")
            return ""

    def extract_onpage_elements(self):
        d = {}
        if 'h1' in self.all_items:
            d['h1'] = self.extraction_separator.join(
                self.clean_list(self.extract_xpath(self.xpath_mapping['h1'])))

        if 'h2' in self.all_items:
            d['h2'] = self.extraction_separator.join(
                self.clean_list(self.extract_xpath(self.xpath_mapping['h2'])))

        if 'page_title' in self.all_items:
            d['page_title'] = self.extraction_separator.join(
                self.clean_list(
                    self.extract_xpath(self.xpath_mapping['page_title'])))

        if 'meta_description' in self.all_items:
            d['meta_description'] = self.extraction_separator.join(
                self.clean_list(
                    self.extract_xpath(
                        self.xpath_mapping['meta_description'])))

        return d

    def extract_directives(self):
        d = {}
        if 'canonical_tag' in self.all_items:
            canonicals = self.extract_xpath(
                self.xpath_mapping['canonical_tag'])
            if len(canonicals) > 0:
                d['canonical_tag'] = canonicals[0]
            else:
                d['canonical_tag'] = ''

        if 'canonical_http_header' in self.all_items:
            d['canonical_http_header'] = self.get_canonical_http_header()

        if 'meta_robots' in self.all_items:
            all_fields = self.get_meta_name_fields()
            matching_ua = [
                f for f in all_fields
                if f.lower() in self.robots_txt_ua.lower()
            ]
            rules = []

            if len(matching_ua) > 0:
                ua = matching_ua[0]
                rules = self.extract_xpath(f'//meta[@name="{ua}"]/@content')

            rules += self.extract_xpath('//meta[@name="robots"]/@content')

            d['meta_robots'] = ', '.join(rules)

        return d

    def custom_extractions(self):

        for extraction_name, selector, value in self.settings.get(
                'EXTRACTIONS', []):
            if selector == 'CSS Selector':
                return {
                    extraction_name:
                    self.get_txt_by_selector(value, method='css', get='txt')
                }
            elif selector == 'XPath':
                return {
                    extraction_name:
                    self.extraction_separator.join(
                        self.clean_list(self.extract_xpath(value)))
                }
            else:
                print('WARNING: regex extraction is not implemented yet')
                return {extraction_name: ''}

        return {}

    def get_crawl_data(self):
        return {
            **self.extract_onpage_elements(),
            **self.extract_directives(),
            **self.custom_extractions()
        }

    def is_canonicalised(self, url, canonical):
        if not canonical:
            return False
        if self.url_components_to_str(
                self.parse_url(canonical)) != self.url_components_to_str(
                    self.parse_url(url)):
            return True
        return False

    def get_full_status(self, url, seo_items):
        status = []

        # Evaluate status code
        try:
            code_description = status_codes._codes[
                seo_items['status_code']][0].replace('_', ' ')
        except KeyError:
            code_description = 'non-standard response'
        status.append(code_description)

        # Check against X-Robots
        # No checking against User-Agents is done
        # As the following setup can not be evaluated:
        # X-Robots-Tag: bingbot: noindex
        # X-Robots-Tag: nofollow, nosnippet
        # response.headers['X-Robots-Tag'] would return a combined result
        # 'X-Robots-Tag': 'bingbot: noindex, nofun, norisk, nofollow, nosnippet'
        # Which CANNOT be deconstructed again
        # This is actually compliant to RFC2616
        # https://www.w3.org/Protocols/rfc2616/rfc2616-sec4.html#sec4.2
        if 'noindex' in seo_items.get('x_robots_tag', ''):
            status.append('blocked by x-robots-tag')

        # Check against robots.txt
        if 'blocked' in seo_items.get('robots_txt', ''):
            status.append('blocked by robots.txt')

        # Check against meta robots.txt
        if 'noindex' in seo_items.get('meta_robots', ''):
            status.append('noindex')

        # Canonical Tag
        if self.is_canonicalised(url, seo_items.get('canonical_tag', '')):
            status.append('canonicalised')

        # Canonical Header
        if self.is_canonicalised(url, seo_items.get('canonical_http_header',
                                                    '')):
            status.append('header canonicalised')

        # Avoid ok, blocked by robots.txt and show blocked by robots.txt
        # instead
        if len(status) != 1 and status[0] == 'ok':
            status.pop(0)

        return ', '.join(status)

    def get_meta_name_fields(self):
        fields = []
        try:
            fields = self.tree.xpath('//meta/@name')
        except:
            pass
        return fields

    def dict_to_row(self, data):
        out = tuple(data.get(item, "") for item in self.all_items)
        return out

    def has_redirected(self):
        return len(self.response.history) > 0

    # @timing
    def get_redirects(self):
        data = []
        hist = self.response.history

        if len(hist) > 0:
            for i in range(len(hist)):
                hob_url = self.url_components_to_str(
                    self.parse_url(hist[i].url))

                if 'external_links' not in self.settings.get(
                        'CRAWL_ITEMS', ''):
                    if self.is_external(hob_url):
                        break

                robots_status = self.get_robots_txt_status(hob_url)
                if 'respect_robots_txt' in self.settings.get(
                        'CRAWL_ITEMS', ''
                ) and 'follow_blocked_redirects' not in self.settings.get(
                        'CRAWL_ITEMS', '') and robots_status == 'blocked':
                    continue

                if i + 1 < len(hist):
                    redirect_to_url = self.url_components_to_str(
                        self.parse_url(str(hist[i + 1].url).strip()))
                else:
                    redirect_to_url = self.get_final_url()

                hob_data = {
                    "url": hob_url,
                    "content_type": hist[i].headers.get('Content-Type', ""),
                    'status_code': hist[i].status_code,
                    'x_robots_tag': hist[i].headers.get('X-Robots-Tag', ''),
                    'redirect_url': redirect_to_url,
                    'robots_txt': robots_status
                }

                hob_data['crawl_status'] = self.get_full_status(
                    hob_url, hob_data)
                hob_row = self.dict_to_row(hob_data)

                data.append(hob_row)

        return data

    def allowed_by_robots_txt(self, url):
        return self.gfrobots.is_allowed(url)

    def get_robots_txt_status(self, url):
        if self.allowed_by_robots_txt(url):
            return "allowed"
        return "blocked"

    def attrib_to_list(self, xpath, attrib):
        try:
            return [
                self.url_components_to_str(self.parse_url(l.attrib[attrib]))
                for l in self.tree.xpath(xpath)
            ]
        except:
            return []

    def extract_xpath(self, path):
        try:
            return self.tree.xpath(path)
        except:
            return []

    def clean_list(self, inp):
        try:
            return [' '.join(i.split()) for i in inp if i.strip()]
        except Exception as e:
            print(f'ERROR: cleaning list {inp} failed!')
            return inp

    def get_hreflang_links(self):
        return self.extract_xpath(self.xpath_mapping['hreflang'])

    def get_canonical_links(self):
        return self.extract_xpath(self.xpath_mapping['canonical_tag'])

    def get_pagination_links(self):
        return self.extract_xpath(self.xpath_mapping['pagination'])
Esempio n. 12
0
    def test_grouped_ua_ends_with_additional_uas_two(self):
        robots_txt = "User-agent: *\nAllow: /\nUser-agent: Google\n\n\n\nUser-agent:       	Bingbot\nUser-agent: Greenflare      # My own crawler\nDisallow: /test/is/disallowed\nUser-agent: Yandex\nDisallow: /*test"
        ua = "Mozilla/5.0 (compatible; Googlebot/2.1; +http://www.google.com/bot.html)"
        ua_greenflare = "Greenflare SEO Spider/1.0"
        ua_bingbot = "Mozilla/5.0 (compatible; bingbot/2.0; +http://www.bing.com/bingbot.htm)"
        ua_yandex = "Mozilla/5.0 (compatible; YandexBot/3.0; +http://yandex.com/bots)"
        ua_firefox = "Mozilla/5.0 (Windows NT 10.0; Win64; x64; rv:81.0) Gecko/20100101 Firefox/81.0"

        url = "https://www.example.com/test/is/disallowed.html"

        robot = GFlareRobots(robots_txt, user_agent=ua)
        self.assertEqual(robot.is_allowed(url), False, "Should be disallowed")

        robot = GFlareRobots(robots_txt, user_agent=ua_greenflare)
        self.assertEqual(robot.is_allowed(url), False, "Should be disallowed")

        robot = GFlareRobots(robots_txt, user_agent=ua_bingbot)
        self.assertEqual(robot.is_allowed(url), False, "Should be disallowed")

        robot = GFlareRobots(robots_txt, user_agent=ua_yandex)
        self.assertEqual(robot.is_allowed(url), False, "Should be disallowed")

        robot = GFlareRobots(robots_txt, user_agent=ua_firefox)
        self.assertEqual(robot.is_allowed(url), True, "Should be allowed")