Ejemplo n.º 1
0
 def check_self_ref(self, page: Page) -> typing.Iterable[CheckResult]:
     """checks all sitemap urls with hreflang for a self reference"""
     msg = f"{page.url} is missing self reference"
     valid = False
     if page.url in [_.href for _ in page.alts]:
         msg = f"{page.url} has self reference"
         valid = True
     yield CheckResult(msg=msg, valid=valid)
Ejemplo n.º 2
0
 def check_alts_indexable(self, alt: "PageCheck") -> CheckResult:
     """ensure pages being pointed to are indexable"""
     valid = alt.indexable().valid
     if valid:
         msg = f"{self.page} points to indexable page {alt.page}"
     else:
         msg = f"{self.page} points to non-indexable page {alt.page}"
     return CheckResult(msg=msg, valid=valid)
Ejemplo n.º 3
0
 def check_alts_self(self, alt: "PageCheck") -> CheckResult:
     """ensure alts have self referring tags"""
     valid = alt.check_self()
     if valid:
         msg = f"{self.page} points to page {alt.page} which has it's self-reference"
     else:
         msg = f"{self.page} points to page {alt.page} which is missing it's self references"
     return CheckResult(msg=msg, valid=valid)
Ejemplo n.º 4
0
 def check_return(self, alt: "PageCheck") -> CheckResult:
     """checks that alternate pages being pointed to also point back"""
     valid = is_page_in_hreflang(self.page, alt.get_hreflangs())
     if valid:
         msg = f"{self.page} has return link from {alt.page}"
     else:
         msg = f"{self.page} missing return link from {alt.page}"
     return CheckResult(valid=valid, msg=msg)
Ejemplo n.º 5
0
 def check_robots(self) -> CheckResult:
     robots_element = self.soup.find("meta", {"name": "robots"}) or {}
     robot_directive = robots_element.get("content", None)
     valid = robot_directive is None or any(
         x in robot_directive.lower() for x in ["no index", "noindex"])
     return CheckResult(
         valid=valid,
         msg=f"{self.page} has robot {robot_directive or 'No robot info'}",
     )
Ejemplo n.º 6
0
 def check_targeting(self, alt: "PageCheck") -> CheckResult:
     """checks if alternate pages all use the same code as the page we are checking when pointing to it"""
     target = get_hreflang_for_page(self.page, self.get_hreflangs())
     hreflang = get_hreflang_for_page(self.page, alt.get_hreflangs())
     valid = hreflang == target
     if valid:
         msg = f"{self.page} has page {alt.page} pointing to it with hreflang {getattr(hreflang, 'language', 'Not found')}"
     else:
         msg = f"{self.page} has page {alt.page} pointing to it with the wrong locale {getattr(hreflang, 'language', 'Not found')}"
     return CheckResult(msg=msg, valid=valid)
Ejemplo n.º 7
0
 def check_link_in_map(
         self, page: Page,
         pages: typing.List[Page]) -> typing.Iterable[CheckResult]:
     """checks that links pointed to in hreflang are also in the sitemap"""
     urls = list(_.url for _ in pages)
     for alt in page.alts:
         msg = alt.href + " is is pointed to but has no corresponding url element"
         valid = False
         if alt.href in urls:
             valid = True
             msg = alt.href + " is in and has a corresponding url element"
         yield CheckResult(msg=msg, valid=valid)
Ejemplo n.º 8
0
    def check_return(self, page: Page,
                     pages: typing.List[Page]) -> typing.Iterable[CheckResult]:
        """checks if alternates exist and have alternates pointing back to the origin"""
        for alt in page.alts:
            msg = page.url + " points to " + alt.href + " but not return"
            valid = False

            if page.url in [
                    _.href for _ in get_alts_for_link(alt.href, pages)
            ]:
                msg = (page.url + " points to " + alt.href +
                       " and has link pointing back to it")
                valid = True
            yield CheckResult(msg=msg, valid=valid)
Ejemplo n.º 9
0
    def check_target(self, page: Page,
                     pages: typing.List[Page]) -> typing.Iterable[CheckResult]:
        """checks if alternates exist, then if they do ensures that the origins
        targeting is the same as the targeting applied by the alternate"""
        urls = [_.url for _ in pages]
        for alt in page.alts:
            if alt.href not in urls:
                continue  # Ensure link in map: check_link_in_map
            backlinks = get_alts_for_link(alt.href, pages)
            if page.url not in [_.href for _ in backlinks]:
                continue  # Ensure alternates exist an point back: check_return

            hreflang = get_hreflang_for_link(alt.href, backlinks)
            msg = (page.url + "url with target " + alt.language +
                   " is pointed to with target " + hreflang.language)
            valid = hreflang and alt.language == hreflang.language
            yield CheckResult(msg=msg, valid=valid)
Ejemplo n.º 10
0
    def crawl(self, page: str = None) -> typing.Iterable[CheckResult]:
        """the recursive crawler, it calls the hreflang check module, so if you want a free crawl validation, this is what you need"""
        page = page or self.root

        logger.info(
            f"{len(self.to_crawl)} pages in queue, {len(self.crawled)} done, current page {page}"
        )

        if validators.url(page):
            instance = PageCheck(page, self.rp)
            indexable = instance.indexable()
            yield from indexable
            if indexable.valid:
                yield from instance.validate_alts()
                self.to_crawl.update(instance.get_links().difference(
                    self.crawled))
        else:
            yield CheckResult(valid=False, msg=f"{page} is a badly formed url")

        self.crawled.add(page)

        if len(self.to_crawl) > 0:
            yield from self.crawl(self.to_crawl.pop())
Ejemplo n.º 11
0
 def check_txt(self) -> CheckResult:
     valid = self.rp.can_fetch("*", self.page)
     msg = "is allowed by robots.txt" if valid else "is forbidden by robots.txt"
     return CheckResult(valid=valid, msg=f"{self.page} {msg}")
Ejemplo n.º 12
0
 def check_canonical(self) -> CheckResult:
     canonical_element = self.soup.find("link", {"rel": "canonical"}) or {}
     canonical_link = canonical_element.get("href", "No canonical link")
     valid = canonical_link == self.page
     return CheckResult(valid=valid,
                        msg=f"{self.page} has canonical {canonical_link}")
Ejemplo n.º 13
0
 def check_status(self) -> CheckResult:
     return CheckResult(
         valid=self.request.status_code == 200,
         msg=f"{self.page} returned status code {self.request.status_code}",
     )