def check_self_ref(self, page: Page) -> typing.Iterable[CheckResult]: """checks all sitemap urls with hreflang for a self reference""" msg = f"{page.url} is missing self reference" valid = False if page.url in [_.href for _ in page.alts]: msg = f"{page.url} has self reference" valid = True yield CheckResult(msg=msg, valid=valid)
def check_alts_indexable(self, alt: "PageCheck") -> CheckResult: """ensure pages being pointed to are indexable""" valid = alt.indexable().valid if valid: msg = f"{self.page} points to indexable page {alt.page}" else: msg = f"{self.page} points to non-indexable page {alt.page}" return CheckResult(msg=msg, valid=valid)
def check_alts_self(self, alt: "PageCheck") -> CheckResult: """ensure alts have self referring tags""" valid = alt.check_self() if valid: msg = f"{self.page} points to page {alt.page} which has it's self-reference" else: msg = f"{self.page} points to page {alt.page} which is missing it's self references" return CheckResult(msg=msg, valid=valid)
def check_return(self, alt: "PageCheck") -> CheckResult: """checks that alternate pages being pointed to also point back""" valid = is_page_in_hreflang(self.page, alt.get_hreflangs()) if valid: msg = f"{self.page} has return link from {alt.page}" else: msg = f"{self.page} missing return link from {alt.page}" return CheckResult(valid=valid, msg=msg)
def check_robots(self) -> CheckResult: robots_element = self.soup.find("meta", {"name": "robots"}) or {} robot_directive = robots_element.get("content", None) valid = robot_directive is None or any( x in robot_directive.lower() for x in ["no index", "noindex"]) return CheckResult( valid=valid, msg=f"{self.page} has robot {robot_directive or 'No robot info'}", )
def check_targeting(self, alt: "PageCheck") -> CheckResult: """checks if alternate pages all use the same code as the page we are checking when pointing to it""" target = get_hreflang_for_page(self.page, self.get_hreflangs()) hreflang = get_hreflang_for_page(self.page, alt.get_hreflangs()) valid = hreflang == target if valid: msg = f"{self.page} has page {alt.page} pointing to it with hreflang {getattr(hreflang, 'language', 'Not found')}" else: msg = f"{self.page} has page {alt.page} pointing to it with the wrong locale {getattr(hreflang, 'language', 'Not found')}" return CheckResult(msg=msg, valid=valid)
def check_link_in_map( self, page: Page, pages: typing.List[Page]) -> typing.Iterable[CheckResult]: """checks that links pointed to in hreflang are also in the sitemap""" urls = list(_.url for _ in pages) for alt in page.alts: msg = alt.href + " is is pointed to but has no corresponding url element" valid = False if alt.href in urls: valid = True msg = alt.href + " is in and has a corresponding url element" yield CheckResult(msg=msg, valid=valid)
def check_return(self, page: Page, pages: typing.List[Page]) -> typing.Iterable[CheckResult]: """checks if alternates exist and have alternates pointing back to the origin""" for alt in page.alts: msg = page.url + " points to " + alt.href + " but not return" valid = False if page.url in [ _.href for _ in get_alts_for_link(alt.href, pages) ]: msg = (page.url + " points to " + alt.href + " and has link pointing back to it") valid = True yield CheckResult(msg=msg, valid=valid)
def check_target(self, page: Page, pages: typing.List[Page]) -> typing.Iterable[CheckResult]: """checks if alternates exist, then if they do ensures that the origins targeting is the same as the targeting applied by the alternate""" urls = [_.url for _ in pages] for alt in page.alts: if alt.href not in urls: continue # Ensure link in map: check_link_in_map backlinks = get_alts_for_link(alt.href, pages) if page.url not in [_.href for _ in backlinks]: continue # Ensure alternates exist an point back: check_return hreflang = get_hreflang_for_link(alt.href, backlinks) msg = (page.url + "url with target " + alt.language + " is pointed to with target " + hreflang.language) valid = hreflang and alt.language == hreflang.language yield CheckResult(msg=msg, valid=valid)
def crawl(self, page: str = None) -> typing.Iterable[CheckResult]: """the recursive crawler, it calls the hreflang check module, so if you want a free crawl validation, this is what you need""" page = page or self.root logger.info( f"{len(self.to_crawl)} pages in queue, {len(self.crawled)} done, current page {page}" ) if validators.url(page): instance = PageCheck(page, self.rp) indexable = instance.indexable() yield from indexable if indexable.valid: yield from instance.validate_alts() self.to_crawl.update(instance.get_links().difference( self.crawled)) else: yield CheckResult(valid=False, msg=f"{page} is a badly formed url") self.crawled.add(page) if len(self.to_crawl) > 0: yield from self.crawl(self.to_crawl.pop())
def check_txt(self) -> CheckResult: valid = self.rp.can_fetch("*", self.page) msg = "is allowed by robots.txt" if valid else "is forbidden by robots.txt" return CheckResult(valid=valid, msg=f"{self.page} {msg}")
def check_canonical(self) -> CheckResult: canonical_element = self.soup.find("link", {"rel": "canonical"}) or {} canonical_link = canonical_element.get("href", "No canonical link") valid = canonical_link == self.page return CheckResult(valid=valid, msg=f"{self.page} has canonical {canonical_link}")
def check_status(self) -> CheckResult: return CheckResult( valid=self.request.status_code == 200, msg=f"{self.page} returned status code {self.request.status_code}", )