def validate(self): links = self.get_links() page_domain, domain_url = get_domain_from_url(self.review.page_url) rel_nofollow = [] for link in links: href = link.get('href') if not is_valid(href): continue link_domain, link_domain_url = get_domain_from_url(href) if link.get('rel') == 'nofollow' and page_domain == link_domain: rel_nofollow.append(href) if rel_nofollow: self.add_violation( key='invalid.links.nofollow', value=rel_nofollow, points=10 * len(rel_nofollow) )
def normalize_url(self, url): parse = is_valid(url) if parse: if not self.is_absolute(url): url = self.rebase(url) return self.url_ends_with_slash(url) return None
def normalize_url(self, url): parse = is_valid(url) if parse: if not self.is_absolute(url): url = self.rebase(url) return url return None
def validate(self): url = self.reviewer.page_url parsed_url = is_valid(url) path = parsed_url.path if '_' in path: self.add_violation(key='invalid.url_word_separator', value=url, points=10)
def validate(self): url = self.reviewer.page_url parsed_url = is_valid(url) path = parsed_url.path if '_' in path: self.add_violation( key='invalid.url_word_separator', value=url, points=10 )
def get_canonical_urls(self): url = self.reviewer.page_url parsed_url = is_valid(url) scheme_url = parsed_url.scheme domain_url = parsed_url.netloc if domain_url.startswith('www.'): www_url = url.rstrip('/') no_www_url = '{}://{}'.format(scheme_url, domain_url[4:]) else: no_www_url = url.rstrip('/') www_url = '{}://www.{}'.format(scheme_url, domain_url) return {'www_url': www_url, 'no_www_url': no_www_url}
def validate(self): if not self.config.FORCE_CANONICAL: # Only pages with query string parameters if self.page_url: if not is_valid(self.page_url): return if not urlparse(self.page_url).query: return head = self.get_head() if head: canonical = [item for item in head if item.get('rel') == 'canonical'] if not canonical: self.add_violation( key='absent.meta.canonical', value=None, points=30 )
def validate(self): blacklist_domains = self.get_violation_pref('blacklist.domains') domains = [] links = self.get_links() for link in links: href = link.get('href') if not is_valid(href): continue link_domain, link_domain_url = get_domain_from_url(href) if link_domain in blacklist_domains: domains.append(href) if domains: self.add_violation(key='blacklist.domains', value=domains, points=100 * len(domains))
def validate(self): force_canonical = self.get_violation_pref('absent.meta.canonical') if not force_canonical: # Only pages with query string parameters if self.page_url: if not is_valid(self.page_url): return if not urlparse(self.page_url).query: return head = self.get_head() if head: canonical = [ item for item in head if item.get('rel') == 'canonical' ] if not canonical: self.add_violation(key='absent.meta.canonical', value=None, points=30)
def validate(self): blacklist_domains = self.get_violation_pref('blacklist.domains') domains = [] links = self.get_links() for link in links: href = link.get('href') if not is_valid(href): continue link_domain, link_domain_url = get_domain_from_url(href) if link_domain in blacklist_domains: domains.append(href) if domains: self.add_violation( key='blacklist.domains', value=domains, points=100 * len(domains) )
def validate(self): links = self.get_links() page_domain, domain_url = get_domain_from_url(self.review.page_url) rel_nofollow = [] for link in links: href = link.get('href') if not is_valid(href): continue link_domain, link_domain_url = get_domain_from_url(href) if link.get('rel') == 'nofollow' and page_domain == link_domain: rel_nofollow.append(href) if rel_nofollow: self.add_violation(key='invalid.links.nofollow', value=rel_nofollow, points=10 * len(rel_nofollow))